Skip to content

Data Manipulation with dplyr

Run the hidden code cell below to import the data used in this course.

# Load the Tidyverse
library(tidyverse)

# Load the course datasets
babynames <- read_rds("datasets/babynames.rds")
counties <- read_rds("datasets/counties.rds")
counties_selected <- counties %>%
	select(county, region, state, population, walk)

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Add your notes here

# Select() Verb
counties %>%
	select(state, coutny, population, unemployment)
# Select a range
counties %>%
	select(state, county, drive:work_at_home)
# Assigning
counties_selected <- counties %>%
	select(state, county, population, unemployment)

# Arrange() Verb
Sorts observations based on one or more variables 
	counties_selected %>%
		arrange(population)
# Descending order: desc()
	counties_selected %>%
		arrange(desc(population))
# Filter() Verb
	filter plus logical (=), (>), or (<)

# Mutate() Verb
adds column to dataset
counties_selected %>%
	mutate(unemployed_population = population * unemployment / 100)

# Count() Verb
counties %>%
	count()
# Count and sort
counties %>%
	count(state, sort = TRUE)
sorts like desc
# weight
counties %>%
	count(state, wt = population, sort = TRUE)
gives total population 

# Summarize 
counties %>%
	summarize(total_population = sum(population))
# summary functions
* sum(), mean(), median(), min(), max(), n()

# slice_min & slice_max
Allows us to extract most extreme values
#slice_max()
*returns largest observations in each group
	counties _selected <- counties %>%
		select(state, county, population, unemployment, income)
	counties_selected %>%
		group_by(state) %>%
		slice_max(population, n = 1) (here n = # of counties in each state)
#slice_min()
*returns smallest observations in each group
	counties _selected <- counties %>%
		select(state, county, population, unemployment, income)
	counties_selected %>%
		group_by(state) %>%
		slice_min(population, n = 1) (here n = # of counties in each state)


									  
# Selecting
# Contains
counties %>%
	select(state, county, contains("work"))
# Starts with
counties %>%
	select(state, county, starts_with("income"))
# Other helpers
contains(), starts_with(), ends_with(), last_col(), matches()
# removing a variable
counties %>%
	select(-census_id)

# Rename columns
counties_selected %>%
	rename(unemployment_rate = unemployment) or
	select(state, county, population, unemployment_rate = unemployment)

# transmute verb
* combo of select() and mutate()
* returns a subset of columns that are transformed and changed

counties %>%
	transmute(state, co unty, fraction_men = men / population)
# select and calculate
counties %>%
	transmute(state, county, population, unemployed_people = population * unemployment / 100)
# filter for multiple names
* %in%:used to filter for multiple values

babynames_multiple <- babynames %>%
	filter(name %in% c("Amy", "Christopher"))
	
# grouped mutates
babynames %>%
	group_by(year) %>%
	mutate(year_total = sum(number))
# ungroup
babynames %>%
	group_by(year) %>% 
	mutate(year_total = sum(number)) %>% 
	ungroup()
# add the fraction column
babynames %>%
	group_by(year) %>%
	mutate(year_total = sum(number)) %>%
	ungroup() %>%
	mutate(fraction = number / year_total)
# Window functions
v <- c(1, 3, 6, 14)
v
# lag() moves each item to the right by 1
lag(v)
#this should say [NA, 1, 3, 6]
# Compare consecutive steps
v - lag(v)
# what is each value after we have subtracted the previous one?
# should say [NA, 2, 3, 8]
# CHANGES IN POPULARITY OF A NAME
babynames_fraction <- babynames %>%
	group_by(year) %>% 
	mutate(year_total = sum(number)) %>%
	ungroup() %>%
	mutate(fraction = number / year_total)
# MATTHEW
babynames_fraction %>%
	filter(name == "Matthew") %>%
	arrange(year)
	mutate(difference = fraction - lag(fraction))
# CHANGES WITHIN EVERY NAME
babynames_fraction %>%
	arrange(name, year) %>%
	group_by(name) %>%
	mutate(difference = fraction - lag(fraction)) %>%
	ungroup() %>%
	arrange(desc(difference))