Skip to content
Data Manipulation with dplyr
Data Manipulation with dplyr
Run the hidden code cell below to import the data used in this course.
# Load the Tidyverse
library(tidyverse)
# Load the course datasets
babynames <- read_rds("datasets/babynames.rds")
counties <- read_rds("datasets/counties.rds")
counties_selected <- counties %>%
select(county, region, state, population, walk)Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Select() Verb
counties %>%
select(state, coutny, population, unemployment)
# Select a range
counties %>%
select(state, county, drive:work_at_home)
# Assigning
counties_selected <- counties %>%
select(state, county, population, unemployment)
# Arrange() Verb
Sorts observations based on one or more variables
counties_selected %>%
arrange(population)
# Descending order: desc()
counties_selected %>%
arrange(desc(population))
# Filter() Verb
filter plus logical (=), (>), or (<)
# Mutate() Verb
adds column to dataset
counties_selected %>%
mutate(unemployed_population = population * unemployment / 100)
# Count() Verb
counties %>%
count()
# Count and sort
counties %>%
count(state, sort = TRUE)
sorts like desc
# weight
counties %>%
count(state, wt = population, sort = TRUE)
gives total population
# Summarize
counties %>%
summarize(total_population = sum(population))
# summary functions
* sum(), mean(), median(), min(), max(), n()
# slice_min & slice_max
Allows us to extract most extreme values
#slice_max()
*returns largest observations in each group
counties _selected <- counties %>%
select(state, county, population, unemployment, income)
counties_selected %>%
group_by(state) %>%
slice_max(population, n = 1) (here n = # of counties in each state)
#slice_min()
*returns smallest observations in each group
counties _selected <- counties %>%
select(state, county, population, unemployment, income)
counties_selected %>%
group_by(state) %>%
slice_min(population, n = 1) (here n = # of counties in each state)
# Selecting
# Contains
counties %>%
select(state, county, contains("work"))
# Starts with
counties %>%
select(state, county, starts_with("income"))
# Other helpers
contains(), starts_with(), ends_with(), last_col(), matches()
# removing a variable
counties %>%
select(-census_id)
# Rename columns
counties_selected %>%
rename(unemployment_rate = unemployment) or
select(state, county, population, unemployment_rate = unemployment)
# transmute verb
* combo of select() and mutate()
* returns a subset of columns that are transformed and changed
counties %>%
transmute(state, co unty, fraction_men = men / population)
# select and calculate
counties %>%
transmute(state, county, population, unemployed_people = population * unemployment / 100)# filter for multiple names
* %in%:used to filter for multiple values
babynames_multiple <- babynames %>%
filter(name %in% c("Amy", "Christopher"))
# grouped mutates
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number))
# ungroup
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup()
# add the fraction column
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
# Window functions
v <- c(1, 3, 6, 14)
v# lag() moves each item to the right by 1
lag(v)
#this should say [NA, 1, 3, 6]# Compare consecutive steps
v - lag(v)
# what is each value after we have subtracted the previous one?
# should say [NA, 2, 3, 8]# CHANGES IN POPULARITY OF A NAME
babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
# MATTHEW
babynames_fraction %>%
filter(name == "Matthew") %>%
arrange(year)
mutate(difference = fraction - lag(fraction))
# CHANGES WITHIN EVERY NAME
babynames_fraction %>%
arrange(name, year) %>%
group_by(name) %>%
mutate(difference = fraction - lag(fraction)) %>%
ungroup() %>%
arrange(desc(difference))