Skip to content
Data Manipulation with dplyr
Case Study: The babynames Dataset
# Load the Tidyverse
library(tidyverse)
# Load the course datasets
babynames <- read_rds("datasets/babynames.rds")
counties <- read_rds("datasets/counties.rds")Hidden output
babynames %>%
# Filter for the year 1990
filter(year==1990) %>%
# Sort the number column in descending order
arrange(desc(number))babynames %>%
# Find the most common name in each year
group_by(year) %>%
slice_max(number,n=1)selected_names <- babynames %>%
# Filter for the names Steven, Thomas, and Matthew
filter(name %in% c("Steven", "Thomas", "Matthew"))
# Plot the names using a different color for each name
ggplot(selected_names, aes(x =year, y = number, color = name)) +
geom_line()# Calculate the fraction of people born each year with the same name
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total) %>%
# Find the year each name is most common
group_by(name) %>%
slice_max(fraction, n = 1)names_normalized <- babynames %>%
# Add columns name_total and name_max for each name
group_by(name) %>%
mutate(name_total = sum(number),
name_max = max(number)) %>%
# Ungroup the table
ungroup() %>%
# Add the fraction_max column containing the number by the name maximum
mutate(fraction_max=number/name_max)names_filtered <- names_normalized %>%
# Filter for the names Steven, Thomas, and Matthew
filter(name %in% c("Steven","Thomas","Matthew"))
# Visualize the names in names_filtered over time
ggplot(names_filtered,aes(x=year,y=fraction_max,color=name))+
geom_line()babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
babynames_fraction %>%
# Arrange the data in order of name, then year
arrange(name,year) %>%
# Group the data by name
group_by(name) %>%
# Add a ratio column that contains the ratio of fraction between each year
mutate(ratio=fraction/lag(fraction))babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)
babynames_ratios_filtered <- babynames_fraction %>%
arrange(name, year) %>%
group_by(name) %>%
mutate(ratio = fraction / lag(fraction)) %>%
filter(fraction >= 0.00001)
babynames_ratios_filtered %>%
# Extract the largest ratio from each name
slice_max(ratio,n=1) %>%
# Sort the ratio column in descending order
arrange(desc(ratio)) %>%
# Filter for fractions greater than or equal to 0.001
filter(fraction >= 0.001)