Skip to content
1 hidden cell
Joining Data with dplyr
Joining Data with dplyr
Run the hidden code cell below to import the data used in this course.
1 hidden cell
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Filter for the dplyr and ggplot2 tag names
by_type_year_tag_filtered <- by_type_year_tag %>%
filter(tag_name %in% c("dplyr", "ggplot2"))
# Create a line plot faceted by the tag name
ggplot(by_type_year_tag_filtered, aes(year, n, color = type)) +
geom_line() +
facet_wrap(~ tag_name)
# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"),
answers_with_tags %>% mutate(type = "answer"))
# Add a year column, then count by type, year, and tag_name
posts_with_tags %>%
mutate(year = year(creation_date)) %>%
count(type, year, tag_name)
# Inner join the question_tags and tags tables with the questions table
questions %>%
inner_join(question_tags, by = c("id" = "question_id")) %>%
inner_join(tags, by = c("tag_id" = "id"))
# Inner join the question_tags and tags tables with the answers table
answers %>%
inner_join(question_tags, by = c("question_id" = "question_id")) %>%
inner_join(tags, by = c("tag_id" = "id"))
tagged_answers %>%
# Aggregate by tag_name
group_by(tag_name) %>%
# Summarize questions and average_answers
summarize(questions = n(),
average_answers = mean(n)) %>%
# Sort the questions in descending order
arrange(desc(questions))
question_answer_counts %>%
# Join the question_tags tables
inner_join(question_tags, by = c("id" = "question_id")) %>%
# Join the tags table
inner_join(tags, by = c("tag_id" = "id"))
# Count and sort the question id column in the answers table
answer_counts <- answers %>%
count(question_id, sort = TRUE)
# Combine the answer_counts and questions tables
questions %>%
left_join(answer_counts, by = c("id" = "question_id")) %>%
# Replace the NAs in the n column
replace_na(list(n=0))
questions %>%
# Inner join questions and answers with proper suffixes
inner_join(answers, by = c("id"="question_id"), suffix = c("_question", "_answer")) %>%
# Subtract creation_date_question from creation_date_answer to create gap
mutate(gap = as.integer(creation_date_answer-creation_date_question))
questions_with_tags %>%
# Group by tag_name
group_by(tag_name) %>%
# Get mean score and num_questions
summarize(score = mean(score),
num_questions = n()) %>%
# Sort num_questions in descending order
arrange(desc(num_questions))
# Replace the NAs in the tag_name column
questions %>%
left_join(question_tags, by = c("id" = "question_id")) %>%
left_join(tags, by = c("tag_id" = "id")) %>%
replace_na(list(tag_name="only-r"))
# Join the Batman and Star Wars colors
full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
# Replace NAs in the total_batman and total_star_wars columns
replace_na(list(total_batman=0, total_star_wars=0)) %>%
inner_join(colors, by = c("color_id" = "id"))
# Use filter() to extract version 1
version_1_inventories <- inventories %>%
filter(version == 1)
# Use anti_join() to find which set is missing a version 1
sets %>%
anti_join(version_1_inventories, by = "set_num")
# Use inventory_parts to find colors included in at least one set
colors %>%
semi_join(inventory_parts, by = c("id" = "color_id"))
#SEMI
#ANTI
# Filter the batwing set for parts that are also in the batmobile set
batwing %>%
semi_join(batmobile, by = c("part_num"))
# Filter the batwing set for parts that aren't in the batmobile set
batwing %>%
anti_join(batmobile, by = c("part_num"))
# Sort the number of star wars pieces in descending order
arrange(desc(n_star_wars)) %>%
# Join the colors table to the parts_joined table
inner_join(colors, by = c("color_id" = "id")) %>%
# Join the parts table to the previous join
inner_join(parts, by = "part_num", suffix = c("_color", "_part"))
# Combine the star_wars_parts table
full_join(star_wars_parts, by = c("part_num", "color_id"), suffix = c("_batman", "_star_wars")) %>%
# Replace NAs with 0s in the n_batman and n_star_wars columns
replace_na(list(n_batman=0, n_star_wars = 0))
# Count the part number and color id, weight by quantity
batman %>%
count(part_num, color_id, wt = quantity)
star_wars %>%
count(part_num, color_id, wt = quantity)
# Start with inventory_parts_joined table
inventory_parts_joined %>%
# Combine with the sets table
inner_join(sets, by = "set_num") %>%
# Combine with the themes table
inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
# Left join the themes table to its own children
left_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
# Filter for themes that have no child themes
filter(is.na(name_child))
# Join themes to itself again to find the grandchild relationships
themes %>%
inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
# Use replace_na to replace missing values in the n column
replace_na(list(n =0))
#right join
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Filter for NA
filter(is.na(n))
# Join versions to sets
sets %>%
left_join(inventory_version_1, by = "set_num") %>%
# Filter for where version is na
filter(is.na(version))
# Left join the Millennium Falcon colors to the Star Destroyer colors
millennium_falcon_colors %>%
left_join(star_destroyer_colors, by = "color_id", suffix= c("_falcon", "_star_destroyer"))
#left join
# Combine the star_destroyer and millennium_falcon tables
millennium_falcon %>%
left_join(star_destroyer, by = c("part_num", "color_id"),
suffix = c("_falcon", "_star_destroyer"))
#inner join
#inner_join(part_categories, by = c("part_cat_id" = "id"),
suffix = c("_part", "_category"))
# Combine the parts and inventory_parts tables
parts %>%
inner_join(inventory_parts, by = "part_num")
sets %>%
# Add inventories using an inner join
inner_join(inventories, by = "set_num") %>%
# Add inventory_parts using an inner join
inner_join(inventory_parts, by = c("id" = "inventory_id"))
# Count the number of colors and sort
sets %>%
inner_join(inventories, by = "set_num") %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
count(name_color, sort = TRUE)