Skip to content

Joining Data with dplyr

Run the hidden code cell below to import the data used in this course.


1 hidden cell

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Add your notes here

# Add your code snippets here

# Filter for the dplyr and ggplot2 tag names 
by_type_year_tag_filtered <- by_type_year_tag %>%
  filter(tag_name %in% c("dplyr", "ggplot2"))

# Create a line plot faceted by the tag name 
ggplot(by_type_year_tag_filtered, aes(year, n, color = type)) +
  geom_line() +
  facet_wrap(~ tag_name)


# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"),
                              answers_with_tags %>% mutate(type = "answer"))

# Add a year column, then count by type, year, and tag_name
posts_with_tags %>%
  mutate(year = year(creation_date)) %>%
  count(type, year, tag_name)

# Inner join the question_tags and tags tables with the questions table
questions %>%
  inner_join(question_tags, by = c("id" = "question_id")) %>%
  inner_join(tags, by = c("tag_id" = "id"))

# Inner join the question_tags and tags tables with the answers table
answers %>%
  inner_join(question_tags, by = c("question_id" = "question_id")) %>%
  inner_join(tags, by = c("tag_id" = "id"))
tagged_answers %>%
  # Aggregate by tag_name
  group_by(tag_name)  %>%
  # Summarize questions and average_answers
  summarize(questions = n(),
            average_answers = mean(n)) %>%
  # Sort the questions in descending order
  arrange(desc(questions)) 

question_answer_counts %>%
  # Join the question_tags tables
  inner_join(question_tags, by = c("id" = "question_id")) %>%
  # Join the tags table
  inner_join(tags, by = c("tag_id" = "id"))

# Count and sort the question id column in the answers table
answer_counts <- answers %>%
count(question_id, sort = TRUE)

# Combine the answer_counts and questions tables
questions %>%
   left_join(answer_counts, by = c("id" = "question_id")) %>%
  # Replace the NAs in the n column
replace_na(list(n=0))
questions %>%
  # Inner join questions and answers with proper suffixes
  inner_join(answers, by = c("id"="question_id"), suffix = c("_question", "_answer")) %>%
  # Subtract creation_date_question from creation_date_answer to create gap
  mutate(gap = as.integer(creation_date_answer-creation_date_question))


questions_with_tags %>% 
  # Group by tag_name
  group_by(tag_name) %>%
  # Get mean score and num_questions
  summarize(score = mean(score),
        	num_questions = n()) %>%
  # Sort num_questions in descending order
  arrange(desc(num_questions))

# Replace the NAs in the tag_name column
questions %>%
  left_join(question_tags, by = c("id" = "question_id")) %>%
  left_join(tags, by = c("tag_id" = "id")) %>%
  replace_na(list(tag_name="only-r"))


  # Join the Batman and Star Wars colors
  full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
  # Replace NAs in the total_batman and total_star_wars columns
  replace_na(list(total_batman=0, total_star_wars=0)) %>%
  inner_join(colors, by = c("color_id" = "id")) 


# Use filter() to extract version 1 
version_1_inventories <- inventories %>%
filter(version == 1)

# Use anti_join() to find which set is missing a version 1
sets %>%
anti_join(version_1_inventories, by = "set_num")

# Use inventory_parts to find colors included in at least one set
colors %>%
semi_join(inventory_parts, by = c("id" = "color_id"))

#SEMI
#ANTI
	# Filter the batwing set for parts that are also in the batmobile set
	batwing %>%
semi_join(batmobile, by = c("part_num"))

	# Filter the batwing set for parts that aren't in the batmobile set
	batwing %>%
anti_join(batmobile, by = c("part_num"))


# Sort the number of star wars pieces in descending order 
  arrange(desc(n_star_wars)) %>%
  # Join the colors table to the parts_joined table
  inner_join(colors, by = c("color_id" = "id")) %>% 
  # Join the parts table to the previous join 
  inner_join(parts, by = "part_num", suffix = c("_color", "_part"))

# Combine the star_wars_parts table 
  full_join(star_wars_parts, by = c("part_num", "color_id"), suffix = c("_batman", "_star_wars")) %>%
# Replace NAs with 0s in the n_batman and n_star_wars columns 
  replace_na(list(n_batman=0, n_star_wars = 0))


# Count the part number and color id, weight by quantity
batman %>%
  count(part_num, color_id, wt = quantity)


star_wars %>%
  count(part_num, color_id, wt = quantity)


# Start with inventory_parts_joined table
inventory_parts_joined %>%
  # Combine with the sets table 
  inner_join(sets, by = "set_num") %>%
  # Combine with the themes table 
  inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
# Left join the themes table to its own children
    left_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
# Filter for themes that have no child themes
    filter(is.na(name_child))



# Join themes to itself again to find the grandchild relationships
themes %>% 
  inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
  inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))


# Use replace_na to replace missing values in the n column
  replace_na(list(n =0))


#right join
parts %>%
  count(part_cat_id) %>%
  right_join(part_categories, by = c("part_cat_id" = "id")) %>%


# Filter for NA
  filter(is.na(n))


# Join versions to sets
sets %>%
  left_join(inventory_version_1, by = "set_num") %>%
  # Filter for where version is na
  filter(is.na(version))

# Left join the Millennium Falcon colors to the Star Destroyer colors
millennium_falcon_colors %>%
    left_join(star_destroyer_colors, by = "color_id", suffix= c("_falcon", "_star_destroyer"))

#left join

# Combine the star_destroyer and millennium_falcon tables
millennium_falcon %>%
left_join(star_destroyer, by = c("part_num", "color_id"),
suffix = c("_falcon", "_star_destroyer"))


#inner join
#inner_join(part_categories, by = c("part_cat_id" = "id"), 
suffix = c("_part", "_category"))

# Combine the parts and inventory_parts tables
parts %>%
inner_join(inventory_parts, by = "part_num")

sets %>%
  # Add inventories using an inner join 
  inner_join(inventories, by = "set_num") %>% 
  # Add inventory_parts using an inner join 
  inner_join(inventory_parts, by = c("id" = "inventory_id"))


# Count the number of colors and sort
sets %>%
  inner_join(inventories, by = "set_num") %>%
  inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
  inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
  count(name_color, sort = TRUE)