Skip to content
Joining Data with dplyr
  • AI Chat
  • Code
  • Report
  • Spinner

    Joining Data with dplyr

    Run the hidden code cell below to import the data used in this course.

    Take Notes

    Add notes about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    
    # Filter for the dplyr and ggplot2 tag names 
    by_type_year_tag_filtered <- by_type_year_tag %>%
      filter(tag_name %in% c("dplyr", "ggplot2"))
    
    # Create a line plot faceted by the tag name 
    ggplot(by_type_year_tag_filtered, aes(year, n, color = type)) +
      geom_line() +
      facet_wrap(~ tag_name)
    
    
    # Combine the two tables into posts_with_tags
    posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"),
                                  answers_with_tags %>% mutate(type = "answer"))
    
    # Add a year column, then count by type, year, and tag_name
    posts_with_tags %>%
      mutate(year = year(creation_date)) %>%
      count(type, year, tag_name)
    
    # Inner join the question_tags and tags tables with the questions table
    questions %>%
      inner_join(question_tags, by = c("id" = "question_id")) %>%
      inner_join(tags, by = c("tag_id" = "id"))
    
    # Inner join the question_tags and tags tables with the answers table
    answers %>%
      inner_join(question_tags, by = c("question_id" = "question_id")) %>%
      inner_join(tags, by = c("tag_id" = "id"))
    tagged_answers %>%
      # Aggregate by tag_name
      group_by(tag_name)  %>%
      # Summarize questions and average_answers
      summarize(questions = n(),
                average_answers = mean(n)) %>%
      # Sort the questions in descending order
      arrange(desc(questions)) 
    
    question_answer_counts %>%
      # Join the question_tags tables
      inner_join(question_tags, by = c("id" = "question_id")) %>%
      # Join the tags table
      inner_join(tags, by = c("tag_id" = "id"))
    
    # Count and sort the question id column in the answers table
    answer_counts <- answers %>%
    count(question_id, sort = TRUE)
    
    # Combine the answer_counts and questions tables
    questions %>%
       left_join(answer_counts, by = c("id" = "question_id")) %>%
      # Replace the NAs in the n column
    replace_na(list(n=0))
    questions %>%
      # Inner join questions and answers with proper suffixes
      inner_join(answers, by = c("id"="question_id"), suffix = c("_question", "_answer")) %>%
      # Subtract creation_date_question from creation_date_answer to create gap
      mutate(gap = as.integer(creation_date_answer-creation_date_question))
    
    
    questions_with_tags %>% 
      # Group by tag_name
      group_by(tag_name) %>%
      # Get mean score and num_questions
      summarize(score = mean(score),
            	num_questions = n()) %>%
      # Sort num_questions in descending order
      arrange(desc(num_questions))
    
    # Replace the NAs in the tag_name column
    questions %>%
      left_join(question_tags, by = c("id" = "question_id")) %>%
      left_join(tags, by = c("tag_id" = "id")) %>%
      replace_na(list(tag_name="only-r"))
    
    
      # Join the Batman and Star Wars colors
      full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
      # Replace NAs in the total_batman and total_star_wars columns
      replace_na(list(total_batman=0, total_star_wars=0)) %>%
      inner_join(colors, by = c("color_id" = "id")) 
    
    
    # Use filter() to extract version 1 
    version_1_inventories <- inventories %>%
    filter(version == 1)
    
    # Use anti_join() to find which set is missing a version 1
    sets %>%
    anti_join(version_1_inventories, by = "set_num")
    
    # Use inventory_parts to find colors included in at least one set
    colors %>%
    semi_join(inventory_parts, by = c("id" = "color_id"))
    
    #SEMI
    #ANTI
    	# Filter the batwing set for parts that are also in the batmobile set
    	batwing %>%
    semi_join(batmobile, by = c("part_num"))
    
    	# Filter the batwing set for parts that aren't in the batmobile set
    	batwing %>%
    anti_join(batmobile, by = c("part_num"))
    
    
    # Sort the number of star wars pieces in descending order 
      arrange(desc(n_star_wars)) %>%
      # Join the colors table to the parts_joined table
      inner_join(colors, by = c("color_id" = "id")) %>% 
      # Join the parts table to the previous join 
      inner_join(parts, by = "part_num", suffix = c("_color", "_part"))
    
    # Combine the star_wars_parts table 
      full_join(star_wars_parts, by = c("part_num", "color_id"), suffix = c("_batman", "_star_wars")) %>%
    # Replace NAs with 0s in the n_batman and n_star_wars columns 
      replace_na(list(n_batman=0, n_star_wars = 0))
    
    
    # Count the part number and color id, weight by quantity
    batman %>%
      count(part_num, color_id, wt = quantity)
    
    
    star_wars %>%
      count(part_num, color_id, wt = quantity)
    
    
    # Start with inventory_parts_joined table
    inventory_parts_joined %>%
      # Combine with the sets table 
      inner_join(sets, by = "set_num") %>%
      # Combine with the themes table 
      inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
    # Left join the themes table to its own children
        left_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
    # Filter for themes that have no child themes
        filter(is.na(name_child))
    
    
    
    # Join themes to itself again to find the grandchild relationships
    themes %>% 
      inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
      inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
    
    
    # Use replace_na to replace missing values in the n column
      replace_na(list(n =0))
    
    
    #right join
    parts %>%
      count(part_cat_id) %>%
      right_join(part_categories, by = c("part_cat_id" = "id")) %>%
    
    
    # Filter for NA
      filter(is.na(n))
    
    
    # Join versions to sets
    sets %>%
      left_join(inventory_version_1, by = "set_num") %>%
      # Filter for where version is na
      filter(is.na(version))
    
    # Left join the Millennium Falcon colors to the Star Destroyer colors
    millennium_falcon_colors %>%
        left_join(star_destroyer_colors, by = "color_id", suffix= c("_falcon", "_star_destroyer"))
    
    #left join
    
    # Combine the star_destroyer and millennium_falcon tables
    millennium_falcon %>%
    left_join(star_destroyer, by = c("part_num", "color_id"),
    suffix = c("_falcon", "_star_destroyer"))
    
    
    #inner join
    #inner_join(part_categories, by = c("part_cat_id" = "id"), 
    suffix = c("_part", "_category"))
    
    # Combine the parts and inventory_parts tables
    parts %>%
    inner_join(inventory_parts, by = "part_num")
    
    sets %>%
      # Add inventories using an inner join 
      inner_join(inventories, by = "set_num") %>% 
      # Add inventory_parts using an inner join 
      inner_join(inventory_parts, by = c("id" = "inventory_id"))
    
    
    # Count the number of colors and sort
    sets %>%
      inner_join(inventories, by = "set_num") %>%
      inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
      inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
      count(name_color, sort = TRUE)