Skip to content
Joining Data with dplyr
Case Study: Joins on Stack Overflow Data
# Load the Tidyverse
library(tidyverse)
# Load the course datasets
sets <- read_rds("datasets/sets.rds")
themes <- read_rds("datasets/themes.rds")
parts <- read_rds("datasets/parts.rds")
part_categories <- read_rds("datasets/part_categories.rds")
inventories <- read_rds("datasets/inventories.rds")
inventory_parts <- read_rds("datasets/inventory_parts.rds")
colors <- read_rds("datasets/colors.rds")
questions <- read_rds("datasets/questions.rds")
tags <- read_rds("datasets/tags.rds")
question_tags <- read_rds("datasets/question_tags.rds")
answers <- read_rds("datasets/answers.rds")
Hidden output
# Replace the NAs in the tag_name column
questions %>%
left_join(question_tags, by = c("id" = "question_id")) %>%
left_join(tags, by = c("tag_id" = "id")) %>%
replace_na(list(tag_name = "only-r"))
questions_with_tags <- questions %>%
left_join(question_tags, by = c("id" = "question_id")) %>%
left_join(tags, by = c("tag_id" = "id")) %>%
replace_na(list(tag_name = "only-r"))
questions_with_tags %>%
# Group by tag_name
group_by(tag_name) %>%
# Get mean score and num_questions
summarize(score = mean(score),
num_questions = n()) %>%
# Sort num_questions in descending order
arrange(desc(num_questions))
# Using a join, filter for tags that are never on an R question
tags %>%
anti_join(question_tags,by=c("id"="tag_id"))
questions %>%
# Inner join questions and answers with proper suffixes
inner_join(answers,by=c("id"="question_id"),suffix=c("_question","_answer")) %>%
# Subtract creation_date_question from creation_date_answer to create gap
mutate(gap = as.integer(creation_date_question-creation_date_answer))
# Count and sort the question id column in the answers table
answer_counts <- answers %>%
count(question_id)
# Combine the answer_counts and questions tables
questions %>%
left_join(answer_counts,by=c("id"="question_id")) %>%
# Replace the NAs in the n column
replace_na(list(n=0))
answer_counts <- answers %>%
count(question_id, sort = TRUE)
question_answer_counts <- questions %>%
left_join(answer_counts, by = c("id" = "question_id")) %>%
replace_na(list(n = 0))
question_answer_counts %>%
# Join the question_tags tables
inner_join(question_tags,by=c("id"="question_id")) %>%
# Join the tags table
inner_join(tags,by=c("tag_id"="id"))
tagged_answers <- question_answer_counts %>%
inner_join(question_tags, by = c("id" = "question_id")) %>%
inner_join(tags, by = c("tag_id" = "id"))
tagged_answers %>%
# Aggregate by tag_name
group_by(tag_name) %>%
# Summarize questions and average_answers
summarize(questions = n(),
average_answers = mean(n)) %>%
# Sort the questions in descending order
arrange(desc(questions))
# Inner join the question_tags and tags tables with the questions table
questions %>%
inner_join(question_tags, by=c("id"="question_id")) %>%
inner_join(tags, by =c("tag_id"="id"))
# Inner join the question_tags and tags tables with the answers table
answers %>%
inner_join(question_tags, by="question_id") %>%
inner_join(tags, by =c("tag_id"="id"))
questions_with_tags <- questions %>%
inner_join(question_tags, by = c("id" = "question_id")) %>%
inner_join(tags, by = c("tag_id" = "id"))
answers_with_tags <- answers %>%
inner_join(question_tags, by = "question_id") %>%
inner_join(tags, by = c("tag_id" = "id"))
# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"),answers_with_tags %>% mutate(type = "answer"))
# Add a year column, then count by type, year, and tag_name
posts_with_tags %>%
mutate(year=year(creation_date)) %>%
count(type,year,tag_name)
by_type_year_tag <- posts_with_tags %>%
mutate(year = year(creation_date)) %>%
count(type, year, tag_name)
# Filter for the dplyr and ggplot2 tag names
by_type_year_tag_filtered <- by_type_year_tag %>%
filter(tag_name %in% c("dplyr","ggplot2"))
# Create a line plot faceted by the tag name
ggplot(by_type_year_tag_filtered, aes(x=year,y=n , color =type)) +
geom_line() +
facet_wrap(~ tag_name)