Skip to content
Project: Analyzing Super Bowl Viewership and Advertising
The Super Bowl is a spectacle. It is the final game in the NFL that crowns the winner of that season. There is always a little something for everyone. For the fans, there is the game itself. For those tagging along, there are the unique advertisements and the halftime shows with the biggest musicians in the world.
You're going to explore how these elements interact.
The Data
The data has been scraped from Wikipedia and is made up of two CSV files covering a span of all Super Bowls up to 2024. This data does contain missing values. The most relevant columns are noted below.
data/tv.csv
| Column | Description |
|---|---|
super_bowl | Super Bowl number (e.g. the first Super Bowl ever is Super Bowl 1 and the last Super Bowl in 2024 is Super Bowl 58) |
avg_us_viewers | Average # of US viewers |
share_household | Percentage of households watching TV that watched the game |
rating_household | Percentage of all households with TVs that watched the game |
ad_cost | Cost per ad |
data/super_bowls.csv
| Column | Description |
|---|---|
super_bowl | Super Bowl number (e.g. the first Super Bowl ever is Super Bowl 1 and the last Super Bowl in 2024 is Super Bowl 58) |
difference_pts | Point difference for that game |
# Load packages
library(tidyverse)
# Load the CSV data
tv <- read_csv("data/tv.csv", show_col_types=FALSE)
super_bowls <- read_csv("data/super_bowls.csv", show_col_types=FALSE)# Load required libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales) # For better axis formatting
## Load and prepare data
tv <- read.csv("data/tv.csv") %>%
select(super_bowl, avg_us_viewers, rating_household, ad_cost) %>%
mutate(ad_cost_millions = ad_cost / 1e6) # Convert ad cost to millions for better readability
super_bowls <- read.csv("data/super_bowls.csv") %>%
select(super_bowl, difference_pts)
## Question 1: Impact of point differences on viewership
merged_data <- inner_join(tv, super_bowls, by = "super_bowl") %>%
drop_na(avg_us_viewers, difference_pts)
# Calculate correlation
correlation <- cor(merged_data$difference_pts, merged_data$avg_us_viewers)
# Visualize relationship
ggplot(merged_data, aes(x = difference_pts, y = avg_us_viewers/1e6)) +
geom_point(alpha = 0.7, size = 3) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Super Bowl Point Difference vs. Viewership",
subtitle = paste("Correlation:", round(correlation, 2)),
x = "Point Difference in Game",
y = "Average US Viewers (millions)") +
theme_minimal() +
scale_y_continuous(labels = comma)
# Determine strength
score_impact <- ifelse(abs(correlation) > 0.5, "strong",
ifelse(abs(correlation) > 0.2, "weak", "none"))
## Question 2: Trends in viewers, ratings, and ad costs
# Prepare time series data
time_data <- tv %>%
drop_na(avg_us_viewers, rating_household, ad_cost_millions) %>%
arrange(super_bowl)
# Find first sustained increases
find_first_sustained_increase <- function(values) {
changes <- diff(values)
# Require at least 3 consecutive years of increase above median change
for (i in 2:(length(changes)-1)) {
if (changes[i] > median(changes, na.rm=TRUE) &
changes[i-1] > median(changes, na.rm=TRUE) &
changes[i+1] > median(changes, na.rm=TRUE)) {
return(i)
}
}
return(NA)
}
first_viewer_increase <- find_first_sustained_increase(time_data$avg_us_viewers)
first_rating_increase <- find_first_sustained_increase(time_data$rating_household)
first_ad_increase <- find_first_sustained_increase(time_data$ad_cost_millions)
# Determine which increased first
increase_order <- data.frame(
metric = c("viewers", "ratings", "ads"),
super_bowl = c(first_viewer_increase, first_rating_increase, first_ad_increase)
) %>% arrange(super_bowl)
first_to_increase <- as.character(increase_order$metric[1])
# Visualization
time_data_long <- time_data %>%
pivot_longer(cols = c(avg_us_viewers, rating_household, ad_cost_millions),
names_to = "metric", values_to = "value") %>%
mutate(metric = factor(metric,
levels = c("rating_household", "avg_us_viewers", "ad_cost_millions"),
labels = c("Household Rating (%)", "Viewers (millions)", "Ad Cost ($ millions)")))
ggplot(time_data_long, aes(x = super_bowl, y = value, color = metric)) +
geom_line(size = 1) +
labs(title = "Super Bowl Metrics Over Time",
subtitle = paste("First sustained increase:", first_to_increase),
x = "Super Bowl Number",
y = "Metric Value",
color = "Metric") +
theme_minimal() +
theme(legend.position = "bottom") +
scale_color_brewer(palette = "Set1") +
geom_vline(xintercept = increase_order$super_bowl[1], linetype = "dashed", alpha = 0.5)
## Results
list(
score_impact = score_impact,
first_to_increase = first_to_increase,
correlation_point_diff_viewers = correlation,
first_increases = increase_order
)