Skip to content

The Super Bowl is a spectacle. It is the final game in the NFL that crowns the winner of that season. There is always a little something for everyone. For the fans, there is the game itself. For those tagging along, there are the unique advertisements and the halftime shows with the biggest musicians in the world.

You're going to explore how these elements interact.

The Data

The data has been scraped from Wikipedia and is made up of two CSV files covering a span of all Super Bowls up to 2024. This data does contain missing values. The most relevant columns are noted below.
data/tv.csv

ColumnDescription
super_bowlSuper Bowl number (e.g. the first Super Bowl ever is Super Bowl 1 and the last Super Bowl in 2024 is Super Bowl 58)
avg_us_viewersAverage # of US viewers
share_householdPercentage of households watching TV that watched the game
rating_householdPercentage of all households with TVs that watched the game
ad_costCost per ad

data/super_bowls.csv

ColumnDescription
super_bowlSuper Bowl number (e.g. the first Super Bowl ever is Super Bowl 1 and the last Super Bowl in 2024 is Super Bowl 58)
difference_ptsPoint difference for that game
# Load packages
library(tidyverse)

# Load the CSV data
tv  <-  read_csv("data/tv.csv", show_col_types=FALSE)
super_bowls  <-  read_csv("data/super_bowls.csv", show_col_types=FALSE)
# Load required libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales) # For better axis formatting

## Load and prepare data
tv <- read.csv("data/tv.csv") %>% 
  select(super_bowl, avg_us_viewers, rating_household, ad_cost) %>%
  mutate(ad_cost_millions = ad_cost / 1e6) # Convert ad cost to millions for better readability

super_bowls <- read.csv("data/super_bowls.csv") %>%
  select(super_bowl, difference_pts)

## Question 1: Impact of point differences on viewership
merged_data <- inner_join(tv, super_bowls, by = "super_bowl") %>%
  drop_na(avg_us_viewers, difference_pts)

# Calculate correlation
correlation <- cor(merged_data$difference_pts, merged_data$avg_us_viewers)

# Visualize relationship
ggplot(merged_data, aes(x = difference_pts, y = avg_us_viewers/1e6)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Super Bowl Point Difference vs. Viewership",
       subtitle = paste("Correlation:", round(correlation, 2)),
       x = "Point Difference in Game",
       y = "Average US Viewers (millions)") +
  theme_minimal() +
  scale_y_continuous(labels = comma)

# Determine strength
score_impact <- ifelse(abs(correlation) > 0.5, "strong",
                      ifelse(abs(correlation) > 0.2, "weak", "none"))

## Question 2: Trends in viewers, ratings, and ad costs
# Prepare time series data
time_data <- tv %>%
  drop_na(avg_us_viewers, rating_household, ad_cost_millions) %>%
  arrange(super_bowl)

# Find first sustained increases
find_first_sustained_increase <- function(values) {
  changes <- diff(values)
  # Require at least 3 consecutive years of increase above median change
  for (i in 2:(length(changes)-1)) {
    if (changes[i] > median(changes, na.rm=TRUE) & 
        changes[i-1] > median(changes, na.rm=TRUE) & 
        changes[i+1] > median(changes, na.rm=TRUE)) {
      return(i)
    }
  }
  return(NA)
}

first_viewer_increase <- find_first_sustained_increase(time_data$avg_us_viewers)
first_rating_increase <- find_first_sustained_increase(time_data$rating_household)
first_ad_increase <- find_first_sustained_increase(time_data$ad_cost_millions)

# Determine which increased first
increase_order <- data.frame(
  metric = c("viewers", "ratings", "ads"),
  super_bowl = c(first_viewer_increase, first_rating_increase, first_ad_increase)
) %>% arrange(super_bowl)

first_to_increase <- as.character(increase_order$metric[1])

# Visualization
time_data_long <- time_data %>%
  pivot_longer(cols = c(avg_us_viewers, rating_household, ad_cost_millions),
               names_to = "metric", values_to = "value") %>%
  mutate(metric = factor(metric, 
                        levels = c("rating_household", "avg_us_viewers", "ad_cost_millions"),
                        labels = c("Household Rating (%)", "Viewers (millions)", "Ad Cost ($ millions)")))

ggplot(time_data_long, aes(x = super_bowl, y = value, color = metric)) +
  geom_line(size = 1) +
  labs(title = "Super Bowl Metrics Over Time",
       subtitle = paste("First sustained increase:", first_to_increase),
       x = "Super Bowl Number",
       y = "Metric Value",
       color = "Metric") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_color_brewer(palette = "Set1") +
  geom_vline(xintercept = increase_order$super_bowl[1], linetype = "dashed", alpha = 0.5)

## Results
list(
  score_impact = score_impact,
  first_to_increase = first_to_increase,
  correlation_point_diff_viewers = correlation,
  first_increases = increase_order
)