Skip to content

Tutorial: Predicting the 2024 US Presidential Elections with Python

Data Sources

import pandas as pd	# importing pandas package

# reading csv files
polls_24 = pd.read_csv('presidential_general_averages.csv')
polls_20 = pd.read_csv('presidential_poll_averages_2020.csv')
polls_until_16 = pd.read_csv('pres_pollaverages_1968-2016.csv')

# filtering and concatenating DataFrames
polls_24 = polls_24[polls_24['cycle'] == 2024]
polls_until_20 = pd.concat([polls_20, polls_until_16], ignore_index=True)

# making sure dates are in datetime format
polls_24['date'] = pd.to_datetime(polls_24['date'], format='%Y-%m-%d')
polls_until_20['modeldate'] = pd.to_datetime(polls_until_20['modeldate'])

# keeping only the columns of interest
polls_until_20 = polls_until_20[['cycle', 'state', 'modeldate', 'candidate_name', 'pct_estimate', 'pct_trend_adjusted']]
# importing result data
results_until_20 = pd.read_csv('results.csv', sep=';')
results_until_20 = results_until_20[['cycle', 'state', 'party', 'candidate', 'vote_share']]
# implementing cycle restriction
start_cycle = 2000
polls_until_20 = polls_until_20[polls_until_20['cycle'] >= start_cycle]
# Defining state lists
swing_states = ['Pennsylvania', 'Wisconsin', 'Michigan', 'Georgia', 'North Carolina', 'Arizona', 'Nevada']
blue_states = ['District of Columbia', 'Vermont', 'Massachusetts', 'Maryland', 'Hawaii', 'California', 'ME-1', 'Connecticut', 'Washington', 'Delaware', 'Rhode Island', 'New York', 'Illinois', 'New Jersey', 'Oregon', 'Colorado', 'Maine', 'New Mexico', 'Virginia', 'New Hampshire', 'NE-2', 'Minnesota']
red_states = ['Wyoming', 'West Virginia', 'Oklahoma', 'North Dakota', 'Idaho', 'South Dakota', 'Arkansas', 'Kentucky', 'NE-3', 'Alabama', 'Tennessee', 'Utah', 'Louisiana', 'Nebraska', 'Mississippi', 'Montana', 'NE-1', 'Indiana', 'Kansas', 'Missouri', 'South Carolina', 'Alaska', 'Ohio', 'Iowa', 'Texas', 'ME-2', 'Florida']

# Defining swing state subset of the poll data
swing_until_20 = polls_until_20[polls_until_20['state'].isin(swing_states)]
swing_24 = polls_24[polls_24['state'].isin(swing_states)]

Exploratory Data Analysis and Data Cleaning

# checking for missing values in swing_24 and swing_until_20
print(swing_24.isnull().sum())
print(swing_until_20.isnull().sum())
print('2024 data:')
print(swing_24['date'].min())			        # earliest polling date
print(swing_24['date'].max())			        # latest polling date
print(swing_24['state'].unique().tolist())	    # distinct states
print(swing_24['party'].unique().tolist())	    # distinct parties
print(swing_24['candidate'].unique().tolist())	# distinct candidates

print('Historical data:')
print(swing_until_20['modeldate'].min())
print(swing_until_20['modeldate'].max())
print(swing_until_20['state'].unique().tolist())
print(swing_until_20['candidate_name'].unique().tolist())
# Only keep rows where candidate_name does not start with 'Convention Bounce'
swing_until_20 = swing_until_20[~swing_until_20['candidate_name'].str.startswith('Convention Bounce')]

Estimated vs. trend-adjusted percentage

 # Checking correlation between the percentages
adj_corr_swing = swing_until_20['pct_estimate'].corr(swing_until_20['pct_trend_adjusted'])
print('Correlation between estimated and trend-adjusted percentage in swing states: ' + str(adj_corr_swing))

# Calculate the mean difference between pct_estimate and pct_trend_adjusted, grouping by date, state, and party
mean_diff = (swing_until_20['pct_estimate'] - swing_until_20['pct_trend_adjusted']).mean()
print('Mean difference between estimated and trend-adjusted percentage in swing states: ' + str(mean_diff))
# Finding out how often pct_estimate and pct_trend_adjusted saw different candidates in the lead in 2020 race
swing_20 = swing_until_20[swing_until_20['cycle'] == 2020]

# Create a new column to indicate if the ranking is different between pct_estimate and pct_trend_adjusted
swing_20['rank_estimate'] = swing_20.groupby(['state', 'modeldate'])['pct_estimate'].rank(ascending=False)
swing_20['rank_trend_adjusted'] = swing_20.groupby(['state', 'modeldate'])['pct_trend_adjusted'].rank(ascending=False)

# Rows where the rankings are different in swing states
different_rankings_swing = swing_20[swing_20['rank_estimate'] != swing_20['rank_trend_adjusted']]
print('Number of observations with differing leader: ' + str(different_rankings_swing.shape[0] / 2))
print('Last occurence: ' + str(different_rankings_swing['modeldate'].max()))

Adding party column and historical result data

# Get unique candidate names
candidate_names = swing_until_20['candidate_name'].unique().tolist()

# Create a dictionary of candidates and their political party
party_map = {
    'Joseph R. Biden Jr.': 'DEM',
    'Donald Trump': 'REP',
    'Hillary Rodham Clinton': 'DEM',
    'Gary Johnson': 'LIB',
    'Barack Obama': 'DEM',
    'Mitt Romney': 'REP',
    'John McCain': 'REP',
    'Ralph Nader': 'IND',
    'George W. Bush': 'REP',
    'John Kerry': 'DEM',
    'Al Gore': 'DEM',
    'Pat Buchanan': 'REF',
    'Bob Dole': 'REP',
    'Bill Clinton': 'DEM',
    'H. Ross Perot': 'IND',
    'George Bush': 'REP',
    'Michael S. Dukakis': 'DEM',
    'Walter F. Mondale': 'DEM',
    'Ronald Reagan': 'REP',
    'Jimmy Carter': 'DEM',
    'John B. Anderson': 'IND',
    'Gerald R. Ford': 'REP',
    'George S. McGovern': 'DEM',
    'Barry Goldwater': 'REP',
    'Hubert Humphrey, Jr.': 'DEM',
    'Richard M. Nixon': 'REP',
    'George Wallace': 'IND',
    'Eugene McCarthy': 'IND'
}

# Create a DataFrame with candidates and their respective parties
candidate_df = pd.DataFrame(candidate_names, columns=['candidate_name'])
candidate_df['party'] = candidate_df['candidate_name'].map(party_map)

# Merge the candidate_df with swing_until_20 on 'candidate' column
swing_until_20 = swing_until_20.merge(candidate_df[['candidate_name', 'party']], on='candidate_name', how='left')