Skip to content
Predicting the U.S. Presidential Prediction 2024
Tutorial: Predicting the 2024 US Presidential Elections with Python
Data Sources
import pandas as pd # importing pandas package
# reading csv files
polls_24 = pd.read_csv('presidential_general_averages.csv')
polls_20 = pd.read_csv('presidential_poll_averages_2020.csv')
polls_until_16 = pd.read_csv('pres_pollaverages_1968-2016.csv')
# filtering and concatenating DataFrames
polls_24 = polls_24[polls_24['cycle'] == 2024]
polls_until_20 = pd.concat([polls_20, polls_until_16], ignore_index=True)
# making sure dates are in datetime format
polls_24['date'] = pd.to_datetime(polls_24['date'], format='%Y-%m-%d')
polls_until_20['modeldate'] = pd.to_datetime(polls_until_20['modeldate'])
# keeping only the columns of interest
polls_until_20 = polls_until_20[['cycle', 'state', 'modeldate', 'candidate_name', 'pct_estimate', 'pct_trend_adjusted']]# importing result data
results_until_20 = pd.read_csv('results.csv', sep=';')
results_until_20 = results_until_20[['cycle', 'state', 'party', 'candidate', 'vote_share']]# implementing cycle restriction
start_cycle = 2000
polls_until_20 = polls_until_20[polls_until_20['cycle'] >= start_cycle]# Defining state lists
swing_states = ['Pennsylvania', 'Wisconsin', 'Michigan', 'Georgia', 'North Carolina', 'Arizona', 'Nevada']
blue_states = ['District of Columbia', 'Vermont', 'Massachusetts', 'Maryland', 'Hawaii', 'California', 'ME-1', 'Connecticut', 'Washington', 'Delaware', 'Rhode Island', 'New York', 'Illinois', 'New Jersey', 'Oregon', 'Colorado', 'Maine', 'New Mexico', 'Virginia', 'New Hampshire', 'NE-2', 'Minnesota']
red_states = ['Wyoming', 'West Virginia', 'Oklahoma', 'North Dakota', 'Idaho', 'South Dakota', 'Arkansas', 'Kentucky', 'NE-3', 'Alabama', 'Tennessee', 'Utah', 'Louisiana', 'Nebraska', 'Mississippi', 'Montana', 'NE-1', 'Indiana', 'Kansas', 'Missouri', 'South Carolina', 'Alaska', 'Ohio', 'Iowa', 'Texas', 'ME-2', 'Florida']
# Defining swing state subset of the poll data
swing_until_20 = polls_until_20[polls_until_20['state'].isin(swing_states)]
swing_24 = polls_24[polls_24['state'].isin(swing_states)]Exploratory Data Analysis and Data Cleaning
# checking for missing values in swing_24 and swing_until_20
print(swing_24.isnull().sum())
print(swing_until_20.isnull().sum())print('2024 data:')
print(swing_24['date'].min()) # earliest polling date
print(swing_24['date'].max()) # latest polling date
print(swing_24['state'].unique().tolist()) # distinct states
print(swing_24['party'].unique().tolist()) # distinct parties
print(swing_24['candidate'].unique().tolist()) # distinct candidates
print('Historical data:')
print(swing_until_20['modeldate'].min())
print(swing_until_20['modeldate'].max())
print(swing_until_20['state'].unique().tolist())
print(swing_until_20['candidate_name'].unique().tolist())# Only keep rows where candidate_name does not start with 'Convention Bounce'
swing_until_20 = swing_until_20[~swing_until_20['candidate_name'].str.startswith('Convention Bounce')]Estimated vs. trend-adjusted percentage
# Checking correlation between the percentages
adj_corr_swing = swing_until_20['pct_estimate'].corr(swing_until_20['pct_trend_adjusted'])
print('Correlation between estimated and trend-adjusted percentage in swing states: ' + str(adj_corr_swing))
# Calculate the mean difference between pct_estimate and pct_trend_adjusted, grouping by date, state, and party
mean_diff = (swing_until_20['pct_estimate'] - swing_until_20['pct_trend_adjusted']).mean()
print('Mean difference between estimated and trend-adjusted percentage in swing states: ' + str(mean_diff))# Finding out how often pct_estimate and pct_trend_adjusted saw different candidates in the lead in 2020 race
swing_20 = swing_until_20[swing_until_20['cycle'] == 2020]
# Create a new column to indicate if the ranking is different between pct_estimate and pct_trend_adjusted
swing_20['rank_estimate'] = swing_20.groupby(['state', 'modeldate'])['pct_estimate'].rank(ascending=False)
swing_20['rank_trend_adjusted'] = swing_20.groupby(['state', 'modeldate'])['pct_trend_adjusted'].rank(ascending=False)
# Rows where the rankings are different in swing states
different_rankings_swing = swing_20[swing_20['rank_estimate'] != swing_20['rank_trend_adjusted']]
print('Number of observations with differing leader: ' + str(different_rankings_swing.shape[0] / 2))
print('Last occurence: ' + str(different_rankings_swing['modeldate'].max()))Adding party column and historical result data
# Get unique candidate names
candidate_names = swing_until_20['candidate_name'].unique().tolist()
# Create a dictionary of candidates and their political party
party_map = {
'Joseph R. Biden Jr.': 'DEM',
'Donald Trump': 'REP',
'Hillary Rodham Clinton': 'DEM',
'Gary Johnson': 'LIB',
'Barack Obama': 'DEM',
'Mitt Romney': 'REP',
'John McCain': 'REP',
'Ralph Nader': 'IND',
'George W. Bush': 'REP',
'John Kerry': 'DEM',
'Al Gore': 'DEM',
'Pat Buchanan': 'REF',
'Bob Dole': 'REP',
'Bill Clinton': 'DEM',
'H. Ross Perot': 'IND',
'George Bush': 'REP',
'Michael S. Dukakis': 'DEM',
'Walter F. Mondale': 'DEM',
'Ronald Reagan': 'REP',
'Jimmy Carter': 'DEM',
'John B. Anderson': 'IND',
'Gerald R. Ford': 'REP',
'George S. McGovern': 'DEM',
'Barry Goldwater': 'REP',
'Hubert Humphrey, Jr.': 'DEM',
'Richard M. Nixon': 'REP',
'George Wallace': 'IND',
'Eugene McCarthy': 'IND'
}
# Create a DataFrame with candidates and their respective parties
candidate_df = pd.DataFrame(candidate_names, columns=['candidate_name'])
candidate_df['party'] = candidate_df['candidate_name'].map(party_map)
# Merge the candidate_df with swing_until_20 on 'candidate' column
swing_until_20 = swing_until_20.merge(candidate_df[['candidate_name', 'party']], on='candidate_name', how='left')