Skip to content
NIGERIA ELECTIONS - Outlier Detection
NIGERIA ELECTIONS - Outlier Detection
import pandas as pd
import requests
import time
from math import radians, sin, cos, sqrt, atan2
import pandas as pd
from scipy.stats import zscore
import numpy as np
import matplotlib.pyplot as plt
# Load the dataset
file_path = 'data/ADAMAWA_crosschecked.csv'
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
data.head()
# Load geocoded dataset
geocoded_file = pd.read_csv('data/ADAMAWA_geocoded.csv')
# View few rows of the dataset
geocoded_file.head()
# Load the geocoded dataset
geocoded_file = 'data/ADAMAWA_geocoded.csv'
geocoded_data = pd.read_csv(geocoded_file)
# Calculate Haversine distance function
def haversine(lat1, lon1, lat2, lon2):
R = 6371.0 # Earth radius in kilometers
lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return distance
# Identify neighbours within 1 km
radius_km = 1
neighbours = []
for i, row1 in geocoded_data.iterrows():
for j, row2 in geocoded_data.iterrows():
if i != j:
distance = haversine(row1['Latitude'], row1['Longitude'], row2['Latitude'], row2['Longitude'])
if distance <= radius_km:
neighbours.append({'PU-Code': row1['PU-Code'], 'Neighbour_PU-Code': row2['PU-Code'], 'Distance_km': distance})
# Convert neighbours list to a DataFrame
neighbours_df = pd.DataFrame(neighbours)
# Calculate outlier scores for each party
party_columns = ['APC', 'LP', 'PDP', 'NNPP']
outlier_scores = []
for pu_code, group in neighbours_df.groupby('PU-Code'):
for party in party_columns:
current_votes = geocoded_data.loc[geocoded_data['PU-Code'] == pu_code, party].values[0]
neighbour_votes = geocoded_data.loc[geocoded_data['PU-Code'].isin(group['Neighbour_PU-Code']), party].values
if len(neighbour_votes) > 0:
mean_votes = np.mean(neighbour_votes)
std_votes = np.std(neighbour_votes)
if std_votes > 0:
z_score = (current_votes - mean_votes) / std_votes
else:
z_score = 0 # Avoid division by zero
else:
z_score = 0 # No neighbours to compare with
outlier_scores.append({'PU-Code': pu_code, 'Party': party, 'Outlier_Score': z_score, 'Neighbours': list(group['Neighbour_PU-Code'])})
# Convert outlier scores to a DataFrame
outlier_scores_df = pd.DataFrame(outlier_scores)
# Sort the dataset by outlier scores for each party
sorted_outlier_scores_df = outlier_scores_df.sort_values(by=['Party', 'Outlier_Score'], ascending=[True, False])
# Save the sorted outlier scores to an Excel sheet
excel_file = 'data/Sorted_Outlier_Scores.xlsx'
sorted_outlier_scores_df.to_excel(excel_file, index=False)
# Plot outlier scores for each party
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10), sharex=True)
party_titles = ['APC', 'LP', 'PDP', 'NNPP']
for i, party in enumerate(party_columns):
ax = axes[i // 2, i % 2]
party_data = sorted_outlier_scores_df[sorted_outlier_scores_df['Party'] == party]
ax.bar(party_data['PU-Code'], party_data['Outlier_Score'])
ax.set_title(f'{party_titles[i]} Outlier Scores')
ax.set_xlabel('Polling Unit')
ax.set_ylabel('Outlier Score')
ax.tick_params(axis='x', rotation=90)
plt.tight_layout()
plt.show()
# Find the top 3 outliers based on the Outlier_Score
top_3_outliers = sorted_outlier_scores_df.nlargest(3, 'Outlier_Score')
# Display the top 3 outliers
top_3_outliers
# Visualize the top 3 outliers with party titles
sorted_outlier_scores_df = pd.read_excel('data/Sorted_Outlier_Scores.xlsx')
# Select the top 3 outliers for each party
top_3_outliers_df = sorted_outlier_scores_df.groupby('Party').head(3)
# Scatter plot for the top 3 outliers for each party
fig, ax = plt.subplots(figsize=(15, 10))
colors = {'APC': 'red', 'LP': 'green', 'PDP': 'blue', 'NNPP': 'orange'}
party_columns = ['APC', 'LP', 'PDP', 'NNPP']
for party in party_columns:
party_data = top_3_outliers_df[top_3_outliers_df['Party'] == party]
ax.scatter(party_data['PU-Code'], party_data['Outlier_Score'], color=colors[party], label=party, s=100)
ax.set_title('Top 3 Outliers for Each Party')
ax.set_xlabel('Polling Unit')
ax.set_ylabel('Outlier Score')
ax.legend()
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()