Skip to content

Olympics

This is a historical dataset on the modern Olympic Games, from Athens 1896 to Rio 2016. Each row consists of an individual athlete competing in an Olympic event and which medal was won (if any).

import pandas as pd

olymp = pd.read_csv("data/athlete_events.csv.gz")
print(olymp.shape)
olymp.head(100)

Data Dictionary

ColumnExplanation
idUnique number for each athlete
nameAthlete's name
sexM or F
ageAge of the athlete
heightIn centimeters
weightIn kilograms
teamTeam name
nocNational Olympic Committee 3
gamesYear and season
yearInteger
seasonSummer or Winter
cityHost city
sportSport
eventEvent
medalGold, Silver, Bronze, or NA

Source and license of the dataset. The dataset is a consolidated version of data from www.sports-reference.com.

Exploring the Dataset

In which year and city did the Netherlands win the highest number of medals in their history?

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

olymp = pd.read_csv("data/athlete_events.csv.gz")

olymp_netherlands = olymp[olymp["team"] == "Netherlands"]
netherlands_medals = olymp_netherlands.groupby(["year", "city"])["medal"].count().reset_index()

# Max medal
max_medal = netherlands_medals[netherlands_medals["medal"] == netherlands_medals["medal"].max()]

# Extract year, city and medal
year = max_medal["year"].values[0]
city = max_medal["city"].values[0]
medal = max_medal["medal"].values[0]

print(f"The Netherland won the highest number of {medal} medals in {year}, hosted in {city}")

Visualizing

Plot visualizing the relationship between the number of athletes countries send to an event and the number of medals they receive.

# Get the data teams in events
olymp_noc_event = olymp.groupby("noc")["event"].count().sort_values(ascending=False)
print(olymp_noc_event)
# Get the data medals by teams
olymp_noc_medal = olymp.groupby("noc")["medal"].count().sort_values(ascending=False)
print(olymp_noc_medal)
sns.scatterplot(x=olymp_noc_event, y=olymp_noc_medal)
plt.xlabel("Number of athletes in an event")
plt.ylabel("Number of medals")
plt.title("Medal vs Athletes sent to an event")
plt.show()

In which sports does the height of an athlete increase their chances of earning a medal?

# Get the list of unique sports
sports = data['sport'].unique()

# Dictionary to store the results
results = {}

# Perform t-test for each sport
for sport in sports:
    sport_data = data[data['sport'] == sport]
    medalists = sport_data[sport_data['medal'] == 1]['height']
    non_medalists = sport_data[sport_data['medal'] == 0]['height']
    
    if len(medalists) > 1 and len(non_medalists) > 1:  # Ensure there are enough data points
        t_stat, p_value = ttest_ind(medalists, non_medalists, equal_var=False)
        results[sport] = p_value

# Convert results to DataFrame
results_df = pd.DataFrame(list(results.items()), columns=['Sport', 'P-Value'])

# Sort by P-Value
results_df = results_df.sort_values(by='P-Value')

# Display the results
print(results_df)

# Filter sports with significant height differences
significant_sports = results_df[results_df['P-Value'] < 0.05]
print("Sports where height significantly affects the chances of winning a medal:")
print(significant_sports)

# Plot height distributions for significant sports
plt.figure(figsize=(15, 10))

for i, sport in enumerate(significant_sports['Sport'], 1):
    plt.subplot(5, 5, i)
    sport_data = data[data['sport'] == sport]
    sns.histplot(sport_data[sport_data['medal'] == 1]['height'], bins=30, kde=True, color='blue', label='Medalist')
    sns.histplot(sport_data[sport_data['medal'] == 0]['height'], bins=30, kde=True, color='red', label='Non-Medalist')
    plt.title(f'{sport}')
    plt.xlabel('Height (cm)')
    plt.ylabel('Frequency')
    
# Add a single legend for the whole figure
plt.figlegend(
    ['Medalist', 'Non-Medalist'],  # Labels
    loc='upper center', ncol=2, bbox_to_anchor=(0.5, 1.02), frameon=False
)

# Adjust layout
plt.tight_layout()  # Leave space for the legend
plt.show()