Olympics Data

Olympics

This is a historical dataset on the modern Olympic Games, from Athens 1896 to Rio 2016. Each row consists of an individual athlete competing in an Olympic event and which medal was won (if any).

import pandas as pd

olymp = pd.read_csv("data/athlete_events.csv.gz")
print(olymp.shape)
olymp.head(100)

Data Dictionary

Column	Explanation
id	Unique number for each athlete
name	Athlete's name
sex	M or F
age	Age of the athlete
height	In centimeters
weight	In kilograms
team	Team name
noc	National Olympic Committee 3
games	Year and season
year	Integer
season	Summer or Winter
city	Host city
sport	Sport
event	Event
medal	Gold, Silver, Bronze, or NA

Source and license of the dataset. The dataset is a consolidated version of data from www.sports-reference.com.

Exploring the Dataset

In which year and city did the Netherlands win the highest number of medals in their history?

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

olymp = pd.read_csv("data/athlete_events.csv.gz")

olymp_netherlands = olymp[olymp["team"] == "Netherlands"]
netherlands_medals = olymp_netherlands.groupby(["year", "city"])["medal"].count().reset_index()

# Max medal
max_medal = netherlands_medals[netherlands_medals["medal"] == netherlands_medals["medal"].max()]

# Extract year, city and medal
year = max_medal["year"].values[0]
city = max_medal["city"].values[0]
medal = max_medal["medal"].values[0]

print(f"The Netherland won the highest number of {medal} medals in {year}, hosted in {city}")

Visualizing

Plot visualizing the relationship between the number of athletes countries send to an event and the number of medals they receive.

# Get the data teams in events
olymp_noc_event = olymp.groupby("noc")["event"].count().sort_values(ascending=False)
print(olymp_noc_event)

# Get the data medals by teams
olymp_noc_medal = olymp.groupby("noc")["medal"].count().sort_values(ascending=False)
print(olymp_noc_medal)

sns.scatterplot(x=olymp_noc_event, y=olymp_noc_medal)
plt.xlabel("Number of athletes in an event")
plt.ylabel("Number of medals")
plt.title("Medal vs Athletes sent to an event")
plt.show()

In which sports does the height of an athlete increase their chances of earning a medal?

# Get the list of unique sports
sports = data['sport'].unique()

# Dictionary to store the results
results = {}

# Perform t-test for each sport
for sport in sports:
    sport_data = data[data['sport'] == sport]
    medalists = sport_data[sport_data['medal'] == 1]['height']
    non_medalists = sport_data[sport_data['medal'] == 0]['height']
    
    if len(medalists) > 1 and len(non_medalists) > 1:  # Ensure there are enough data points
        t_stat, p_value = ttest_ind(medalists, non_medalists, equal_var=False)
        results[sport] = p_value

# Convert results to DataFrame
results_df = pd.DataFrame(list(results.items()), columns=['Sport', 'P-Value'])

# Sort by P-Value
results_df = results_df.sort_values(by='P-Value')

# Display the results
print(results_df)

# Filter sports with significant height differences
significant_sports = results_df[results_df['P-Value'] < 0.05]
print("Sports where height significantly affects the chances of winning a medal:")
print(significant_sports)

# Plot height distributions for significant sports
plt.figure(figsize=(15, 10))

for i, sport in enumerate(significant_sports['Sport'], 1):
    plt.subplot(5, 5, i)
    sport_data = data[data['sport'] == sport]
    sns.histplot(sport_data[sport_data['medal'] == 1]['height'], bins=30, kde=True, color='blue', label='Medalist')
    sns.histplot(sport_data[sport_data['medal'] == 0]['height'], bins=30, kde=True, color='red', label='Non-Medalist')
    plt.title(f'{sport}')
    plt.xlabel('Height (cm)')
    plt.ylabel('Frequency')
    
# Add a single legend for the whole figure
plt.figlegend(
    ['Medalist', 'Non-Medalist'],  # Labels
    loc='upper center', ncol=2, bbox_to_anchor=(0.5, 1.02), frameon=False
)

# Adjust layout
plt.tight_layout()  # Leave space for the legend
plt.show()

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Olympics