Skip to content
Olympics Data
Olympics
This is a historical dataset on the modern Olympic Games, from Athens 1896 to Rio 2016. Each row consists of an individual athlete competing in an Olympic event and which medal was won (if any).
import pandas as pd
olymp = pd.read_csv("data/athlete_events.csv.gz")
print(olymp.shape)
olymp.head(100)Data Dictionary
| Column | Explanation |
|---|---|
| id | Unique number for each athlete |
| name | Athlete's name |
| sex | M or F |
| age | Age of the athlete |
| height | In centimeters |
| weight | In kilograms |
| team | Team name |
| noc | National Olympic Committee 3 |
| games | Year and season |
| year | Integer |
| season | Summer or Winter |
| city | Host city |
| sport | Sport |
| event | Event |
| medal | Gold, Silver, Bronze, or NA |
Source and license of the dataset. The dataset is a consolidated version of data from www.sports-reference.com.
Exploring the Dataset
In which year and city did the Netherlands win the highest number of medals in their history?
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
olymp = pd.read_csv("data/athlete_events.csv.gz")
olymp_netherlands = olymp[olymp["team"] == "Netherlands"]
netherlands_medals = olymp_netherlands.groupby(["year", "city"])["medal"].count().reset_index()
# Max medal
max_medal = netherlands_medals[netherlands_medals["medal"] == netherlands_medals["medal"].max()]
# Extract year, city and medal
year = max_medal["year"].values[0]
city = max_medal["city"].values[0]
medal = max_medal["medal"].values[0]
print(f"The Netherland won the highest number of {medal} medals in {year}, hosted in {city}")
Visualizing
Plot visualizing the relationship between the number of athletes countries send to an event and the number of medals they receive.
# Get the data teams in events
olymp_noc_event = olymp.groupby("noc")["event"].count().sort_values(ascending=False)
print(olymp_noc_event)# Get the data medals by teams
olymp_noc_medal = olymp.groupby("noc")["medal"].count().sort_values(ascending=False)
print(olymp_noc_medal)sns.scatterplot(x=olymp_noc_event, y=olymp_noc_medal)
plt.xlabel("Number of athletes in an event")
plt.ylabel("Number of medals")
plt.title("Medal vs Athletes sent to an event")
plt.show()In which sports does the height of an athlete increase their chances of earning a medal?
# Get the list of unique sports
sports = data['sport'].unique()
# Dictionary to store the results
results = {}
# Perform t-test for each sport
for sport in sports:
sport_data = data[data['sport'] == sport]
medalists = sport_data[sport_data['medal'] == 1]['height']
non_medalists = sport_data[sport_data['medal'] == 0]['height']
if len(medalists) > 1 and len(non_medalists) > 1: # Ensure there are enough data points
t_stat, p_value = ttest_ind(medalists, non_medalists, equal_var=False)
results[sport] = p_value
# Convert results to DataFrame
results_df = pd.DataFrame(list(results.items()), columns=['Sport', 'P-Value'])
# Sort by P-Value
results_df = results_df.sort_values(by='P-Value')
# Display the results
print(results_df)
# Filter sports with significant height differences
significant_sports = results_df[results_df['P-Value'] < 0.05]
print("Sports where height significantly affects the chances of winning a medal:")
print(significant_sports)
# Plot height distributions for significant sports
plt.figure(figsize=(15, 10))
for i, sport in enumerate(significant_sports['Sport'], 1):
plt.subplot(5, 5, i)
sport_data = data[data['sport'] == sport]
sns.histplot(sport_data[sport_data['medal'] == 1]['height'], bins=30, kde=True, color='blue', label='Medalist')
sns.histplot(sport_data[sport_data['medal'] == 0]['height'], bins=30, kde=True, color='red', label='Non-Medalist')
plt.title(f'{sport}')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
# Add a single legend for the whole figure
plt.figlegend(
['Medalist', 'Non-Medalist'], # Labels
loc='upper center', ncol=2, bbox_to_anchor=(0.5, 1.02), frameon=False
)
# Adjust layout
plt.tight_layout() # Leave space for the legend
plt.show()