The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.
The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv
file in the data
folder.
In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!
# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
# Read in the file
path_to_csv = './data/nobel.csv'
nobel = pd.read_csv(path_to_csv)
nobel.head()
# Find the most commonly awarded gender
top_gender = nobel['sex'].value_counts().idxmax()
print(f"The most commonly awarded gender is {top_gender}")
# Find the most commonly awarded birth country
top_country = nobel['birth_country'].value_counts().idxmax()
print(f"The most commonly awarded birth country is {top_country}")
# Add a usa_born_winner column to nobel, where the value is True when birth_country is "United States of America"
nobel['usa_born_winner'] = nobel['birth_country'] == 'United States of America'
# Add a decade column to nobel for the decade each prize was awarded
nobel['decade'] = (np.floor(nobel['year']/10)*10).astype(int)
# Create a new DataFrame named prop_usa_winners with columns decade and usa_born_winner
prop_usa_winners = nobel.groupby('decade', as_index=False)['usa_born_winner'].mean()
# Visualize the proportion of US-born winners per decade with a relational line plot
ax1 = sns.relplot(x='decade', y='usa_born_winner', data=prop_usa_winners)
ax1.set(xlabel='Decade', ylabel='Proportion of US-born winners', title='Proportion of US-born Nobel Prize Winners Per Decade');
# Add a female_winner column to nobel, where the value is True when sex is "Female"
nobel['female_winner'] = nobel['sex'] == 'Female'
# Create a new DataFrame named prop_female_winners with columns decade, category, and female_winner
prop_female_winners = nobel.groupby(['decade', 'category'], as_index=False)['female_winner'].mean()
# Visualize the proportion of female laureates per decade and category with a relational line plot
ax2 = sns.relplot(x='decade', y='female_winner', data=prop_female_winners, hue='category')
ax1.set(xlabel='Decade', ylabel='Proportion of female laureates', title='Proportion of female Nobel Prize winners per decade and category');
# Filter the dataset to only include women laureates
women_laureates = nobel[nobel["sex"] == "Female"]
# Sort the dataset by year
women_laureates = women_laureates.sort_values(by="year")
# Get the first woman to receive a Nobel Prize
first_woman = women_laureates.iloc[0]
# Extract the name and category of the prize
first_woman_name = first_woman["full_name"]
first_woman_category = first_woman["category"]
print(f"The first woman to receive a Nobel Prize was {first_woman_name} and the category was {first_woman_category}")
# Filter the dataset to only include laureates who have won multiple Nobel Prizes
repeats = nobel["full_name"].value_counts()
# Get the full names of the repeat laureates
repeat_list = list(repeats[repeats >= 2].index)
# Print the results
print(f"The individuals or organizations who have won multiple Nobel Prizes are: {repeat_list}")