Skip to content

The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.

The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv file in the data folder.

In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!

# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

#Load in data
nobel_data = pd.read_csv('data/nobel.csv')

# 1. What is the most commonly awarded gender and birth country?
top_gender = nobel_data['sex'].value_counts().idxmax() # Find the most common gender
top_country = nobel_data['birth_country'].value_counts().idxmax() # Find the most common birth country
top_countries = nobel_data['birth_country'].value_counts().head(5).index # Filter for top 5 countries
filtered_data = nobel_data[nobel_data['birth_country'].isin(top_countries)] # Filter for top 10 countries
g1 = sns.catplot(x='sex', kind='count', data=nobel_data,hue='sex',palette='Blues') # Plot for gender count
g1.fig.suptitle("Most Commonly Awarded Gender",fontsize=12,y=1.01) # Adding title
g1.set_axis_labels("Gender", "Count of Nobel Winners") # Adding labels
g1.fig.tight_layout()
g2 = sns.catplot(x='birth_country', kind='count', data=filtered_data) # Plot for top 10 birth countries count
g2.fig.suptitle("Top 5 Most Common Birth Countries", fontsize=16,y=1.05) # Adding title
g2.set_axis_labels("Birth Country", "Count of Nobel Winners") # Adding labels
g2.fig.tight_layout()
plt.xticks(rotation=45) # Rotate the x-axis labels
print(f"1.Most Commonly Awarded Gender: {top_gender}, Most Commonly Awarded Birth Country: {top_country}")



# 2. Which decade had the highest ratio of US-born Nobel Prize winners to total winners in all categories?
nobel_data['US-born'] = nobel_data['birth_country'] == 'United States of America' # Create the US-born winners column
nobel_data['decade'] = (np.floor(nobel_data['year'] / 10) * 10).astype(int) # Create the decade column
us_born_ratio = nobel_data.groupby('decade',as_index=False)['US-born'].mean()# Finding the ratio
max_decade_usa = us_born_ratio[us_born_ratio['US-born'] == us_born_ratio['US-born'].max()]['decade'].values[0] # Identify the decade with the highest ratio of US-born winners
g3 = sns.relplot(x='decade',y='US-born',kind='line',data=us_born_ratio) # Create a relational line plot
g3.fig.suptitle('US-born Nobel Prize Winners Ratio by Decade', fontsize=12 , y=1.01) # Adding title
g3.set_axis_labels('Decade', 'US-born Ratio') # Adding labels
g3.fig.tight_layout()
plt.grid(axis='y', linestyle='--', linewidth=0.5, color='gray') # Adding grid lines
print(f"\n2.The decade with the highest ratio of US-born Nobel Prize winners to total winners is {max_decade_usa}s")




# 3. Which decade and Nobel Prize category combination had the highest proportion of female laureates?
nobel_data['female_winner'] = nobel_data['sex'] == 'Female' # Create the female_winner winners
female_winner_prop = nobel_data.groupby(['decade','category'],as_index=False)['female_winner'].mean() # find a proportion of female laureates
max_female_decade_category = female_winner_prop[female_winner_prop['female_winner'] == female_winner_prop['female_winner'].max()][['decade', 'category']] # Find the decade and category with the highest female winners
max_female_dict = {max_female_decade_category['decade'].values[0]:max_female_decade_category['category'].values[0]} #Create a dictionary
g4 = sns.relplot(x='decade',y='female_winner',kind='line',hue='category',data=female_winner_prop) # Create a relational line plot
g4.fig.suptitle('Proportion of Female Nobel Laureates by Decade and Category', fontsize=12 , y=1.02) # Adding title
g4.set_axis_labels('Decade', 'Female winners proportion') # Adding labels
g4.fig.tight_layout()
plt.subplots_adjust(right=0.75)
plt.grid(axis='y', linestyle='--', linewidth=0.5, color='gray') # Adding grid lines
print(f"\n3.The combination of decade and Nobel Prize category with the highest proportion of female laureates is {max_female_decade_category['decade'].values[0]} and {max_female_decade_category['category'].values[0]} with a proportion of {female_winner_prop['female_winner'].max():.2f}.")




# 4. Who was the first woman to receive a Nobel Prize, and in what category?
first_woman_name = nobel_data[nobel_data['female_winner'] == True]['full_name'].iloc[0] # Filter first woman name
first_woman_category = nobel_data[nobel_data['female_winner'] == True]['category'].iloc[0] # Filter first woman category
print(f"\n4.The first woman to win a Nobel Prize was {first_woman_name}, in the category of {first_woman_category}.")



# 5. Which individuals or organizations have won more than one Nobel Prize throughout the years?
name_counts = nobel_data['full_name'].value_counts() # Finding counts
repeat_list = list(name_counts[name_counts >= 2].index) # Save as a list
print("\n5.The repeat winners are :", repeat_list)