Skip to content

The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.

The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv file in the data folder.

In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!

# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Reading the .csv and finding what are inside the dataset
nobel = pd.read_csv("data/nobel.csv")
print (nobel.head())
print (nobel.columns)

What is the most commonly awarded gender and birth country?

# Finding the most commonly awarded gender
top_gender = nobel['sex'].value_counts().index[0]
print ("The most commonly awarded gender is", top_gender)

# Finding the most commonly awarded country
top_country = nobel['birth_country'].value_counts().index[0]
print ("The most commonly awarded country is", top_country)

What decade had the highest proportion of US-born winners?

# Finding the decade having the highest proportion of US-born winners

## Flagging winners whose birth country is United States of America
nobel['US_born'] = nobel['birth_country'] == 'United States of America'

## Creating the column for decade
nobel['decade'] = (np.floor(nobel['year']/10) * 10).astype(int)

## Identifying US-born winners, grouping by decades
usa_born_winner = nobel.groupby('decade', as_index=False)['US_born'].mean()

## Identifying the decade which had the highest proportion of US-born winners
max_decade_usa = usa_born_winner['decade'][usa_born_winner['US_born'].idxmax()]
## Optional plot
sns.relplot(kind='line', data=usa_born_winner, x='decade', y='US_born')
plt.title("Proportion of US-Born Winners Across Decades")
plt.xlabel("Decade")
plt.ylabel("Proportion of US-Born Winners")
plt.show()

## Showing the answer to the question
print (f"The decade which had the highest proportion of US-born winners is {max_decade_usa}.")

What decade and category pair had the highest proportion of female laureates?

# Finding the decade and category pair that had the highest proportion of female laureates

## Flagging winners whose gender is female
nobel['female_winner'] = nobel['sex'] == 'Female'

## Identifying female winners and their categories, grouping by decades and categories
female_laureates = nobel.groupby(['decade', 'category'], as_index=False)['female_winner'].mean()

## Identifying the decade and category with the highest female winners
max_decade_category = female_laureates.loc[female_laureates['female_winner'].idxmax(), ['decade', 'category']]
max_female_dict = {max_female_decade_category['decade'].values[0]: max_female_decade_category['category'].values[0]}

## Optional plot
sns.relplot(kind='line', data=female_laureates, x='decade', y='female_winner', hue= 'category')
plt.title("Proportion of Female Laureates Across Decades")
plt.xlabel("Decade")
plt.ylabel("Proportion of Female Laureates")
plt.show()

## Showing the answer to the question
print (f"The decade and category pair which had the highest proportion of female laureates is {max_female_dict}.")

Who was the first woman to receive a Nobel Prize, and in what category?

## Finding the first woman to receive a Nobel Prize and her category

# Filter DataFrame for the rows with Female winners and find the earliest year and corresponding category
female_laureates_by_year = nobel[nobel['sex'] == 'Female']

# Finding the first woman to receive a Nobel Prize and her category
first_female_laureate = female_laureates_by_year.loc[female_laureates_by_year['year'].idxmin()]

# Find the year and category of the first female laureate
first_woman_name = first_female_laureate['full_name']
first_woman_category = first_female_laureate['category']

print(f"The first woman to receive a Nobel Prize is {first_woman_name} in the category of {first_woman_category}.")

Which individuals or organizations have won multiple Nobel Prizes throughout the years?

# Finding the individuals or organizations that have won multiple Nobel Prizes

## Counting the repeat values by full_name
repeats = nobel['full_name'].value_counts()

## Identifying the names which received multiple prizes, indexing only the names, then convert the object into the list
repeat_list = list(repeats[repeats >= 2].index)

## Showing the result
print (f"{repeat_list} have received multiple Nobel prizes throughout the years.")