Skip to content

The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.

The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv file in the data folder.

In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!

# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

nobel = pd.read_csv('data/nobel.csv')
nobel

What is the most commonly awarded gender and birth country?

  • Store your answers as string variables top_gender and top_country.
# In the sex column, there are Male, Female and null values
nobel_sex_not_null = nobel.dropna(subset=["sex"])

# Method 1, using mode aggregate function on the sex column
top_gender = nobel_sex_not_null["sex"].mode()[0]

# Method 2, use value counts and get the first row's index name
gender_count = nobel_sex_not_null["sex"].value_counts()
top_gender_alternative = gender_count.index[0]

# Visualize total Male and Female Noble Prize Winners
sns.barplot(data=gender_count)
plt.xlabel("Sex")
plt.ylabel("Number of Nobel Prize Winners")
plt.title("Nobel Prize Winners by Gender")
plt.show()
nobel_country_not_null = nobel.dropna(subset=["birth_country"])
top_country = nobel_country_not_null["birth_country"].mode()[0]

country_count = nobel_country_not_null["birth_country"].value_counts()
top_country_alternative = country_count.index[0]

# Visualize top ten birth countries for Nobel Prize Winners
top_ten_countries = country_count.head(10)

sns.barplot(x=top_ten_countries, y= top_ten_countries.index)
plt.xlabel("Winners per country")
plt.ylabel("Birth Country")
plt.title("Top 10 Countries of Nobel Prize Winners")
plt.show()

Which decade had the highest ratio of US-born Nobel Prize winners to total winners in all categories?

  • Store this as an integer called max_decade_usa.
# Calculate the decade for each row
nobel['decade'] = (np.floor(nobel['year'] / 10) * 10).astype(int)

# Subset for winners from USA
nobel_us_born = nobel[nobel["birth_country"] == "United States of America"]

# Get value counts and convert to DataFrame with correct column names
nobel_us_decade_count = nobel_us_born["decade"].value_counts().sort_index()
nobel_us_decade_count = nobel_us_decade_count.rename_axis("decade").reset_index(name="count")

# Subset for non-USA born winners
nobel_not_us_born = nobel[nobel["birth_country"] != "United States of America"]

# Get value counts and convert to DataFrame with correct column names
nobel_not_us_decade_count = nobel_not_us_born["decade"].value_counts().sort_index()
nobel_not_us_decade_count = nobel_not_us_decade_count.rename_axis("decade").reset_index(name="count")

# Merge USA born and non-USA born winners
us_ratio = pd.merge(nobel_us_decade_count, nobel_not_us_decade_count, on = "decade", suffixes=('_usa', '_not_usa'))
us_ratio["ratio"] = us_ratio["count_usa"] / us_ratio["count_not_usa"]
#us_ratio #2000 0.7

max_ratio = us_ratio["ratio"].max()
max_decade_usa = us_ratio.loc[us_ratio["ratio"] == max_ratio, "decade"].iloc[0]

#print(us_ratio)

sns.lineplot(x="decade", y="count_usa", data = us_ratio, label = "US Born")
sns.lineplot(x="decade", y="count_not_usa", data = us_ratio, label = "Non-US Born")
plt.xticks([i for i in range(1900, 2030, 10)])
plt.xlabel("Decades")
plt.ylabel("Number of Nobel Prize Winners")
plt.show()

Which decade and Nobel Prize category combination had the highest proportion of female laureates?

  • Store this as a dictionary called max_female_dict where the decade is the key and the category is the value. There should only be one key:value pair.
nobel_female = nobel[nobel["sex"] == "Female"]
nobel_non_female = nobel[nobel["sex"] != "Female"]

female_counts = (
    nobel_female.groupby(["decade", "category"])
    .size()
    .reset_index(name="female_count")
)

non_female_counts = (
    nobel_non_female.groupby(["decade", "category"])
    .size()
    .reset_index(name="non_female_count")
)

# Merge female and male counts
gender_ratio = pd.merge(
    female_counts, 
    non_female_counts, 
    on=["decade", "category"], 
)

# Compute female-to-male ratio
gender_ratio["female_ratio"] = (
    gender_ratio["female_count"] / gender_ratio["non_female_count"].replace(0, np.nan)
)


max_ratio = gender_ratio["female_ratio"].max()
highest_female_ratio = gender_ratio.loc[
    gender_ratio["female_ratio"] == max_ratio
]

max_female_dict = {
    row["decade"]: row["category"]
    for _, row in highest_female_ratio.iterrows()
}

print(max_female_dict)

sns.lineplot(data=gender_ratio, x = "decade", y="female_count", hue="category")

Who was the first woman to receive a Nobel Prize, and in what category?

  • Save your string answers as first_woman_name and first_woman_category.
first_woman = nobel[nobel["sex"] == "Female"].sort_values(by="year").head(1)
first_woman_name = first_woman.iloc[0]["full_name"]
first_woman_category = first_woman.iloc[0]["category"]

print(f"The first woman to receive a Nobel Prize is {first_woman_name} in the category of {first_woman_category}")

Which individuals or organizations have won more than one Nobel Prize throughout the years?

  • Store the full names in a list named repeat_list.
name_count = nobel["full_name"].value_counts()

repeat_individuals = name_count[name_count > 1]

#print(repeat_individuals)

repeat_list = list(repeat_individuals.index)

sns.barplot(x=repeat_individuals, y= repeat_individuals.index)
plt.title("Individuals / Organizations with multiple Nobel Prize")
plt.xlabel("Number of Awards")
plt.ylabel("Individual / Organization")
plt.xticks([1,2,3])