Skip to content

The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.

The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv file in the data folder.

In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!

# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Start coding here!
# create nobel_data dataframe
nobel_data = pd.read_csv("data/nobel.csv")

# print first rows of nobel_data
print(nobel_data.head())

# columns of nobel_data
print(nobel_data.columns)

# top_country
top_country = nobel_data["birth_country"].value_counts().index[0]
print("The most commonly awarded birth country:", top_country)

# top_gender
top_gender = nobel_data["sex"].value_counts().index[0]
print("The most commonly awarded gender:", top_gender)
# us_born column
nobel_data["us_born"] = False
nobel_data.loc[nobel_data["birth_country"] == "United States of America", "us_born"] = True
print(nobel_data[nobel_data["birth_country"] == "United States of America"][["birth_country", "us_born"]].head())

# decade column
nobel_data["decade"] = (10 * (nobel_data["year"] // 10)).astype(str) + "s"
print(nobel_data.head())
print(nobel_data.tail())

# us_ratio column
nobel_pivot_table = nobel_data.pivot_table(values=["us_born", "year"], index="decade", aggfunc={"us_born": "sum", "year": "count"}).rename(columns={"us_born": "USA winners", "year": "Total winners"})
nobel_pivot_table["us_ratio"] = (nobel_pivot_table["USA winners"] / nobel_pivot_table["Total winners"] * 100).round()
print(nobel_pivot_table.sort_values("us_ratio", ascending=False))

max_decade_usa = int(nobel_pivot_table.sort_values("us_ratio", ascending=False).index[0][:-1])
print("The decade had the highest ratio of US-born Nobel Prize winners to total winners in all categories:", str(max_decade_usa) + "s")

# lineplot => decade and us_ratio
sns.lineplot(x="decade", y="us_ratio", data=nobel_pivot_table, marker="X")
plt.xticks(rotation=90)
plt.ylabel("USA-born winners (%)")
plt.xlabel("Decade")
plt.title("American Nobel Laureates Through the Decades")
plt.show()

# countplot => sex vs decade
sns.countplot(x=nobel_data["sex"], data=nobel_data, hue="decade", order=["Female", "Male"], alpha=0.6)
plt.title("Distribution of Nobel Prize Winners by Gender Over the Decades")
plt.xlabel("Gender")
plt.ylabel("Count of Nobel Prize Winners")
plt.show()

# countplot => category vs decade
sns.catplot(x=nobel_data["category"], data=nobel_data, hue="decade", kind="count", alpha=0.6)
plt.title("Distribution of Nobel Prize Winners by Category Over the Decades")
plt.xlabel("Category")
plt.ylabel("Count of Nobel Prize Winners")
plt.show()
# is_female column
nobel_data["is_female"] = False
nobel_data.loc[nobel_data["sex"] == "Female", "is_female"] = True
print(nobel_data[nobel_data["sex"] == "Female"][["sex", "is_female"]].head())

# female_ratio column
nobel_pivot_table = nobel_data.pivot_table(values=["is_female", "year"], index=["decade", "category"], aggfunc={"is_female": "sum", "year": "count"}).rename(columns={"is_female": "Female winners", "year": "Total winners"}).reset_index()
nobel_pivot_table["female_ratio"] = (nobel_pivot_table["Female winners"] / nobel_pivot_table["Total winners"] * 100).round()
print(nobel_pivot_table.sort_values("female_ratio", ascending=False))

# top decade and category
top_decade_female = int(nobel_pivot_table.sort_values("female_ratio", ascending=False).values[0, 0][:-1])
print("The decade had the highest ratio of female Nobel Prize winners to total winners in all categories:", str(top_decade_female) + "s")

top_category_female = nobel_pivot_table.sort_values("female_ratio", ascending=False).values[0, 1]
print("The category had the highest ratio of female Nobel Prize winners to total winners:", top_category_female)

max_female_dict = {top_decade_female: top_category_female}
print(max_female_dict)

# lineplot => decade and female_ratio
sns.lineplot(x="decade", y="female_ratio", data=nobel_pivot_table, marker="X", hue="category")
plt.xticks(rotation=90)
plt.ylabel("Female winners (%)")
plt.xlabel("Decade")
plt.title("Nobel Prize-Winning Women Across the Decades")
plt.show()
# first_woman
female_winners = nobel_data[nobel_data["sex"] == "Female"]
first_woman_name = female_winners.sort_values("year").head(1).values[0, 7]
print("The first woman to receive a Nobel Prize:", first_woman_name)

# first_woman_category
first_woman_category = female_winners.sort_values("year").head(1).values[0, 1]
print("The category:", first_woman_category)
# ind_pivot and org_pivot columns
ind_pivot = nobel_data.pivot_table(values="year", index="full_name", aggfunc="count")["year"].reset_index()
org_pivot = nobel_data.pivot_table(values="year", index="organization_name", aggfunc="count")["year"].reset_index()

# more than one times
repeat_list = []
print(ind_pivot[ind_pivot["year"] > 1].values[:, 0])

repeat_list += list(ind_pivot[ind_pivot["year"] > 1].values[:, 0])
print(repeat_list)
# us_born (alternative way)
nobel_data["usa_born_winner"] = nobel_data["birth_country"] == "United States of America"
print(nobel_data[["birth_country", "usa_born_winner"]].head())

nobel_data["decade"] = (np.floor(nobel_data["year"] / 10) * 10).astype(int)
print(nobel_data[["year", "decade"]].head())

prop_usa_winners = nobel_data.groupby("decade", as_index=False)["usa_born_winner"].mean()
print(prop_usa_winners.head())

max_decade_usa = prop_usa_winners[prop_usa_winners["usa_born_winner"] == prop_usa_winners["usa_born_winner"].max()]["decade"].values[0]
print(max_decade_usa)

ax1 = sns.relplot(x="decade", y="usa_born_winner", data=prop_usa_winners, kind="line")
# is_female (alternative_way)
nobel_data["female_winner"] = nobel_data["sex"] == "Female"
print(nobel_data["female_winner"].head())

prop_female_winners = nobel_data.groupby(["decade", "category"], as_index=False)['female_winner'].mean()
print(prop_female_winners.head())

max_female_decade_category = prop_female_winners[prop_female_winners["female_winner"] == prop_female_winners["female_winner"].max()][["decade", "category"]]
print(max_female_decade_category)

max_female_dict = {max_female_decade_category["decade"].values[0]: max_female_decade_category["category"].values[0]}
print(max_female_dict)

ax2 = sns.relplot(x="decade", y="female_winner", hue="category", data=prop_female_winners, kind="line")
# first woman (alternative way)
nobel_women = nobel_data[nobel_data['female_winner']]
print(nobel_women.head())

min_row = nobel_women[nobel_women["year"] == nobel_women["year"].min()]
print(min_row)

first_woman_name = min_row["full_name"].values[0]
print(first_woman_name)

first_woman_category = min_row["category"].values[0]
print(first_woman_category)

print(f"\n The first woman to win a Nobel Prize was {first_woman_name}, in the category of {first_woman_category}.")
# more than 1 time (alternative way)
counts = nobel_data["full_name"].value_counts()
print(counts)

repeats = counts[counts >= 2].index
print(repeats)

repeat_list = list(repeats)
print(repeat_list)

print(f"\n The repeat winners are:", ", ".join(repeat_list))