Skip to content
# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np

# Start coding here!
df = pd.read_csv('data/nobel.csv')
top_gender = df['sex'].value_counts().idxmax()
top_country= df["birth_country"].value_counts().idxmax()

print(f"The most commonly awarded gender is {top_gender} and most commonly awarded birth country is {top_country}")


#Identify the decade with the highest ratio of US-born winners
df["US-born_winners"] = df['birth_country'] == "United States of America"
df["decade"] = (np.floor(df['year'] / 10) * 10).astype(int)
ratio = df.groupby('decade', as_index=False)['US-born_winners'].mean()
max_decade_usa = ratio[ratio['US-born_winners'] == ratio['US-born_winners'].max()]['decade'].values[0]

print(f"The highest ratio of US-born Nobel Prize winners in all catergories is {max_decade_usa}")

g = sns.relplot(x = 'decade', y='US-born_winners', data=ratio, kind="line")
g.fig.suptitle("The proportion of US-born winners by decade", y=1.1)

#Finding the decade and category with the highest proportion of female laureates
df['female_winners'] = df['sex'] == 'Female'
groupby_dec_cat = df.groupby(["decade","category"], as_index=False)['female_winners'].mean()
max_female = groupby_dec_cat[groupby_dec_cat['female_winners'] == groupby_dec_cat['female_winners'].max()][['decade','category']]
max_female_dict = {max_female['decade'].values[0]: max_female['category'].values[0]}

print(f"The female laureates with the highest proportion is in year {list(max_female_dict.keys())[0]} and in {list(max_female_dict.values())[0]} category")

r = sns.relplot(x='decade', y='female_winners', data=groupby_dec_cat, kind="line", hue="category")
r.fig.suptitle("The proportion of female laureates per decade and category", y=1.1)

#Identify the first woman to receive a Nobel Prize and in what category.
filter_gender_category = df[df['sex'] == "Female"][["full_name", "category","year"]]
first_woman_row = filter_gender_category.sort_values('year').iloc[0]
first_woman_name = first_woman_row["full_name"]
first_woman_category = first_woman_row["category"]

print(f"The first woman to receive a Nobel Prize is {first_woman_name} in {first_woman_category} category")

#Identify the individuals or organizations that have won more than one Nobel Prize
full_name_count = df["full_name"].value_counts()
repeat_names = full_name_count [full_name_count > 1]
repeat_list = list(repeat_names.index)
print(repeat_names)