The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.
The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv
file in the data
folder.
In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!
Cleaned
# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
# Import nobel csv
nobel = pd.read_csv('data/nobel.csv')
# Look for column names
column_names = nobel.columns
print(column_names)
#year, category, prize, motivation, prize_share, 'laureate_id', 'laureate_type', 'full_name', 'birth_data', 'birth_country', 'sex', 'organization_name', 'organization_country', 'death_date', 'death_city', 'death_country'
#typying out helps with ctrl c but might be consuming time
print('\n')
print('What is the most commonly rewarded gender and birth country?')
#Find top gender and country
top_gender = nobel['sex'].value_counts().index[0]
top_country = nobel['organization_country'].value_counts().index[0]
print([top_gender], [top_country])
print('The most commonly rewarded gender is' + ' ' + top_gender + ', ' + 'and the country is the' + ' ' + top_country + '.')
# Create US-born winners column
nobel['us_winners'] = nobel['birth_country'] == 'United States of America'
# Create decade column
nobel['decade'] = (np.floor(nobel['year'] / 10) * 10).astype(int)
# This is how you find the ratio of us winners by decade
prop_usa_winners = nobel.groupby('decade', as_index=False)['us_winners'].mean()
print('\n')
print('Which decade has the highest ratio of US-born Nobel Prize Winners?')
max_decade_us = prop_usa_winners.loc[prop_usa_winners['us_winners'].idxmax(), 'decade']
print('The decade with the highest ratio of US-born Nobel Prize winners is', max_decade_us)
# Optional Make a plot
ax1 = sns.relplot(x='decade', y='us_winners', data=prop_usa_winners, kind='line')
print('\n')
print('Which decade and Nobel Prize category combo had the highest proportion of female laureates?')
# Filter for female winners
nobel['f_winners'] = nobel['sex'] == 'Female'
# Find the ratio of female winners by decade
prop_f_winners = nobel.groupby(['decade', 'category'], as_index=False)['f_winners'].mean()
# Find the max
max_female = prop_f_winners[prop_f_winners['f_winners'] == prop_f_winners['f_winners'].max()]
print('The decade with the highest amount of female winners is', max_female['decade'].values[0], "'s")
# Save to a dictionary
max_female_dict = {max_female['decade'].values[0]: max_female['category'].values[0]}
# Optional graph
ax2 = sns.relplot(x='category', y='f_winners', data=prop_f_winners, kind='line')
print('\n')
print('Who was the first woman to recieve a Nobel Prize and what category?')
#filter dataframe
earliest_f = nobel[nobel['sex'] == 'Female']
earliest_f = earliest_f.sort_values(by='year', ascending=True).iloc[0]
first_woman_name = earliest_f['full_name']
first_woman_category = earliest_f['category']
print('The first woman to receive a Nobel Prize was', earliest_f['full_name'], 'in the category of', earliest_f['category'])
print('\n')
print('Which individuals or organizations have won more than one Nobel Prize throughout the years?')
#count 2 or more and save
counts = nobel['full_name'].value_counts()
repeats = counts[counts >= 2].index.tolist()
repeat_list = list(repeats)
print('Individuals or organizations that have won more than one Nobel Prize:', repeat_list)
Saved so my code stops getting edited
# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
# Start coding here!
# I needed to use hints, it's like looking at a blank sheet
# Read csv
# Note looking for the csv you need to put the entire file path in quotes
nobel = pd.read_csv('data/nobel.csv')
# Check for columns
print(nobel.head())
#found columns year, category, death city, death country
#Jk this is probably a better method
column_names = nobel.columns
print(column_names)
#year, category, prize, motivation, prize_share, 'laureate_id', 'laureate_type', 'full_name', 'birth_data', 'birth_country', 'sex', 'organization_name', 'organization_country', 'death_date', 'death_city', 'death_country'
#typying out helps with ctrl c but might be consuming time
print('\n')
print('What is the most commonly rewarded gender and birth country?')
#Find top gender and country
top_gender = nobel.value_counts(nobel['sex']).index[0]
top_country = nobel.value_counts(nobel['organization_country']).index[0]
print([top_gender], [top_country])
print('The most commonly rewarded gender is' + ' ' + top_gender + ', ' + 'and the country is the' + ' ' + top_country + '.')
# why index 0
#create US-born winners column
nobel['us_winners'] = nobel['birth_country'] == 'United States of America'
#create decade column
nobel['decade'] = (np.floor(nobel['year'] / 10) * 10).astype(int)
#this is how you find the ratio of us winners by decade
prop_usa_winners = nobel.groupby('decade', as_index=False)['us_winners'].mean()
print('\n')
print('Which decade has the highest ratio of US-born Nobel Prize Winners?')
#save value to integer
#max_decade_usa = (nobel['decade'].values[0])
#did this part wrong
#max_decade_usa = prop_usa_winners[prop_usa_winners['us_winners'] == prop_usa_winners['us_winners'].max()['decade'].values[0]]
#not really sure what went wrong
max_decade_us = prop_usa_winners.loc[prop_usa_winners['us_winners'].idxmax(), 'decade']
print('The decade with the highest ratio of US-born Nobel Prize winners is', + max_decade_us)
#Optional Make a plot
ax1 = sns.relplot(x='decade', y='us_winners', data=prop_usa_winners, kind='line')
print('\n')
print('Which decade and Nobel Prize category combo had the highest proportion of female laureates?')
#groupby on decade column again
#laur = nobel.groupby('decade', as_index=False)['sex' == 'Female'].mean()
#wrong need column sorting again
nobel['f_winners'] = nobel['sex'] == 'Female'
#Find the ratio of female winners by decade
prop_f_winners = nobel.groupby(['decade', 'category'], as_index=False)['f_winners'].mean()
#Find the max
max_female = prop_f_winners[prop_f_winners['f_winners'] == prop_f_winners['f_winners'].max()]
#Find the max
#print('The decade with the highest amount of female winners is', + prop_f_winners['decade'].max(), "'s")
#this came out wrong correted here
print('The decade with the highest amount of female winners is', max_female['decade'].values[0], "'s")
#Save to a dictionary
max_female_dict = {max_female['decade'].values[0]: max_female['category'].values[0]}
#Make a histogram
#sns.relplot(x='category', data = max_female_dict, kind = 'hist')
#The problem was that the sns.relplot function does not support the kind='hist'
#argument. Instead, you should use sns.histplot for creating histograms. Additionally,
#sns.histplot expects a DataFrame, so I converted the dictionary max_female_dict to a
#DataFrame before passing it to the data parameter.
#sns.histplot(x='category', data = pd.DataFrame(list(max_female_dict.items()), columns=['decade', 'category']))
#that was wrong overall comes up with no graph
#Make a histogram
#ax2 = sns.relplot(x='category', data = pd.DataFrame(list(max_female_dict.items()), kind='line', columns=['decade', 'category']))
#kind has to be at the end?
#ax2 = sns.relplot(x='category', data=pd.DataFrame(list(max_female_dict.items()), columns=['decade', 'category']), kind='line')
# Optional graph
ax2 = sns.relplot(x='category', y='f_winners', data=prop_f_winners, kind='line')
print('\n')
print('Who was the first woman to recieve a Nobel Prize and what category?')
#filter dataframe
#don't use the dictionary
#earliest_f = max_female[max_female['decade']]
#s_eaarliest_f = earliest_f.sort_values(by='decade', ascending=False)
#this is wrong because
#Error Explanation: The error occurred because the code was trying to use a DataFrame
#(max_female) as an index to filter itself, which is not valid. The max_female
#DataFrame does not contain the column 'decade' as an index, leading to a KeyError.
earliest_f = nobel[nobel['sex'] == 'Female']
earliest_f = earliest_f.sort_values(by='year', ascending=True).iloc[0]
first_woman_name = earliest_f['full_name']
first_woman_category = earliest_f['category']
print('The first woman to receive a Nobel Prize was', earliest_f['full_name'], 'in the category of', earliest_f['category'])
print('\n')
print('Which individuals or organizations have won more than one Nobel Prize throughout the years?')
#count 2 or more and save
#repeats = nobel.value_counts(nobel['prize']).index[]
#invalid python syntax
#The index[] part is invalid syntax. To fix this, I changed the approach to count the
#occurrences of each individual's or organization's name using value_counts() on the
#'full_name' column. Then, I filtered out those with counts greater than 1 and
#converted the result to a list. This corrected the syntax and achieved the desired functionality.
#repeats = nobel['full_name'].value_counts()
#repeats_list = repeats[repeats > 1].index.tolist()
#count 2 or more and save
counts = nobel['full_name'].value_counts()
repeats = counts[counts >= 2].index.tolist()
repeat_list = list(repeats)
print('Individuals or organizations that have won more than one Nobel Prize:', repeat_list)
#Value Counts Usage:
#Changed nobel.value_counts(nobel['sex']).index[0] to nobel['sex'].value_counts().index[0].
#Changed nobel.value_counts(nobel['organization_country']).index[0] to nobel['organization_country'].value_counts().index[0].
#The value_counts method should be called on a Series, not on the DataFrame directly.
#Finding Maximum Decade:
#Fixed the line prop_usa_winners['us_winners'].max()['decade'].values[0]] to max_decade_us = prop_usa_winners.loc[prop_usa_winners['us_winners'].idxmax(), 'decade'].
#The previous code was trying to access the maximum value incorrectly. The correct approach is to use idxmax() to find the index of the maximum value and then use loc to get the corresponding decade.
#Optional Graph for Female Winners:
#Changed the plotting line to ax2 = sns.relplot(x='category', y='f_winners', data=prop_f_winners, kind='line').
#The previous code was trying to plot a dictionary directly, which is not suitable for sns.relplot. Instead, we plot the prop_f_winners DataFrame directly.
Trial and Error
# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
# Start coding here!
# I needed to use hints, it's like looking at a blank sheet
# Read csv
# Note looking for the csv you need to put the entire file path in quotes
nobel = pd.read_csv('data/nobel.csv')
# Check for columns
print(nobel.head())
#create US-born winners column
nobel['us_winners'] = nobel['birth_country'] == 'United States of America'
#create decade column
nobel['decade'] = (np.floor(nobel['year'] / 10) * 10).astype(int)
#this is how you find the ratio of us winners by decade
prop_usa_winners = nobel.groupby('decade', as_index=False)['us_winners'].mean()
print('\n')
print('Which decade has the highest ratio of US-born Nobel Prize Winners?')
#save value to integer
max_decade_usa = prop_usa_winners[prop_usa_winners['us_winners'] == prop_usa_winners['us_winners'].max()]['decade'].values[0]
print('The decade with the highest ratio of US-born Nobel Prize winners is', max_decade_usa)
#Optional Make a plot
ax1 = sns.relplot(x='decade', y='us_winners', data=prop_usa_winners, kind='line')
print('\n')
print('Which decade and Nobel Prize category combo had the highest proportion of female laureates?')
#groupby on decade column again
nobel['f_winners'] = nobel['sex'] == 'Female'
#Find the ratio of female winners by decade
prop_f_winners = nobel.groupby(['decade', 'category'], as_index=False)['f_winners'].mean()
#Find the max
max_female = prop_f_winners[prop_f_winners['f_winners'] == prop_f_winners['f_winners'].max()]
print('The decade with the highest amount of female winners is', max_female['decade'].values[0], "'s")
#Save to a dictionary
max_female_dict = {max_female['decade'].values[0]: max_female['category'].values[0]}
print('\n')
print('Who was the first woman to recieve a Nobel Prize and what category?')
#filter dataframe
#don't use the dictionary
earliest_f = nobel[nobel['sex'] == 'Female']
earliest_f = earliest_f.sort_values(by='year', ascending=True).iloc[0]
first_woman_name = earliest_f['full_name']
first_woman_category = earliest_f['category']
print('The first woman to receive a Nobel Prize was', earliest_f['full_name'], 'in the category of', earliest_f['category'])
print('\n')
print('Which individuals or organizations have won more than one Nobel Prize throught the years?')
#count 2 or more and save
repeats = nobel['full_name'].value_counts()
repeats = repeats[repeats > 1].index.tolist()
print('Individuals or organizations that have won more than one Nobel Prize:', repeats)