Skip to content
New Workbook
Sign up
Nobel Prize Winners - Project: Visualizing the History

This is an adapted version of a guided datacamp project: Visualizing the History of Nobel Prize Winners by Jasmin Ludolf.

The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in the following:

  • physiology or medicine,
  • chemistry,
  • physics,
  • literature,
  • economics, and
  • peace.

He specified that the bulk of his fortune should be divided into five parts and to be used for prizes in physics, chemistry, physiology or medicine, literature and peace to:

those who, during the preceding year, shall have conferred the greatest benefit to humankind.” link

In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize in his famous will.

Sources:

  • The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the nobel.csv file in the data folder.
  • In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!

Code

  • I begin by loading the required modules and libraries
  • reading the source file
  • previewing the data
# loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re

# standardise appearance of visualisations
# sns.set_palette("tab10")
# mpl.rcParams['axes.titleweight'] = 'bold'
# mpl.rcParams['figure.titleweight'] = 'bold'
# mpl.rcParams['font.weight'] = 'regular'
# mpl.rcParams['axes.labelweight'] = 'regular'
# mpl.rcParams['axes.titlesize'] = 12
# mpl.rcParams['axes.labelsize'] = 10
# sns.set_style("ticks")

# import csv and review
nobel=pd.read_csv('data/nobel.csv')

#nobel.info()

# preview the data
nobel.head()
nobel.info()
# change types
nobel['birth_date'] = pd.to_datetime(nobel['birth_date'], errors='coerce')
nobel['death_date'] = pd.to_datetime(nobel['death_date'], errors='coerce')

# import module 
from datetime import datetime

# update birth dates for specific laureates
nobel['birth_date'] = nobel['full_name'].map({
    'Venkatraman Ramakrishnan': '1952-04-05',
    'Saul Perlmutter': '1959-09-22',
    'Nadia Murad': '1993-03-10',
    'Paul M. Romer': '1955-11-06',
    'Michael Houghton': '1949-06-10',
    'Ardem Patapoutian': '1967-10-02',
    'Abdulrazak Gurnah': '1948-12-20',
    'Dmitry Muratov': '1961-11-30',
    'David Card': '1956-04-24',
    'Morten Meldal': '1954-01-16',
    'Moungi Bawendi': '1961-03-15',
    'Louis Brus': '1943-08-10',
    'Aleksey Yekimov': '1945-02-28',
    'Claudia Goldin': '1946-05-14'
}).combine_first(nobel['birth_date'])

# Ensure all birth dates are in datetime format
nobel['birth_date'] = pd.to_datetime(nobel['birth_date'], errors='coerce')

# create age column that assumes the prize is given on 31 Dec
nobel['age_at_prize_est'] = nobel.apply(
    lambda row: (pd.to_datetime(f"{row.year}-12-31") - row.birth_date).days // 365, axis=1)

# calculate post prize years only for rows where 'sex' is not null
nobel['post_prize_years'] = nobel.apply(lambda row: 
                                        (datetime.now().year - row.year 
                                         if pd.isna(row.death_date) 
                                         else row.death_date.year - row.year) 
    if pd.notna(row.sex) else np.nan, axis=1)

# rename columns to Australian spelling to minimise errors
nobel.rename(columns={
    'organization_name': 'organisation_name',
    'organization_city': 'organisation_city',
    'organization_country': 'organisation_country'
}, inplace=True)

# reorganise for convenience 
nobel = nobel[['year',
    'category',
    'laureate_id',
    'full_name', 'sex', 'age_at_prize_est', 'post_prize_years',
    'prize_share', 'laureate_type',
    'organisation_name',
    'organisation_city',
    'organisation_country', 
               'birth_date',
    'birth_city',
    'birth_country',
    'death_date',
    'death_city',
    'death_country']]
nobel.head()
# subset rows 
nobel_orgs = nobel[nobel['sex'].isnull()]

nobel.laureate_type.value_counts()
nobel_orgs = nobel[nobel['sex'].isnull()]

# change full name to keep only contents of parentheses
# nobel_orgs['full_name'] = nobel_orgs['full_name'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else x)

# reorder columns
cols = ['full_name', 'category', 'laureate_id', 'prize_share']
nobel_orgs = nobel_orgs[cols]

# display subset
nobel_orgs
# subset rows where gender is not NaN
nobel_individuals = nobel[nobel['sex'].notna()]

# display subset
nobel_individuals
 import re

# function to clean and replace country names
def clean_and_replace_country_name(country):
    if isinstance(country, str):  # Check if the input is a string
        match = re.search(r'\((.*?)\)', country)
        if match:
            country = match.group(1)
    
    # define a dictionary for the country name replacements
    country_replacements = {
        'East Germany': 'Germany',
        'Federal Republic of Germany': 'Germany',
        'West Germany (Germany)': 'Germany',
        'then Germany, now France': 'France',
        'Germany (Poland)': 'Poland',
        'Northern Rhodesia (Zambia)': 'Zambia',
        'now Tunisia': 'Tunisia',
        'now Democratic Republic of the Congo': 'Democratic Republic of the Congo',
        "People's Republic of China": 'China',
        'Republic of Macedonia': 'Macedonia',
        'Czech Republic': 'Czechia', ## manual scan shows cities as Prague
        'Czechoslovakia': 'Czechia', ## manual scan shows cities as Prague
        'USA': 'United States',
        'United States of America': 'United States',
        'Union of Soviet Socialist Republics (Russia)': 'Russia',
        'now Russia': 'Russia',
        'Yugoslavia (Serbia)': 'Serbia',
        'Union of Soviet Socialist Republics': 'Russia' ## manual scan shows cities as Moscow
    }
    
    return country_replacements.get(country, country)

# apply the function to the relevant columns
nobel['birth_country'] = nobel['birth_country'].apply(clean_and_replace_country_name)
nobel['death_country'] = nobel['death_country'].apply(clean_and_replace_country_name)
nobel['organisation_country'] = nobel['organisation_country'].apply(clean_and_replace_country_name)

print('preview changes to country:')
nobel[['birth_country', 'organisation_country']].describe().T
# identify the youngest and oldest winners by category
youngest_winners = nobel.loc[nobel.groupby('category')['age_at_prize_est'].idxmin()]
oldest_winners = nobel.loc[nobel.groupby('category')['age_at_prize_est'].idxmax()]

# identify the winners with the maximum post prize years by category
max_post_prize_years_winners = nobel.loc[nobel.groupby('category')['post_prize_years'].idxmax()]

# identify the winners with the minimum post prize years by category where death_date is not null
min_post_prize_years_winners = nobel[nobel['death_date'].notnull()].loc[nobel[nobel['death_date'].notnull()].groupby('category')['post_prize_years'].idxmin()]

# merge them
combined_winners = pd.concat([youngest_winners, oldest_winners, max_post_prize_years_winners, min_post_prize_years_winners]).sort_values(by=['category', 'age_at_prize_est'])
combined_winners[['category', 'full_name', 'sex', 'birth_country', 'organisation_country' ,'year',  'age_at_prize_est', 'post_prize_years']]
# Subset for Nobel laureates born in or after 1980
nobel_born_after_1980 = nobel[nobel['birth_date'] >= '1970-01-01']
nobel_born_after_1980
nobel_individuals[nobel_individuals['birth_date'].isna()]['full_name'].tolist()
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 8))

# calculate median age for sorting
median_age = nobel.groupby('category')['age_at_prize_est'].median().sort_values()
sorted_categories = median_age.index

# create the box plot
sns.boxplot(data=nobel, x='age_at_prize_est', y='category', order=sorted_categories, color='gray')
sns.swarmplot(data=nobel, x='age_at_prize_est', y='category', order=sorted_categories, color='blue', alpha=0.5)

plt.title('distribution of age for each category')
plt.xlabel('age at prize')
plt.ylabel('category')
plt.xlim(15, 100)
plt.tight_layout()
plt.show()
nobel_expats = nobel[nobel['birth_country'].notna() & 
                     nobel['organisation_country'].notna() & 
                     (nobel['birth_country'] != nobel['organisation_country'])].sort_values(by=['year', 'category'], ascending=[False, True])
nobel_expats=nobel_expats[['year', 'category', 'laureate_id', 'full_name', 'sex', 'laureate_type', 'birth_date', 'birth_country','organisation_name', 'organisation_city','organisation_country']]

nobel_expats[['birth_country']].value_counts().reset_index(name='exports')
import plotly.express as px

# Get the counts of Nobel laureates by birth country
birth_country_counts = nobel_expats['birth_country'].value_counts().reset_index()
birth_country_counts.columns = ['birth_country', 'exports']

# Merge the counts with the world map
world = world.merge(birth_country_counts, how='left', left_on='name', right_on='birth_country')

# Create a plotly map
fig = px.choropleth(world, 
                    locations="iso_a3",
                    color="exports",
                    hover_name="name",
                    color_continuous_scale=px.colors.sequential.OrRd,
                    labels={'exports': 'Number of Nobel Laureates Born'},
                    title='Number of Nobel Laureates by Birth Country')

fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(legend_title_text='Number of Nobel Laureates Born')

fig.show()