Skip to content

Gross Domestic Product Data

This project explores a dataset ("gdp_data.csv") on the yearly gross domestic product (GDP) of different countries and regions worldwide. The data set runs from 1960s till 2010s. The GDP values are in current USD.

The country_codes.csv dataset contains counrty codes and thus allows filter the gdp_data.csv to include only countires and not regions, like the Europian Union.

The goal of this project is do some EDA and visualize GDP dynamics.

# Import necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
gdp_data = pd.read_csv("gdp_data.csv", index_col=None)
gdp_data.head(10)
Hidden output
gdp_data.query('`Country Name` == "Germany"').head()

To analyze only the GDP of countries, you can use country_codes.csv to extract these rows from the dataset:

# Filter for countries
codes = pd.read_csv("country_codes.csv", index_col=0)
codes = codes.reset_index()
df = gdp_data[gdp_data['Country Code'].isin(codes['Code'])] # Returns a df containing only countries

df = df.reset_index(drop=True)
Hidden output
# Get the regions
df_regions = gdp_data[~gdp_data['Country Code'].isin(codes['Code'])]
df_regions['Country Name'].unique() # Returns the regions
# Create a lineplot showing GDP values over time for a given country.

# Replace it with the name of the country you want to analyze
country = 'Spain'
df_country= df[df['Country Name'] == country]

g = sns.set_style('darkgrid')
g =sns.lineplot(data=df_country,x='Year',y='Value',ci=None)
plt.ticklabel_format(style='plain', axis='y') # Turns off the scientific notation
ylabels = ["{0} billion".format(y/10**9) for y in g.get_yticks()] # Creates y-axis label
g.set_yticklabels(ylabels) 
plt.ylabel('GDP')
plt.title('GDP of {0} over time'.format(country))
plt.show()
# A lineplot like above, but for a given region

# Replace with the region of interest
region = 'European Union'
df_region = df_regions[df_regions['Country Name'] == region]

# quick lineplot showing Value against Year
plt.clf()
g1 = sns.lineplot(data=df_region,x='Year',y='Value',ci=None)
plt.ticklabel_format(style='plain', axis='y')
ylabels = ["{0} billion".format(y/10**9) for y in g1.get_yticks()]
g1.set_yticklabels(ylabels)
plt.ylabel('GDP')
# Look at the GDPs by decade
gdp_data['Decade'] = gdp_data['Year']//10

gdp_by_decade = gdp_data.groupby(['Country Name','Decade'])['Value'].mean()
gdp_by_decade = gdp_by_decade.reset_index()

#df.sample(n=100)
gdp_by_decade.sample(n=100)
# Compare the GDPs of two countries by decade

# Replace with the countries of interest
country1 = 'Spain'
country2 = 'Italy'

country1_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == country1]
country2_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == country2]
df_merged = country1_by_decade.merge(country2_by_decade,on='Decade',suffixes=('_country1','_country2'))


# Plot the coutries' GDPs by decade
plt.clf()
g = sns.lineplot(data=df_merged,x='Decade',y='Value_country1',label=country1, marker='o',linestyle='--')
g1 = sns.lineplot(data=df_merged,x='Decade',y='Value_country2',label=country2,marker='o',linestyle='--')
plt.ticklabel_format(style='plain', axis='y')
ylabels = ["{0} billion".format(y/10**9) for y in g1.get_yticks()]
g1.set_yticklabels(ylabels)
plt.ylabel('GDP')
plt.title('GDP of {0} vs {1} by decade'.format(country1,country2))

plt.show()
# A line plot like above, but now we look at a country of interest vs a region of interest. Their GDPs might differ by a huge amount, so transform the GDP values using log. 

# Replace with the country and region of interest
country = 'Spain'
region = 'European Union'

country_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == country]
region_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == region]
df_merged = country_by_decade.merge(region_by_decade,on='Decade',suffixes=('_country','_region'))

df_merged['Value_country_log'] = np.log10(df_merged['Value_country']) # Log transformation.
df_merged['Value_region_log'] = np.log10(df_merged['Value_region']) # Log transformation.


plt.clf()
g = sns.lineplot(data=df_merged,x='Decade',y='Value_country_log',label=country,marker='o',linestyle='--')
g1 = sns.lineplot(data=df_merged,x='Decade',y='Value_region_log',label=region,marker='o',linestyle='--')
plt.ticklabel_format(style='plain', axis='y')
plt.ylabel('log10 GDP')
plt.title('GDP of {0} vs {1} by decade'.format(country,region))
plt.show()
# Find the country with the highest percentage growth in GDP in a given decade

# The decade of interest
decade = 198

df_countries = gdp_data[gdp_data['Country Code'].isin(codes['Code'])]
df = df_countries.query('Decade == 198')
df_grouped = df.groupby('Country Name')[['Country Name','Year', 'Value']]

# Create a dictionary where the keys are country names and the values are GDP growth over decade.
dctn = {}
for country in df['Country Name'].values:
    df_country = df_grouped.get_group(country)
    gdps = df_country['Value'].values
    if len(gdps) < 7: 
        continue # If the data on a country's GDP is missing for more than 3 years (say, we only have data dor year 1, 2 and 3 of a given decade), this country is not added to the dictionary.
    else:
        gdp_growth_perc = round(gdps[-1]/gdps[0] * 100,2)
        dctn[country] = gdp_growth_perc

    
    
# Find the country with the highest GDP growth
country = max(dctn,key=dctn.get)
dctn[country]

# You might want to look at the list of countries sorted by the GDP growth
lst = []
for key,value in dctn.items():
    tpl = value,key
    lst.append(tpl)
lst.sort(reverse=True)
print(lst)

len(dctn)
# Find recession years. A year counts as a recession year if
# its GDP value is lower than the GDP value of the previous year.

df_countries = gdp_data[gdp_data['Country Code'].isin(codes['Code'])]
df_countries['Value_lagged'] = df_countries.groupby('Country Name')['Value'].shift(1) # Creates a column that sttores the GDP value of the previous year. If a given country has no data for the previous year, the column's value is NaN.

df_countries['Recession'] = np.where(df_countries['Value'] < df_countries['Value_lagged'], 1, 0) # Creates an indicator column:
# it contains 1 if a given year is a recession year. Otherwise, it contains 0. Any year preceded by a NaN is also assigned 0.