Skip to content
Exploratory Data Analysis and Visualization: Gross Domestic Product Data
Gross Domestic Product Data
This project explores a dataset ("gdp_data.csv") on the yearly gross domestic product (GDP) of different countries and regions worldwide. The data set runs from 1960s till 2010s. The GDP values are in current USD.
The country_codes.csv dataset contains counrty codes and thus allows filter the gdp_data.csv to include only countires and not regions, like the Europian Union.
The goal of this project is do some EDA and visualize GDP dynamics.
# Import necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
gdp_data = pd.read_csv("gdp_data.csv", index_col=None)
gdp_data.head(10)
Hidden output
gdp_data.query('`Country Name` == "Germany"').head()
To analyze only the GDP of countries, you can use country_codes.csv
to extract these rows from the dataset:
# Filter for countries
codes = pd.read_csv("country_codes.csv", index_col=0)
codes = codes.reset_index()
df = gdp_data[gdp_data['Country Code'].isin(codes['Code'])] # Returns a df containing only countries
df = df.reset_index(drop=True)
Hidden output
# Get the regions
df_regions = gdp_data[~gdp_data['Country Code'].isin(codes['Code'])]
df_regions['Country Name'].unique() # Returns the regions
# Create a lineplot showing GDP values over time for a given country.
# Replace it with the name of the country you want to analyze
country = 'Spain'
df_country= df[df['Country Name'] == country]
g = sns.set_style('darkgrid')
g =sns.lineplot(data=df_country,x='Year',y='Value',ci=None)
plt.ticklabel_format(style='plain', axis='y') # Turns off the scientific notation
ylabels = ["{0} billion".format(y/10**9) for y in g.get_yticks()] # Creates y-axis label
g.set_yticklabels(ylabels)
plt.ylabel('GDP')
plt.title('GDP of {0} over time'.format(country))
plt.show()
# A lineplot like above, but for a given region
# Replace with the region of interest
region = 'European Union'
df_region = df_regions[df_regions['Country Name'] == region]
# quick lineplot showing Value against Year
plt.clf()
g1 = sns.lineplot(data=df_region,x='Year',y='Value',ci=None)
plt.ticklabel_format(style='plain', axis='y')
ylabels = ["{0} billion".format(y/10**9) for y in g1.get_yticks()]
g1.set_yticklabels(ylabels)
plt.ylabel('GDP')
# Look at the GDPs by decade
gdp_data['Decade'] = gdp_data['Year']//10
gdp_by_decade = gdp_data.groupby(['Country Name','Decade'])['Value'].mean()
gdp_by_decade = gdp_by_decade.reset_index()
#df.sample(n=100)
gdp_by_decade.sample(n=100)
# Compare the GDPs of two countries by decade
# Replace with the countries of interest
country1 = 'Spain'
country2 = 'Italy'
country1_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == country1]
country2_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == country2]
df_merged = country1_by_decade.merge(country2_by_decade,on='Decade',suffixes=('_country1','_country2'))
# Plot the coutries' GDPs by decade
plt.clf()
g = sns.lineplot(data=df_merged,x='Decade',y='Value_country1',label=country1, marker='o',linestyle='--')
g1 = sns.lineplot(data=df_merged,x='Decade',y='Value_country2',label=country2,marker='o',linestyle='--')
plt.ticklabel_format(style='plain', axis='y')
ylabels = ["{0} billion".format(y/10**9) for y in g1.get_yticks()]
g1.set_yticklabels(ylabels)
plt.ylabel('GDP')
plt.title('GDP of {0} vs {1} by decade'.format(country1,country2))
plt.show()
# A line plot like above, but now we look at a country of interest vs a region of interest. Their GDPs might differ by a huge amount, so transform the GDP values using log.
# Replace with the country and region of interest
country = 'Spain'
region = 'European Union'
country_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == country]
region_by_decade = gdp_by_decade[gdp_by_decade['Country Name'] == region]
df_merged = country_by_decade.merge(region_by_decade,on='Decade',suffixes=('_country','_region'))
df_merged['Value_country_log'] = np.log10(df_merged['Value_country']) # Log transformation.
df_merged['Value_region_log'] = np.log10(df_merged['Value_region']) # Log transformation.
plt.clf()
g = sns.lineplot(data=df_merged,x='Decade',y='Value_country_log',label=country,marker='o',linestyle='--')
g1 = sns.lineplot(data=df_merged,x='Decade',y='Value_region_log',label=region,marker='o',linestyle='--')
plt.ticklabel_format(style='plain', axis='y')
plt.ylabel('log10 GDP')
plt.title('GDP of {0} vs {1} by decade'.format(country,region))
plt.show()
# Find the country with the highest percentage growth in GDP in a given decade
# The decade of interest
decade = 198
df_countries = gdp_data[gdp_data['Country Code'].isin(codes['Code'])]
df = df_countries.query('Decade == 198')
df_grouped = df.groupby('Country Name')[['Country Name','Year', 'Value']]
# Create a dictionary where the keys are country names and the values are GDP growth over decade.
dctn = {}
for country in df['Country Name'].values:
df_country = df_grouped.get_group(country)
gdps = df_country['Value'].values
if len(gdps) < 7:
continue # If the data on a country's GDP is missing for more than 3 years (say, we only have data dor year 1, 2 and 3 of a given decade), this country is not added to the dictionary.
else:
gdp_growth_perc = round(gdps[-1]/gdps[0] * 100,2)
dctn[country] = gdp_growth_perc
# Find the country with the highest GDP growth
country = max(dctn,key=dctn.get)
dctn[country]
# You might want to look at the list of countries sorted by the GDP growth
lst = []
for key,value in dctn.items():
tpl = value,key
lst.append(tpl)
lst.sort(reverse=True)
print(lst)
len(dctn)
# Find recession years. A year counts as a recession year if
# its GDP value is lower than the GDP value of the previous year.
df_countries = gdp_data[gdp_data['Country Code'].isin(codes['Code'])]
df_countries['Value_lagged'] = df_countries.groupby('Country Name')['Value'].shift(1) # Creates a column that sttores the GDP value of the previous year. If a given country has no data for the previous year, the column's value is NaN.
df_countries['Recession'] = np.where(df_countries['Value'] < df_countries['Value_lagged'], 1, 0) # Creates an indicator column:
# it contains 1 if a given year is a recession year. Otherwise, it contains 0. Any year preceded by a NaN is also assigned 0.