Skip to content

I downloaded a dataset of Gross Domestic Product (GDP) based on individual countries. I identified recessions based on GDP criteria and used bivariate regressions and a moving heatmap to provide a nuanced understanding of global economic landscapes over time. Using linear regression, I defined leading indicators through predictive modeling that point towards sustained growth for GDP of China and the USA, while Russia displays a steady decline in the same regard. It is important to note that Russia does not adhere to the same economic reporting standards. I calculated coefficients, p-values, and R-squared values, contributing to a comprehensive understanding of the relationships between time and GDP. Statistical significance was not identified between USA and China, which was expected. I wanted to calculate p-value with the stats.model API in python because I had only used R to find statistical significance in the past. This was a rewarding independent exploration of international economic data where I:

  1. Utilized Plotly Express to create interactive and dynamic visualizations of GDP trends.
  2. Explored time series plots to visualize the change in GDP over the years for specific countries.
  3. Proposed challenges for further exploration, such as identifying recessions, creating specific country visualizations, and calculating the highest percentage growth in GDP.
  4. Utilized statistical libraries for bivariate regression analysis within selected countries.
  5. Used linear regression analysis to explore relationships between variables using the Statsmodels library.This opens avenues for further predictive modeling and in-depth exploration of global economic data.
  6. Created a moving heat map for visualizing changing GDP trends over time.

Python analysis involved data loading, exploration, and dynamic visualization. Challenges and future exploration ideas point towards a deeper understanding of GDP trends, statistical analysis, and predictive modeling.

Source and license of dataset.

import pandas as pd
gdp_data = pd.read_csv("gdp_data.csv", index_col=None)
print(gdp_data.head)
gdp_data.head(100)
codes = pd.read_csv("country_codes.csv", index_col=0)
codes
import pandas as pd
gdp_data = pd.read_csv("gdp_data.csv", index_col=None)
print(gdp_data.head)
gdp_data.head(100)

codes = pd.read_csv("country_codes.csv", index_col=0)
gdp_countries = gdp_data[gdp_data['Country Code'].isin(codes.index)]
print(gdp_countries.head())

total_gdp_by_country = gdp_countries.groupby('Country Code')['Value'].sum().reset_index()
sorted_gdp = total_gdp_by_country.sort_values(by='Value', ascending=False)
top_5_countries = sorted_gdp.head(5)
print(top_5_countries)
import pandas as pd
codes = pd.read_csv("country_codes.csv", index_col=0)

gdp_countries = gdp_data[gdp_data['Country Code'].isin(codes.index)]

def identify_recessions(df, threshold=-0.03, consecutive_years=2):
    recessions = []
    for country_code in df['Country Code'].unique():
        country_data = df[df['Country Code'] == country_code]
        country_recession = country_data['Value'].pct_change() < threshold
        country_recession = country_recession.groupby((country_recession != country_recession.shift()).cumsum()).cumcount() + 1
        country_recession = country_data[country_recession >= consecutive_years]
        recessions.append(country_recession)

    return pd.concat(recessions)

identified_recessions = identify_recessions(gdp_countries)
print(identified_recessions)
import matplotlib.pyplot as plt

selected_country_code = 'USA'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]


start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]

plt.figure(figsize=(10, 6))
plt.plot(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, marker='o', linestyle='-', color='b')
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.grid(True)
plt.show()
import matplotlib.pyplot as plt

# Russia GDP 
selected_country_code = 'RUS'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]

start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]

# Bad use of a scatter plot...
plt.figure(figsize=(10, 6))
plt.scatter(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, color='b', label=f"{selected_country_code}")
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.legend()
plt.grid(True)
plt.show()
import matplotlib.pyplot as plt

# Russia GDP 
selected_country_code = 'RUS'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]

start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]

plt.figure(figsize=(10, 6))
plt.plot(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, marker='o', linestyle='-', color='b')
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.grid(True)
plt.show()
import matplotlib.pyplot as plt

# China GDP 
selected_country_code = 'CHN'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]


start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]

plt.figure(figsize=(10, 6))
plt.plot(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, marker='o', linestyle='-', color='b')
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.grid(True)
plt.show()

Algo for percentage growth for each country over past decade

gdp_countries['Percentage Growth'] = gdp_countries.groupby('Country Code')['Value'].pct_change() * 100

highest_growth_country = gdp_countries.loc[gdp_countries['Percentage Growth'].idxmax()]

print(f"The country with the highest percentage growth in GDP over the past decade is {highest_growth_country['Country Name']} ({highest_growth_country['Country Code']}).") print(f"Percentage Growth: {highest_growth_country['Percentage Growth']:.2f}%")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


selected_country_code = 'USA'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]


model_data = selected_country_data[['Year', 'Value']]
model_data.columns = ['Year', 'GDP']


train_data, test_data = train_test_split(model_data, test_size=0.9, random_state=42)


X_train = train_data[['Year']]
y_train = train_data['GDP']
X_test = test_data[['Year']]
y_test = test_data['GDP']


model = LinearRegression()


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
future_years = pd.DataFrame({'Year': range(2022, 2030)})
future_predictions = model.predict(future_years)

plt.figure(figsize=(10, 5))
plt.scatter(test_data['Year'], y_test, color='blue', label='Actual')
plt.plot(test_data['Year'], y_pred, color='red', linewidth=2, label='Predicted')
plt.scatter(future_years['Year'], future_predictions, color='green', marker='x', label='Future Predictions')
plt.title(f'GDP Prediction for {selected_country_code}')
plt.xlabel('Year')
plt.ylabel('GDP (in 20 trillion USD)')
plt.legend()
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


selected_country_code = 'CHN'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]


model_data = selected_country_data[['Year', 'Value']]
model_data.columns = ['Year', 'GDP']


train_data, test_data = train_test_split(model_data, test_size=0.9, random_state=42)


X_train = train_data[['Year']]
y_train = train_data['GDP']
X_test = test_data[['Year']]
y_test = test_data['GDP']


model = LinearRegression()


model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


future_years = pd.DataFrame({'Year': range(2022, 2030)})
future_predictions = model.predict(future_years)

# plotting actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(test_data['Year'], y_test, color='blue', label='Actual')
plt.plot(test_data['Year'], y_pred, color='red', linewidth=2, label='Predicted')
plt.scatter(future_years['Year'], future_predictions, color='green', marker='x', label='Future Predictions')
plt.title(f'GDP Prediction for {selected_country_code}')
plt.xlabel('Year')
plt.ylabel('GDP (in 20 trillion USD)')
plt.legend()
plt.show()
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


usa_data = gdp_countries[gdp_countries['Country Code'] == 'USA'][['Year', 'Value']]
china_data = gdp_countries[gdp_countries['Country Code'] == 'CHN'][['Year', 'Value']]

merged_data = pd.merge(usa_data, china_data, on='Year', suffixes=('_USA', '_CHN'))


X = merged_data[['Year']]
y_usa = merged_data['Value_USA']
y_chn = merged_data['Value_CHN']


model_usa = LinearRegression()
model_chn = LinearRegression()


model_usa.fit(X, y_usa)
model_chn.fit(X, y_chn)


pred_usa = model_usa.predict(X)
pred_chn = model_chn.predict(X)


plt.figure(figsize=(10, 6))

plt.scatter(X, y_usa, color='blue', label='USA - Actual')
plt.plot(X, pred_usa, color='red', linewidth=2, label='USA - Predicted')
plt.scatter(X, y_chn, color='green', label='CHN - Actual')
plt.plot(X, pred_chn, color='purple', linewidth=2, label='CHN - Predicted')
plt.title('Bivariate Regression: USA vs CHN GDP')
plt.xlabel('Year')
plt.ylabel('GDP (in 20 trillion USD)')
plt.legend()
plt.show()