I downloaded a dataset of Gross Domestic Product (GDP) based on individual countries. I identified recessions based on GDP criteria and used bivariate regressions and a moving heatmap to provide a nuanced understanding of global economic landscapes over time. Using linear regression, I defined leading indicators through predictive modeling that point towards sustained growth for GDP of China and the USA, while Russia displays a steady decline in the same regard. It is important to note that Russia does not adhere to the same economic reporting standards. I calculated coefficients, p-values, and R-squared values, contributing to a comprehensive understanding of the relationships between time and GDP. Statistical significance was not identified between USA and China, which was expected. I wanted to calculate p-value with the stats.model API in python because I had only used R to find statistical significance in the past. This was a rewarding independent exploration of international economic data where I:
- Utilized Plotly Express to create interactive and dynamic visualizations of GDP trends.
- Explored time series plots to visualize the change in GDP over the years for specific countries.
- Proposed challenges for further exploration, such as identifying recessions, creating specific country visualizations, and calculating the highest percentage growth in GDP.
- Utilized statistical libraries for bivariate regression analysis within selected countries.
- Used linear regression analysis to explore relationships between variables using the Statsmodels library.This opens avenues for further predictive modeling and in-depth exploration of global economic data.
- Created a moving heat map for visualizing changing GDP trends over time.
Python analysis involved data loading, exploration, and dynamic visualization. Challenges and future exploration ideas point towards a deeper understanding of GDP trends, statistical analysis, and predictive modeling.
import pandas as pd
gdp_data = pd.read_csv("gdp_data.csv", index_col=None)
print(gdp_data.head)
gdp_data.head(100)codes = pd.read_csv("country_codes.csv", index_col=0)
codesimport pandas as pd
gdp_data = pd.read_csv("gdp_data.csv", index_col=None)
print(gdp_data.head)
gdp_data.head(100)
codes = pd.read_csv("country_codes.csv", index_col=0)
gdp_countries = gdp_data[gdp_data['Country Code'].isin(codes.index)]
print(gdp_countries.head())
total_gdp_by_country = gdp_countries.groupby('Country Code')['Value'].sum().reset_index()
sorted_gdp = total_gdp_by_country.sort_values(by='Value', ascending=False)
top_5_countries = sorted_gdp.head(5)
print(top_5_countries)import pandas as pd
codes = pd.read_csv("country_codes.csv", index_col=0)
gdp_countries = gdp_data[gdp_data['Country Code'].isin(codes.index)]
def identify_recessions(df, threshold=-0.03, consecutive_years=2):
recessions = []
for country_code in df['Country Code'].unique():
country_data = df[df['Country Code'] == country_code]
country_recession = country_data['Value'].pct_change() < threshold
country_recession = country_recession.groupby((country_recession != country_recession.shift()).cumsum()).cumcount() + 1
country_recession = country_data[country_recession >= consecutive_years]
recessions.append(country_recession)
return pd.concat(recessions)
identified_recessions = identify_recessions(gdp_countries)
print(identified_recessions)import matplotlib.pyplot as plt
selected_country_code = 'USA'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]
start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]
plt.figure(figsize=(10, 6))
plt.plot(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, marker='o', linestyle='-', color='b')
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.grid(True)
plt.show()import matplotlib.pyplot as plt
# Russia GDP
selected_country_code = 'RUS'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]
start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]
# Bad use of a scatter plot...
plt.figure(figsize=(10, 6))
plt.scatter(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, color='b', label=f"{selected_country_code}")
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.legend()
plt.grid(True)
plt.show()import matplotlib.pyplot as plt
# Russia GDP
selected_country_code = 'RUS'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]
start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]
plt.figure(figsize=(10, 6))
plt.plot(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, marker='o', linestyle='-', color='b')
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.grid(True)
plt.show()import matplotlib.pyplot as plt
# China GDP
selected_country_code = 'CHN'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]
start_year = 2012
end_year = 2021
selected_country_data_decade = selected_country_data[(selected_country_data['Year'] >= start_year) & (selected_country_data['Year'] <= end_year)]
plt.figure(figsize=(10, 6))
plt.plot(selected_country_data_decade['Year'], selected_country_data_decade['Value'] / 1e9, marker='o', linestyle='-', color='b')
plt.title(f"GDP Change Over the Past Decade - {selected_country_code}")
plt.xlabel("Year")
plt.ylabel("GDP (in billions USD)")
plt.grid(True)
plt.show()Algo for percentage growth for each country over past decade
gdp_countries['Percentage Growth'] = gdp_countries.groupby('Country Code')['Value'].pct_change() * 100
highest_growth_country = gdp_countries.loc[gdp_countries['Percentage Growth'].idxmax()]
print(f"The country with the highest percentage growth in GDP over the past decade is {highest_growth_country['Country Name']} ({highest_growth_country['Country Code']}).") print(f"Percentage Growth: {highest_growth_country['Percentage Growth']:.2f}%")
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
selected_country_code = 'USA'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]
model_data = selected_country_data[['Year', 'Value']]
model_data.columns = ['Year', 'GDP']
train_data, test_data = train_test_split(model_data, test_size=0.9, random_state=42)
X_train = train_data[['Year']]
y_train = train_data['GDP']
X_test = test_data[['Year']]
y_test = test_data['GDP']
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
future_years = pd.DataFrame({'Year': range(2022, 2030)})
future_predictions = model.predict(future_years)
plt.figure(figsize=(10, 5))
plt.scatter(test_data['Year'], y_test, color='blue', label='Actual')
plt.plot(test_data['Year'], y_pred, color='red', linewidth=2, label='Predicted')
plt.scatter(future_years['Year'], future_predictions, color='green', marker='x', label='Future Predictions')
plt.title(f'GDP Prediction for {selected_country_code}')
plt.xlabel('Year')
plt.ylabel('GDP (in 20 trillion USD)')
plt.legend()
plt.show()import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
selected_country_code = 'CHN'
selected_country_data = gdp_countries[gdp_countries['Country Code'] == selected_country_code]
model_data = selected_country_data[['Year', 'Value']]
model_data.columns = ['Year', 'GDP']
train_data, test_data = train_test_split(model_data, test_size=0.9, random_state=42)
X_train = train_data[['Year']]
y_train = train_data['GDP']
X_test = test_data[['Year']]
y_test = test_data['GDP']
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
future_years = pd.DataFrame({'Year': range(2022, 2030)})
future_predictions = model.predict(future_years)
# plotting actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(test_data['Year'], y_test, color='blue', label='Actual')
plt.plot(test_data['Year'], y_pred, color='red', linewidth=2, label='Predicted')
plt.scatter(future_years['Year'], future_predictions, color='green', marker='x', label='Future Predictions')
plt.title(f'GDP Prediction for {selected_country_code}')
plt.xlabel('Year')
plt.ylabel('GDP (in 20 trillion USD)')
plt.legend()
plt.show()import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
usa_data = gdp_countries[gdp_countries['Country Code'] == 'USA'][['Year', 'Value']]
china_data = gdp_countries[gdp_countries['Country Code'] == 'CHN'][['Year', 'Value']]
merged_data = pd.merge(usa_data, china_data, on='Year', suffixes=('_USA', '_CHN'))
X = merged_data[['Year']]
y_usa = merged_data['Value_USA']
y_chn = merged_data['Value_CHN']
model_usa = LinearRegression()
model_chn = LinearRegression()
model_usa.fit(X, y_usa)
model_chn.fit(X, y_chn)
pred_usa = model_usa.predict(X)
pred_chn = model_chn.predict(X)
plt.figure(figsize=(10, 6))
plt.scatter(X, y_usa, color='blue', label='USA - Actual')
plt.plot(X, pred_usa, color='red', linewidth=2, label='USA - Predicted')
plt.scatter(X, y_chn, color='green', label='CHN - Actual')
plt.plot(X, pred_chn, color='purple', linewidth=2, label='CHN - Predicted')
plt.title('Bivariate Regression: USA vs CHN GDP')
plt.xlabel('Year')
plt.ylabel('GDP (in 20 trillion USD)')
plt.legend()
plt.show()