Skip to content
import pandas as pd
!pip install pandas openpyxl xlrd

#import csv and create dataframe
df = pd.read_excel('Module_1.xlsx', sheet_name='Dataset')

# Explore dataframe - Describe the data
df.head()
df.info()

# Count different observations in df["Not_Country"]
Not_Country_Count = df['Not_Country'].value_counts()
frequency_table = df['Not_Country'].value_counts(1)
missing_values = df['Not_Country'].isnull().sum()
print(Not_Country_Count)
print(frequency_table)
print(missing_values)

# Recode missing values as 0 for observation "Not_Country"
df['Not_Country'].fillna(0, inplace=True)
missing_values = df['Not_Country'].isnull().sum()
country_count = df['Not_Country'].value_counts().get(0, 0)
print(f'Updated count of missing values is {missing_values}')
print(f'The number of countries is {country_count}')

# Sort by GDP_2023
df_sorted = df.sort_values(by='GDP_2023', ascending=True)
df_sorted.head()
# List of regions/not countries where Not_Country equals 1
not_countries = df[df['Not_Country'] == 1]
print(not_countries)

# Corrected line to select specific columns for rows where Not_Country equals 1
not_country_GDPs = df.loc[df["Not_Country"] == 1, ['Country', 'Code', 'Region', 'GDP_2000', 'GDP_2010', 'GDP_2020', 'GDP_2022', 'GDP_2023']]
print(not_country_GDPs)

#Calculate Summary statistics
gdp_2023_summary = df['GDP_2023'].describe()
print(gdp_2023_summary)

#Isolate observations for only countries
countries = df[df['Not_Country'] == 0]
print(countries.count())

# Calculate Summary statistics for countries
countries_summary = countries.describe()
print(countries_summary['GDP_2023'])

#region and income
print(countries['Region'].value_counts())
print(countries['Income'].value_counts())

#Calculate growth rates
countries['growth_rate_22_23'] = ((countries['GDP_2023'] - countries['GDP_2022'])/countries['GDP_2022'] * 100).round(2)
print(countries.head())

#summarize growth rates by Income
print(countries['growth_rate_22_23'].describe())
summary_growth = countries.groupby('Income')['growth_rate_22_23'].describe()
print(summary_growth)

#summarize growth by region
summary_regions = countries.groupby('Region')['growth_rate_22_23'].describe()
print(summary_regions)
import seaborn as sns
import matplotlib.pyplot as plt

# Create a dot plot of the summary statistics for growth rates by region
plt.figure(figsize=(10, 6))
sns.stripplot(data=summary_regions.reset_index(), x='mean', y='Region', size=10, jitter=True)

# Add labels and title
plt.ylabel('Region')
plt.xlabel('Mean Growth Rate 2022-2023')
plt.title('Mean Growth Rate by Region (2022-2023)')

# Show plot
plt.show()
import plotly.express as px

# Create a dot plot for growth rate by region
fig = px.strip(countries, x='growth_rate_22_23', y='Region', 
                title='Growth Rate by Region (2022-2023)',
                labels={'growth_rate_22_23': 'Growth Rate (%)'},
                hover_data=['Country', 'GDP_2022', 'GDP_2023'])

# Show the plot
fig.show()
# Calculate if countries' GDP per capita has recovered from COVID recession
countries['recovered'] = countries.apply(
    lambda row: True if row['GDP_2023'] > row['GDP_2019'] else (False if row['GDP_2023'] <= row['GDP_2019'] else ('.' if pd.isnull(row['GDP_2023']) or pd.isnull(row['GDP_2019']) else None)),
    axis=1
)

print(countries.groupby('Region')['recovered'].value_counts())
import plotly.express as px

# Group data by Region and Recovered status
recovered_counts = countries.groupby(['Region', 'recovered']).size().reset_index(name='counts')

# Create a horizontal bar chart
fig = px.bar(recovered_counts, x='counts', y='Region', color='recovered', 
             orientation='h',
             title='Number of Countries Recovered from COVID Recession by Region',
             labels={'counts': 'Number of Countries', 'recovered': 'Recovered Status'})

# Show the plot
fig.show()
# Filter the data for countries that have recovered
recovered_true_counts = recovered_counts[recovered_counts['recovered'] == True]

# Create a horizontal bar chart for recovered countries only
fig = px.bar(recovered_true_counts, x='counts', y='Region', color='recovered', 
             orientation='h',
             title='Number of Countries Recovered from COVID Recession by Region (Recovered Only)',
             labels={'counts': 'Number of Countries', 'recovered': 'Recovered Status'})

# Show the plot
fig.show()
import plotly.express as px

# Calculate the share of countries with GDP_2023 > GDP_2019 by region
share_recovered = countries.groupby('Region').apply(
    lambda x: (x['GDP_2023'] > x['GDP_2019']).mean()
).reset_index(name='share_recovered')

# Sort the data in descending order
share_recovered = share_recovered.sort_values(by='share_recovered', ascending=False)

# Create a horizontal bar chart
fig = px.bar(share_recovered, x='share_recovered', y='Region', 
             orientation='h',
             title='Share of Countries with GDP_2023 > GDP_2019 by Region',
             labels={'share_recovered': 'Share of Countries'})

# Show the plot
fig.show()