Skip to content

World Population Data

This dataset has the total population numbers for every country from 1960 to 2020. Additionally, there is a table that contains country information, including region, income group, and any special notes.

Not sure where to begin? Scroll to the bottom to find challenges!

import pandas as pd
world_pop = pd.read_csv("world_pop_data.csv")
display(world_pop)
world_pop=world_pop.drop(['Indicator Name','Indicator Code'],axis=1)
display(world_pop)
import pandas as pd

country_data = pd.read_csv('metadata_country.csv')
display(country_data[['Country Code', 'TableName']])
country_data = country_data.drop(['SpecialNotes', 'TableName'], axis=1)

Source and license of dataset.

Don't know where to start?

Challenges are brief tasks designed to help you practice specific skills:

  • ๐Ÿ—บ๏ธ Explore: Which countries have experienced the highest population growth?
  • ๐Ÿ“Š Visualize: Create a plot that visualizes the population growth of countries over time grouped by region.
  • ๐Ÿ”Ž Analyze: How does income group affect a country's population growth?

Scenarios are broader questions to help you develop an end-to-end project for your portfolio:

You are working for a regional organization in East Asia and the Pacific that analyzes population statistics and makes recommendations to relevant governments. There are concerns that some countries in the region are experiencing a decline in populations.

Your manager has asked you to prepare a population forecast for the region for 5, 10, and 15 years into the future. They have also asked you to identify the five countries with the lowest population growth (or greatest decline), and perform similar forecasts for these five countries individually.

You will need to prepare a report that is accessible to a broad audience. It should outline your motivation, steps, findings, and conclusions.

Read, clean and reorganize data

# Merge the dataframes
WC_pop = world_pop.merge(country_data, on='Country Code', how='inner')


# Select the specified columns and convert them to 'category' dtype
WC_pop['Country Code'] = WC_pop['Country Code'].astype('category')

# Print the info of the dataframe to check the changes

# Replace missing regions with 'World'
WC_pop['Region'] = WC_pop['Region'].fillna('World')

# Print list of regions
print(WC_pop['Region'].unique())
WC_pop['Region'] = WC_pop['Region'].astype('category')

# Replace missing regions with 'Unknown'
WC_pop['IncomeGroup'] = WC_pop['IncomeGroup'].fillna('Unknown')

# Print list of income groups
print(WC_pop['IncomeGroup'].unique())                       
WC_pop['IncomeGroup'] = WC_pop['IncomeGroup'].astype('category')                     

#print the dataframe 
display(WC_pop.head())


Set Region, IncomeGroup and CountryCode as indices

import matplotlib.pyplot as plt
import seaborn as sns

# Set region, income broup and country code as indices
WC_region = WC_pop.set_index(['Region', 'IncomeGroup', 'Country Code'])
print(WC_region)



Plot population growth of selectec countries for illustration purpose


# Create a function for repeated plots
def cntry_pop(cntr_code):
    row_cntr = WC_region.loc[:, :, cntr_code].reset_index()

    row_cntr = row_cntr.drop(['Region', 'IncomeGroup'], axis=1).values.flatten()
    cols = WC_region.columns.astype(int)
    sns.lineplot(x=cols, y=row_cntr,label=cntr_code)
    plt.xlabel('Year')
    plt.ylabel('Population')
    plt.title('Population growth for selected countries')

#Plot populaton growth in selectwd countries

cntry_pop('RUS')    
cntry_pop('TUR')
cntry_pop('ALB')
cntry_pop('JPN')
cntry_pop('USA')
plt.show()  

Plot population growth over the years for different income groups


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd  # Import pandas for sorting functionality

# Crete a function for repeated plots
rate=[]
labels=[]
def popex(income_level):
    UM_income=WC_region.loc[:,income_level,:]
    UM_income_aver = UM_income.mean().reset_index()  
    
    UM_income_aver = UM_income_aver.rename(columns={'index': 'year', 0: 'Aver_pop'})
  
    cols=UM_income_aver['year'].astype('int').values
    
    val=UM_income_aver['Aver_pop'].values
    ratei=(val[-1]- val[0])/60
    rate.append(ratei)
    labels.append(income_level)
    sns.lineplot(x=cols,y=val,label=income_level)
    plt.title(f'Average population increase for different income groups' )
    plt.xlabel('Year')
    plt.ylabel('Population')
    plt.xticks(np.arange(1960,2030,20))

# Plot idfferent incolme groups    
popex('Lower middle income')
popex('Upper middle income')   
popex('Low income')
popex('High income')
plt.show()


# Genrate bar plot of average population growth rates

sortex = pd.Series(rate, index=labels).sort_values(ascending=False)
print(sortex)
sns.barplot(y=sortex.values, x=sortex.index,hue=sortex.index)
plt.xticks(rotation=15)
plt.xlabel('Income groups')
plt.ylabel('Rate')
plt.title('Average Population growth per year for different income groups')
plt.show()

Plot population growth rates for different regions



import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Create a function for repeated plots
rate=[]
labels=[]
def popreg(region):
    UM_income=WC_region.loc[region,:,:]
    UM_income_aver = UM_income.mean().reset_index()
    UM_income_aver = UM_income_aver.rename(columns={'index': 'year', 0: 'Aver_pop'})
    cols=UM_income_aver['year'].astype('int').values
    val=UM_income_aver['Aver_pop'].values
    
    ratei=(val[-1]- val[0])/60
    rate.append(ratei)
    labels.append(region)
    sns.lineplot(x=cols,y=val,label=region)
    plt.title(f'Average population increase for different regions' )
    plt.xlabel('Year')
    plt.ylabel('Population')
    plt.xticks(np.arange(1960,2030,20))

# Plot popluation rise over time in different regiond

popreg('South Asia')   
popreg('North America')
popreg('East Asia & Pacific')
popreg('Sub-Saharan Africa')
popreg('Middle East & North Africa')
popreg('Latin America & Caribbean')   
popreg('Europe & Central Asia')
plt.show()

# Generate bar plot for average population griwth rate in different regions

sortex = pd.Series(rate, index=labels).sort_values(ascending=False)
sns.barplot(y=sortex.values, x=sortex.index,hue=sortex.index)
plt.xticks(rotation=15)
plt.xlabel('Regions')
plt.ylabel('Rate')
plt.title('Average Population growth per year for different regions')
plt.show()
โ€Œ
โ€Œ
โ€Œ