Skip to content
How Much of the World Has Access to the Internet?
Import Packages
# Import pandas
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import scipy as sp
from scipy import stats
from sklearn.mixture import GaussianMixture
How Much of the World Has Access to the Internet?
Now let's now move on to the competition and challenge.
📖 Background
You work for a policy consulting firm. One of the firm's principals is preparing to give a presentation on the state of internet access in the world. She needs your help answering some questions about internet accessibility across the world.
Loading data
# Read the internet table
internet = pd.read_csv('data/internet.csv')
# Read the people table
people = pd.read_csv('data/people.csv')
# Read the data
broadband = pd.read_csv('data/broadband.csv')
# Country and region accordig to the World bank data : https://data.worldbank.org/country
regions=pd.read_csv('world-regions-according-to-the-world-bank.csv').drop(['Year'],axis=1)
💾 The data
The research team compiled the following tables (source):
internet
- "Entity" - The name of the country, region, or group.
- "Code" - Unique id for the country (null for other entities).
- "Year" - Year from 1990 to 2019.
- "Internet_usage" - The share of the entity's population who have used the internet in the last three months.
people
- "Entity" - The name of the country, region, or group.
- "Code" - Unique id for the country (null for other entities).
- "Year" - Year from 1990 to 2020.
- "Users" - The number of people who have used the internet in the last three months for that country, region, or group.
broadband
- "Entity" - The name of the country, region, or group.
- "Code" - Unique id for the country (null for other entities).
- "Year" - Year from 1998 to 2020.
- "Broadband_Subscriptions" - The number of fixed subscriptions to high-speed internet at downstream speeds >= 256 kbit/s for that country, region, or group.
region
- "Entity" - The name of the country
- "Code" - Unique id fir the country
- "World Region according to the World Bank" - The region of the country
Acknowledgments: Max Roser, Hannah Ritchie, and Esteban Ortiz-Ospina (2015) - "Internet." OurWorldInData.org.
💪 Challenge
Create a report to answer the principal's questions. Include:
- What are the top 5 countries with the highest internet use (by population share)?
- How many people had internet access in those countries in 2019?
- What are the top 5 countries with the highest internet use for each of the following regions: 'Middle East & North Africa', 'Latin America & Caribbean', 'East Asia & Pacific', 'South Asia', 'North America', 'Europe & Central Asia'?
- Create a visualization for those five regions' internet usage over time.
- What are the 5 countries with the most internet users?
- What is the correlation between internet usage (population share) and broadband subscriptions for 2019?
- Summarize your findings.
Note: This is how the World Bank defines the different regions.
⌛️ Time is ticking. Good luck!
Helper functions
def gauss_mix_model(df,var,components):
X=np.array(df[var]).reshape((df[var].shape[0],1))
gmm = GaussianMixture(n_components=components).fit(X)
labels = gmm.predict(X)
df_gauss=pd.DataFrame(np.concatenate((X,labels.reshape((labels.shape[0],1))),axis=1),columns=[var,'Groups'])
min_0=df_gauss[df_gauss['Groups']==0][var].min()
max_0=df_gauss[df_gauss['Groups']==0][var].max()
print('Group ',0)
print(' ** Min : ',min_0)
print(' ** Max : ',max_0)
print(' ** Means : ',str("%.2f"%gmm.means_[0]))
min_1=df_gauss[df_gauss['Groups']==1][var].min()
max_1=df_gauss[df_gauss['Groups']==1][var].max()
print('Group ',1)
print(' ** Min : ',min_1)
print(' ** Max : ',max_1)
print(' ** Means : ',str("%.2f"%gmm.means_[1]))
return min_0,max_0,min_1,max_1
def uni_analyse_quant(df,row_number):
data=df[df.columns[row_number]]
print(data.describe());
fig=plt.figure(figsize=(15,10))
grid = plt.GridSpec(2, 2, wspace=1, hspace=0.3)
plt.subplot(grid[0,0])
sns.histplot(data,kde=True)
plt.title('Distribution of '+df.columns[row_number])
plt.subplot(grid[0,1])
sns.boxplot(x=data)
plt.title('Boxplot of '+df.columns[row_number])
compteur_sup=0
compteur_inf=0
mean = np.mean(data)
std = np.std(data)
iqr=stats.iqr(data)
threshold = 2
lim_up=np.quantile(data,0.75)+1.5*iqr
lim_down=np.quantile(data,0.25)-1.5*iqr
for j in data:
if j >lim_up:
compteur_sup=compteur_sup+1
elif j <lim_down:
compteur_inf=compteur_inf+1
print('Number of outliers :'+str(compteur_sup+compteur_inf))
print('Outlier sup '+str(round(lim_up,2))+' in dataset is '+str(compteur_sup))
print('Outlier inf '+str(round(lim_down,2))+' in dataset is '+str(compteur_inf)+'Soit '+str("%.2f"%((compteur_sup+compteur_inf)/len(data)))+'%')
print("************************************************************************")
if (sp.stats.shapiro(df[~data.isna()][df.columns[row_number]])[1])<0.05:
print('➛ The variable '+df.columns[row_number]+' does not follow a normal distribution')
else:
print('➛ The variable '+df.columns[row_number]+' follow a normal distribution')
#Checking percentage of missing value
def percent_missing_value(df):
feature_50=[]
for i in df.columns:
number=df[i].isna().sum()
percentage=number/len(df)*100
print(i,' :',number, ' soit ',"%.2f"%percentage+'% de valeurs manquantes')
if percentage>50:
feature_50.append(i)
print('▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️ ')
print('List of variables with more than 50 missing values ',feature_50)
print('▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️ ')
print('Number of missing values ',"{:,}".format(df.isna().sum().sum()),' on ',"{:,}".format(df.shape[0]*
df.shape[1]))
print('▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️ ')
print('Percentage of missing values ',str("%.2f"%(df.isna().sum().sum()/(df.shape[0]*df.shape[1])*100)),' %')
print('▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️▪️ ')
print("Heatmap for viewing missing values",'\n','➙ Number of rows : ',"{:,}".format(df.shape[0]),'\n','➙ Number of columns : ',
"{:,}".format(df.shape[1]))
msno.matrix(df,filter='bottom',figsize=(25, 10),sparkline=False,label_rotation=90,color = (0.25, 0.25, 0.25));
Data preprocessing
Set up a database with all of them
- Internet
- People
- Broadband
- Regions