Skip to content
📖 Background
💾 The data
import pandas as pd
import numpy as np
import geopandas as gpd
import missingno as msno
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.features import PCA
trees = pd.read_csv('data/trees.csv')
trees.head(2)
neighborhoods = gpd.read_file('data/nta.shp')
neighborhoods.head(2)
💪 Challenge
# Checking percentage of missing value
def percent_missing_value(df):
feature_50 = []
for i in df.columns:
number = df[i].isna().sum()
percentage = number/len(df)*100
print(i, ' :', number, ' soit ', "%.2f" % percentage
+ '% missing values')
if percentage > 50:
feature_50.append(i)
print('-'*40)
print('List of variables with more than 50 missing values ', feature_50)
print('-'*40)
print('Number of missing values ', "{:,}".format(df.isna().sum().sum()),
' on ', "{:,}".format(df.shape[0]*df.shape[1]))
print('-'*40)
print('Percentage of missing values ',
str("%.2f" % (df.isna().sum().sum() /
(df.shape[0] * df.shape[1])*100)), ' %')
print('-'*40)
print("Heatmap for viewing missing values", '\n',
'** Number of rows : ', "{:,}".format(df.shape[0]), '\n',
'** Number of columns : ',
"{:,}".format(df.shape[1]))
msno.matrix(df, filter='bottom', figsize=(25, 10),
sparkline=False, label_rotation=90)
‌
‌
‌
‌
‌