Competition - Breath and trees in Manhattan

📖 Background

💾 The data

import pandas as pd
import numpy as np
import geopandas as gpd
import missingno as msno
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.features import PCA

trees = pd.read_csv('data/trees.csv')
trees.head(2)

neighborhoods = gpd.read_file('data/nta.shp')
neighborhoods.head(2)

💪 Challenge

# Checking percentage of missing value

def percent_missing_value(df):
    feature_50 = []
    for i in df.columns:
        number = df[i].isna().sum()
        percentage = number/len(df)*100
        print(i, ' :', number, ' soit ', "%.2f" % percentage
              + '% missing values')
        if percentage > 50:
            feature_50.append(i)
    print('-'*40)
    print('List of variables with more than 50 missing values ', feature_50)
    print('-'*40)
    print('Number of missing values ', "{:,}".format(df.isna().sum().sum()),
          ' on ', "{:,}".format(df.shape[0]*df.shape[1]))
    print('-'*40)
    print('Percentage of missing values ',
          str("%.2f" % (df.isna().sum().sum() /
                        (df.shape[0] * df.shape[1])*100)), ' %')
    print('-'*40)
    print("Heatmap for viewing missing values", '\n',
          '** Number of rows : ', "{:,}".format(df.shape[0]), '\n',
          '** Number of columns : ',
          "{:,}".format(df.shape[1]))
    msno.matrix(df, filter='bottom', figsize=(25, 10),
                sparkline=False, label_rotation=90)

‌
‌
‌