Skip to content
Competition - Breath and trees in Manhattan
0
  • AI Chat
  • Code
  • Report
  • 📖 Background

    💾 The data

    import pandas as pd
    import numpy as np
    import geopandas as gpd
    import missingno as msno
    import seaborn as sns
    from sklearn.preprocessing import PowerTransformer
    from sklearn.preprocessing import LabelEncoder
    from sklearn.cluster import KMeans
    from yellowbrick.cluster import KElbowVisualizer
    from yellowbrick.features import PCA

    trees = pd.read_csv('data/trees.csv')
    trees.head(2)
    neighborhoods = gpd.read_file('data/nta.shp')
    neighborhoods.head(2)

    💪 Challenge

    # Checking percentage of missing value
    
    def percent_missing_value(df):
        feature_50 = []
        for i in df.columns:
            number = df[i].isna().sum()
            percentage = number/len(df)*100
            print(i, ' :', number, ' soit ', "%.2f" % percentage
                  + '% missing values')
            if percentage > 50:
                feature_50.append(i)
        print('-'*40)
        print('List of variables with more than 50 missing values ', feature_50)
        print('-'*40)
        print('Number of missing values ', "{:,}".format(df.isna().sum().sum()),
              ' on ', "{:,}".format(df.shape[0]*df.shape[1]))
        print('-'*40)
        print('Percentage of missing values ',
              str("%.2f" % (df.isna().sum().sum() /
                            (df.shape[0] * df.shape[1])*100)), ' %')
        print('-'*40)
        print("Heatmap for viewing missing values", '\n',
              '** Number of rows : ', "{:,}".format(df.shape[0]), '\n',
              '** Number of columns : ',
              "{:,}".format(df.shape[1]))
        msno.matrix(df, filter='bottom', figsize=(25, 10),
                    sparkline=False, label_rotation=90)