Skip to content
Diabetes Prediction Using Random Forest Algorithm
  • AI Chat
  • Code
  • Report
  • #Importing the required libraries
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix,accuracy_score
    from sklearn import metrics
    from sklearn.preprocessing import StandardScaler
    import warnings
    #loading the data
    warnings.filterwarnings('ignore')
    diab_df= pd.read_csv("diabetes-dataset.csv")
    print(diab_df.head())
    
    print(diab_df.tail())
    
    print(diab_df.shape)
    
    print(diab_df.info())
    
    print(diab_df.isnull().sum())
    diab_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diab_df[['Glucose','BloodPressure','SkinThickness',
                                                                                    'Insulin','BMI']].replace(0,np.NaN)
    print(diab_df.isnull().sum())
    print(diab_df.hist(figsize = (11,11), color="#008080"))
    plt.show()
    
    diab_df['Glucose'].fillna(diab_df['Glucose'].mean(), inplace = True)
    diab_df['BloodPressure'].fillna(diab_df['BloodPressure'].mean(), inplace = True)
    diab_df['SkinThickness'].fillna(diab_df['SkinThickness'].median(), inplace = True)
    diab_df['Insulin'].fillna(diab_df['Insulin'].median(), inplace = True)
    diab_df['BMI'].fillna(diab_df['BMI'].median(), inplace = True)
    print(diab_df.isnull().sum())
    
    print(diab_df.info())
    
    
    plt.figure(figsize=(10,5))
    plt.title('Diabetes Plot Yes/No', fontsize=14)
    sns.countplot(x="Outcome", data=diab_df, palette=('#23C552','#C52219'))
    plt.xlabel("Diabetes (0 = No, 1= Yes)", fontsize=12)
    plt.ylabel("Count", fontsize=12)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()
    
    fig, axes = plt.subplots(2, 4, figsize=(18, 12))
    fig.suptitle('Diabetes Outcome Distribution WRT All Independent Variables', fontsize=16)
    sns.boxplot(ax=axes[0, 0], x=diab_df['Outcome'], y=diab_df['Pregnancies'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[0, 0].set_title("Diabetes Outcome vs Pregnancies", fontsize=12)
    
    sns.boxplot(ax=axes[0, 1], x=diab_df['Outcome'], y=diab_df['Glucose'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[0, 1].set_title("Diabetes Outcome vs Glucose", fontsize=12)
    
    sns.boxplot(ax=axes[0, 2], x=diab_df['Outcome'], y=diab_df['BloodPressure'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[0, 2].set_title("Diabetes Outcome vs BloodPressure", fontsize=12)
    
    sns.boxplot(ax=axes[0, 3], x=diab_df['Outcome'], y=diab_df['SkinThickness'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[0, 3].set_title("Diabetes Outcome vs SkinThickness", fontsize=12)
    
    sns.boxplot(ax=axes[1, 0], x=diab_df['Outcome'], y=diab_df['Insulin'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[1, 0].set_title("Diabetes Outcome vs Insulin", fontsize=12)
    
    sns.boxplot(ax=axes[1, 1], x=diab_df['Outcome'], y=diab_df['BMI'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[1, 1].set_title("Diabetes Outcome vs BMI", fontsize=12)
    
    sns.boxplot(ax=axes[1, 2], x=diab_df['Outcome'], y=diab_df['DiabetesPedigreeFunction'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    axes[1, 2].set_title("Diabetes Outcome vs DiabetesPedigreeFunction", fontsize=12)
    
    sns.boxplot(ax=axes[1, 3], x=diab_df['Outcome'], y=diab_df['Age'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
    print(axes[1, 3].set_title("Diabetes Outcome vs Age", fontsize=12))
    plt.show()
    print(sns.pairplot(diab_df, hue='Outcome', palette=('#23C552','#C52219')))
    plt.show()
    
    plt.figure(figsize=(12,10))
    sns.heatmap(diab_df.corr(), annot=True, cmap='RdYlGn')
    plt.title("Feature Correlation Matrix",fontsize=20)
    plt.show()
    
    
    print(diab_df.describe())
    
    print(diab_df['Outcome'].value_counts())
    
    x = diab_df.drop(['Outcome'],axis=1)
    y = diab_df['Outcome']
    
    
    sc= StandardScaler()
    x_scaled= sc.fit_transform(x)
    
    
    x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=0)
    
    print(x_train.shape, y_train.shape)
    
    print(x_test.shape, y_test.shape)
    
    
    #applying random forst
    random_forest = RandomForestClassifier(criterion = "gini",
                                           min_samples_leaf = 1,
                                           min_samples_split = 10,
                                           n_estimators=100,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=1,
                                           n_jobs=-1)
    
    random_forest.fit(x_train, y_train)
    y_pred = random_forest.predict(x_test)
    
    confmat1 = confusion_matrix(y_test,y_pred )
    print(confmat1)
    
    cm=metrics.ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_test,y_pred,labels=random_forest.classes_),
                                  display_labels=random_forest.classes_)
    print(cm.plot(cmap="magma"))
    plt.show()
    
    print(accuracy_score(y_test, y_pred))