Skip to content
# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
from matplotlib import pyplot as plt

# PCA
from sklearn.decomposition import PCA


# Carrega a função SMOTE
import imblearn
from imblearn.over_sampling import SMOTE

# Ensemble Classifiers
from sklearn.ensemble import VotingClassifier, BaggingClassifier

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.metrics import accuracy_score


from sklearn.pipeline import Pipeline

%matplotlib inline 
import warnings
warnings.filterwarnings("ignore")
# Functions
def plot_sklearn_roc_curve(y_real, y_pred, auc, model_name):
    fpr, tpr, _ = roc_curve(y_real, y_pred)
    
    fig, ax = plt.subplots(1, 1, figsize=(12, 12))
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax)
    plt.plot([0, 1], [0, 1], color = 'g',label="AUC="+str(round(auc,2)))
    plt.title('AUC: ' + model_name)
    plt.legend(loc=4, fontsize='xx-large')
    

    

Caregando os dados e Primeiras impressões

# Carrega os dados
dados = pd.read_csv('indian_liver_patient.csv')
[7]
dados.head()
[8]
dados.shape
[9]
dados.columns
[10]
dados.dtypes
[11]
dados.isnull().sum()
[12]
# Dados duplicados

dados_duplicados = dados[dados.duplicated(keep = False)]

dados_duplicados
[13]
dados_duplicados.shape
Análise:
  • Com exceção da variave 'Gender' todas as outras se apresetam na forma númerica
  • Há poucos ocorrência de dados Vazios (Nas) no conjunto
  • Existem duas variáveis com Nomes similares ('Total_Bilirubin' e 'Direct_Bilirubin'): verificar se são redundantes.

Análise Exporatória

Variáveis Numéricas

[14]
dados.describe()