Skip to content
Indian Liver Patient (AUC 83%)
# Data Manipulation
import numpy as np
import pandas as pd
# Data Visualization
import seaborn as sns
from matplotlib import pyplot as plt
# PCA
from sklearn.decomposition import PCA
# Carrega a função SMOTE
import imblearn
from imblearn.over_sampling import SMOTE
# Ensemble Classifiers
from sklearn.ensemble import VotingClassifier, BaggingClassifier
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")# Functions
def plot_sklearn_roc_curve(y_real, y_pred, auc, model_name):
fpr, tpr, _ = roc_curve(y_real, y_pred)
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax)
plt.plot([0, 1], [0, 1], color = 'g',label="AUC="+str(round(auc,2)))
plt.title('AUC: ' + model_name)
plt.legend(loc=4, fontsize='xx-large')
Caregando os dados e Primeiras impressões
# Carrega os dados
dados = pd.read_csv('indian_liver_patient.csv')[7]
dados.head()[8]
dados.shape[9]
dados.columns[10]
dados.dtypes[11]
dados.isnull().sum()[12]
# Dados duplicados
dados_duplicados = dados[dados.duplicated(keep = False)]
dados_duplicados[13]
dados_duplicados.shapeAnálise:
- Com exceção da variave 'Gender' todas as outras se apresetam na forma númerica
- Há poucos ocorrência de dados Vazios (Nas) no conjunto
- Existem duas variáveis com Nomes similares ('Total_Bilirubin' e 'Direct_Bilirubin'): verificar se são redundantes.
Análise Exporatória
Variáveis Numéricas
[14]
dados.describe()