Skip to content
Cardiovascular Disease Model
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df = pd.read_csv('1645792390_cep1_dataset.csv')df.head(20)df.info()#Convert category columns into category types (sex, cp, fbs, restecg, exang, slope, ca, thal, target)
df['sex'] = df['sex'].astype('category')
df['cp'] = df['cp'].astype('category')
df['fbs'] = df['fbs'].astype('category')
df['restecg'] = df['restecg'].astype('category')
df['exang'] = df['exang'].astype('category')
df['slope'] = df['slope'].astype('category')
df['ca'] = df['ca'].astype('category')
df['thal'] = df['thal'].astype('category')#detect duplicates
duplicates = df[df.duplicated(keep=False)]
if duplicates.empty:
print("No duplicates found in the DataFrame.")
else:
print("Duplicate rows in the DataFrame:")
print(duplicates)#eliminate duplicate row
df = df.drop_duplicates()#Split df into numeric and categoric dfs
num_columns = df.select_dtypes(include=['int64','float64'])
numerical_df = df[num_columns.columns]
#numerical_df_with_target = pd.concat([numerical_df, df['target']],axis=1)
cat_columns = df.select_dtypes(include='category')
cat_df = df[cat_columns.columns]
Analysis of Numerical Data
# Histogram - numeric features
numerical_df.hist(figsize=(10,8))#numerical data summary statistics
numerical_df.describe()correlation_matrix = numerical_df.corr()
# Display the correlation matrix
print(correlation_matrix)
pearsoncorr_relevant = numerical_df.corr(method='pearson')
sns.heatmap(pearsoncorr_relevant,
xticklabels=pearsoncorr_relevant.columns,
yticklabels=pearsoncorr_relevant.columns,
annot=True,
linewidth=0.5)
#display boxplot of numerical features
for column in numerical_df.columns:
sns.boxplot(y = numerical_df['target'].astype('category'), x = column, data=df)
plt.show()#visualize positive/negative distribution
for column in df.columns:
if df[column].dtype in ['int64','float64']:
plt.figure(figsize=(5, 5))
sns.histplot(x=column, hue='target', data=df, bins=15, multiple='stack')
plt.xticks(rotation=45)
plt.show()Analysis of categorical data