Skip to content
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df = pd.read_csv('1645792390_cep1_dataset.csv')
df.head(20)
df.info()
#Convert category columns into category types (sex, cp, fbs, restecg, exang, slope, ca, thal, target)
df['sex'] = df['sex'].astype('category')
df['cp'] = df['cp'].astype('category')
df['fbs'] = df['fbs'].astype('category')
df['restecg'] = df['restecg'].astype('category')
df['exang'] = df['exang'].astype('category')
df['slope'] = df['slope'].astype('category')
df['ca'] = df['ca'].astype('category')
df['thal'] = df['thal'].astype('category')
#detect duplicates
duplicates = df[df.duplicated(keep=False)]

if duplicates.empty:
    print("No duplicates found in the DataFrame.")
else:
    print("Duplicate rows in the DataFrame:")
    print(duplicates)
#eliminate duplicate row
df = df.drop_duplicates()
#Split df into numeric and categoric dfs

num_columns = df.select_dtypes(include=['int64','float64'])
numerical_df = df[num_columns.columns]
#numerical_df_with_target = pd.concat([numerical_df, df['target']],axis=1)
cat_columns = df.select_dtypes(include='category')
cat_df = df[cat_columns.columns]

Analysis of Numerical Data

# Histogram - numeric features

numerical_df.hist(figsize=(10,8))
#numerical data summary statistics
numerical_df.describe()
correlation_matrix = numerical_df.corr()

# Display the correlation matrix
print(correlation_matrix)

pearsoncorr_relevant = numerical_df.corr(method='pearson')

sns.heatmap(pearsoncorr_relevant,
xticklabels=pearsoncorr_relevant.columns,
yticklabels=pearsoncorr_relevant.columns,
annot=True,
linewidth=0.5)
#display boxplot of numerical features
for column in numerical_df.columns:
    sns.boxplot(y = numerical_df['target'].astype('category'), x = column, data=df)
    plt.show()
#visualize positive/negative distribution
for column in df.columns:
    if df[column].dtype in ['int64','float64']:
        plt.figure(figsize=(5, 5))
        sns.histplot(x=column, hue='target', data=df, bins=15, multiple='stack')
        plt.xticks(rotation=45)
        plt.show()

Analysis of categorical data