Skip to content
DATATHON 2024
# import libraries
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from statsmodels.graphics.mosaicplot import mosaic
from itertools import product
from scipy.optimize import minimize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans# import data
train = pd.DataFrame(pd.read_csv('train.csv', low_memory = False))
test = pd.DataFrame(pd.read_csv('test_x.csv', low_memory = False))# look data
print(train.columns)
print(len(train))# look na
print(train.isna().sum())Let's calculate the the percentage of Na of the columns. (Total number of NA) / (length of Column)
# write a function for calculating na percentage
def na_per(df):
na_percentages = df.isna().sum() / len(df)
return na_percentages
na_percentages = na_per(train) *100
na_percentagesna_columns = na_percentages[na_percentages > 50]
print(na_columns)# drop columns those na percentage higher than 50%
train = train.drop(columns = na_columns.index)
print(train.head())print(train.columns)We will go investiage the data column by column
na_columns_20 = na_percentages[na_percentages <= 20]
print(na_columns_20)# Correct typos
train['Cinsiyet'] = train['Cinsiyet'].str.capitalize()
# Drop any remaining NaN values after mapping
train = train.dropna(subset=['Cinsiyet'])
# hist
plt.figure(figsize=(10, 6))
sns.histplot(train['Cinsiyet'], bins=2, edgecolor='black')
plt.xlabel('Cinsiyet (1: Erkek, 0: Kadin)')
plt.title('Distribution of Cinsiyet')
plt.show()# train = pd.get_dummies(train, columns=['Cinsiyet'], drop_first=False)train['Cinsiyet'].value_counts()