Skip to content
# imports
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

gender_submission = pd.read_csv('gender_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# data cleaning for "test"
train['Sex'] = pd.Categorical(train['Sex'], categories=['male', 'female'], ordered=False)
train['Embarked']= pd.Categorical(train['Embarked'], categories=['C', 'Q', 'S'], ordered=False)
imputer = KNNImputer(n_neighbors=5)
train['Age'] = imputer.fit_transform(train[['Age']]).round(0)
test['Age'] = imputer.transform(test[['Age']]).round(0)
train['Age'] = train['Age'].fillna(train['Age'].mean().round(2))
test['Age'] = test['Age'].fillna(test['Age'].mean().round(2))
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)
train = train.drop('Cabin', axis=1)
test = test.drop('Cabin', axis=1)

# preparing for predictive models
x_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']
x_train = pd.get_dummies(data=x_train, columns=(['Sex', 'Embarked']))
x_train = x_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = test.copy()
x_test['Sex'] = pd.Categorical(x_test['Sex'], categories=['male', 'female'], ordered=False)
x_test['Embarked'] = pd.Categorical(x_test['Embarked'], categories=['C', 'Q', 'S'], ordered=False)
x_test = pd.get_dummies(data=x_test, columns=(['Sex', 'Embarked']))
x_test = x_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
x_test = scaler.transform(x_test)

# Hyperparameters tuning for Logistic Regression
logreg = LogisticRegression()
logreg_params = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='accuracy')
logreg_grid.fit(x_train, y_train)
logreg_best = logreg_grid.best_estimator_
logreg_pred = logreg_best.predict(x_test)
logreg_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': logreg_pred
})
logreg_submission.to_csv('logreg_submission.csv', index=False)

# Hyperparameters tuning for Decision Tree Classifier
dt = DecisionTreeClassifier()
dt_params = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10]
}
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy')
dt_grid.fit(x_train, y_train)
dt_best = dt_grid.best_estimator_
dt_pred = dt_best.predict(x_test)
dt_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': dt_pred
})
dt_submission.to_csv('dt_submission.csv', index=False)

# Hyperparameters tuning for Random Forest Classifier
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(x_train, y_train)
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(x_test)
rf_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': rf_pred
})
rf_submission.to_csv('rf_submission.csv', index=False)

# Hyperparameters tuning for Gradient Boosting Classifier
gb = GradientBoostingClassifier()
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10]
}
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')
gb_grid.fit(x_train, y_train)
gb_best = gb_grid.best_estimator_
gb_pred = gb_best.predict(x_test)
gb_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': gb_pred
})
gb_submission.to_csv('gb_submission.csv', index=False)

# Hyperparameters tuning for AdaBoost Classifier
ada = AdaBoostClassifier()
ada_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}
ada_grid = GridSearchCV(ada, ada_params, cv=5, scoring='accuracy')
ada_grid.fit(x_train, y_train)
ada_best = ada_grid.best_estimator_
ada_pred = ada_best.predict(x_test)
ada_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': ada_pred
})
ada_submission.to_csv('ada_submission.csv', index=False)

# Hyperparameters tuning for Extra Trees Classifier
et = ExtraTreesClassifier()
et_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10]
}
et_grid = GridSearchCV(et, et_params, cv=5, scoring='accuracy')
et_grid.fit(x_train, y_train)
et_best = et_grid.best_estimator_
et_pred = et_best.predict(x_test)
et_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': et_pred
})
et_submission.to_csv('et_submission.csv', index=False)

# Hyperparameters tuning for Support Vector Classifier
svc = SVC()
svc_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']
}
svc_grid = GridSearchCV(svc, svc_params, cv=5, scoring='accuracy')
svc_grid.fit(x_train, y_train)
svc_best = svc_grid.best_estimator_
svc_pred = svc_best.predict(x_test)
svc_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': svc_pred
})
svc_submission.to_csv('svc_submission.csv', index=False)

# Displaying accuracy of the models
y_true = gender_submission['Survived']
logreg_accuracy = round(accuracy_score(y_true, logreg_pred), 2)
dt_accuracy = round(accuracy_score(y_true, dt_pred), 2)
rf_accuracy = round(accuracy_score(y_true, rf_pred), 2)
gb_accuracy = round(accuracy_score(y_true, gb_pred), 2)
ada_accuracy = round(accuracy_score(y_true, ada_pred), 2)
et_accuracy = round(accuracy_score(y_true, et_pred), 2)
svc_accuracy = round(accuracy_score(y_true, svc_pred), 2)
logreg_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, ada_accuracy, et_accuracy, svc_accuracy