Commercial banks receive a lot of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this workbook, you will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.
The Data
The data is a small subset of the Credit Card Approval dataset from the UCI Machine Learning Repository showing the credit card applications a bank receives. This dataset has been loaded as a pandas DataFrame called cc_apps. The last column in the dataset is the target value.
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix
)
from sklearn.model_selection import GridSearchCV, KFold
# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)
print('Getting to know the data:\n')
print('----------------------------------')
print('cc_apps shape:', cc_apps.shape)
print('----------------------------------')
print('cc_apps columns:\n', cc_apps.columns)
print('----------------------------------')
print('cc_apps dtypes:\n', cc_apps.dtypes)
print('----------------------------------')
print('cc_apps info:\n', cc_apps.info())
print('----------------------------------')
print('cc_apps head:\n', cc_apps.head())
print('----------------------------------')
# Let's first provide a good name for the columns. The last one is the target
cc_apps.columns = ['col_' + str(i) for i in range(cc_apps.shape[1] - 1)] + ['target']
# Let's also map the target to be 1 for + and 0 for -
cc_apps['target'] = cc_apps['target'].map({'+': 1, '-': 0}) def build_pipeline(model, num_cols: list, cat_cols: list) -> Pipeline:
"""
Creates a pipeline for the given model. This function accounts for the model's scaling sensitivity, that is, it doesn't apply the StandardScaler to models that are not sensitive to scaling.
Args:
model (class object): The model to be used in the pipeline.
Returns:
pipeline (Pipeline): The constructed pipeline.
"""
# Check the scaling sensitivity
is_scale_sensitive = isinstance(model, (LogisticRegression, RidgeClassifier, SVC, KNeighborsClassifier))
transformers = []
if is_scale_sensitive:
transformers.append(
('num', StandardScaler(), num_cols)
)
else:
transformers.append(
('num', 'passthrough', num_cols)
)
transformers.append(
('cats', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
)
preprocessor = ColumnTransformer(transformers=transformers)
return Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
def apply_grid_search(models: list, df: pd.DataFrame, num_cols: list, cat_cols: list):
"""
It applies the GridSearch on the models provided on a list. This list must contain tuples with the model and the parameters to be tested. It also needs to receive the dataframe, the numerical columns and the categorical columns.
Args:
models (list): List of tuples containing the model and the parameters to be tested.
df (pd.DataFrame): The dataframe to be used.
num_cols (list): List of numerical columns.
cat_cols (list): List of categorical columns.
Returns:
scores_df (pd.DataFrame): DataFrame containing the scores and the best parameters for all models.
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
df.drop(columns=['target']),
df['target'],
test_size=0.2,
random_state=42
)
best_model = []
for model, param_grid in models:
pipeline = build_pipeline(model, num_cols, cat_cols)
model_grid = GridSearchCV(
pipeline,
param_grid=param_grid,
cv=kf,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
model_grid.fit(X_train, y_train)
y_pred = model_grid.predict(X_test)
model_name = model.__class__.__name__
best_params = model_grid.best_params_
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred).tolist()
metricas = {
'model': model_name,
'best_params': best_params,
'accuracy': acc,
'precision': prec,
'recall': recall,
'f1_score': f1,
'confusion_matrix': cm
}
best_model.append(metricas)
print(f'Model: {model_name}')
print(f'Best parameters: {best_params}')
print(f'Best score: {model_grid.best_score_}')
print(f'Accuracy for the test set: {acc}')
scores_df = pd.DataFrame(best_model).sort_values('accuracy', ascending=False)
print('----------------------------------')
print('Scores for all models:\n', scores_df)
print('----------------------------------')
return scores_df
# Applying the GridSearch
models = [
(LogisticRegression(), {'model__C': [.001, .01, .1, 1, 10,100],
'model__max_iter': [100, 500, 1000],
'model__penalty': ['l2'],
'model__solver': ['liblinear', 'lbfgs']}),
(RidgeClassifier(), {'model__alpha': [.001, .01, .1, 1, 10, 100]}),
(DecisionTreeClassifier(), {'model__max_depth': range(1, 16),
'model__min_samples_split': np.arange(.1, .5, .05),
'model__min_samples_leaf': np.arange(.3, .5, .05),
'model__max_features': ['sqrt', 'log2', None],
'model__criterion': ['gini', 'entropy']}),
(SVC(), {'model__C': [0.1, 1, 10, 100, 1000],
'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'model__kernel': ['linear', 'rbf', 'poly'],}),
(GaussianNB(), {'model__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]}),
(KNeighborsClassifier(), {'model__weights': ['uniform', 'distance'],
'model__n_neighbors': range(1, 26)})
]
num_cols = cc_apps.select_dtypes(include='number').drop(columns='target').columns.tolist()
cat_cols = cc_apps.select_dtypes(exclude='number').columns.tolist()
df_models = apply_grid_search(models, cc_apps, num_cols, cat_cols)
print(df_models)best_score = df_models.iloc[0, 2]
print(best_score)