Skip to content

Commercial banks receive a lot of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this workbook, you will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.

The Data

The data is a small subset of the Credit Card Approval dataset from the UCI Machine Learning Repository showing the credit card applications a bank receives. This dataset has been loaded as a pandas DataFrame called cc_apps. The last column in the dataset is the target value.

# Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import randint, uniform
RANDOM_STATE = 42

# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
cc_apps.head()
# ---------------------------
# Target = last column
# ---------------------------
data = cc_apps

# Try converting object-like columns to numeric if many values parse
for col in data.columns:
    if data[col].dtype == object:
        converted = pd.to_numeric(data[col].astype(str).str.strip(), errors='coerce')
        # If >50% rows convert to numeric, replace the column
        if converted.notna().sum() > 0.5 * len(data):
            data[col] = converted

X = data.iloc[:, :-1].copy()
y = data.iloc[:, -1].copy()

# Map target from '+'/'-' to 1/0 if needed
if y.dtype == object:
    y = y.map({'+': 1, '-': 0}).astype(int)

print("Target distribution (counts):")
print(y.value_counts())

# ---------------------------
# Identify numeric vs categorical
# ---------------------------
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

# ---------------------------
# Preprocessing pipelines
# ---------------------------
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# ---------------------------
# Train/test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# ---------------------------
# Model search (light)
# ---------------------------
cv_folds = 4
results = []

# 1) Logistic Regression (grid)
pipe_lr = Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=2000, solver='liblinear', random_state=RANDOM_STATE))])
param_grid_lr = {'clf__C': [0.01, 0.1, 1, 10], 'clf__penalty': ['l1', 'l2']}
gs_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=cv_folds, scoring='accuracy', n_jobs=-1)
gs_lr.fit(X_train, y_train)
results.append(('LogisticRegression', gs_lr.best_score_, gs_lr.best_estimator_))

# 2) Random Forest (randomized)
pipe_rf = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(random_state=RANDOM_STATE))])
param_dist_rf = {'clf__n_estimators': randint(50, 150),
                 'clf__max_depth': randint(3, 15),
                 'clf__min_samples_split': randint(2, 8)}
rs_rf = RandomizedSearchCV(pipe_rf, param_dist_rf, n_iter=12, cv=cv_folds, scoring='accuracy', random_state=RANDOM_STATE, n_jobs=-1)
rs_rf.fit(X_train, y_train)
results.append(('RandomForest', rs_rf.best_score_, rs_rf.best_estimator_))

# 3) Gradient Boosting (randomized)
pipe_gb = Pipeline([('pre', preprocessor), ('clf', GradientBoostingClassifier(random_state=RANDOM_STATE))])
param_dist_gb = {'clf__n_estimators': randint(50, 200),
                 'clf__learning_rate': uniform(0.01, 0.5),
                 'clf__max_depth': randint(1, 5)}
rs_gb = RandomizedSearchCV(pipe_gb, param_dist_gb, n_iter=12, cv=cv_folds, scoring='accuracy', random_state=RANDOM_STATE, n_jobs=-1)
rs_gb.fit(X_train, y_train)
results.append(('GradientBoosting', rs_gb.best_score_, rs_gb.best_estimator_))

# 4) SVC (smaller grid)
pipe_svc = Pipeline([('pre', preprocessor), ('clf', SVC(random_state=RANDOM_STATE))])
param_grid_svc = {'clf__C': [0.1, 1, 10], 'clf__kernel': ['linear', 'rbf']}
gs_svc = GridSearchCV(pipe_svc, param_grid_svc, cv=cv_folds, scoring='accuracy', n_jobs=-1)
gs_svc.fit(X_train, y_train)
results.append(('SVC', gs_svc.best_score_, gs_svc.best_estimator_))

# Print CV results
for name, cvscore, est in results:
    print(f"{name}: CV best accuracy = {cvscore:.4f}")

# ---------------------------
# Pick best by CV, retrain and evaluate on test
# ---------------------------
best_name, best_cv_score, best_estimator = max(results, key=lambda x: x[1])
print(f"\nSelected best model by CV: {best_name} (CV acc={best_cv_score:.4f})")

best_model = best_estimator
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test set accuracy for chosen model ({best_name}): {test_acc:.4f}")

# Save numeric variable required by the user
best_score = float(test_acc)

print("\nClassification report (test):")
print(classification_report(y_test, y_pred))

print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_pred))

# Optional: show the best classifier hyperparameters inside pipeline
try:
    params = best_model.get_params()
    clf_params = {k: v for k, v in params.items() if k.startswith('clf__')}
    print("\nBest classifier hyperparameters (clf__*):")
    print(clf_params)
except Exception:
    pass

# Final: best_score is available as a numeric Python variable.
# Print it explicitly so callers can grab it:
print("\nNumeric variable 'best_score' =", best_score)