Skip to content

Commercial banks receive a lot of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this workbook, you will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.

The Data

The data is a small subset of the Credit Card Approval dataset from the UCI Machine Learning Repository showing the credit card applications a bank receives. This dataset has been loaded as a pandas DataFrame called cc_apps. The last column in the dataset is the target value.

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
cc_apps.head()
# Preprocess the data
def preprocess_data(data_path):
  cc_apps = pd.read_csv("cc_approvals.data", header=None)

  # Handle missing values
  for col in cc_apps.columns:
    if pd.api.types.is_numeric_dtype(cc_apps[col]):
      cc_apps[col].fillna(cc_apps[col].mean(), inplace=True)  # Replace numeric with mean
    else:
      cc_apps[col].fillna(cc_apps[col].mode()[0], inplace=True)  # Replace categorical with mode

  # One-hot encode categorical features
  categorical_cols = [col for col in cc_apps.columns if not pd.api.types.is_numeric_dtype(cc_apps[col])]
  cc_apps = pd.get_dummies(cc_apps, columns=categorical_cols)

  # Ensure feature names are strings (important for StandardScaler)
  cc_apps.columns = cc_apps.columns.astype(str)  # Convert column names to strings

  # Split data into features (X) and target variable (y)
  X = cc_apps.iloc[:, :-1]
  y = cc_apps.iloc[:, -1]

  # Feature scaling
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  return X_scaled, y
# Prepare data for modeling
def prepare_data(X_scaled, y, test_size=0.2):
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42)
  return X_train, X_test, y_train, y_test
# Train the model (Logistic Regression example)
def train_model(X_train, y_train):
  model = LogisticRegression()
  model.fit(X_train, y_train)
  return model
# Finding the best scoring model
def find_best_model(X_train, y_train, X_test, y_test):
    # Train Logistic Regression model (replace with other models as needed)
    model = train_model(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)  # Use imported accuracy_score
    print("Logistic Regression Accuracy:", accuracy)

    # Hyperparameter tuning (optional) with GridSearchCV (example)
    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    best_score = accuracy_score(y_test, y_pred)  # Use imported accuracy_score
    print("Best Hyperparameter Tuned Logistic Regression Accuracy:", best_score)

    return best_model, best_score

# Load data and preprocess
X_scaled, y = preprocess_data('cc_approvals.data')

# Prepare data for modeling
X_train, X_test, y_train, y_test = prepare_data(X_scaled, y)

# Train model and find best model with hyperparameter tuning (optional)
best_model, best_score = find_best_model(X_train, y_train, X_test, y_test)

# Check if accuracy meets the target
if best_score >= 0.75:
  print("Target accuracy achieved! Best model accuracy:", best_score)
else:
  print("Target accuracy not achieved. Consider trying other models or hyperparameter tuning.")