Skip to content

(Invalid URL)

Context

Data from borrowers that took loans - some paid back and others are still in progress.

library(baguette)
library(corrplot)
library(DataExplorer)
library(GGally)
library(ggmosaic)
library(lubridate)
library(ranger)
library(skimr)
library(themis)
library(tidymodels)
library(tidyverse)
library(vip)
library(zoo)

t <- theme(axis.text.x = element_text(size = 15),
           axis.text.y = element_text(size = 15),
           title = element_text(size = 16))

options(scipen = 999)
tidyverse_logo()
loans <- readr::read_csv('data/loans.csv.gz')
glimpse(loans)
# skimr::skim(loans)

print(loans, n = Inf)

Data Definition

Variableclassdescription
not_fully_paidfctloan is not fully paid = 1; otherwise = 0
credit_policyfctcustomer meets credit underwriting criteria = 1; otherwise = 0
pub_recnumborrower's number of derogatory public records (5) class
installmentnummonthly payment (if funded)
int_ratenuminterest rate (6% - 22%)
log_annual_incnumborrower (self reported) annual income (natural log)
ficonumborrower FICO credit score
days_with_cr_linenumborrower # days has had credit line
revol_balnumborrower revolving balance (amount unpaid/end of credit card billing cycle)
inq_last_6mthsnumborrower number of inquiries by creditors in the last 6 months
delinq_2yrsnumborrower times had been 30+ days past due on a payment in the past 2 years
revol_utilnumborrower revolving line utilization (0-100%)
dtinumborrower debt-to-income ratio (0-30%)
purposechrloan purpose

Extract useful insights and visualize them in the most interesting way possible

Proportions of being paid versus meeting underwriting policy

Proprotion Paid 84%, Not Paid 16%
There is a large imbalance in the categories
loans %>% count(not_fully_paid) %>% mutate(proportion = n / sum(n))

cp <- loans %>%
  mutate(credit = if_else(credit_policy  == 1, "Meets underwriting policy", "Does not meet policy"),
         paid   = if_else(not_fully_paid == 1, "Not Fully Paid ", " Paid")) %>%
  select(credit, paid)
# Contingency table
table(cp$credit, cp$paid)

cp %>%
  ggplot() +
  geom_mosaic(aes(x = product(paid, credit), fill = credit)) +
  theme(legend.position = 'none') +
  labs(x = 'Underwriting Policy', y = 'Loans which are paid') +
  ggtitle("Borrowers meeting policy vs. having the loan paid") + t
  
Users who meet underwriting policy typically have their loans paid more than users who do not meet policy.
There are many more users overall who meet policy than not.
What type of borrowers do not meet our underwriting policy?
  • high interest rates
  • short installment loans
  • lower income
  • high debt to income ratio
  • low FICO scores
  • short credit history
  • tend to not have loans fully paid
The opposite is true about users who do meet our underwriting policy.
suppressWarnings(print(loans %>% select(credit_policy, int_rate, installment, log_annual_inc, 
                 dti, fico, days_with_cr_line, not_fully_paid) %>%
  GGally::ggpairs(mapping = aes(color = factor(credit_policy)))))

Correlations

corrplot(cor(loans %>% select_if(is.numeric)))

Log Regression Elimination: Most important factors in having loans that are paid

summary(glm(not_fully_paid ~ credit_policy + pub_rec + installment + log_annual_inc + fico +
              revol_bal + inq_last_6mths + int_rate, loans, family = 'binomial'))
Best predictors of loans that are currently paid:
  • low interest rates
  • low number of public records
  • low number of inquiries in the past 6 months
  • high income
  • high FICO scores
loans %>%
    mutate(paid = as.factor(case_when(not_fully_paid == 0 ~ 'Yes', 
                                      not_fully_paid == 1 ~ 'No'))) %>%
    ggplot(aes(int_rate, fill = paid)) +
    geom_histogram(bins = 42, position = 'identity', alpha = 0.42, color = 'black') +
    scale_fill_manual(values = c("black", "chartreuse")) +
    scale_y_continuous(breaks = seq(0, 850,50)) +
    scale_x_continuous(labels = scales::percent_format()) +
    labs(y = '', x = "Interest Rate", fill = "Fully Paid") +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    ggtitle("Distribution of paid loans by interest rate") + t

loans %>%
    mutate(paid = as.factor(case_when(not_fully_paid == 0 ~ 'Yes', 
                                      not_fully_paid == 1 ~ 'No'))) %>%
    ggplot(aes(log_annual_inc, fill = paid)) +
    geom_histogram(bins = 42, position = 'identity', alpha = 0.42, color = 'black') +
    scale_fill_manual(values = c("black", "chartreuse")) +
    scale_y_continuous(breaks = seq(0, 20000,1000)) +
    scale_x_continuous(breaks = seq(0, 13, 1)) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    labs(x = "Log Scale Annual Income", y = "", fill = "Fully Paid") +
    ggtitle("Distribution of paid loans by income") + t
fit <- glm(not_fully_paid ~ int_rate + pub_rec + log_annual_inc, loans, family = 'binomial')

newdata <- with(loans, expand.grid(int_rate = seq(min(int_rate), max(int_rate), length = 22),
                                   pub_rec = unique(pub_rec),
                                   log_annual_inc = quantile(log_annual_inc)))

newdata$probability <- predict(fit, newdata = newdata, type = 'response')

suppressWarnings(print(newdata %>%
    ggplot(aes(int_rate, probability, color = factor(pub_rec))) +
    geom_point() +
    geom_line() +
    geom_hline(yintercept = 0.1, size = 1.5)+
    geom_hline(yintercept = 0.5, size = 1.5)+
    facet_grid(~ log_annual_inc) +
    geom_smooth(method = 'glm', method.args = list(family = 'binomial'), 
                formula = y ~ x, se = FALSE) +
    scale_x_continuous(labels = scales::percent_format()) +
    labs(x = "Interest Rate", y = 'Probability of Payment', color = "# of derogatory marks") +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle("Probability of payment with Interest Rate & Public Records") + t ))