Skip to content
Competition - Loan Data
(Invalid URL)
Context
Data from borrowers that took loans - some paid back and others are still in progress.
library(baguette)
library(corrplot)
library(DataExplorer)
library(GGally)
library(ggmosaic)
library(lubridate)
library(ranger)
library(skimr)
library(themis)
library(tidymodels)
library(tidyverse)
library(vip)
library(zoo)
t <- theme(axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
title = element_text(size = 16))
options(scipen = 999)
tidyverse_logo()
loans <- readr::read_csv('data/loans.csv.gz')
glimpse(loans)
# skimr::skim(loans)
print(loans, n = Inf)
Data Definition
Variable | class | description |
---|---|---|
not_fully_paid | fct | loan is not fully paid = 1; otherwise = 0 |
credit_policy | fct | customer meets credit underwriting criteria = 1; otherwise = 0 |
pub_rec | num | borrower's number of derogatory public records (5) class |
installment | num | monthly payment (if funded) |
int_rate | num | interest rate (6% - 22%) |
log_annual_inc | num | borrower (self reported) annual income (natural log) |
fico | num | borrower FICO credit score |
days_with_cr_line | num | borrower # days has had credit line |
revol_bal | num | borrower revolving balance (amount unpaid/end of credit card billing cycle) |
inq_last_6mths | num | borrower number of inquiries by creditors in the last 6 months |
delinq_2yrs | num | borrower times had been 30+ days past due on a payment in the past 2 years |
revol_util | num | borrower revolving line utilization (0-100%) |
dti | num | borrower debt-to-income ratio (0-30%) |
purpose | chr | loan purpose |
Extract useful insights and visualize them in the most interesting way possible
Proportions of being paid versus meeting underwriting policy
Proprotion Paid 84%, Not Paid 16%
There is a large imbalance in the categories
loans %>% count(not_fully_paid) %>% mutate(proportion = n / sum(n))
cp <- loans %>%
mutate(credit = if_else(credit_policy == 1, "Meets underwriting policy", "Does not meet policy"),
paid = if_else(not_fully_paid == 1, "Not Fully Paid ", " Paid")) %>%
select(credit, paid)
# Contingency table
table(cp$credit, cp$paid)
cp %>%
ggplot() +
geom_mosaic(aes(x = product(paid, credit), fill = credit)) +
theme(legend.position = 'none') +
labs(x = 'Underwriting Policy', y = 'Loans which are paid') +
ggtitle("Borrowers meeting policy vs. having the loan paid") + t
Users who meet underwriting policy typically have their loans paid more than users who do not meet policy.
There are many more users overall who meet policy than not.
What type of borrowers do not meet our underwriting policy?
- high interest rates
- short installment loans
- lower income
- high debt to income ratio
- low FICO scores
- short credit history
- tend to not have loans fully paid
The opposite is true about users who do meet our underwriting policy.
suppressWarnings(print(loans %>% select(credit_policy, int_rate, installment, log_annual_inc,
dti, fico, days_with_cr_line, not_fully_paid) %>%
GGally::ggpairs(mapping = aes(color = factor(credit_policy)))))
Correlations
corrplot(cor(loans %>% select_if(is.numeric)))
Log Regression Elimination: Most important factors in having loans that are paid
summary(glm(not_fully_paid ~ credit_policy + pub_rec + installment + log_annual_inc + fico +
revol_bal + inq_last_6mths + int_rate, loans, family = 'binomial'))
Best predictors of loans that are currently paid:
- low interest rates
- low number of public records
- low number of inquiries in the past 6 months
- high income
- high FICO scores
loans %>%
mutate(paid = as.factor(case_when(not_fully_paid == 0 ~ 'Yes',
not_fully_paid == 1 ~ 'No'))) %>%
ggplot(aes(int_rate, fill = paid)) +
geom_histogram(bins = 42, position = 'identity', alpha = 0.42, color = 'black') +
scale_fill_manual(values = c("black", "chartreuse")) +
scale_y_continuous(breaks = seq(0, 850,50)) +
scale_x_continuous(labels = scales::percent_format()) +
labs(y = '', x = "Interest Rate", fill = "Fully Paid") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
ggtitle("Distribution of paid loans by interest rate") + t
loans %>%
mutate(paid = as.factor(case_when(not_fully_paid == 0 ~ 'Yes',
not_fully_paid == 1 ~ 'No'))) %>%
ggplot(aes(log_annual_inc, fill = paid)) +
geom_histogram(bins = 42, position = 'identity', alpha = 0.42, color = 'black') +
scale_fill_manual(values = c("black", "chartreuse")) +
scale_y_continuous(breaks = seq(0, 20000,1000)) +
scale_x_continuous(breaks = seq(0, 13, 1)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(x = "Log Scale Annual Income", y = "", fill = "Fully Paid") +
ggtitle("Distribution of paid loans by income") + t
fit <- glm(not_fully_paid ~ int_rate + pub_rec + log_annual_inc, loans, family = 'binomial')
newdata <- with(loans, expand.grid(int_rate = seq(min(int_rate), max(int_rate), length = 22),
pub_rec = unique(pub_rec),
log_annual_inc = quantile(log_annual_inc)))
newdata$probability <- predict(fit, newdata = newdata, type = 'response')
suppressWarnings(print(newdata %>%
ggplot(aes(int_rate, probability, color = factor(pub_rec))) +
geom_point() +
geom_line() +
geom_hline(yintercept = 0.1, size = 1.5)+
geom_hline(yintercept = 0.5, size = 1.5)+
facet_grid(~ log_annual_inc) +
geom_smooth(method = 'glm', method.args = list(family = 'binomial'),
formula = y ~ x, se = FALSE) +
scale_x_continuous(labels = scales::percent_format()) +
labs(x = "Interest Rate", y = 'Probability of Payment', color = "# of derogatory marks") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Probability of payment with Interest Rate & Public Records") + t ))