Skip to content
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load data
loan   = pd.read_csv('2018_state_NH.csv')
loan1  = pd.read_csv('2019_state_NH.csv')
loan2  = pd.read_csv('2020_state_NH.csv')
loan3  = pd.read_csv('2021_state_NH.csv')
loan4  = pd.read_csv('2022_state_NH.csv')
LEI = pd.read_csv('LEI.csv', encoding='latin1', on_bad_lines='skip')  
CC     = pd.read_csv('county_codes.csv')

# Step 2: Explore
print(loan.info())
print(loan.describe())
print(loan.head())
print(loan.shape)
print(LEI.info())

print(loan['action_taken'].value_counts())
print(loan['county_code'].value_counts())
print(loan['derived_ethnicity'].value_counts())

if 'applicant_race.1' in loan.columns:
    print(loan['applicant_race.1'].value_counts())
else:
    # Try to find a similar column (e.g., 'applicant_race_1' or 'applicant_race')
    possible_cols = [col for col in loan.columns if 'applicant_race' in col]
    print(f"'applicant_race.1' not found. Available applicant_race columns: {possible_cols}")
    # Optionally, print value counts for the first found similar column
    if possible_cols:
        print(loan[possible_cols[0]].value_counts())

# Step 3: Combine datasets
loan_m = pd.concat([loan, loan1, loan2, loan3, loan4], ignore_index=True)
print(loan_m.shape)
print(loan_m['action_taken'].value_counts())

# Step 4: Missing values
na_count = loan_m.isna().sum()
print(na_count)

# Step 5: Clean columns (drop ranges)
drop_cols = [
    'denial_reason.2','denial_reason.3','denial_reason.4',
    'co.applicant_race.2','co.applicant_race.3','co.applicant_race.4','co.applicant_race.5',
    'co.applicant_ethnicity.2','co.applicant_ethnicity.3','co.applicant_ethnicity.4','co.applicant_ethnicity.5',
    'applicant_ethnicity.2','applicant_ethnicity.3','applicant_ethnicity.4','applicant_ethnicity.5',
    'applicant_race.2','applicant_race.3','applicant_race.4','applicant_race.5',
    'aus.2','aus.3','aus.4','aus.5',
    'multifamily_affordable_units','total_points_and_fees',
    'prepayment_penalty_term','intro_rate_period',
    'rate_spread','lender_credits',
    'negative_amortization','other_nonamortizing_features',
    'construction_method','manufactured_home_secured_property_type',
    'manufactured_home_land_property_interest',
    'submission_of_application','initially_payable_to_institution'
]
loan_m1 = loan_m.drop(columns=[c for c in drop_cols if c in loan_m.columns])

# Scale income
loan_m1['income'] = loan_m1['income'] * 1000

# Step 6: Filter out business purpose and reverse mortgage
loan_m1 = loan_m1[(loan_m1['business_or_commercial_purpose'] != 1) &
                  (loan_m1['business_or_commercial_purpose'] != 1111)]
loan_m1 = loan_m1[(loan_m1['reverse_mortgage'] != 1) &
                  (loan_m1['reverse_mortgage'] != 1111)]
loan_m1 = loan_m1.drop(columns=['reverse_mortgage','business_or_commercial_purpose'], errors='ignore')

# Step 7: Focus on institutions with >5000 records
if 'lei' not in loan_m1.columns:
    # Try to find a similar column (case-insensitive)
    possible_lei_cols = [col for col in loan_m1.columns if col.lower() == 'lei']
    if possible_lei_cols:
        lei_col = possible_lei_cols[0]
        print(f"'lei' column not found, using '{lei_col}' instead.")
        loan_m1 = loan_m1.rename(columns={lei_col: 'lei'})
    else:
        raise KeyError("No 'lei' column found in loan_m1. Please check your data columns.")

loan_m1L = loan_m1.groupby('lei').filter(lambda x: len(x) > 5000)

# Also check for 'lei' column in LEI lookup table ---
if 'lei' not in LEI.columns:
    possible_lei_cols = [col for col in LEI.columns if col.lower() == 'lei']
    if possible_lei_cols:
        lei_col = possible_lei_cols[0]
        print(f"'lei' column not found in LEI, using '{lei_col}' instead.")
        LEI = LEI.rename(columns={lei_col: 'lei'})
    else:
        raise KeyError("No 'lei' column found in LEI lookup table. Please check your LEI.csv columns.")

# Step 8: Merge lookup tables
loan_m1L = loan_m1L.merge(LEI, on='lei', how='left')

# Also check for 'county_code' column in CC lookup table ---
if 'county_code' not in CC.columns:
    possible_cc_cols = [col for col in CC.columns if col.lower() == 'county_code']
    if possible_cc_cols:
        cc_col = possible_cc_cols[0]
        print(f"'county_code' column not found in CC, using '{cc_col}' instead.")
        CC = CC.rename(columns={cc_col: 'county_code'})
    else:
        raise KeyError("No 'county_code' column found in county_codes.csv. Please check your columns.")

loan_m1L = loan_m1L.merge(CC, on='county_code', how='left')

# Step 9: Convert to numeric
for col in ['income','property_value','interest_rate','loan_term','loan_to_value_ratio']:
    loan_m1L[col] = pd.to_numeric(loan_m1L[col], errors='coerce')

# Step 10: Feature engineering
loan_m1L['DTI_Bins'] = pd.qcut(loan_m1L['debt_to_income_ratio'].rank(method='first'), 8, labels=False)
loan_m1L['action_taken'] = np.where(loan_m1L['action_taken'] > 1, 0, 1)

# Step 11: Export
loan_m1.to_csv("loan_m1.csv", index=False)
loan_m1L.to_csv("loan_m1L.csv", index=False)

# Step 12: Visualization examples
plt.hist(loan_m['action_taken'], bins=10)
plt.title("Histogram of Action Taken")
plt.show()

# Check for applicant_race.1 column existence before plotting ---
if 'applicant_race.1' in loan_m.columns:
    sns.countplot(x='applicant_race.1', data=loan_m)
    plt.title("Applicant Race Distribution")
    plt.show()
else:
    # Try to find a similar column (e.g., 'applicant_race_1' or 'applicant_race')
    possible_cols = [col for col in loan_m.columns if 'applicant_race' in col]
    print(f"'applicant_race.1' not found in loan_m. Available applicant_race columns: {possible_cols}")
    if possible_cols:
        sns.countplot(x=possible_cols[0], data=loan_m)
        plt.title(f"Applicant Race Distribution ({possible_cols[0]})")
        plt.show()

# Step 13: Used SAP for Predicive Modeling but will run some Regressions and Ensemble models
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Features and target
# Define predictor variables (X) for residential-focused model
X = loan_m1L[[
    # Loan/Application Features
    'loan_amount','loan_type','loan_purpose','lien_status',
    'interest_rate','loan_term','conforming_loan_limit',
    'open-end_line_of_credit','interest_only_payment','balloon_payment',

    # Applicant Demographics
    'derived_ethnicity','derived_race','derived_sex',
    'applicant_age','applicant_age_above_62','income','debt_to_income_ratio',
    'applicant_credit_score_type','co-applicant_credit_score_type',

    # Property & Geography
    'property_value','occupancy_type','state_code','county_code'
]]

# Target variable
y = loan_m1L['action_taken']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Data 670 Data Analytics Brian J Freund Professor Jon McKeeby Assignment 6 6th August 2024   Executive Summary The project scope is to identify issues in the risk management of financial institutions doing business in the New Hampshire market via the Home Mortgage Disclosure Act data ranging from 2018 to 2022. The following report will identify deficiencies in the operations that help the non-minority population to obtain favorable loan terms. The data will then be used to determine what loans will be originated by the lender to be used to determine future credit policy to acquire more loans. The data has been prepared looking at the different variables and determining the best course of action. It has been seen that the data is skewed to twenty major financial institutions in New Hampshire who owned over 50% of the market share us of 843 different lenders in the market. Visualizations have been created showing that Rockingham County has the highest property value and saw a 48% increase in property values. Continued modeling will be performed to look for ways to improve and identify opportunities in the market to increase this exposure. The modeling will look for the champion model that can be put in place to predict originated loans based on the information provided; the champion model is an ensemble model that is able to determine 95% of the time that the loan should be originated while mitigating the loans that will not be originated. The findings in this analysis saw that home values increased by almost 50% in New Hampshire, well above the 29% national average. Also, it saw that minorities were less likely to apply for mortgages during this period based on the census data. There are places where the data shows a strong market in New Hampshire but concerns about this rapid increase in home values coupled with the market catering still disproportionately to the white race demographic.  Table of Contents Executive Summary 2 Project Scope 7 Data Analytics Project/Project Scope 7 Problem Importance 7 Research/Problem Analysis 8 Critical Success Factors (CSFs) 8 Key Performance Indicators 8 Project Insights of your Data Analysis 9 Project Milestones 10 Completion History 10 Lessons Learned 12 Data Set Description 14 High-Level Data Diagram 14 Data Definition/Data Profile 15 Data Preparation/Cleansing/Transformation 21 Data Preparation 21 Data Cleansing 21 Data Transformation 22 Data Analysis 23 Data Visualization 24 Descriptive Statistics 24 Data Visualization Definitions 25 Data Visualization 1 25 Data Visualization 2 27 Data Visualization 3 28 Data Visualizations 4 30 Proposed Visualizations 32 Predictive Models 33 Data Modeling Definitions 33 Predictive Model 1 34 Predictive Model 2 38 Predictive Model 3 41 Predictive Model 4 44 Predictive Model Review 47 Final Results 50 Analysis Justification 50 Findings 50 Review of KPIs 53 Review Significance 54 Recommendations for Future Analysis 55 References 56

  Figure 1 Action Taken on the Application 24 Figure 2 Median Property Values 26 Figure 3 Percentage of Minority 27 Figure 4 Top 20 Race Breakdown 29 Figure 5 Rockingham Bubble 31 Figure 6 Hyper plane visualization (Statinov A.) 35 Figure 7 SVM results. 36 Figure 8 SVM Confusion Matrix 36 Figure 9 SVM Classifier Results 37 Figure 10 SVM scores. 38 Figure 11 Random Forest Results 39 Figure 12 Random Forest Confusion Matrix 39 Figure 13 Random Forest Classifier distribution. 40 Figure 14 Random Forest scores. 40 Figure 15 Neural Network Diagram 42 Figure 16 Neural Network Results 42 Figure 17 Neural Network Confusion Matrix 43 Figure 18 Graph of results. 43 Figure 19 Neural Network scores. 44 Figure 20 Ensemble and underlying models Results. 45 Figure 21 Ensemble Confusion Matrix 45 Figure 22 Ensemble Classifier Chart 46 Figure 23 Ensemble and underlying models scores. 47 Figure 24 Champion Model Comaprison2 48 Figure 25 ROC Chart for all top models. 48 Figure 26 Top 20 Lenders 51 Figure 27 Median Property Values by Year 52 Figure 28 Rockingham Loan Data Top 20 52  

Project Scope Data Analytics Project/Project Scope The onset of COVID in March 2020 and the push for work from home (WFH) created an exodus of people from the large cities like Boston and New York City to more remote locations. As immigration became more restrictive in the late 2010s, major US cities showed slowing growth which was accelerated with the onset of the pandemic (Frey, 2022). During the pandemic, the state of New Hampshire saw a significant rise in home sales as this exodus from the large cities on the US East Coast. Over 62% of homes in the US have a mortgage on it (Ortiz, 2024). This gives a good start to see how people have moved from the larger cities to New Hampshire. The Home Mortgage Disclosure Act (HMDA) provides the 'most comprehensive data set' of the mortgage industry (CFPB, n.d.). In these reports, a large amount of data is collected including race, census tract, income and other variables which help to provide more insight into the economy. The years chosen for this analysis are 2018 to 2022; this five-year period shows the trend before COVID and then the aftermath. Working as a risk management analyst for a large commercial bank, New Hampshire has become a large area of interest and what the trends are showing. As well, the analyst will look for changes in population density utilizing Census data to ensure that all races are able to attain mortgages. The federal government through the Community Reinvestment Act (CRA) looks to ensure that all lenders are creating opportunity for low- and moderate-income areas as well across all races (Kenton, 2024). The data analyst performed will look at trends across single family homes in the time from 2018-2022 cross referenced with the census data showing the population distribution. The goal is to look for trends in the lending of the home mortgage industry and if the needs of all borrowers are covered. The initial conclusion is that minorities in New Hampshire are less likely to apply and obtain mortgages compared to the census breakdown of race. As an analyst after the findings, the bank can explore ways to be more inclusive and educate minorities on home ownership. Problem Importance This project was selected for a few reasons. It has been shown that home ownership is less diverse and that minority groups own less homes. In the US 64.6% of Americans own a home while only 42.1% of Black Americans own a home (USAFacts, 2023). The analysis will consist of looking at the data in New Hampshire from 2018 to 2022 to identify trends that could help with the understanding of this trend. Are more white Americans able to obtain a mortgage than a minority when all other variables are equal. Access to affordable lending is important as sometimes the weight of loans bears unfortunately on those who cannot afford to pay those higher rates.

This analysis will help financial institutions to ensure that they are compliant with the HMDA and CRA standards of the federal government. Also, the analysis will look at the trends in the New Hampshire market and if any financial institutions are overleveraged in the New Hampshire market and are susceptible to a market correction. This will benefit not only the business themselves, but since banks were catalysts in the last fiscal crisis this will ensure soundness of restrictions in lending. The data has lots of options on the exploration. Lastly, a predictive analysis of those with loans that were denied and approved to find any correlation to what made a good decision. From the start of Covid in March of 2020 through the end of 2021, a moratorium was in place to protect those impacted by COVID-19 to ensure they could stay in their homes (Consumer Finance Protection Bureau, 2021). Research/Problem Analysis The HMDA datasets that have been selected can be used to see the overall health of the mortgage lending business, look for ways that the financial institutions could not be conforming to the CRA and not providing lending to certain areas (consumerfinance.gov, 2024). On Kaggle, a user analyzed the data from New York to glean some understanding of the market in 2015 (Bukun, n.d.). The gap with this analysis is how the pandemic affected a smaller market like New Hampshire that saw a 4.6% increase in population from 2010-2020 census (America Counts Staff, 2021). Were the needs of the minority population met by the mortgage lending industry? New Hampshire is a white with 88.3% identifying as only white in the state (America Counts Staff, 2021). This disparity in diversity could cause a lender to be less involved in education or acquire a more diverse portfolio of loans. Critical Success Factors (CSFs) Critical Success Factors are ‘a business strategy’ (Saira Naseer, 2022). CSFs are a way to keep the project on track to meet deliverables. Also, CSFs are “factors that contribute to an organization’s success and are important for the achievement of an organization’s mission” (Abid Haleem, 2011). The main objectives of this project are as follows:

  1. Home Ownership access to more minorities
  2. Understanding the trends in the mortgage industry
  3. Develop more focused marketing. These CSFs will help to create the ability of this project to be an actionable item for the business. The goal is to understand where the market is currently and anything that can be gleaned from this analysis. Key Performance Indicators Key Performance Indicators (KPIs) are a target that is measurable that helps a business is meeting their goals or mission (Hennigan, 2023). By setting measured goals as KPIs, these benchmarks can help to communicate results and if the strategy is working. In this project, a few KPIs are employed to ensure that the project is aligned. • Gauging the proportions of applications in demographics compared to the census data – should align together. • Compare home prices, interest rate and census tract to see if a sharp increase happened from 2020 – 2022 due to pandemic. • Focused marketing on county where homes would have a larger equity in home to promote a home equity line of credit. • Predictive modeling for future demographic growth as well as home values increasing. These KPIs will be explained a little further in the preceding sections.

The first KPI is gauging the proportions of applications and how they compare to the census. Mortgage lenders should show a need to serve the entire community and not just a smaller percentage of them. If a lender is found to be out of compliance they can fined by the proper regulators; Bank of America was fined $12 million for reporting false mortgage data (Comnsumer Finance Protection Bureau, 2023).

The next KPI is comparing the home prices, interest rates and census tract to see if a sharp increase happened due to the pandemic. The increase in applications was driven by lower interest rates and rapid increases in home prices. This led to people taking out second mortgages or Home Equity line of credits during this period. If a financial institution does not routinely inspect their outstanding portfolio to look for home value risk, any change in home values could put the bank at risk of loans not being paid back in full.

Now the KPI is revolving around using the data to look for trends in areas where home values have increased the most and could have equity built in their home. A home equity line of credits can be a boost to interest rate risk for lenders as the rate is variable and can increase when prime rises. Currently some banks sit with exceptionally low interest rate mortgages and currently are having to pay more for keeping deposits currently; this is called net interest spread and it is currently a large area that banks must continue to monitor to stay solvent. If a lender can identify a need in a certain area, then look at saturating the area with marketing for a home equity line of credit to offset with the continued increase in rates for deposits.

Lastly, the KPI surrounding the predictive modeling to look at future loan and future demographic growth and how lenders can look for ways to market and gear products to this changing demographic. Also does a lender need to become more conservative with their credit standards or are they too conservative compared to other lenders? Project Insights of your Data Analysis The Data Analysis will start by first looking at the dataset through ETL operations as well as using the secondary datasets to have a fuller understanding of market conditions in the New Hampshire market. Since this data has already happened, we will be able to see how median home values have increased in a record defying manner from 2018 to 2022. This increase is unsustainable. Also, the data will show how the demographics may have changed and what type of people moved away from the Manchester/Concord larger population density to further out that may understand the market.

Since the data will still be showing the effects of the pandemic on the housing market and interest rates still at lows since the Fed had not moved the base rate until 2023, we could still see a prediction that shows a climbing home value. Also, it will be shown that the demographics have shifted in New Hampshire and where people call home. Also, it is expected that white males can secure mortgages than other minorities even based on the demographics. Project Milestones

  1. Define the scope of the work and how to interpret the data set.
  2. Using R, pair data down to just single-family home
  3. Cleanse and deal with NA variables and other multiple variables, especially in race.
  4. Proportions of dataset once cleansed in R with demographics of loan origination versus the 2020 census data.
  5. Run ensemble models in SAS to look for census growths, equity opportunity due to growth in home values.
  6. Present findings in a presentation using Tableau and other modeling techniques. Completion History

Week 1 Worked on finding a data set that would work for a lending in a financial institution Week 2 Upgraded R and began working with the data set and combing it Week 3 Explored the data set using R Week 4 Continued to explore and began to shape the full problem to ensure that it is a manageable data analysis Week 5 Completed a rough draft of the presentation. Recorded a presentation of Power Point. Completed some initial analysis on my current employer. Started removing some columns that are statistically insignificant or more redundant Week 6 Posted the presentation – began exploring and preparing the data in R to ensure that an understanding issues that may arise.
Week 7 Found issues with the data regarding redundant variables and appended the columns. Normalized the Income and Property Values due to issues with outliers. Created a narrower dataset looking at the top 20 of lenders in the New Hampshire market.
Week 8 Completed more data cleansing after noticing more issues with the data and dropping columns. Messed with lots of different visualizations and had trouble deciding which ones to bring into the paper that would be impactful. Want to get a map involved since the data is from the state of New Hampshire Week 9 Worked on changing the action taken variable to be a binary classification. Complete Presentation 2 Week 10 Built multiple models in SAS to find the champion model. Had to update the dataset a few times sin R to help with some items that did not process correctly – numeric numbers and binning DTI column to remove the 20 different answers to the question. Week 11 Writing the results and getting the visualizations ready for the presentation. Did a few more Tableau changes to get crisper visuals for the PowerPoint   Lessons Learned

Week 1 Found I was going to need to revisit my R studio skills and began working with some Datacamp projects Week 2 Really learned a lot about the HMDA and the information that this dataset has which makes it a remarkably interesting choice Week 3 More R on combining datasets and how to search for NA values Week 4 Found I was a little behind with where I want to be since the next two weeks have two deliverables. Also, I found the bank I worked for took the most applications for mortgage lending in the five years observed. Week 5 I learned how to record on Power Point as I had not used that feature. Also understanding more of working with R to get a deeper understanding of the data. Also, after getting the last assignment back, I have begun working on Lessons Learned and Tasks completed as I was going throughout the week to remember what I had done. Week 6 Developed an understanding of recording the video using PowerPoint. Found that a huge disparity amongst the 853 different lenders in the state of New Hampshire – only 20 of them make up over 50% of the dataset
Week 7 Still getting my feet back under me regarding using R – spent plenty of time searching for ways that I could complete task that I could easily complete in Excel but wanted to get the practice to get the answers. Week 8 As I was building the visualizations, noticed that more work needed to be completed in the data cleansing to allow for the meaningful operation of the visualization. Learned more tricks of R to help with this as well and feel more comfortable with the data cleansing process in R. Week 9 Continued to grow my R knowledge and found like mentioned before that even though I think I had thought of items more would arise when modeling was performed. This week binning and items being in character format instead of numeric. Week 10 Really got a good understanding of Machine Learning principles. Really enjoyed all the modeling that I was able to create and see how the little tweaks were able to create the champion model. Week 11 Again, just learning more about the data and how it fits into the macro part of the economy and in the country. This was really eye opening. Week 12 I learned a lot via this product – creating the scope of what I wanted to find out and that you need to be agile because it might just not work out to what you are hoping to find. I also found myself going down other rabbit holes that the data wanted to go down, but it was not in the scope which kept me a little grounded.

  Data Set Description For this project, option two was selected for the data set. The data was obtained by accessing the public data from the Home Disclosure Mortgage Act (Consumer Finance Protection Bureau, n.d.). The data is collected is from the any lender who issues mortgages; it tracks not only demographic information but captures a lot of information. The years of 2018 through 2022 were chosen and had to downloaded in five separate datasets for the state of New Hampshire. The data was combined in R using the rbind function to create the big dataset which is called loan_m. The dataset grows to 459,438 observations and has over ninety-nine variables that are collected; Year 2021 had the most applications with 119,667 where 2018 had the least amount of 70,901. The increase of apps during 2020 and 2021 can be attributed to the lower interest rates as people (Consumer Financial Protection Bureau, 2021).

The combined five-year data set incorporates all loan applications throughout the US. The data is meant to ensure that lenders are lending accordingly to their demographics and no redlining or discriminatory practices are happening. As well the lender information is included in the lei variable to determine the different financial institutions. The importance of the dataset is that it can show the overall health of the housing market. If lenders are denying more applications, borrowers are stretched and unable to meet current criteria. Also, lenders could not be issuing new debt because they are concerned or overleveraged in the market and worried about that exposure should an issue arise in the local housing market.

In the next few sections, a visualization of the data and then the definition of each variable. During the initial analysis of the data, it is noted that variables with multiple options are not used after the first instance. Take the denial_reason-1 there are no null values but when denial_reason-2 is used 446,039 has no value; this is the entire dataset. During cleansing this will be dealt with at that point as this will be removed. It is seen in a few other variables like race and ethnicity and that will reduce the variables but not the desired information. Some of the other interesting areas are that half of the homeowners applying for mortgages in the five years are between the ages of 35-54 which is over half of the respondents. Lots of interesting information has been collected and can be used by institutions, regulators, and the public to glean information that can be used to understand how mortgage lending is occurring. High-Level Data Diagram A high-level data diagram is shown below. The tools going to be used for this will be R for the ETL and then SAS for predictive modeling and Tableau to help with the visualization of the data outcomes.

The diagram shows that five separate data sets for each year of data in this study will be combined in R. The corresponding dataset was described above and close to 500,000 rows will need to be. So, starting from the left the five different datasets will be merged in R to create the combined dataset. The next step is the ETL and data cleansing process on the combined dataset. Afterwards the dataset will be exported into SAS and Tableau to allow for predictive modeling and visualizations of the data to ensure a firm understanding of the purpose of the data. Data Definition/Data Profile Below list the variables for the data set that was chosen. Regarding the Ethnicity, Race, AUS, and Denial Reason it is shown that after the first entry that the data collected was heavy with nulls in the area. This will be addressed and removed in the data cleansing due to the statistical insignificance for it.
Variable Values Description # of Nulls activity_year integer The calendar year the data submission covers 0 lei character A financial Institution's Legal Entity Identifier 0 derived_msa-md integer The 5-digit derived MSA (metropolitan statistical area) or MD (metropolitan division) code. 0 state_code character Two-letter State Code 0 county_code integer State-county FIPS code 1291 census_tract integer 11-digit census tract number 1867 derived_loan_product_type character product type from Loan Type (FHA, conventional, etc) 0 derived_dwelling_category character Dwelling Type (Single Family, manufactured, etc) 0 conforming_loan_limit character indicates whether the reported loan amount exceeds the GSE (government sponsored enterprise) conforming loan limit 1977 derivied_ethnicity character single aggregated ethnicity categorization derived from applicant ethnicity fields 0 derived_race character aggregated race categorization from borrower race fields 0 derived_sex character aggregated sex categorization from borrower’s sex fields 0 action_taken integer the action taken on the loan - Target variable 0 purchaser_type integer type of entity purchasing a covered loan from the institution 0 preapproval integer if the covered loan had a preapproval 0 loan_type integer the type of covered loan or application 0 loan_purpose integer the purpose of the covered loan 0 lien_status integer lien status of the property securing the loan position 0 reverse_mortgage integer whether the covered loan or application is for a reverse mortgage 0 open-end_line_of_credit integer is the covered a loan an open-end of credit or HELOC 0 business_or_commercial_purpose integer is the loan for a business or commercial purpose 0 loan_amount integer amount of the covered loan 0 combined_loan_tovalue_ratio character Loan to value 131166 interest_rate character interest rate for the covered loan 135282 rate_spread character covered loan's APR and the APOR for a comparable transaction the date of the interest rate 184604 hoepa_status integer is it a high-cost mortgage 0 total_loan_costs character the total cost of loan costs 194346 total_points_and_fees character total points and fees charged in connection with the covered loan 445916 origination_charges character itemized amounts that are borrower paid at or before closing 193310 discount_points character the points pain to the creditor to reduce the interest rate 199232 lender_credits character any lender credits 197833 loan_term character number of months that the term of the loan is 3362 prepayment_penalty_term character any prepayment penalty 420211 intro_rate_period character number of months before the rate may change after closing 379632 negative_amortization integer a term that could cause the covered loan to be a negative amortization loan 0 interst_only_payment integer are there interest only payments 0 balloon_payment integer when the contract ends are there a balloon payment 0 other_nonamortizing_features integer other non-amortizing features 0 property_value character property value that is used to secure loan 87010 construction_method integer construction method for the dwelling 0 occipancy_type integer occupancy type for the dwelling 0 manufactured_home_secured_property_type integer loan is covered by a manufactured home or land 0 manufactured_home_land_property_interest integer does the borrower own the land 0 total_units character total number of individual dwellings in the property 0 ageapplicant integer age of the applicant 0 multifamily_affordable_units character value of each individual unit 445942 income integer income of applicant 47992 debt_to_income_ratio character monthly debt to monthly income 133636 applicant_credit_score_type integer applicant credit score 0 co-applicant_score_type integer name and version of the credit scoring model used to generate the score 0 applicant_ethnicity-1(2-5) integer ethnicity of the applicant (can choose up to 5) 121 co-applicant_ethnicity-1(2-5) integer ethnicity of the co-applicant (can choose up to 5) 42 applicant_ethnicity_observed integer ethnicity collected by visual observations of applicant 0 co-applicant_ethnicity_observed integer ethnicity collected by visual observations of co-applicant 0 applicant_race-1(2-5) integer race of applicant can choose up to five choices. 23 co-applicant_race-1(2-5) integer race of co-applicant can choose up to five choices 6 applicant_race_observed integer race collected by visual observations of applicant 0 co-applicant_race_observed integer race collected by visual observations of co-applicant 0 applicant_sex integer sex of the applicant 0 co-applicant_sex integer sex of the co-applicant 0 applicant_sex_observed integer sex of the applicant collected by visual observations 0 co-applicant_sex_observed integer sex of the co-applicant collected by visual observations 0 applicant_age_above_62 character the applicant is above the age of 62(yes/no) 30075 co-applicant_age character co-applicant age - bins 0 co-applicant_age_above_62 character co-applicant age is above 62 (yes/no) 250704 submission_of_application integer submitted directly to the financial institution 0 initially_payable_to_institution integer payable to the institution 0 aus-1(2-5) integer the automated underwriting system used up to five choices 0 denial_reason-1(2-4) integer denial reason up to four choices 0 tract_population integer total population in the tract 0 tract_minority_population_percent integer percentage of minority population to total population for tract 0 ffiec_msa_md_median_family_income integer Median family income for the tract 0 tract_to_msa_income_percentage integer Tract median family income compared to MSA/MD median family income 0 tract_owner_occupied_units integer total number of dwellings occupied by owners 0 tract_one_to_four_family_homes integer dwellings that are built to houses with fewer than five families 0 tract_median_age_of_housing_units integer tract median age of the dwellings 0 (Consumer Finance Protection Bureau, n.d.) As well it will be noted that the information surrounding the loan costs to the consumer seemed to have many nulls. It will be interesting to see what is causing this or if this should be ignored during the data preparation phase of the assignment. The nice part of the dataset can be seen that a lot of variables have no nulls in it; this will allow for some meaningful insights that can be derived from the data provided.  Data Preparation/Cleansing/Transformation

Data Preparation Data Preparation is the process of getting or preparing the data ready for the analysis (Ibarrera, 2018). It is different than using Extract, Transform, Load (ETL) as a tool which has been seen as more of an IT process (Ibarrera, 2018). The data set employed in this analysis was pulled from five separate years of data from the HMDA website. This data is already easily manipulated and can stay in its form from the download. The data is combined using R to make a larger dataset than was described before. This data set will be compared to the general census data as well as searching for knowledge of certain Legal Entity Identifiers (LEI).

The variables that are included have a few that will be used in the analysis. There are two categorical variables that seem important pertaining to the analysis going to be performed. The action_taken variable will be able to look at how race, age and ethnicity affected the loan approval process. This will look for future forecasting on the variables that affect the action taken removing both race, age, and ethnicity from the variables; if credit guidelines are to be affective demographics cannot be included in those analysis. Lastly, using the open_end_line_of_credit variable to look for some ideas on areas that from a marketing standpoint can look for ways to capture more of these loans; as stated earlier from a interest rate risk low mortgages taken during the pandemic are weighing on banks as they pay higher interest rates on deposits to keep money deposit. First Republic Bank suffered from low-cost mortgages and then did not navigate the swift interest rates rising from the Fed (Saul, 2023). Open end lines of credits have a variable rate that can allow banks to modulate their interest rate risk as it follows the Fed fund rate.

As the study is going to be compared against single family homes, another step will be eliminating the applications that were for business use. The scope of the project is to look at the health of the residential home market both in property value increase as well as how the entire population is served by the financial institution. The top twenty operating in the state of New Hampshire will help with identifying any failures of these institutions to lend to the public of New Hampshire based on the census. The importance of preparing and exploring the data before any analysis can be performed is that the data is cleansed to ensure that issues inside the data have been dealt with. Data Cleansing Data cleaning or data cleansing or scrubbing is the process to identify and deal with errors and other issues to help with the over improvement of the data (Rahm & Ho). The first step in ensuring that data cleaning is completed efficiently is understanding the data. An initial look into missing values and then into any outliers in the dataset. In dealing with missing values the data analyst must determine will they remove or ‘fix’ the missing values with either the mean of the column or another number. When use of the mean could bring cause for concern of overfitting the data. Overfitting is when the analysis of the data set can hurt the exploration of new data being brought in thus affecting predictive models; if the dataset has 20% of missing values in a column and the solution is to add the mean to those columns the model may swing one way or the other depending on the outliers and overall data.

Now described in the previous section the data set was found to have some columns that seemed redundant. Those columns that contain additional information surrounding demographic, age and denial reason are removed using R. The reasoning behind this is that with most of the data being null the column would just be a continuation of the first column of data if kept and nulls using the answer in the first column. As well as another area concerning loan costs will be removed as the data is very incomplete as well because this analysis does not maintain a statistically significant measure in information. The rate spread, which is the difference between the rate the bank pays versus the consumer pays, is not needed in a risk analysis. Using R, the subset feature is used to remove those areas.

Another area in the data cleansing is to focus the dataset on residential mortgages. The data has business or commercial purpose lending which is not needed for the overall health of the market. A concern in the residential market has been large companies like, Black Rock, have been driving prices by buying up single family homes; investment firms like Invitation Home’s focuses more on certain markets and currently only 15% of total homes are owned by these types of companies (Botella, 2021). Though these homes are normally bought with cash and would not trigger an application captured by the HMDA are factors that could help to show a rapid rise in the home value. Removing business purpose properties like strip malls will help to focus the data more on the individual instead of a business entity. Data Transformation Data transformation is the process that converts the data into a unified format; it helps to ensure the compatibility and the use of the data for analysis (Hayes & Downie, 2024). Data transformation looks to standardize the dataset to allow for more meaningful analysis. In looking at Big Data, the need for the data must be formatted to fit this analysis. As well as data is transformed allows for improved data visualization. Data normalization is a way to standardize the data into a grouping which help to eliminate the presence of outliers affecting the outcomes (Hayes & Downie, 2024). Next, data splitting is another way to find meaningful information i.e. training set and test set for machine learning (Hayes & Downie, 2024).

In the HMDA dataset, data splitting was used to create two datasets that is the whole dataset and then a dataset that has the top twenty lenders in the state. These were chosen those over five thousand applications in the five years surveyed. The reason behind that is that means that these organizations were taking almost twenty applications a week during this time. The top lenders in the state were averaging over one hundred applications per week during this time.

Another area to combat was some of the issues with data entry errors in the income and property values. It was determined that in the income variable which is based on thousands and the maximum at over 31,255,000 which is an extreme when the census data for home value shows that only 3.8% of the home values are over $1 million. Handling outliers in the dataset is to use bins that place equal groupings of the data that helps to normalize it. Income and home value will be placed in six bins for the analysis to help offset the outliers in the data while not losing the other demographic data.

Data Analysis As the volume of data has increased this century, determining the best tools to use for this analysis will be performed. The scope of the product is to look for deficiencies currently in risk management and its compliance with lending in the New Hampshire market over the years 2018 through 2022. Also looking at modeling for where the growth in home values over the next few years and which area will see the increase in equity in the homes and thus better for marketing. The tools that will be used will be R for the initial data exploration and cleaning. Then using Tableau to help with the data visualizations and finally SAS Enterprise Miner to work on the predictive modeling.

R is a good start with the data exploration as it really does a wonderful job of visualizing the data as well as statistical analysis (Miller, 2021). R is an open-source technology, and the exploration was going to be processed through this tool. Since exploration looks for issues possible in the data like outliers, and null values, R does a solid job of allowing the data scientist to identify and handle the issue. Some other advantages include the different packages available to help with the information, the library dplyr is used in the analysis. The package helps to process and filter data and summarize the data.

Tableau is a leader in the visualization of data. It can handle large amounts of data and produce visualizations in short periods of time. As well Tableau works with other programming languages like R as will be used in the data exploration phase. Also, Tableau does not need a large amount of programming knowledge to be able to be used (Kar, 2023). Chosen for its abilities to build dashboards and powerful visualizations like geographical maps that will allow to visualize where in New Hampshire is seeing the greatest applications as well as increases in home values.

SAS Enterprise Miner is a multi-faceted tool that allows for both descriptive and predictive modeling for data scientists. The next step of the project is to deliver an understanding of where home values are increasing and where people can afford to take out an open-ended line of credits. Using the five-years of data supplied SAS should be able to model and predict as interest rates have climbed that home values have increased, which census tract is the best place to invest its marketing piece to increase an open-end line of credit.  Data Visualization

Descriptive Statistics The HMDA dataset has many types of variables from categorical to quantitative variables. Categorical variables can be used to create frequency tables to see how the data is distributed (Bobbitt, 2021). This was completed on the action_taken regarding the loan, below are the results.

Figure 1 Action Taken on the Application Each number corresponds to a different action from one being loan being originated or completed to three being application being denied to five being the application closed due to incompleteness. It is interesting that about 60% of the applications are originated and just as many applications are denied that are withdrawn by the applicant. The dataset did assist with interpreting certain columns from the derived race, ethnicity and sex that move from just a categorical variable in those columns to interpreting them to show the actual label. For the scope of the project, looking at did this information affects the dataset as origination or whether the loan being accepted is important to determine if the financial institutions are serving the communities.

Next, the HMDA has many quantitative variables. Quantitative variables are ones that have a nominal value and measures like mean, median, and standard deviation can be performed to understand more of the data (Bobbitt, 2021). These tasks are performed in R Studio for this analysis using the summary function. As discussed during the cleansing of the data and transformation, the focus on the datasets were on non-commercial properties and meant to understand the residential housing market and how it affects financial institutions. The income variable for those applying for the loans is interesting compared to the median of the entire population of New Hampshire. The median income of those applying for these loans is 101,900. This could be cause for concern that people may be stretched due to falling below the median income of the state. During visualization, it will be interesting to see how those who applied for a loan as a whole and those that get approved compare to the census information; also looking at this through the lens of each year to see if interest rate played into people affording homes due to lower interest rates but could be in trouble as property values rise in turn affects property taxes. Next the property value and loan amount are interesting areas to look at in the statistics. The interesting part is to look for trends in property values during the five years observed; this will be visualized and discussed further as part of the analysis is to look for trends in the New Hampshire market. The median loan amount for the dataset was 315,000. The minimum amount borrowed is 31 million; this maximum amount is quite an outlier compared to the rest of the dataset and could skew the meaning of the dataset.

A brief discussion of a few of the variables has happened and a further exploration during the analysis. Two datasets will be used, with the first one being the entire dataset and then condensing the dataset to the top twenty financial institutions that issue over 50% of the applications in the state of New Hampshire. Though a few visualizations will be shown to understand the dataset more fully, additional depth can be delved in to ensure that nothing is being limited. Data Visualization Definitions Data visualization ‘is the graphical representation of information and data’ (Tableau). Visualizing the data can help with understanding trends easier or outliers or any correlations between variables. A visualization should be easily interpreted and not over formatted; the more information on a visualization can led to being too busy and missing the point of the visualization. When crafting a visualization, the designer should determine what is the story behind the representation of the data. Tableau is an excellent tool to be used as it helps with ‘choosing’ the options that may represent the data the best.

For the next visualization, a few techniques will be used. One of the techniques is a line graph; line graphs are excellent because of their simplicity while also being able to show trends in the data easily (Cisneros, 2020). Next bar graphs or charts will be used to interpret the data; bar graphs are used to compare data or frequencies in distinct categories (SplashLearn) Now sometimes a simple chart of the data can help to explain what the goal is as well; remember it does not need to be flashy but able to convey the information in a visual way for the client. A bubble chart will be employed to help understand the disbursement of the applications; a bubble chart is an excellent way to show difference in data but not when variables are close (Ricks, 2021). Using a bubble chart can be employed to visually show the difference between the whole segment; using color variations as well can draw attention to the meaningful dimensions. Data Visualization 1 The first visualization technique used is the line graph. As described above it is an excellent way to convey trends in the data. As one of the projects goals is to look for property values increasing and how that affects the markets. The below graph shows 2018 through 2022 and the median property values for each county. The insights that can be understood and taken to explain any changes to the scope of this project will follow.

Figure 2 Median Property Values There are ten counties in New Hampshire, and they are represented by separate colors to provide a visual distinction between the counties. The y axis has the median property values columns ranging from 550K and the x axis has the years from 2017 through 2023. There is no forecasting ability due to the amount of data contained. To improve the forecasting feature adding years prior can help with showing additional trends.

Initial insights from the visualization are that Rockingham County has the highest property value and seems to have had a mediocre rise in home values from 2018 to 2022. Coos county had the lowest property value and stayed flat from 2021 to 2022 while all the other counties saw some increase in the property value. Hillsborough county also saw the second largest rise in property value from 2018 to 2022. The overall market looks strong as property values increased from 2018 to 2022 in all counties.

Digging into the results, it is seen that Rockingham County had a 47.8% increase in median property value from 2018 to 2022. The concern is that this increase is not sustainable and any pullback in the market can cause issues in the market. Homeowners in Rockingham County though can look at maximizing their equity in their homes or look at ‘cashing out’ with their increased property value. This increase in property values could be attributed to a hot housing market in the Seacoast region of New Hampshire while other regions were not seen as favorable like in Coos county.

This added information shows that Rockingham County will be a focus when discussing increasing property values and where to look for an open-ended line of credit from a marketing standpoint. As well, Rockingham County along with Carroll County show pullbacks. Carroll moved from the sixth county in median home values to third in 2022. The loans in Carroll County and financial institutions could be in trouble if a pullback in home values happens.

When looking at the disbursement of the races in the counties, looking especially at Rockingham and Carroll and seeing if the increases priced out minorities in this county. Also, well, were applications increasing for minorities through the years or were more white applicants looking to move to New Hampshire. Covid brought about a mass exodus of people from cities which allowed moved people to more rural areas and those who saw less restrictive measures during the pandemic. Carroll County especially encompasses the Lakes region of New Hampshire and people were looking to get away from city life.

Data Visualization 2 Tableau allows for the creation of dashboards which are a way to display several types of visualization in one place (Tableau). This process allows for a comparison of different information in the dataset; the presentation style allows for this to be conveyed to the user. The below visualization shows the disbursement between all applications and those that are approved contrasted with the overall minority population. It contains three tables and a bubble chart just to represent the frequency of white applicants in the data set.

Figure 3 Percentage of Minority In the top left corner looks at all applications and their derived race in the form of a bubble chart. Bubble charts show a visualization of the frequency of observation that can show difference in that frequency. In the All-Application table shows the derived race in a table and its disbursement as a percentage in each county. The Only Approved Application table shows the same information but for only the approved applications. In the bottom left corner is the table that shows the mixture of the minority population in each county.

The first insight is that the disbursement of white applicants is almost equal across all ten counties; though the entire population in each county has a broad range from 31.39% in Hillsborough to 1.55% in Coos. The data shows that the minority population is branching out to more rural and less diverse counties of New Hampshire. It is concerning that Rockingham at 27.55% of the overall population has only 4% of the applications in approved in the county that are minority.

Another insight is the Asian population is moving into the southern part of the states primarily in three counties, Hillsborough, Rockingham, and Merrimack. These counties are close to access to the interstates, both 93 and 95, which allow for easier access to Boston. These counties as well have the largest minority populations as well. The possibilities are endless but could mean that more bilingual bankers may be needed in these areas to help with the growing Asian population.

A KPI for this project was to get the applications approved at 12% which is the percentage of minorities of the entire state. After looking at the initial evidence it is noted that achieving that as a goal for this analysis will be quite far from the truth. The three counties mentioned above that housed the highest minority population lagged the rest of the state in its minority applications. Looking at the project scope changing to a more meaningful KPI that is achievable in the near term like 8% statewide and increasing Hillsborough, Merrimack and Rockingham to get closer to 10% since those counties have close to 70% of the minority population.

The project scope was to look at how the population is served by financial institutions and their lending practices. It is noted that the financial institutions are not performing to the goal of this project scope and will need to look for some meaningful changes to the scope that will look at the risk and how improvements can be made to better address the disparities between the population and those taking out home loans.

Data Visualization 3 As the exploration continues to look for issues with lending practices in the state of New Hampshire. An analysis of the Top 20 lenders in the state will help to look for issues. These twenty financial institutions account for over half of the applications that are taken in the state. Understanding their practices and any outliers could be a good place to start for regulators and compliance teams. The below table shows the derived race percentages for each of these twenty financial institutions.

Figure 4 Top 20 Race Breakdown The chart shows the list of the twenty financial institutions in the first column. The chart is sorted in descending order by the percentage of white applicants. The other columns are ordered by the least percentage of the applicants up until the last column, white. There are seven columns with two or more races through Asian.

It is shown that over 95% of the applicants were of the race white in this dataset. Asian was the next race that had the most applications with 2.3% and concerning that less than 1% of the applicants were African American. As a financial institution, it is shown that the entire community is not being served by the institutions. Looking for ways an organization can improve their saturation in the minority community is important. Bank of America had over 4% of their applications were Asian and Freedom Mortgage was close to 2% were African American.

It is shown that Northeast Credit Union has the highest number of applicants that are white at almost 97% of their applicants. Northeast Credit Union can look for ways to increase its client base to be inclusive of other minorities; ideas would be looking at where branch locations are and if they are serving the entire community or leaning into white communities. TD Bank and Service Credit Union are over 96% for applications given to white applicants. It is interesting that Freedom Mortgage just below 95% applicants who were white had the second lowest Asian percentage and the largest African American applicants as a financial institution looking ways to increase in the African American segment can look at what Freedom Mortgage is doing.

The percentage of applicants of minorities are way above the actual census data. Close to 12% of the population of New Hampshire are a minority but just under 5% of them applied for a mortgage in the five years analyzed. When looking at approved applications the numbers did not improve that much which means that education for how to obtain a loan may be needed in these communities. Maybe using a line graph or bar graph to see if it has trended like this or more minorities are applying for mortgages.

Exploring the data more to see any more trends in these areas and bringing in interest rates and debt to income to see if there is any correlation as well. Also, can it be determined what are the credit guidelines that each organization is using? Do financial institutions need to review to mitigate risk in their organization? Looking at presenting these initial visualizations and looking at building models that can help predict that next steps in the housing market in New Hampshire can help to position an institution to be more profitable.

Data Visualizations 4 Again, using a dashboard in Tableau, the below picture represents an analysis of Rockingham County of the Top 20 lenders. On the left is a bubble chart that shows the disbursement of approved loan applications. The chart on the right shows a few of the variables. The first column is that of financial institutions. Next how many applications were approved during the five-year period? Then the average home loan amount and then how much the entire portfolio of each financial institution. Of these twenty institutions there is a total of over $17 billion in loans extending during this time.

Figure 5 Rockingham Bubble Citizens Bank had the most approved applications by percentage with just under 15% of the approved applications in Rockingham County. Then Rocket Mortgage had the next largest percentage of the approved loans at 13.5%. These two institutions are interesting since one has a physical presence in the market, which is Citizens Bank and Rocket Mortgage does not have a physical presence in the market. Also, both lenders operate a different model for these loan applications. Rocket Mortgage focuses on refinances of existing mortgages which so a boom in 2020 through 2022 due to bottoming out of the interest rate market. Then Citizens Bank operates in the open-end line of credit space or home equity lines of credit which account for almost 75% of the applications that they received. As well the top six lenders in the market except for Citizens Bank do not have a physical presence in the state of New Hampshire.

Next the project scope was to look for risk involved in the market if a steep decline in property value were to happen and what lenders could be in trouble. The variable of loan to value can help to show the risk that each lender has. The higher the loan to value at the time of the loan could place the borrower in a position that they could become underwater like was seen in the 2008 financial scandal. A few lenders show some concern that if property values were to decrease in the next could be in trouble. Residential Mortgage Service averages an 80% loan to value, and they have $783 million in originated loans and could be susceptible to market conditions in Rockingham County dropping the property values.

This visualization helped to show that the market sees some risk issues that was seen in the First Republic Bank issues; low interest rates on the loan side and then having to pay more to keep money on deposit eliminated the net interest income and placed into a negative relationship. Rocket Mortgage, the most assets under management are paying an average of 3.41% on those 337.9 million in MBS to sell of prime borrowers with high median income to raise funds (Mitchell, 2022). Is this going to be another Big Short as financial institution look for ways to raise capital to issue new loans as the federal government has stopped buying mortgage securities.

This visualization helps to support the need for this analysis to ensure that regulations are not being rolled back that allows for more of the issues that arose during the housing market bursting. Continuing this process in the next steps will look for some predictive models if home values decrease by 15% what percentage of mortgages would be underwater and which lenders are in trouble. Rocket Mortgage issuing MBS and being backed by Bank of America can cause one of the Big 4 banks to be put at undue risk. So, the question for the predictive modeling will look at what point a decrease in home values or interest rates remaining at fifteen-year highs would cause issues in the risky part of the market. This is a slight change in the project scope because of the risk being identified through this analysis.

Proposed Visualizations The above visualizations help to give an understanding of the scope of this project. It looks to improve the understanding of the housing market and how the individual financial institutions are positioned in the market. Bringing in visualizations like a heat map to visually show how property values increased based on the census tract. This can be contrasted with the census data that is included as well to show divergent from the population.

A heat map shows frequency counts of the variable and observations to be able to visually see where the increases may be. The data has shown that whites dominate the application in this dataset; this could mean that more whites want to have home ownership. To increase minorities looking to be homeowners, using a heat map keyed on certain minority groups in the census tract in the counties. Do minorities see certain areas as being a hotbed of people that are like them?

The way that the organization can use this information is looking for ways to look for centers of influence to get out the information to the prospective clients. Homeownership can be a confusing and overwhelming process but with some education and outreach closing this gap could happen. Using this heatmap can show which tracts would be the most beneficial for this focused marketing. Lastly, this visualization will support the compliance portion of the CRA for the financial institution if they can unlock the minority community applying for loans through their organization.

Next visualization is to use a map of New Hampshire that is broken down to census tract which is even more granular than just the counties. The map function in Tableau can handle this and allow you to dive inside the counties. Some of the measures that would be added would be median property values, median income, and derived race. These measures can be color coded to show the different saturation of the data and where the property values and income are the highest.

Next the map will give a visualization of the state and using a background map to show where population centers are in the state. This can help to understand where the cities are and where the concentration of the population is in New Hampshire. Using a map that shows roadways, especially large highways that can help move people around the state as well as getting to the major metropolitan area of Boston. This little touch can give another understanding of the state for those who are not familiar with the state. Are home values increasing around these highways or are people wanting to be more rural?

Another part of this map would be to overlay the physical branches of the different financial institutions. By doing this could give an insight that those with a physical presence could see an uptick in the loan volume. As pointed out Rockingham County has the highest property value in New Hampshire, so financial institutions would be wise to put physical locations in these areas. As well, the median income can determine the affluent of the client base in the census tract which could position financial institutions to be proactive in the market.

Predictive Models Data Modeling Definitions Data modeling ‘is the process of creating a visual representation of either a whole information system or parts of it to communicate connections between data points and structures’ (IBM). The process looks for ways to organize the data so that it may be recalled and used for the different business purposes. In the data set, the ways that the data is joined are in very logical terms and allows for easy-to-understand inferences between the variables. This type allows for the process of machine learning models to be performed to determine the best variables to determine loan origination.

Machine learning is a combination of artificial intelligence (AI) and computer science that allows for data to be used to create algorithms that the AI can learn from and then improve the predictions (IBM). SAS system is created that allows for multiple models to be developed quickly to determine the champion model. The focus of modeling is how the accuracy of the model is versus the actual data; developing a model that can predict the best will be determining the champion model. The models that can be developed in SAS range from decision trees and logistical regression to more complex models like neural networks and support vector machines (SVM). These several types of models discussed will be built and discussed in the following pages with the goal of finding the champion model.

The target variable will be the action_taken with the loans originated being held as a one while the loans will not be a zero. Overall accuracy of the model is the least important and will focus on the sensitivity and specificity of the modeling. Sensitivity is the results of the true positive rates versus the total determined positives while specificity shows the true negative rates versus the total determined negatives of the model. Other variables will be used to understand how each model will perform. As stated before, SAS allows for multiple models to be built in a quick succession with tuning of parameters very easily. Predictive Model 1 Support vector machines (SVM) uses a line on a linear or a hyperplane to find the best representation of the classifier; it maximizes the distance between each classifier in a dimensional space (IBM.com). SVMs are particularly good at classification problems because they help to look for the relationships between the other variables and how that fits on a plot. There are several types of SVMs which are summarized by either linear or nonlinear models. The kernels which are a mathematical function to determine how to weigh the data to help to find the best plane to determine the classification. The several types of nonlinear are polynomial, radial basis function (RBF) and sigmoid kernels which again look for ways to separate and rate the data to allow for finding the best possible intersection of data. Below is a representation of how a hyperplane looks.

Figure 6 Hyper plane visualization (Statinov A.) The importance that can be noticed is that data is not always linear and can be dissected to find the best possible decision surface to say these people have cancer and these do not.

In this project, SAS allows for the ease to create multiple models quickly and then compare them together inside the diagram. Multiple SVMs were created, and the best one arose which was the linear model. This makes sense due to the data being linear because the variables used like loan-to-value (LTV) and debt-to-income (DTI) seem to follow a linear trajectory, i.e. a lower LTV or DTI of the borrower is a better chance that the loan would be improved. This will be shown in the other models as more elaborate models were not able to more accurately predict if a loan would be improved.

The dataset is separated into a training and validation data set with 70% in the training and the other 30% in the validation data. As well as not that the dataset is not off-balance too much which helped with not having to develop a model to account for the unbalanced dataset; the loans being originated was around 60% and the rest were not originated loans. The caveat to discuss further in the review is that some loans were approved but the borrower did not take the loan thus possibly affecting the results.

A confusion matrix which looks to determine the overall accuracy of the predictive model (Murel & Kavlakpglu, 2024). Below is a quick table showing how the data was computed. Model Data Type False Negative True Negative False Positive True Positive Observations Linear SVM TRAIN 2717 31970 6479 52882 94048 VALIDATE 1948 23958 4879 39750 70535 Figure 7 SVM results. It can be noted that the model predicts loans being originated correctly but allows for more loans that should not be originated easier. Now adding the confusion matrix is another area to check for over fitting of the training versus the validation. Overfitting is when the training data and validation have a significant difference in accuracy; this could mean that the training data has noise or information that is not needed (IBM). It will be noted that when all variables, especially interest rate was left in become a large determination of if a loan was originated. This variable is removed because a person will take the lowest interest rate even if their loan was approved.

SVM - Linear Train 0 1 0 31970 6479 Accuracy 1 2717 52882 90.22%

SVM - Linear Validation 0 1 0 23958 4879 Accuracy 1 1948 39750 90.32% Figure 8 SVM Confusion Matrix As seen in the diagrams above the SVM has not been overfit as both models predict at close to the same rate.

SAS creates multiple diagrams to help determine the over accuracy. The confusion matrix is a good table to understand what has happened but earlier it was mentioned that the model predicted loans being originated but allowed for a lot of loans not to be originated to cross into being originated. This is shown in the bar graph below.

Figure 9 SVM Classifier Results It can be noted that this model predicts more false positives which are loans that should not be originated.

Since accuracy has been discussed which shows how the model does at predicting all outcomes (Evidently AI). Precisions shows how often the model does at predicting the outcome; it is the relationship between the true positives and the total amount of positives (Evidently AI). Lastly recall or sensitivity shows the relationship between the model at predicting the positives versus the total positives; this shows how well the model is at predicting the actual outcome instead of losing those that should be predicted (Evidently AI). Lastly, specificity is how well the model predicts the zero or the negatives or in this case the loans not originated.

Model Data Type Accuracy Precision Recall (Sensitivity) F1 Score ROC Index Linear SVM TRAIN 0.90222 0.890854 0.951132 0.920007 0.97 VALIDATE 0.903211 0.890676 0.953283 0.920917 0.97 Figure 10 SVM scores. As mentioned before the model is accurate at 90.32% of the time and of the positive predictions 89.07% of them are true positives. Though as mentioned the best statistic is that it predicts 95.33% of the loans being originated. This is a good measure.

Two other measures in the chart above are F1 score and ROC curve. F1 score shows the relationship between precision and recall (Rose, 2021). This helps to determine how well your model is at predicting and how loans that should not be originated are affecting the model. The F1 score is particularly good at 92.09%. Lastly, when the models are compared in the subsequent to find the champion model, the receiver operating characteristics (ROC) curve plots the true positive and true negative rates at different classification thresholds (Agarwel, 2024). The area of the curve represents the accurate classification of the true positives and true negatives. It gives the picture of how well the model is at its accuracy.

The project scope was to determine the variables that made for a successful originated loan that could be generated by the computer, In the process the goal was to eliminate as many of the loans that should not be originated and keep as many of the current ones.

Predictive Model 2 Random Forest is another machine learning technique that utilizes multiple decision trees to reach a single result. It can work with both classification and regression problems. Random forests with different decision trees can allow for multiple questions to be asked initially. The idea is that more questions or knowledge can be gleaned from the masses then in just one process (IBM). A random forest model needs three hyperparameters that need to be set which are node size, number of trees, and the number of features sampled (IBM).

SAS allows for these hyperparameters to easily be changed and run to develop the best possible model. The number of decisions trees will sample the data separately and randomly which allows for difference in the data. As these trees are constructed to look for information in the data; the whole sum of the parts is greater than each individual part of the model. This model helps to reduce the risk of overfitting the data to the model while it can be difficult to interpret with the forest of decision trees that are produced (IBM).

The random forest model used the same construction as the other models. Some of those are dropping of the race, age, and ethnicity variables. The importance of keeping the parameters the same might not be the model itself that is the champion model but the tuning of the variables that achieve the desired result. The model that won out is the one that built two hundred trees and used a random 50% of the data to craft the model.

Below are the initial results of the raw data from the random forest.

Model Data Type False Negative True Negative False Positive True Positive Observations HP Forest - 50/200 TRAIN 3472 27618 10831 52127 94048 VALIDATE 2601 20736 8101 39097 70535 Figure 11 Random Forest Results It is shown that by first glance the False Positive rate is higher than in the SVM, but it did identify the True Positive at close to the same level. The confusion metrics show that the accuracy is lower. Random Forest Train 0 1 0 27618 10831 Accuracy 1 3472 52127 84.79%

Random Forest Validation 0 1 0 20736 8101 Accuracy 1 2601 39097 84.83% Figure 12 Random Forest Confusion Matrix Diving further into the confusion metrics, it is shown that the train and validation data had the same results, so overfitting of the data did not seem to seep into the model. The model can be seen to be used against new data due to getting the same results in the data.

As mentioned, understanding how the data can predict the decision of the loan being originated is important to the scope of the project. Below is how the data is distributed against the backdrop of positives and negatives.

Figure 13 Random Forest Classifier distribution. The model does a respectable job of identifying the positives, but it does seem to capture a sizable number of false positives.

Now moving on to some more advanced metrics of the data points of accuracy, recall, and precision. It can be noticed that the model itself captures a respectable number of false positives leading the precision to be at 82.78%.
Model Data Type Accuracy Precision Recall (Sensitivity) F1 Score ROC Index HP Forest - 50/200 TRAIN 0.847918 0.827965 0.937553 0.879358 0.97 VALIDATE 0.848274 0.828361 0.937623 0.879612 0.97 Figure 14 Random Forest scores. As well though it was mentioned that the recall of the data pulls close to the last model as it did an impressive job of identifying the actual positives in the dataset. The concern is that the precision is large which means more of the loans not originated are captured. The F1 score can level set that as it uses precision and recall as its way to evaluate the model. The ROC index is strong at .97. The review of how the model perform shows one that is effective if only finding the true positive is important and the cost of capturing more false positives is acceptable; something looking for cancer patients or even in insurance, you want to mitigate the risk, but in this study the scope looks to increase the loans being originated and including loans that should not be could be of concern.

The scope of the project remains unchanged with the data points provided. The goal is still to find a model that achieves both strong precision and recall at the same time. Having too many loans that should not be originated could lend to people who will default on their loans. So now another model is explored looking for the one that can execute on both principles.

Predictive Model 3 Neural Networks are a machine learning model that works and makes decisions similar to the human brain; their use weights to determine when to perform an action just like the brain works (IBM). Neural networks start with the variables and then have hidden nodes or layers as they perform the weighting of the decisions and finding the best possible outcomes. The neural network continues to run until it finds the best possible iteration that eliminates the error in the system; this is called the mean squared error (MSE) (IBM). The neural network performed the best was set to run multiple times up to one thousand times to find the best iterations.

The neural network results are to follow. It will be noticed that neural network diagram is present. The weights lead to three underlying nodes that the variables would flow through thus the neurons branching off to find the best course of action. The purple lines designate the offshoots of the neurons preceding to the different nodes with their weights and then finish with the decision of action_taken.

Figure 15 Neural Network Diagram The weights as mentioned before help to determine how each of the variables affect the classification qualifier. The neural network ran 295 iterations until it found the best possible scenario; SAS allows to vary this number to ensure that the data is not overfit. As this all pass through the results are started below. Neural Network Model Data Type False Negative True Negative False Positive True Positive Observations HP Neural TRAIN 2831 32965 5484 52768 94048 VALIDATE 2097 24631 4206 39601 70535 Figure 16 Neural Network Results Initially we will see that the neural network does an impressive job at finding the true positives while not bringing over too many of the false positives seen in other models before.

The confusion matrix is looking for the possibility of overfitting of the data from the training to the validation. The goal is for accuracy to remain close to the same as this will show that the data has not been overfit. Neural Network Train 0 1 0 32965 5484 Accuracy 1 2831 52768 91.16%

Neural Network Validation 0 1 0 24631 4206 Accuracy 1 2097 39601 91.06% Figure 17 Neural Network Confusion Matrix The accuracy did not change much between the validation and the training set. It can be stated that the model is not overfit.

Another visualization is in the bar graph. Looking at the numbers in the confusion matrix does not represent how the data is broken down. As is shown, the training and validation datasets show that model captures most of the true positives and less of the false positives make their way into the positives.

Figure 18 Graph of results. This is important to note because the overall goal of the project is to capture as many of the true positives while minimizing the saturation of the false positives.

Now looking at the table below and how the accuracy, precision, and recall performed for the neural network.

Model Data Type Accuracy Precision Recall (Sensitivity) F1 Score ROC Index HP Neural TRAIN 0.911588 0.905857 0.949082 0.926966 0.97 VALIDATE 0.91064 0.903988 0.94971 0.926285 0.97 Figure 19 Neural Network scores. It will be noted that the neural network has so far, the best performance metrics. The precision is over 90% which means that it has performed the best at not predicting false positives from the target variable. As well, the recall though not the best which was seen in the linear SVM model it is close to 95% of the actual positives are predicted in the model. Lastly, it is the best F1 score that has been shown. The F1 score shows a relationship between precision and recall. It helps to determine the efficiency of the model on predicting the desired variable.

Once again, the project scope of predicting what variables and what applications for a loan will make it to loan origination. The importance to business is that the credit policies can be changed to meet current demand. Eventually running a model on the market to look for changing the DTI or the LTV if a housing market bubble does burst could hurt the overall credit policy.

Predictive Model 4 Ensemble models are models that combine multiple models together to gather more information to find the best solution. Ensemble models are a machine learning that combine multiple other models to use each models’ strengths to improve the overall model (Alhamid, 2022). When important decisions are made gathering information from multiple sources can help to make a more informed decision. Ensemble models operate under that same presence. The models combined in this model are a decision tree, logistical regression model and the afore mentioned neural network. A decision tree is a machine learning model that uses branches and leaves to determine the correct decision. Decision trees were discussed when random forest was discussed. Decision trees are easier to understand as they have the decision and then leaves are extended from the branches looking for information that can be determined from these (IBM).
Another machine learning, logistic regression looks to estimate the probability of the event occurring based on independent variables (IBM). It uses probability to determine if the result should be a 0 or a 1; if it is below .5 then it will round down to a zero while above will round to the one. Logistic regression is great for use with classification models either binary or multiple. With this project and the binary function was a good model to add to this ensemble model. The initial results are found below of the three individual models and their results and how they improved when put all together. Model Data Type False Negative True Negative False Positive True Positive Observations Tree TRAIN 2532 30996 7453 53067 94048 VALIDATE 1846 23236 5601 39852 70535 Logistic Regression TRAIN 3013 32624 5825 52586 94048 VALIDATE 2241 24423 4414 39457 70535 HP Neural TRAIN 2831 32965 5484 52768 94048 VALIDATE 2097 24631 4206 39601 70535 Ensemble - Tree & Log TRAIN 2931 33039 5410 52668 94048 VALIDATE 2160 24726 4111 39538 70535 Figure 20 Ensemble and underlying models Results. It can be noted that all three models did an excellent job of predicting the true positives. Though each model had varying degrees of success for sorting out the false positives. The three models seem to have not been overfitted especially which can be a concern for logistic regression models.

The confusion matrix below can show that by combining the three models to create the ensemble model it did not show signs of overfitting. Ensemble - Log, Neural, Tree Train 0 1 0 33039 5410 Accuracy 1 2931 52668 91.13%

Ensemble - Log, Neural, Tree Validation 0 1 0 24726 4111 Accuracy 1 2160 39538 91.11% Figure 21 Ensemble Confusion Matrix The accuracy of both the training and validation set are almost equal, which is better than the other three predictive models. Though accuracy as stated earlier is not the important statistic to look for in this process, it will lead us to believe that both precision and recall will see some better gains.

The graphical representation below denotes that the models do not capture too many of the false positives.

Figure 22 Ensemble Classifier Chart The business importance of determining if the loan will be originated or not will help to minimize the cost of these loans. The goal is that once an application goes through the entire process it will be originated and not pulled due to other extenuating circumstances.

The underlying statistics contained in the chart below show the individual models as well as the ensemble model together. Model Data Type Accuracy Precision Recall (Sensitivity) F1 Score ROC Index Tree TRAIN 0.893831 0.876851 0.95446 0.914011 0.96 VALIDATE 0.894421 0.876774 0.955729 0.914551 0.96 Logistic Regression TRAIN 0.906027 0.900276 0.945808 0.92248 0.97 VALIDATE 0.90565 0.899387 0.946256 0.922227 0.97 HP Neural TRAIN 0.911588 0.905857 0.949082 0.926966 0.97 VALIDATE 0.91064 0.903988 0.94971 0.926285 0.97 Ensemble - Tree & Log TRAIN 0.911311 0.906849 0.947283 0.926625 0.97 VALIDATE 0.911094 0.905817 0.948199 0.926523 0.97 Figure 23 Ensemble and underlying models scores. It was discussed in the last section about the neural network model, but we will explore the other two models that went into this ensemble model. First up, the decision tree model was selected because it had the best recall of any model that was explored. The goal would be that it would help to improve the identification of the true positives. Next the logistic regression was a model that overall was close to being a champion model and did an excellent job of detecting the false positives with its precision score remarkably close to the neural network.

Ensemble models’ goal is to use multiple machine learning models to look for ways that each model can detect the variables differently; the sum of the parts is greater than the parts themselves. The combination of the three models helped to improve the F1 score a little as well as the accuracy, precision, and recall of all the models. The importance of this little additional was over a hundred applications being moved from the false positive category back to the true negative. Again, not wasting resources on loans that would not originate in the first place.

The project scope is maintained, and a champion model has been determined based on the results presented. The business purpose of automating what loans will be originated will help to consolidate resources and focus on a higher percentage of loans being originated. Currently, looking at recall based on the factors presented, the business can expect to originate almost 95% of their loans by using this model. A huge business success story that will eliminate some costs.

Predictive Model Review Many predictive models were created in different iterations and four of the models were discussed that seemed to be the champion model of their style and with their hyperparameters. The below grid shows the results of all four models in one chart.

Model Data Type Accuracy Precision Recall (Sensitivity) F1 Score ROC Index Tree TRAIN 0.893831 0.876851 0.95446 0.914011 0.96 VALIDATE 0.894421 0.876774 0.955729 0.914551 0.96 Logistic Regression TRAIN 0.906027 0.900276 0.945808 0.92248 0.97 VALIDATE 0.90565 0.899387 0.946256 0.922227 0.97 HP Neural TRAIN 0.911588 0.905857 0.949082 0.926966 0.97 VALIDATE 0.91064 0.903988 0.94971 0.926285 0.97 HP Forest - 50/200 TRAIN 0.847918 0.827965 0.937553 0.879358 0.97 VALIDATE 0.848274 0.828361 0.937623 0.879612 0.97 Linear SVM TRAIN 0.90222 0.890854 0.951132 0.920007 0.97 VALIDATE 0.903211 0.890676 0.953283 0.920917 0.97 Ensemble - Tree & Log TRAIN 0.911311 0.906849 0.947283 0.926625 0.97 VALIDATE 0.911094 0.905817 0.948199 0.926523 0.97 Figure 24 Champion Model Comaprison2 It will be noted that the ensemble model combining the decision tree, logistical regression and neural network was the champion model. It is shown that it may not have had the best recall of the models unlike the linear SVM model, it did classify less of the loans not to be originated or false positives or precision. It was a very nominal difference than the neural network, but the decision was made because the ensemble model had the best F1 score which shows the relationship between recall and precision.

Figure 25 ROC Chart for all top models. The ROC curve demonstrates that much difference between the models as they look for the best possible model. The HP Forest shows the longest to reach the others on the curve as it falls below the other models. Remember the ROC curve shows how the model performs in both specificity and sensitivity in different intervals.

The champion model is the ensemble model as mentioned before. It has the strongest F1 score which is the relationship between recall and precision. The scope of the project is to look for a model that could predict which loans should be originated. Having the best F1 score shows that the model will not capture as many false positives as they can while capturing enough of the true positives. Lenders must ensure that they can lend quickly and easily.

During the exploration stage it was noted that out of the top five lenders during this period four of them were online banks without a physical presence in the state. The need for this automation and the continuing changing market in New Hampshire is important because not all mortgage markets react to different stressors. Making credit policy based on the entire United States will not make for a good lending policy as each market has different stressors.

Lastly, the identification of the champion model can assure that almost 95% of the loans will come based on the variables provided. Thus, automating this process can help to consolidate the resources and work on just the loans that will be originated. The lender would only make money when the loan goes to origination, and it is a loss when time is spent on loans that will not become originated.

Final Results Analysis Justification The HMDA data is collected to ensure that financial institutions are providing services to the whole community. The reason for collecting the information is to ensure that no institutions are harming any of the prohibited classes. The penalty that an institution can face is not only reputational but financial; Bank of America was fined $12 million in 2023 because their loan officers did not ask about race and reported that the customers did not want to report (Consumer Finance Protection Bureau, 2023). Lenders must be confident that they are reporting or face steep penalties and fines like Bank of America.

As financial institutions must deal with more competition for lending which is the life blood on revenue for banks, determining which loans will originate is important. Fee income via insufficient funds and other fees have lessened due to regulatory protections enacted, banks must look to be better at determining the success of their lending. Handling large volumes of loans like the Top 20 lenders in New Hampshire did for the years 2018 to 2022 it is imperative that a model is developed to determine which loans will be successful. If it can be determined that certain variables show that the loan will not be successful than changing credit policy to not pre-approve the loan. This will save money for the institution because they can focus on the loans that will get across the proverbial finish line.

The analysis that was performed in the small state of New Hampshire which generated over four-hundred thousand applications during this period shows how to not only applications were dispersed but about trends in the different counties. The analysis is able to show trends in the market place from property values increasing to incomes change as well as demographics changes. This is important for a financial institution to understand the clients that will be coming in or knowing how the market has inflated and what risk may arise if a bubble burst like in 2008. Further investigation in this area to make determining factors but the analysis performed can enlighten knowing that people’s homes grew in value which increased their net worth. Findings The analysis of the data was able to shed some light on the mortgage business in New Hampshire during 2018-2022. It was noted that almost 60% of the loans originated from twenty lenders. Below is the chart of those lenders. Financial Institution Total Applications Citizens Bank, NA 29665 Rocket Mortgage 27248 CMG Mortgage 15396 Residential Mortgage Servicing 13803 Wells Fargo 13371 TD Bank, NA 12717 United Wholesale Mortgage 11764 St Mary's Bank CU 10381 Harborne Mortgage 10373 Digital 10197 Freedom Mortgage 9802 Bank of America 9405 Newrez 8314 Loan Depot.com 8195 Service Federal CU 8087 Amerisave 7561 Pennymac Loan 7501 Northeast CU 7449 Chase 7321 Nationstar Mortgage 6717 Figure 26 Top 20 Lenders It is noted that some of the top five lenders do not have an active presence in the New Hampshire market; this means that people are willing to spend the largest purchase of their lives not to work with someone local. The top lender, Citizens Bank, originated about 75% of their loans as open-ended lines of credit or home equity line of credit. Citizens Bank had over 50% of these types of loans which have a variable instead of a fixed rate like in a traditional 30-year mortgage.

Next property value increases were a concern from a risk level. If property values increase too quickly then a bubble could form and a disruption in the market could cause property values to crater. Below is how median property values performed during the period.

Figure 27 Median Property Values by Year A sharp spike especially in the Rockingham County market from 2020 to 2022. The increase in Rockingham created differences between the other counties. The median home value of property sold in Rockingham was over a half million dollars. Looking at issues with risk if the market were to change and property values change.

Figure 28 Rockingham Loan Data Top 20 The data shows that Rocket Mortgage has the highest amount of loan value in Rockingham County under its portfolio; they have just $1.5 billion of loans delivered during this period. A concern for falling home values if the loan to value of the loans is high; this is demonstrated by Residential Mortgage Services where the median of loan to value of the originated loan was 80%. If home values in Rockingham were to decrease significantly then some of the loans that were issued would become underwater. Most mortgages are amortized differently than other term loans; they are heavier on interest paid to the loan than going to principal. As a regulator keeping an eye on this spread can understand what lenders could be in distress. Lastly, interest rates have increased over the past few years and rates are two-decade highs; net interest spread on these loans compared to what either financial institution needs to pay on to keep deposits can be disproportionate to what these loans show. Amerisave have loans that average under 3% where in the current environment they are above 6% on the national average; to raise funds through mortgage-backed securities (MBS) Amerisave would have to issue these at a discount due to the lower yield for those who would influx the cash into the business.

Lastly, the findings on race in the New Hampshire data set showed that the minority population was not represented like the entire community was. In Figure 3, it was demonstrated that the top three counties did not have significantly more minorities filling out applications for loans. It was demonstrated that the Asian population was moving into the New Hampshire at a greater percentage than any other minority class. The concerns are that the entire population may not be able to afford homes in the market which continues to hurt people from obtaining their largest net worth; the Federal Reserve states that those who own a home has a forty times a higher net worth than those who do not own a home (Knueven & Acevado, 2024). This is why the analysis of the HMDA data is important to find how minorities are participating in obtaining wealth. Review of KPIs Four Key Performance Indicators (KPI) were identified at the beginning of this project. The first KPI was tracking that 12% of the approved loans in New Hampshire would match the demographics of the state. It was found that just over 5% of the applicants and approved loans were minorities. The concern that was discussed earlier that this failure for those to purchase a home could keep them from increasing their wealth. It was interesting that the demographics were changing as more people were moving to less expensive counties. Though the census data showed some of the counties saw application increases in their counties of minorities. This can be seen as a trend that can help to diversify communities and offer affordable housing for minorities and others.

The next KPI was during this period a 29% increase in property value for the entire country. New Hampshire saw a median property value of 425,000 in 2022; this is an increase of 49.1%. New Hampshire was better than the national average. Rockingham has the largest dollar increase in property values of all the counties increasing from 525,000 which is an increase of 47.9% still above the national average. This quick ascension of property value can increase the wealth metrics of those homeowners. Coos County, which is in the northernmost county saw property values increase but remained below the median home value in the US of $348,000.

Another KPI is that one third of homes have an open line of credit. There is 653,069 housing units in New Hampshire (United States Census). There were only 56,000 applications for a home equity line of credit. Though this does not demonstrate that only 10% of homes have home equity in it, it shows that it can be grown in this market. Since home values have increased about 50% in the past five years than equity in homes has increased thus allowing people to tap into this newfound equity. This can become a strategy for a lender to exploit and find a way to offset low fixed rates with a variable rate home equity line of credit.

The last KPI which was a 6.4% increase in home values in 2024. Though the data set does not include 2023 or the first half of 2024 in it, this would take the median home value in New Hampshire. This would increase the median home value by 450,000. These numbers continue to make home ownership tougher for the overall population and it will be interesting to see if these prices those out of the market. Also, it was noted that Coos county had the lowest property value at just over $200,000 which could allow buyers to migrate to this area to own a home. The property value prediction could drive higher as it was seen that New Hampshire was almost twice the national average during the pandemic. Owning a home in New Hampshire before the pandemic seems like a great investment as the return has been strong in the past few years. Review Significance The analysis and the big takeaway from the data set is that home values have increased in the state of New Hampshire. The close to 50% increase in property values during this analysis is a concern and rapid five-year increase. This increase, though increasing the net worth of the homeowner, has also adjusted the property taxes that the homeowners must pay. This can cause the home to become unaffordable due to the increase; this author has seen almost a 20% increase in property taxes due to his tax assessed value increasing close to 50%. It will be interesting to see if homes come on the market due to people not wanting to pay the property tax and cashing in on the newfound equity they have acquired in their home.

Another finding of importance was that the Asian population was growing as more applications of this race accounted for around 4% of the application originated. The importance is knowing that the clients are changing and that the demographics done by the census may see that these numbers are changing. It will be interesting to see this over the next few years if this trend continues as it will become the second largest population next to the white population. Homeownership amongst black homeowners decreased when cross referenced with the census data. New Hampshire has just under a 2% population that is black while only 1.2% of the applications were those identified as black. Concerning that these loans are not being originated for this race and could be cause that this minority is either not being marketed or do not understand the process.

In the predictive modeling, it was noted that interest rate was the largest factor that determined if one would originate the loan. Keeping a firm eye on lending rates of competitors is a huge part of the everyday job of an analysis in the mortgage business. When removing the interest rates, which can be done by level setting that a lender could match the rate of a competitor through a process. This process can be the predictive model that was developed since it is understood the model can predict with about 95% confidence the loan will become originated if these factors are present. Another part in future is taking this data from the application and comparing it against loans that go into default. What were the factors that lead to the borrower defaulting – increase in DTI or decrease in credit score. These factors could help to determine a better idea of credit policy.

Recommendations for Future Analysis

As was seen with the pandemic and the shifting of employment and living arrangements, performing an analysis of the HMDA with Massachusetts, Maine, and Vermont being included in the analysis. Looking for trends, did home values decrease during this period in any markets as people migrated from larger population centers and moved to more rural areas to live out the pandemic. Adding in the years of 2023 and 2024 to this data will help to see if home value has decreased with the increased lending rates. As the surrounding states are added it will be interesting to see those trends and if any states fell below the national average in home value increases during this time.

Another future analysis would look how a recession like in 2008 affected the property values in New Hampshire; the national average saw home values plummet almost 16% (Padgitt & Merchak, 2010). Using the effects of this recession pitting it against the current numbers and decreases in property values. This can be contrasted as the decrease happens what does it do to the loan to value of said loan. The percentage of homes that would be underwater means that if the home were sold it could not pay off the mortgage. This could cause a snowball effect as homes go into foreclosure and thus are sold at lower cost as the lender attempts to recoup its losses thus driving down property values more. A 16% loss in property values with the current data points would see the median home value in New Hampshire in 2022 fall to 67,000 which is almost half of the gains in home values that was seen in the data.

Lastly, continuing to build upon the demographics analysis. It was noted that the minority population was not as represented in the data compared to the entire set. Working with the community a lender could look for ways to increase the visibility of their products. Making home ownership a key to building one’s net worth and building their wealth so that they can retire comfortably is important. In other studies, by the Federal Reserve the disparity in home ownership among minorities is well documented. If a lender can use this data to help develop relationships with the consumers in the state and understand what factors make people originate a loan would allow more loans to happen.   References Abid Haleem, M. A. (2011, November 10). Analysis of critical success factors of world-class manufacturing pracrices: an application of interpretative structural modelling and interpretative ranking process. Retrieved from https://www.tandfonline.com/doi/full/10.1080/09537287.2011.642134 Agarwel, R. (2024, March 29). ROC curvers and AUC: The ultimate guide. Retrieved from builtin.com: https://builtin.com/data-science/roc-curves-auc Alhamid, M. (2022, May 12). Ensemble Models: What Are They and When Should You Use Them? Retrieved from builtin.com: https://builtin.com/machine-learning/ensemble-model America Counts Staff. (2021, August 25). New Hampshire's population grew 4.6% last decade. Retrieved from https://www.census.gov/library/stories/state-by-state/new-hampshire-population-change-between-census-decade.html Bobbitt, Z. (2021, March 31). Categoricalvs Quantitative Variables: Definition + Examples. Retrieved from Statology.org: https://www.statology.org/categorical-vs-quantitative/ Botella, E. (2021, June 19). Investment firms aren't buying all the houses. But they are buying the most important ones. Retrieved from slate.com: https://slate.com/business/2021/06/blackrock-invitation-houses-investment-firms-real-estate.html Bukun. (n.d.). EDA Home Mortgage NY(with Feature Analysis). Retrieved from kaggle.com: https://www.kaggle.com/code/ambarish/eda-home-mortgage-ny-with-feature-analysis Cisneros, M. (2020, March 24). what is a line grapj? Retrieved from storytellingwithdata.com: https://www.storytellingwithdata.com/blog/2020/3/24/what-is-a-line-graph/ Comnsumer Finance Protection Bureau. (2023, November 28). CFPB orders Bank of America to pay 12 Million for Reporting False Mortgage Data. Retrieved from consumerfinance.gov: https://www.consumerfinance.gov/about-us/newsroom/cfpb-orders-bank-of-america-to-pay-12-million-for-reporting-false-mortgage-data/ Consumer Finance Protection Bureau. (n.d.). Home Disclosure Mortgage Act (HMDA). Retrieved June 8, 2024, from ffiec.cfpb.gov: https://ffiec.cfpb.gov/data-browser/ Consumer Financial Protection Bureau. (2021, August). Data Point: 2020 Mortgage Market Activity and Trends. Retrieved from https://files.consumerfinance.gov/f/documents/cfpb_2020-mortgage-market-activity-trends_report_2021-08.pdf#:~:text=In%20total%2C%20the%20number%20of%20closed-end%20originations%20%28excluding,an%20increase%20in%20the%20number%20of%20refinance%20loans. consumerfinance.gov. (2024, May 27). About HMDA. Retrieved from https://www.consumerfinance.gov/data-research/hmda/#:~:text=Using%20HMDA%20data%2C%20we%20can%20learn,and%20compare%20that%20to%20previous%20years.&text=Using%20HMDA%20data%2C%20we,that%20to%20previous%20years.&text=data%2C%20we%20can%20learn,and%20compar Evidently AI. (n.d.). Accuracy vs. precision vs. recall in machine learning: what's the difference? Retrieved July 23, 2024, from evidentlyai.com: https://www.evidentlyai.com/classification-metrics/accuracy-precision-recall Hayes, M., & Downie, A. (2024, June 19). What is data transformation. Retrieved from ibm.com: https://www.ibm.com/think/topics/data-transformation Hennigan, L. (2023, April 24). What is a KPI? Definition & Examples. Retrieved from forbes.com: https://www.forbes.com/advisor/business/what-is-a-kpi-definition-examples/ Ibarrera. (2018, September 26). Understanding the difference between ETL and Data Preparation. Retrieved from dataladder.com: https://dataladder.com/understanding-difference-etl-data-preparation/ IBM. (n.d.). What is a neural network? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/neural-networks IBM. (n.d.). What is data modeling? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/data-modeling IBM. (n.d.). What is decision trees? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/decision-trees IBM. (n.d.). What is logistic regression? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/logistic-regression IBM. (n.d.). What is ML? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/machine-learning#:~:text=Machine%20learning%20%28ML%29%20is%20a%20branch%20of%20artificial,way%20that%20humans%20learn%2C%20gradually%20improving%20its%20accuracy. IBM. (n.d.). What is overfitting? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/overfitting IBM. (n.d.). What is Random Forest? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/random-forest IBM.com. (n.d.). What are SVMs? Retrieved July 23, 2024, from ibm.com: https://www.ibm.com/topics/support-vector-machine Kar, U. (2023, October 10). Pros & Cons of using Tableau for your projects. Retrieved from knowledgehut.com: https://www.knowledgehut.com/blog/business-intelligence-and-visualization/tableau-advantages-disadvantages Knueven, L., & Acevado, S. (2024, July 23). Understanding the Average American Net Worth: Insights and Analysis. Retrieved from businessinsider.com: https://www.businessinsider.com/personal-finance/banking/average-american-net-worth Miller, S. (2021, June 25). What is R used for? Retrieved from codeacademy.com: https://www.codecademy.com/resources/blog/what-is-r-used-for/ Mitchell, D. M. (2022, May 23). Mortgage aim to raise $337.9 million in MBS. Retrieved from americanbanker.com: https://asreport.americanbanker.com/news/rocket-mortgage-raises-337-9-million-in-mbs#:~:text=RCKT%20Mortgage%20Trust%202022-4%20is%20preparing%20to%20issue,median%20incomes%20occupying%20the%20properties%20they%20will%20buy. Murel, J., & Kavlakpglu, E. (2024, January 19). What is a confusion matrix. Retrieved from ibm.com: https://www.ibm.com/topics/confusion-matrix Padgitt, K., & Merchak, A. (2010, August 31). Property Tax Revenue Increased As Property Values Fell. Retrieved from taxfoundation.org: https://taxfoundation.org/research/all/state/property-tax-revenue-increased-property-values-fell/ Rahm, E., & Ho, H. H. (n.d.). Data Cleaning: Problems and Current Approaches. Retrieved from https://www.betterevaluation.org/sites/default/files/data_cleaning.pdf Ricks, E. (2021, May 17). what is a bubble chart? Retrieved from storytellingwithdata.com: https://www.storytellingwithdata.com/blog/2021/5/11/what-is-a-bubble-chart Rose, M. (2021, October 11). An intuitive guide to the F1 score. Retrieved from towardsdatascience.com: https://towardsdatascience.com/an-intuitive-guide-to-the-f1-score-55fe8233c79e Saira Naseer, K. A. (2022, June 7). Impace of Critical Success Factors on Project Success through the Meduation of Knowledge Creation. Retrieved from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9211757/ Saul, D. (2023, May 1). First Republic Bank Failure: A Timeline Of What Led To The Second-Largest Bank Collapse In U.S. History. Retrieved from forbes.com: https://www.forbes.com/sites/dereksaul/2023/05/01/first-republic-bank-failure-a-timeline-of-what-led-to-the-second-largest-bank-collapse-in-us-history/ SplashLearn. (n.d.). Bar Graph - definition, types, examples, practice problems, facts. Retrieved July 10, 2024, from splashlearn.com: https://www.splashlearn.com/math-vocabulary/geometry/bar-graph#:~:text=A%20bar%20graph%20can%20be%20defined%20as%20a,or%20other%20measures%20of%20distinct%20categories%20of%20data. Tableau. (n.d.). Data visualisation beginner's guide: a definition, examples and learning resources. Retrieved July 10, 2024, from tableau.com: https://www.tableau.com/en-gb/learn/articles/data-visualization Tableau. (n.d.). What is a dashboard? A complete overview. Retrieved July 10, 2024, from tableau.com: https://www.tableau.com/learn/articles/dashboards/what-is United States Census. (n.d.). Quick Facts New Hampshire. Retrieved August 6th, 2024, from census.gov: https://www.census.gov/quickfacts/fact/table/NH/PST040221#:~:text=Housing%20units%2C%20July,1%2C%202021%2C%20%28V2021%29%20643%2C981 USAFacts. (2023, October 3). Homeownship rates show that Black Americans are currently the least likely group to own homes. Retrieved from usafacts.org: https://usafacts.org/articles/homeownership-rates-by-race/