Skip to content
ChatGPT For Data Science Project (copy)
Exploratory Data Analysis (EDA)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the loan dataset
loan_df = pd.read_csv("loan_data.csv")
loan_df.head()# Display the number of rows and columns in the dataset
print("Number of rows and columns:", loan_df.shape)
# Display summary statistics for numerical variables
print(loan_df.describe())# Display the number of missing values in each column
print(loan_df.isnull().sum())# Visualize the distribution of the target variable
sns.countplot(x="not.fully.paid", data=loan_df)
plt.show()# Visualize the correlation between variables
corr = loan_df.corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.show()# Visualize the distribution of loan purpose
sns.countplot(x="purpose", data=loan_df)
plt.xticks(rotation=90)
plt.show()# Visualize the distribution of interest rates by loan purpose
sns.boxplot(x="purpose", y="int.rate", data=loan_df)
plt.xticks(rotation=90)
plt.show()Feature Engineering
# Perform feature engineering
loan_df["installment_to_income_ratio"] = (
loan_df["installment"] / loan_df["log.annual.inc"]
)
loan_df["credit_history"] = (loan_df["delinq.2yrs"] + loan_df["pub.rec"]) / loan_df[
"fico"
]Preprocessing and Balancing the Data
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Drop unnecessary columns
loan_df = loan_df.drop(['credit.policy', 'days.with.cr.line', 'purpose'], axis=1)
# Convert categorical variables to numerical using LabelEncoder
le = LabelEncoder()
loan_df['not.fully.paid'] = le.fit_transform(loan_df['not.fully.paid'])# Scale the numerical variables using StandardScaler
scaler = StandardScaler()
numerical_cols = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec','credit_history','installment_to_income_ratio']
loan_df[numerical_cols] = scaler.fit_transform(loan_df[numerical_cols])# Handle class imbalance by oversampling the minority class
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X = loan_df.drop('not.fully.paid', axis=1)
y = loan_df['not.fully.paid']
X_resampled, y_resampled = sm.fit_resample(X, y)
loan_df = pd.concat([X_resampled, y_resampled], axis=1)loan_df['not.fully.paid'].value_counts()