# Import libraries and methods/functions
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression # <-- Added LogisticRegression import
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Start your code here!
# Load datasets
telco_demo = pd.read_csv("telecom_demographics.csv")
telco_usage = pd.read_csv("telecom_usage.csv")
# Quick look
print(telco_demo.head())
print(telco_usage.head())
# Merge on customer_id
churn_df = pd.merge(telco_demo, telco_usage, on="customer_id", how="inner")
print(churn_df.head())
# Calculate churn proportion
churn_counts = churn_df['churn'].value_counts()
churn_rate = churn_counts[1] / len(churn_df)
print("proportion of churn:", churn_rate)
# Inspect column info
churn_df.info()
# One Hot Encoding for categorical variables
churn_df = pd.get_dummies(churn_df, columns=['telecom_partner', 'gender', 'state', 'city', 'registration_event'])
print(churn_df.head())
# Initialize scaler
scaler = StandardScaler()
# 'customer_id' is not a feature
features = churn_df.drop(['customer_id', 'churn'], axis=1)
features_scaled = scaler.fit_transform(features)
# Target variable
target = churn_df['churn']
# Perform 80-20 split
X_train, X_test, y_train, y_test = train_test_split(features_scaled,target, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])
# Instantiate the Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
# Logistic Regression predictions
logreg_pred = logreg.predict(X_test)
# Logistic Regression evaluation
print(confusion_matrix(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))
# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
# Random Forest predictions
rf_pred = rf.predict(X_test)
# Random Forest evaluation
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
# Which accuracy score is higher? Ridge or RandomForest
higher_accuracy = "RandomForest"