Skip to content

Objective

In this project I explored the intricate dynamics of customer behavior and demographics in the Indian telecom sector in predicting customer churn, utilizing two comprehensive datasets from four major telecom partners: Airtel, Reliance Jio, Vodafone, and BSNL:

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
telecom_usage = pd.read_csv('telecom_usage.csv')
telecom_demographics = pd.read_csv('telecom_demographics.csv')
telecom_usage.head()
telecom_demographics.head()
# Merged the 2 datasets together for deeper analysis
churn_df = telecom_demographics.merge(telecom_usage, on='customer_id', how='inner')
churn_df.head()
# Calculated the churn rate
print('Churn Rate:',churn_df['churn'].mean())
# Then prep the data sets for X, y training and testing sets
churn_dummies = pd.get_dummies(churn_df, columns=['telecom_partner','gender','state','city','registration_event'])

# Seperate the features from target values
churn_features = churn_dummies.drop(columns=['customer_id','churn'])

# Isolate the target values 'churn'
target = churn_dummies['churn']
# Scale the features to balance the influence from each feature
ss = StandardScaler()

ss.fit(churn_features)

features_scaled = ss.transform(churn_features)
# Split the data into train sets and test sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=.2, random_state=42)

# Create the predictive  models
lr = LogisticRegression(random_state=42)
rfc = RandomForestClassifier(random_state=42)

# Fit models to the training data
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)

# Predict the test(target) values to see accuracy
logreg_pred = lr.predict(X_test)
rf_pred = rfc.predict(X_test)

#check scores of both models for comparison
print('lr score:', classification_report(y_test, logreg_pred))
print('rfc score:', classification_report(y_test, rf_pred))
# rfc / Random Forest Classifier has the higher accuracy at 79%
higher_accuracy = 'RandomForest'