Classification: Telecom Customer Churn
This dataset comes from an Iranian telecom company, with each row representing a customer over a year period. Along with a churn label, there is information on the customers' activity, such as call failures and subscription length.
Let's predict customer churn!
Data Dictionary
Column | Explanation |
---|---|
Call Failure | number of call failures |
Complains | binary (0: No complaint, 1: complaint) |
Subscription Length | total months of subscription |
Charge Amount | ordinal attribute (0: lowest amount, 9: highest amount) |
Seconds of Use | total seconds of calls |
Frequency of use | total number of calls |
Frequency of SMS | total number of text messages |
Distinct Called Numbers | total number of distinct phone calls |
Age Group | ordinal attribute (1: younger age, 5: older age) |
Tariff Plan | binary (1: Pay as you go, 2: contractual) |
Status | binary (1: active, 2: non-active) |
Age | age of customer |
Customer Value | the calculated value of customer |
Churn | class label (1: churn, 0: non-churn) |
Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
data = pd.read_csv("data/customer_churn.csv")
data
Exploratory data analysis, Response variable
Let's learn more about Churn, its statistics and correlation with other variables in the dataset. I define the eda function that returns various statistics, distribution, and feature importance information.
def eda(data, response):
'''The function takes a response (y) variable from a DF and returns various statistics, correlation, and feature importance information.
Input: Dataframe, y-variable.'''
from sklearn.model_selection import KFold, GridSearchCV
sns.set(style='ticks', palette=['#62C370', '#FFD166', '#EF476F'])
plt.rc('xtick', labelsize=9)
plt.rc('xtick.major', width=0.5)
plt.rc('ytick', labelsize=9)
plt.rc('ytick.major', width=0.5)
plt.rc('axes', linewidth=0.5)
columns = data.drop(response, axis=1).columns
x = data.drop(response, axis=1).values
y = data[response].values
# Distribution of response variable and features
print('Distribution of response variable and features')
fig, axes = plt.subplots(1, 2, figsize=(10, len(columns)*0.32), constrained_layout=True)
axes[0].set_title('Histogram, {}\nMean: {:.2f}'.format(response, data[response].mean()), fontsize=10)
axes[0].set_xlabel(fontsize=9, xlabel='{}'.format(response))
axes[0].set_ylabel(fontsize=9, ylabel='Count')
axes[1].set_title('Features Scale', fontsize=10)
sns.histplot(ax=axes[0], data=data, x=response)
sns.boxplot(ax=axes[1], data=data, orient='h', palette='pastel')
plt.show()
# Correlation and NaNs
print('.'*75, '\n\nCorrelation and Missed values (NaNs)')
fig, ax = plt.subplots(1, 2, constrained_layout=True, figsize=(10, len(columns)*0.32))
data.drop(response, axis=1).corrwith(data[response]).plot.barh(ax=ax[0])
ax[0].set_title('Correlation, {}'.format(response), fontsize=10)
data.isna().sum().plot.barh(ax=ax[1])
ax[1].set_title('Missed values (NaNs)', fontsize=10)
ax[1].bar_label(ax[1].containers[0], fontsize=8)
plt.show()
# Feature importance with Lasso Regression
from sklearn.linear_model import Lasso
kf = KFold(n_splits=5, shuffle=True, random_state=100)
param_grid = {'alpha': np.linspace(0.00001, 0.001, 10)}
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=kf, n_jobs=-1)
lasso_cv.fit(x, y)
best_params_lasso = lasso_cv.best_params_
lasso_best = lasso_cv.best_estimator_
coefs = lasso_best.coef_
print('.'*75, '\n')
print('Feature importance with Lasso Regression, {}:'.format(response),
'\n\tBest params:', best_params_lasso, '\n\tScore: {:.3f}'.format(lasso_cv.best_score_))
# Feature importance with XGB Classifier
from xgboost import XGBClassifier
xgb = XGBClassifier()
param_grid = {'booster':['gbtree', 'gblinear ', 'dart'],
'alpha': np.linspace(0, 10, 10)}
kf = KFold(n_splits=5, shuffle=True, random_state=100)
xgb_cv = GridSearchCV(xgb, param_grid, cv=kf, n_jobs=-1)
xgb_cv.fit(x ,y)
best_params_xgb = xgb_cv.best_params_
xgb_best = xgb_cv.best_estimator_
fi = xgb_best.feature_importances_
print('\nFeature importance with XGB Classification, {}:'.format(response),
'\n\tBest params:', best_params_xgb, '\n\tScore: {:.3f}\n'.format(xgb_cv.best_score_))
# Feature importance plots
fig, axes = plt.subplots(1, 2, constrained_layout = True, sharey='row', figsize=(10, len(columns)*0.32))
axes[0].set_title('Feature importance with Lasso Regression, {}'.format(response), fontsize=10)
axes[0].barh(columns, coefs, color='#FFD166', height=0.5)
axes[1].set_title('Feature importance with XGBoost Classification, {}'.format(response), fontsize=10)
axes[1].barh(columns, fi, color='#EF476F', height=0.5)
plt.show()
# Regression, Histogram, Boxplot
print('.'*75, '\n')
print('Regression, Histogram, Boxplot: {} x other variables'.format(response))
fig, axes = plt.subplots(len(columns), 3, figsize=(20, len(columns)*5))
for grid, column in enumerate(columns):
sns.regplot(ax=axes[grid, 0], data=data, x=column, y=response, color='#62C370')
sns.histplot(ax=axes[grid, 1], data=data, x=column, y=response, color='#FFD166')
sns.boxplot(ax=axes[grid, 2], data=data, x=column, color='#EF476F' )
plt.show()
Running the function with Churn as response variable:
eda(data, 'Churn')
Noted, that Churn classes are heavily imbalanced, only 15% of customers have churned.
Considering both linear and non-linear correlation of features I select the following as the ones have the most potential: Status, Complains, Age Group, Distinct Called Numbers, Seconds of Use, and Subscription Length.
Also, Age Group and Subscription Length have outliers that could be dropped for better accuracy.
Data preprocessing
The dataset has no NaNs, so there is no need in missing values imputation. As for the data scaling - we'll do it later using pipelines.
Outliers treatment
We drop the outliers from Age Group and Subscription Length. I define the function dropoutliers wich uses two methods: IQR and Z-scores — to have an alternate approach.
def dropoutliers(data, predictors):
'''The function uses two methods, IQR and Z-scores to drop outliers, makes comparison plots and returns two new preprocessed dataframes.
Input: Dataframe, list of predictors.'''
# Before treatment
print('Shape of the Dataframe before treatment:', data.shape)
fig, axes = plt.subplots(1, len(predictors), figsize=(20, 5))
for col, predictor in enumerate(predictors):
sns.boxplot(ax=axes[col], data = data, x=predictor, color='#62C370')
plt.show()
print('.'*75, '\n')
# Treatment with IQR
global data_do_iqr
data_do_iqr = data
for predictor in predictors:
q1 = data_do_iqr[predictor].quantile(0.25)
q3 = data_do_iqr[predictor].quantile(0.75)
iqr = q3 - q1
lower = q1 - iqr * 1.5
upper = q3 + iqr * 1.5
data_do_iqr = data_do_iqr[data_do_iqr[predictor] > lower]
data_do_iqr = data_do_iqr[data_do_iqr[predictor] < upper]
print('Treatment with IQR method \nShape of the Dataframe after treatment:', data_do_iqr.shape)
print('Name of the processed Dataframe: data_do_iqr')
fig, axes = plt.subplots(1, len(predictors), figsize=(20, 5))
for col, predictor in enumerate(predictors):
sns.boxplot(ax=axes[col], data = data_do_iqr, x=predictor, color='#FFD166')
plt.show()
print('.'*75, '\n')
# Treatment with Z score method
global data_do_zs
data_do_zs = data
from scipy import stats
for predictor in predictors:
z = np.abs(stats.zscore(data_do_zs[predictor]))
data_do_zs = data_do_zs[z < 3]
print('Treatment with Z-Scores method \nShape of the Dataframe after treatment:', data_do_zs.shape)
print('Name of the processed Dataframe: data_do_zs')
fig, axes = plt.subplots(1, len(predictors), figsize=(20, 5))
for col, predictor in enumerate(predictors):
sns.boxplot(ax=axes[col], data = data_do_zs, x=predictor, color='#EF476F')
plt.show()