Skip to content

Classification: Telecom Customer Churn

This dataset comes from an Iranian telecom company, with each row representing a customer over a year period. Along with a churn label, there is information on the customers' activity, such as call failures and subscription length.

Let's predict customer churn!

Data Dictionary

ColumnExplanation
Call Failurenumber of call failures
Complainsbinary (0: No complaint, 1: complaint)
Subscription Lengthtotal months of subscription
Charge Amountordinal attribute (0: lowest amount, 9: highest amount)
Seconds of Usetotal seconds of calls
Frequency of usetotal number of calls
Frequency of SMStotal number of text messages
Distinct Called Numberstotal number of distinct phone calls
Age Groupordinal attribute (1: younger age, 5: older age)
Tariff Planbinary (1: Pay as you go, 2: contractual)
Statusbinary (1: active, 2: non-active)
Ageage of customer
Customer Valuethe calculated value of customer
Churnclass label (1: churn, 0: non-churn)

Data Analysis

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

data = pd.read_csv("data/customer_churn.csv")
data

Exploratory data analysis, Response variable

Let's learn more about Churn, its statistics and correlation with other variables in the dataset. I define the eda function that returns various statistics, distribution, and feature importance information.

def eda(data, response):
    '''The function takes a response (y) variable from a DF and returns various statistics, correlation, and feature importance information.
    Input: Dataframe, y-variable.'''
    
    from sklearn.model_selection import KFold, GridSearchCV
    sns.set(style='ticks', palette=['#62C370', '#FFD166', '#EF476F'])
    plt.rc('xtick', labelsize=9) 
    plt.rc('xtick.major', width=0.5)
    plt.rc('ytick', labelsize=9)
    plt.rc('ytick.major', width=0.5)
    plt.rc('axes', linewidth=0.5)
    
    columns = data.drop(response, axis=1).columns
    x = data.drop(response, axis=1).values
    y = data[response].values

    # Distribution of response variable and features
    print('Distribution of response variable and features')
    fig, axes = plt.subplots(1, 2, figsize=(10, len(columns)*0.32), constrained_layout=True)
    axes[0].set_title('Histogram, {}\nMean: {:.2f}'.format(response, data[response].mean()), fontsize=10)
    axes[0].set_xlabel(fontsize=9, xlabel='{}'.format(response))
    axes[0].set_ylabel(fontsize=9, ylabel='Count')
    axes[1].set_title('Features Scale', fontsize=10)
    sns.histplot(ax=axes[0], data=data, x=response)
    sns.boxplot(ax=axes[1], data=data, orient='h', palette='pastel')
    plt.show()
    
    # Correlation and NaNs
    print('.'*75, '\n\nCorrelation and Missed values (NaNs)')
    fig, ax = plt.subplots(1, 2, constrained_layout=True, figsize=(10, len(columns)*0.32))
    data.drop(response, axis=1).corrwith(data[response]).plot.barh(ax=ax[0])
    ax[0].set_title('Correlation, {}'.format(response), fontsize=10)    
    data.isna().sum().plot.barh(ax=ax[1])
    ax[1].set_title('Missed values (NaNs)', fontsize=10)    
    ax[1].bar_label(ax[1].containers[0], fontsize=8)
    plt.show()
    
    # Feature importance with Lasso Regression    
    from sklearn.linear_model import Lasso
    kf = KFold(n_splits=5, shuffle=True, random_state=100)
    param_grid = {'alpha': np.linspace(0.00001, 0.001, 10)}    
    lasso = Lasso()
    lasso_cv = GridSearchCV(lasso, param_grid, cv=kf, n_jobs=-1)
    lasso_cv.fit(x, y)
    best_params_lasso = lasso_cv.best_params_
    lasso_best = lasso_cv.best_estimator_
    coefs = lasso_best.coef_
    print('.'*75, '\n')
    print('Feature importance with Lasso Regression, {}:'.format(response),
          '\n\tBest params:', best_params_lasso, '\n\tScore: {:.3f}'.format(lasso_cv.best_score_))
    
    # Feature importance with XGB Classifier    
    from xgboost import XGBClassifier
    xgb = XGBClassifier()
    param_grid = {'booster':['gbtree', 'gblinear ', 'dart'],
                 'alpha': np.linspace(0, 10, 10)}
    kf = KFold(n_splits=5, shuffle=True, random_state=100)
    xgb_cv = GridSearchCV(xgb, param_grid, cv=kf, n_jobs=-1)
    xgb_cv.fit(x ,y)
    best_params_xgb = xgb_cv.best_params_
    xgb_best = xgb_cv.best_estimator_
    fi = xgb_best.feature_importances_
    print('\nFeature importance with XGB Classification, {}:'.format(response),
          '\n\tBest params:', best_params_xgb, '\n\tScore: {:.3f}\n'.format(xgb_cv.best_score_))    
    
    # Feature importance plots
    fig, axes = plt.subplots(1, 2, constrained_layout = True, sharey='row', figsize=(10, len(columns)*0.32))
    axes[0].set_title('Feature importance with Lasso Regression, {}'.format(response), fontsize=10)
    axes[0].barh(columns, coefs, color='#FFD166', height=0.5)
    axes[1].set_title('Feature importance with XGBoost Classification, {}'.format(response), fontsize=10)
    axes[1].barh(columns, fi, color='#EF476F', height=0.5)
    plt.show()
    
    # Regression, Histogram, Boxplot
    print('.'*75, '\n')
    print('Regression, Histogram, Boxplot: {} x other variables'.format(response))
    fig, axes = plt.subplots(len(columns), 3, figsize=(20, len(columns)*5)) 
    for grid, column in enumerate(columns):
        sns.regplot(ax=axes[grid, 0], data=data, x=column, y=response, color='#62C370')
        sns.histplot(ax=axes[grid, 1], data=data, x=column, y=response, color='#FFD166')
        sns.boxplot(ax=axes[grid, 2], data=data, x=column, color='#EF476F' )
    plt.show()
    

Running the function with Churn as response variable:

eda(data, 'Churn')

Noted, that Churn classes are heavily imbalanced, only 15% of customers have churned.

Considering both linear and non-linear correlation of features I select the following as the ones have the most potential: Status, Complains, Age Group, Distinct Called Numbers, Seconds of Use, and Subscription Length.

Also, Age Group and Subscription Length have outliers that could be dropped for better accuracy.

Data preprocessing

The dataset has no NaNs, so there is no need in missing values imputation. As for the data scaling - we'll do it later using pipelines.

Outliers treatment

We drop the outliers from Age Group and Subscription Length. I define the function dropoutliers wich uses two methods: IQR and Z-scores — to have an alternate approach.

def dropoutliers(data, predictors):
    '''The function uses two methods, IQR and Z-scores to drop outliers, makes comparison plots and returns two new preprocessed dataframes.
    Input: Dataframe, list of predictors.'''
    
    # Before treatment
    print('Shape of the Dataframe before treatment:', data.shape)
    fig, axes = plt.subplots(1, len(predictors), figsize=(20, 5))
    for col, predictor in enumerate(predictors):
        sns.boxplot(ax=axes[col], data = data, x=predictor, color='#62C370')
    plt.show()
    print('.'*75, '\n')
    
    # Treatment with IQR
    global data_do_iqr
    data_do_iqr = data
    for predictor in predictors:
        q1 = data_do_iqr[predictor].quantile(0.25)
        q3 = data_do_iqr[predictor].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - iqr * 1.5
        upper = q3 + iqr * 1.5
        data_do_iqr = data_do_iqr[data_do_iqr[predictor] > lower]
        data_do_iqr = data_do_iqr[data_do_iqr[predictor] < upper]
    
    print('Treatment with IQR method \nShape of the Dataframe after treatment:', data_do_iqr.shape)
    print('Name of the processed Dataframe: data_do_iqr')

    fig, axes = plt.subplots(1, len(predictors), figsize=(20, 5))
    for col, predictor in enumerate(predictors):
        sns.boxplot(ax=axes[col], data = data_do_iqr, x=predictor, color='#FFD166')
    plt.show()
    print('.'*75, '\n')
    
    # Treatment with Z score method
    global data_do_zs
    data_do_zs = data
    from scipy import stats
    for predictor in predictors:
        z = np.abs(stats.zscore(data_do_zs[predictor]))
        data_do_zs = data_do_zs[z < 3]
        
    print('Treatment with Z-Scores method \nShape of the Dataframe after treatment:', data_do_zs.shape)
    print('Name of the processed Dataframe: data_do_zs')
        
    fig, axes = plt.subplots(1, len(predictors), figsize=(20, 5))
    for col, predictor in enumerate(predictors):
        sns.boxplot(ax=axes[col], data = data_do_zs, x=predictor, color='#EF476F')
    plt.show()