Skip to content

AZ Watch is a popular video streaming platform specialized in educational content, where creators publish online video tutorials and lessons about any topic, from speaking a new language to cooking to learning to play a musical instrument.

Their next goal is to leverage AI-driven solutions to analyze and make predictions about their subscribers and improve their marketing strategy around attracting new subscribers and retaining current ones. This project uses machine learning to predict subscribers likely to churn and find customer segments. This may help AZ Watch find interesting usage patterns to build subscriber personas in future marketing plans!

The data/AZWatch_subscribers.csv dataset contains information about subscribers and their status over the last year:

Column nameDescription
subscriber_idThe unique identifier of each subscriber user
age_groupThe subscriber's age group
engagement_timeAverage time (in minutes) spent by the subscriber per session
engagement_frequencyAverage weekly number of times the subscriber logged in the platform (sessions) over a year period
subscription_statusWhether the user remained subscribed to the platform by the end of the year period (subscribed), or unsubscribed and terminated her/his services (churned)

Carefully observe and analyze the features in the dataset, asking yourself if there are any categorical attributes requiring pre-processing?

The subscribers dataset from the data/AZWatch_subscribers.csv file is already being loaded and split into training and test sets for you:


#Input data matrix and perform initial evaluation and preparation

# Import the necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt

# Specify the file path of your CSV file
file_path = "data/AZWatch_subscribers.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

#show df and information
display(df.head())
print(df.info())

#chech age group categories
print(df.age_group.unique())

#convert age group to integers

age_group_dum = pd.get_dummies(df['age_group']).astype('int')
print(age_group_dum.dtypes) 

subscribe_dum = pd.get_dummies(df['subscription_status'], drop_first=True).astype('int')

#concat dummies to df
df = pd.concat([df, age_group_dum, subscribe_dum], axis=1)  # Corrected from append to concat

#drop non-numeric columns
df = df.drop(['age_group', 'subscriber_id', 'subscription_status'], axis=1)

#rename remaining columns
columns = ['Time', 'Frequency', 'Age_18_to_34', 'Under_18', '35_or_over', 'Subscribe']
df.columns = columns

#drop subsscribe from predictor df to X and assign subscribe column to y
X=df.drop(['Subscribe'],axis=1)
y=df['Subscribe']

# Split intro training and test sets (20% test)
X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=.2, random_state=42)



#perform a logisgic regression to check rate of success in correct prediction 

#additional libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report


logreg = LogisticRegression()
model1 = logreg.fit(X_train, y_train)
model1_pred = model1.predict(X_test)
model1_score = accuracy_score(y_test, model1_pred)
conf_mat = confusion_matrix(y_test, model1_pred)
class_report=classification_report(y_test,model1_pred)
print(f'Logistic regression accuracy score: {model1_score}')
print(f'\nConfusion_matrix:\n {conf_mat}')
print(f'\nClassification report:\n {class_report}')


#try decision tree classifier
dectree=DecisionTreeClassifier(max_depth=3,criterion='gini')
model2=dectree.fit(X_train,y_train)
model2_pred=model2.predict(X_test)
model2_score = accuracy_score(y_test, model2_pred)
conf_mat = confusion_matrix(y_test, model2_pred)
class_report=classification_report(y_test,model2_pred)
print(f'Decision tree classifier accuracy score: {model2_score}')
print(f'\nConfusion_matrix:\n {conf_mat}')
print(f'\nClassification report:\n {class_report}')

#finally try random forest classifier
rf=RandomForestClassifier(n_estimators=10,max_depth=3)
model3=rf.fit(X_train,y_train)
model3_pred=model3.predict(X_test)
model3_score=accuracy_score(y_test,model3_pred)
conf_mat = confusion_matrix(y_test, model3_pred)
class_report=classification_report(y_test,model3_pred)
print(f'Randomforest classifier accuracy score: {model2_score}')
print(f'\nConfusion_matrix:\n {conf_mat}')
print(f'\nClassification report:\n {class_report}')

#check the importance of predictor variables
cols=X_test.columns
importances=pd.DataFrame({'features':cols, 'importances':model3.feature_importances_})

#show as a bar plot
topimp=importances[importances['importances']>=0.01]
sns.barplot(data=topimp, x='features', y='importances')
plt.title('Importance of features')
plt.show()
#Let us experiment with clustering to see if it picks up the churn and subscribe clusters successfully

from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Separate predictor variables from class label
segmentation = df.drop(['Age_18_to_34','Under_18','35_or_over'], axis=1)
y = df.Subscribe
scale = StandardScaler()
segmentation_normalized = scale.fit_transform(segmentation)

# Clustering
from sklearn.cluster import KMeans


krange=range(1,20)
inertia=[]
for k in krange:
    kmeans = KMeans(n_clusters=k, random_state=1)
    kmeans.fit(segmentation_normalized)
    inertia.append(kmeans.inertia_)

#Alternative way to  collect innertia for different n_neigbors
#intertia = {}
#for k in range(1, 20):
#    kmeans = KMeans(n_clusters=k, random_state=1)
#    kmeans.fit(segmentation_normalized)
#    inertia[k] = kmeans.inertia_
#sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
#plt.show()
    

#generate elbow plot to decide optimum number of n_neighbors    
sns.lineplot(x=krange, y=inertia, marker='o')
plt.xlabel('n_neighbors')
plt.xticks(krange)
plt.ylabel('inertia')
plt.title('Inertia vs. n_neighbos: elbow plot')
plt.show()

#Try 3 neighbors
model = KMeans(n_clusters=3, random_state=1)
model.fit(segmentation_normalized)
labels = model.predict(segmentation_normalized)
segmentation['cluster_id']=labels

#prepare and show crosstab print and prepare heatmap
ct=pd.crosstab(segmentation['Subscribe'],segmentation['cluster_id'])
print(ct)
sns.heatmap(ct)
plt.show()

#show the result as a scatter plot to show how predictive clustering is
sns.scatterplot(data=segmentation,x='Time',y='Frequency',hue='cluster_id')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks(range(0,20))
plt.title('Time-Frequency scatter plot with 3 clusters')
plt.show()

#show averages
time_freq_means=segmentation.groupby('cluster_id')[['Time','Frequency']].mean().round(0)
print(time_freq_means)

#now since there are only 2 options let us use only 2 cluters
model = KMeans(n_clusters=2, random_state=1)
model.fit(segmentation_normalized)
labels = model.predict(segmentation_normalized)
segmentation['cluster_id']=labels

#create crosstab for 2 cluster cae
ct=pd.crosstab(segmentation['Subscribe'],segmentation['cluster_id'])
print(ct)
sns.heatmap(ct)
plt.show()

#show the clustering as a scatter plot
sns.scatterplot(data=segmentation,x='Time',y='Frequency',hue='cluster_id')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks(range(0,20))
plt.title('Time-Frequency scatter plot with 2 clusters')
plt.show()

#construct a scatter plot for the actual churna and subscribe options 
sns.scatterplot(data=segmentation,x='Time',y='Frequency',hue='Subscribe', palette='RdBu')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks(range(0,20))
plt.title('Time-Frequency scatter plot with actual subscribe-churn clusters')
plt.show()