AZ Watch is a popular video streaming platform specialized in educational content, where creators publish online video tutorials and lessons about any topic, from speaking a new language to cooking to learning to play a musical instrument.
Their next goal is to leverage AI-driven solutions to analyze and make predictions about their subscribers and improve their marketing strategy around attracting new subscribers and retaining current ones. This project uses machine learning to predict subscribers likely to churn and find customer segments. This may help AZ Watch find interesting usage patterns to build subscriber personas in future marketing plans!
The data/AZWatch_subscribers.csv dataset contains information about subscribers and their status over the last year:
| Column name | Description |
|---|---|
subscriber_id | The unique identifier of each subscriber user |
age_group | The subscriber's age group |
engagement_time | Average time (in minutes) spent by the subscriber per session |
engagement_frequency | Average weekly number of times the subscriber logged in the platform (sessions) over a year period |
subscription_status | Whether the user remained subscribed to the platform by the end of the year period (subscribed), or unsubscribed and terminated her/his services (churned) |
Carefully observe and analyze the features in the dataset, asking yourself if there are any categorical attributes requiring pre-processing?
The subscribers dataset from the data/AZWatch_subscribers.csv file is already being loaded and split into training and test sets for you:
# Import the necessary modules
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score# Make a scoring function
def scorer(preds):
cm = confusion_matrix(y_test, preds)
right = cm[0][0]+cm[1][1]
error = cm[0][1]+cm[1][0]
accuracy = right / (right + error)
return accuracy# Specify the file path of your CSV file
file_path = "data/AZWatch_subscribers.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Inspect the data
df.head(2)df.age_group.nunique()# Since age_group is categorical and it only has 3 unique values, we'll swap the column for 3 boolean dummy columns
dummies = pd.get_dummies(df.age_group)
dummies = dummies.astype(int)
# Cols to omit at standard scaling step
dummy_cols = list(dummies.columns)
# Concatenate dummies to the original dataframe
fin_df = pd.concat([df, dummies], axis=1)
fin_df.drop(columns=["age_group"], inplace=True) # Now represented by dummies
# Shift dummy columns from [0, 1] to [-1, 1] to match standard scaler
fin_df[dummy_cols] = fin_df[dummy_cols] * 2 - 1
# Separate predictor variables from class label and drop the irrelevant id column
X = fin_df.drop(['subscriber_id','subscription_status'], axis=1)
# Omit booleans from scaling
scale_cols = [col for col in X.columns if col not in dummy_cols]
scaler = StandardScaler()
X[scale_cols] = scaler.fit_transform(X[scale_cols])
y = fin_df.subscription_status.map({"churned":-1, "subscribed":1}).astype(int)
ynu = fin_df.subscription_status.nunique()
seed = 42
# Split into training and test sets (20% test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=.2, random_state=seed)fin_df.head(5)models = [LogisticRegression(random_state=seed), DecisionTreeClassifier(random_state=seed), RandomForestClassifier(random_state=seed)]
best_accuracy = 0
best_model = ""
for i in range(len(models)):
model = models[i]
fit_model = model.fit(X_train, y_train)
globals()[f"model{i+1}"] = fit_model
model_preds = fit_model.predict(X_test)
globals()[f"preds{i+1}"] = model_preds
accuracy = scorer(model_preds)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_model = model
print(models[i], scorer(model_preds))
print("best_accuracy:", score:=best_accuracy)
print("best_model:", best_model)
print("_"*42)# SUBSCRIBER SEGMENTATION
# You can optionally use a method like the elbow criterion and silhouette calculation to choose the number of clusters.
segmentation = fin_df.drop(['subscriber_id','subscription_status'], axis=1)
# Scale the two numerical data attributes
scaler = StandardScaler()
scaler.fit(segmentation)
segmentation_normalized = scaler.transform(segmentation)
sse = {} # sum of squared errors (distances) to each cluster
silhouette_scores = {}
for k in range(2, 20): # silhouette score is not defined for k=1
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(segmentation_normalized)
sse[k] = kmeans.inertia_
labels = kmeans.labels_
silhouette_scores[k] = silhouette_score(segmentation_normalized, labels)
# Plot Elbow method
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.title('Elbow method to choose k')
plt.xlabel('k')
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.tight_layout()
# Plot Silhouette scores
plt.subplot(1,2,2)
plt.title('Silhouette Score for k')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
sns.pointplot(x=list(silhouette_scores.keys()), y=list(silhouette_scores.values()))
plt.tight_layout()
plt.show()print(sse)# algorithmic selection
ims = {} # in-slopes
oms = {} # out-slopes
dms = {} # delta-slopes
for k, v in sse.items():
if k > 2: # slope from last n to this n
im = (v - sse[k - 1])
ims[k] = im
if k < max(sse.keys()): # slope from this n to next n
om = (sse[k + 1] - v)
oms[k] = om
print(ims)
print("-"*42)
print(oms)# subtract out-slope from in-slope to get change of slope at cluster n
for k in sse.keys():
if (k in ims) and (k in oms): # for common keys
dms[k] = ims[k] - oms[k] # get change of slope at n
print(dms)# the elbow is where the slope change is most extreme (minimum)
elbow_key = min(dms, key=dms.get)
elbow_key# Apply k-means clustering with elbow clusters
kmeans = KMeans(n_clusters=elbow_key, random_state=1)
kmeans.fit_predict(segmentation_normalized)
# Add cluster labels as a new attribute in the dataset before scaling
segmentation["cluster_id"] = kmeans.labels_
# Analyze average feature values and counts per cluster
analysis = segmentation.groupby(['cluster_id']).agg({
'engagement_time': ['mean'],
'engagement_frequency':['mean']
}).round(0)
analysis