Recipe Site Traffic Model

1 hidden cell

I have added a # to the GridSearch objects in the RFC training section to run the code faster. Feel free to remove the # and run the GridSearch (~5min extra).

# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import linalg
from scipy.special import expit
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample
from sklearn.metrics import precision_score
from tqdm import tqdm

# DATA VALIDATION AND EXPLORATORY ANALYSIS

# Read CSV as a dataframe and set index
recipes = pd.read_csv("recipe_site_traffic_2212.csv")
recipes.set_index("recipe")

# Inspect data
recipes.info()
recipes.head()

# Check for missing data
print(recipes.isna().sum())

# Column "recipes": Are there duplicate recipes (index)?
duplicates = recipes.duplicated(subset = "recipe", keep=False)
duplicated_recipes = recipes[duplicates].sort_values("recipe")
print(duplicated_recipes["recipe"])

# Column "servings": Update the "servings" data type to numeric
recipes["servings"] = recipes["servings"].str.extract('(\d+)').astype(int)
print(recipes["servings"].dtype)

# Column "category": Validate categorical data
categories = ['Lunch/Snacks','Beverages','Potato','Vegetable','Meat','Chicken', 'Pork','Dessert', 'Breakfast','One Dish Meal']
cvad = ~recipes["category"].isin(categories)
print(cvad.value_counts())

# There are 98 records that do not fit any category. Find which ones.
subset = recipes[cvad]
print(subset["category"].value_counts())

# All 98 missing categories are "Chicken Breast". These 98 records will be adjusted to "Chicken". Then double check new values.
recipes["category"] = recipes["category"].replace("Chicken Breast", "Chicken")
cvad = recipes["category"].isin(categories)
print(cvad.value_counts())

# Column "high_traffic": deal with the missing values.
recipes["high_traffic"].value_counts()
recipes["high_traffic"] = recipes["high_traffic"].fillna("Low")

# Columns "calories", "carbohydrate", "sugar", "protein": deal with the missing data (52 NaN)
recipes.describe()

# First try dropping data: check treshold.
treshold = len(recipes) * 0.05
print(treshold)

# Treshold is 47 which is below 52 so cannot drop data. Need to impute. Explore the distribution of numerical data with boxplots and histograms to decide how to impute.
numerical = ["calories", "carbohydrate", "sugar", "protein", "servings"]
recipes_numerical = recipes[numerical]
n = len(recipes_numerical.columns)
fig, axs = plt.subplots(n, 2, figsize=(15, n*5))

for i, column in enumerate(recipes_numerical):
    # Plot histogram on the left (0-th column of plot grid)
    axs[i, 0].hist(recipes[column], bins=30, color='lightblue')
    axs[i, 0].set_title(f'Histogram of {column}')
    # Plot boxplot on the right (1st column of plot grid)
    sns.boxplot(x=recipes[column], ax=axs[i, 1]) 
    axs[i, 1].set_title(f'Boxplot of {column}')
    
plt.tight_layout()
plt.show()

# Imputing data with the median. Min, max of each variable looks sane for nutrition, hence the outliers will not be dropped.
for col in numerical:
    recipes[col] = recipes[col].fillna(recipes[col].median())

# Check again for missing data and data types
print(recipes.isna().sum())
print(recipes.info())

# Check the relationships between numerical data
sns.pairplot(recipes_numerical)
plt.show()

# Check data for correlation and create a heatmap
recipes_corr = recipes[numerical].corr()
sns.heatmap(recipes_corr, annot=True)
plt.show()

# Checking class distribution for category, servings, high_traffic
for i in ["category", "servings", "high_traffic"] :
    v = recipes[i].value_counts(normalize=True)
    print(v)

# BASELINE MODEL m1 - logistic regression

# Make the target variable numeric (0 and 1)
recipes["high_traffic"] = recipes["high_traffic"].map({'Low': 0, 'High': 1})

# Encode categorical variables
recipes_encoded = pd.get_dummies(recipes, columns=["category"], prefix="category")
recipes_encoded = pd.get_dummies(recipes_encoded, columns=["servings"], prefix="serving")

# Get X, y
X_m1 = recipes_encoded.drop(["recipe", "high_traffic", "calories", "carbohydrate", "protein", "serving_6"], axis=1)
y_m1 = recipes_encoded["high_traffic"]

# Split the data
X_train_m1, X_test_m1, y_train_m1, y_test_m1 = train_test_split(X_m1, y_m1, stratify=y_m1, test_size=0.2, random_state=42)

# Declare scaler and scale X
scaler_m1 = StandardScaler()
X_train_scaled_m1 = scaler_m1.fit_transform(X_train_m1) 
X_test_scaled_m1 = scaler_m1.transform(X_test_m1)

# StratifiedKFold for stratified cross-validation
stratified_cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

# Perform k-fold CV and print accuracy scores
scoring_list = ["precision", "recall", "f1", "roc_auc"]
for i in scoring_list:
    score_m1 = cross_val_score(LogisticRegression(), X_train_scaled_m1, y_train_m1, cv=stratified_cv, scoring=i)
    print(f"m1 {i}: {score_m1.mean()} (std: {score_m1.std()})")

# Final logistic regression model
m1_final = LogisticRegression()
m1_final.fit(X_train_scaled_m1, y_train_m1)
y_pred_test_m1 = m1_final.predict(X_test_scaled_m1)

# Final logistic regression model's accuracy
m1_accuracy = accuracy_score(y_test_m1, y_pred_test_m1)
m1_prec = precision_score(y_test_m1, y_pred_test_m1)
print(f"Final Model m1 Accuracy on Test Set: {m1_accuracy}")
print(f"Final Model m1 Precision on Test Set: {m1_prec}")

# Final logistic regression model's coefficients
coefficients_m1 = pd.DataFrame({'Feature': X_m1.columns, 'Coefficient': m1_final.coef_[0]})
coefficients_m1["Odds"] = np.exp(coefficients_m1["Coefficient"])
coefficients_m1["Probabilities"] = expit(coefficients_m1["Coefficient"])
print(coefficients_m1.sort_values("Probabilities"))

# Plot the feature coefficients
plt.figure(figsize=(10, 6))
sns.barplot(x='Probabilities', y='Feature', data=coefficients_m1.sort_values("Probabilities"))
plt.title('Model m1 - probability of high traffic given a feature')
plt.axvline(x=0.5,linewidth=1, color='k')
plt.show()

recipes.head()

# COMPARISON MODEL m2 - RFC

# Get X, y
X_m2 = recipes_encoded.drop(["recipe", "high_traffic"], axis=1)
y_m2 = recipes_encoded["high_traffic"]

# Split the data into training and testing sets
X_train_m2_cv, X_test_m2_cv, y_train_m2_cv, y_test_m2_cv = train_test_split(X_m2, y_m2, test_size=0.2, random_state=42, stratify=y_m2)

# Scale the data using only the training data
scaler_m2cv = StandardScaler()
X_train_scaled_m2_cv = scaler_m2cv.fit_transform(X_train_m2_cv)
X_test_scaled_m2_cv = scaler_m2cv.transform(X_test_m2_cv)

# Set class weights
total_samples_cv_m2 = len(y_train_m2_cv)
class_0_weight = total_samples_cv_m2 / (2 * np.sum(y_train_m2_cv == 0))
class_1_weight = total_samples_cv_m2 / (2 * np.sum(y_train_m2_cv == 1))
class_weights_cv_m2 = {0: class_0_weight, 1: class_1_weight}

# Define the parameter grid to search the best parameters
param_grid = {
    'n_estimators': [10, 25, 50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [1, 2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8],
}

# Instantiate the RandomForestClassifier for the grid search
# rf_classifier_0 = RandomForestClassifier(random_state=42, class_weight=class_weights_cv_m2)

# Create the GridSearchCV object
# grid_search = GridSearchCV(estimator=rf_classifier_0, param_grid=param_grid, scoring='precision', cv=7)

# Fit the GridSearchCV object to the data
# grid_search.fit(X_train_scaled_m2_cv, y_train_m2_cv)

# Print the best parameters and corresponding precision
# print("Best Parameters:", grid_search.best_params_)
# print("Best Precision:", grid_search.best_score_)

# Instantiate a Decision Tree Classifier and use the best parameters
m2cv = RandomForestClassifier(
    n_estimators=10,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=8,
    class_weight=class_weights_cv_m2,
    random_state=42
)

# StratifiedKFold for stratified cross-validation
stratified_cv_2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and print accuracy scores
for i in scoring_list:
    cv_scores_m2 = cross_val_score(m2cv, X_train_scaled_m2_cv, y_train_m2_cv, cv=6, scoring=i)
    print(f"m2 cross-validation {i}: {cv_scores_m2.mean()} (std: {cv_scores_m2.std()})")



# Final RFC model: split the data for the random forest classifier training
X_train_m2, X_test_m2, y_train_m2, y_test_m2 = train_test_split(X_m2, y_m2, stratify=y_m2, test_size=0.2, random_state=42)

# Final RFC model: Declare scaler and scale X
scaler_m2 = StandardScaler()
X_train_scaled_m2 = scaler_m2.fit_transform(X_train_m2) 
X_test_scaled_m2 = scaler_m2.transform(X_test_m2)

# Final RFC model: Set class weights
total_samples_m2 = len(y_train_m2)
class_0_weight_m2 = total_samples_m2 / (2 * np.sum(y_train_m2 == 0))
class_1_weight_m2 = total_samples_m2 / (2 * np.sum(y_train_m2 == 1))
class_weights_m2 = {0: class_0_weight_m2, 1: class_1_weight_m2}

# Final RFC model: Initialize the Random Forest Classifier
m2_final = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=8,
    class_weight=class_weights_m2,
    random_state=42
)

# Final RFC model: Train the model on the training set
m2_final.fit(X_train_scaled_m2, y_train_m2)

# Final RFC model: Make predictions on the test set
y_pred_m2 = m2_final.predict(X_test_scaled_m2)

# Final RFC model: Evaluate the model
accuracy_m2 = accuracy_score(y_test_m2, y_pred_m2)
recall_m2 = recall_score(y_test_m2, y_pred_m2)
precision_val = precision_score(y_test_m2, y_pred_m2)
f1_m2 = f1_score(y_test_m2, y_pred_m2)

# Final RFC model: Display evaluation metrics
print(f'Accuracy of m2 rf: {accuracy_m2:.4f}')
print(f'Precision of m2 rf: {precision_val:.4f}')
print(f'Recall of m2 rf: {recall_m2:.4f}')
print(f'F1-score of m2 rf: {f1_m2:.4f}')

# Assessing which features will help get more traffic
feature_importance = m2_final.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_m2.columns,
    'Importance': feature_importance
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('RFC Feature Importance')
plt.show()

# MODEL m1,m2 TEST

# Creating test data
test_data = {
    'recipe': ['TestRecipe1', 'TestRecipe2', 'TestRecipe3', 'TestRecipe4', 'TestRecipe5',
               'TestRecipe6', 'TestRecipe7', 'TestRecipe8', 'TestRecipe9', 'TestRecipe10',
               'TestRecipe11', 'TestRecipe12', 'TestRecipe13', 'TestRecipe14', 'TestRecipe15'],
    'calories': [300, 450, 200, 350, 500, 250, 400, 300, 450, 350, 150, 400, 300, 500, 250],
    'carbohydrate': [25, 30, 15, 20, 35, 18, 28, 24, 32, 22, 10, 28, 20, 40, 15],
    'sugar': [10, 15, 5, 8, 12, 6, 10, 9, 14, 11, 3, 15, 12, 20, 8],
    'protein': [20, 25, 15, 18, 30, 16, 24, 22, 28, 20, 8, 30, 18, 25, 14],
    'category': ['Dessert', 'Breakfast', 'Lunch/Snacks', 'Chicken', 'Vegetable',
                 'Breakfast', 'Dessert', 'Chicken', 'Lunch/Snacks', 'Vegetable',
                 'Beverages', 'Meat', 'One Dish Meal', 'Pork', 'Potato'],
    'servings': [2, 4, 1, 6, 2, 2, 4, 2, 4, 6, 1, 4, 2, 6, 2]
}

# Creating the test dataframe
test_df = pd.DataFrame(test_data)
test_df.set_index("recipe")

# Encode categorical variables
test_df_encoded = pd.get_dummies(test_df, columns=["category"], prefix="category")
test_df_encoded = pd.get_dummies(test_df_encoded, columns=["servings"], prefix="serving")
test_df_X_m1 = test_df_encoded.drop(["recipe", "calories", "carbohydrate", "protein", "serving_6"], axis=1)
test_df_X_m2 = test_df_encoded.drop(["recipe"], axis=1)

# Scale data
scaler_test = StandardScaler()
test_df_X_scaled_m1 = scaler_test.fit_transform(test_df_X_m1)
test_df_X_scaled_m2 = scaler_test.fit_transform(test_df_X_m2)

# Predict with m1
test_df["high_traffic_m1"] =  m1_final.predict(test_df_X_scaled_m1)

# Predict with m2
test_df["high_traffic_m2"] =  m2_final.predict(test_df_X_scaled_m2)

# Print 
test_df