Skip to content
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
data_path = 'recipe_site_traffic_2212.csv'
data = pd.read_csv(data_path)

# Step 1: Data Preprocessing Pipeline
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

def preprocess_data(df):
    # Remove invalid entries in 'servings' and convert to integer
    df = df[~df['servings'].astype(str).str.contains('as a snack', na=False)]
    df['servings'] = pd.to_numeric(df['servings'], errors='coerce').fillna(0).astype(int)

    # Fill missing values
    for column in df.columns:
        if column != 'high_traffic':
            if df[column].dtype in ['float64', 'int64']:
                df[column].fillna(df[column].median(), inplace=True)
            elif df[column].dtype == 'object':
                df[column].fillna(df[column].mode()[0], inplace=True)

    # Handle 'high_traffic' column
    df['high_traffic'] = df['high_traffic'].fillna('Low')
    df['high_traffic'] = df['high_traffic'].map({'High': 1, 'Low': 0})

    # One-hot encode 'category'
    df = pd.get_dummies(df, columns=['category'], drop_first=True)

    # Normalize features
    feature_columns = [col for col in df.columns if col not in ['recipe', 'high_traffic']]
    df[feature_columns] = preprocessing_pipeline.fit_transform(df[feature_columns])

    return df

# Preprocess the data
data = preprocess_data(data)

# Save processed dataset
processed_data_path = 'processed_recipe_site_traffic.csv'
data.to_csv(processed_data_path, index=False)

# Step 2: Model Implementation
X = data.drop(columns=['recipe', 'high_traffic'])
y = data['high_traffic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Baseline Logistic Regression Model
baseline_model = LogisticRegression(random_state=42, max_iter=500)
baseline_model.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the baseline model
y_pred_baseline = baseline_model.predict(X_test)

# Save the baseline model
baseline_model_path = 'baseline_logistic_regression_model.pkl'
joblib.dump(baseline_model, baseline_model_path)

# Step 3: Optimize Voting Classifier with Random Forest Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Grid search for best Random Forest parameters
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_resampled, y_train_resampled)
best_rf = grid_search_rf.best_estimator_

# Voting Classifier with Soft Voting and Weights
classifiers = [
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=500)),
    ('K-Nearest Neighbor', KNeighborsClassifier(n_neighbors=7)),
    ('Decision Tree', DecisionTreeClassifier(max_depth=5, random_state=42)),
    ('Random Forest', best_rf)
]

voting_classifier = VotingClassifier(
    estimators=classifiers,
    voting='soft',
    weights=[2, 1, 1, 3]  # Giving higher weight to Random Forest
)

# Fit the voting classifier
voting_classifier.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the voting classifier
y_pred_voting = voting_classifier.predict(X_test)
y_prob_voting = voting_classifier.predict_proba(X_test)[:, 1]

# Save the optimized voting classifier
voting_classifier_path = 'optimized_voting_classifier_model.pkl'
joblib.dump(voting_classifier, voting_classifier_path)

# Step 4: Feature Importance Analysis
feature_importances = best_rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Visualize Feature Importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('feature_importance.png')
plt.show()

# Step 5: Decision Threshold Optimization
optimal_threshold = thresholds[np.argmax(precision * recall)]

# Predict with Optimized Threshold
y_pred_optimized = (y_prob_voting >= optimal_threshold).astype(int)

# Additional Plots
# 1. Correlation Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()

# 2. Confusion Matrix Heatmaps for Models
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_baseline), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Baseline Model Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, y_pred_voting), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Voting Classifier Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
plt.savefig('confusion_matrix_heatmaps.png')
plt.show()

# 3. Distribution of Important Features
important_features = importance_df['Feature'].head(3)
for feature in important_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='high_traffic', y=feature, data=data, palette='muted')
    plt.title(f'{feature} Distribution by High Traffic')
    plt.xlabel('High Traffic (1: Yes, 0: No)')
    plt.ylabel(feature)
    plt.savefig(f'{feature}_distribution.png')
    plt.show()

# 4. ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_prob_voting)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='Voting Classifier')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.savefig('roc_curve.png')
plt.show()

Recipe Site Traffic Analysis: Written Report


1. Data Validation and Cleaning

  • Servings Column:

    • Removed invalid entries such as "as a snack."
    • Converted all values to integers, filling missing values with 0.
  • High Traffic Column:

    • Missing values filled with "Low."
    • Converted categorical values to binary encoding: High = 1, Low = 0.
  • Categorical Variables (e.g., Category):

    • One-hot encoded, dropping the first category to avoid multicollinearity.
  • Numerical Features (e.g., Protein, Calories):

    • Missing values replaced with median values.
    • Standardized using StandardScaler to ensure uniform scaling.

2. Exploratory Analysis

Key Insights:

  • Recipes high in protein and calories are strongly associated with high traffic.
  • Vegetables and potatoes emerged as highly impactful categories, more so than desserts or breakfast options.

Visualizations:

  1. Protein Distribution by High Traffic:

    • Recipes with higher protein content are more likely to result in high traffic, indicating the appeal of health-focused options.
  2. Calories Distribution by High Traffic:

    • High-calorie recipes show increased traffic, suggesting consumer interest in indulgent recipes.
  3. Correlation Heatmap:

    • Displays strong correlations between Protein, Calories, and Category_Vegetables with high traffic.

3. Model Development

Problem Type:

  • Binary classification to predict whether a recipe will generate high traffic (High Traffic = 1, Low Traffic = 0).

Baseline Model:

  • Logistic Regression:
    • Chosen for its simplicity and interpretability.
    • Metrics: Accuracy: 73.9%, Recall (High Traffic): 72%.

Comparison Model:

  • Voting Classifier:
    • Combines Logistic Regression, Random Forest, KNN, and Decision Tree.
    • Optimized using GridSearchCV for Random Forest parameters.
    • Metrics: Accuracy: 74.3%, Recall (High Traffic): 97%.

4. Model Evaluation

Baseline vs. Voting Classifier:

  1. Logistic Regression:

    • Precision: 83%, Recall: 72%.
    • Balanced performance but lower recall for high traffic.
  2. Voting Classifier:

    • Precision: 71%, Recall: 97%.
    • Prioritizes recall to ensure most high-traffic recipes are identified.

Visualizations:

  • Precision-Recall Curve:

    • Demonstrates how the Voting Classifier balances precision and recall effectively.
  • ROC Curve:

    • Highlights the Voting Classifier’s ability to distinguish high and low traffic recipes (AUC: High).

5. Business Metric Definition

Proposed Metric:

  • Recall for High Traffic Recipes:
    • Ensures the business identifies the majority of high-traffic recipes for promotion.

Initial Value:

  • 97% (achieved by the Voting Classifier).

Monitoring Plan:

  • Regularly evaluate recall during model predictions.
  • Retrain the model periodically with new recipe data to maintain performance.

6. Final Summary and Recommendations

Key Findings:

  • Protein, calories, and category-specific variables (e.g., vegetables and potatoes) are the top predictors of recipe traffic.
  • The Voting Classifier significantly outperforms Logistic Regression in recall, aligning with the business’s goal to identify high-traffic recipes.

Recommendations:

  1. Promote High-Traffic Recipes:

    • Focus on high-protein, low-calorie options to align with health-conscious trends.
    • Highlight recipes in impactful categories such as vegetables and potatoes.
  2. Deploy Voting Classifier:

    • Use the optimized model to predict high-traffic recipes for marketing campaigns.
  3. Monitor and Iterate:

    • Track recall as the primary performance metric.
    • Retrain the model regularly to incorporate new recipe data.

Appendix:

  • Visualizations:
    1. Protein Distribution by High Traffic.
    2. Calories Distribution by High Traffic.
    3. Correlation Heatmap.
    4. Precision-Recall Curve.
    5. ROC Curve.
  • Code:
    • Full implementation is available for review upon request.