top_recipes_website_model

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
data_path = 'recipe_site_traffic_2212.csv'
data = pd.read_csv(data_path)

# Step 1: Data Preprocessing Pipeline
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

def preprocess_data(df):
    # Remove invalid entries in 'servings' and convert to integer
    df = df[~df['servings'].astype(str).str.contains('as a snack', na=False)]
    df['servings'] = pd.to_numeric(df['servings'], errors='coerce').fillna(0).astype(int)

    # Fill missing values
    for column in df.columns:
        if column != 'high_traffic':
            if df[column].dtype in ['float64', 'int64']:
                df[column].fillna(df[column].median(), inplace=True)
            elif df[column].dtype == 'object':
                df[column].fillna(df[column].mode()[0], inplace=True)

    # Handle 'high_traffic' column
    df['high_traffic'] = df['high_traffic'].fillna('Low')
    df['high_traffic'] = df['high_traffic'].map({'High': 1, 'Low': 0})

    # One-hot encode 'category'
    df = pd.get_dummies(df, columns=['category'], drop_first=True)

    # Normalize features
    feature_columns = [col for col in df.columns if col not in ['recipe', 'high_traffic']]
    df[feature_columns] = preprocessing_pipeline.fit_transform(df[feature_columns])

    return df

# Preprocess the data
data = preprocess_data(data)

# Save processed dataset
processed_data_path = 'processed_recipe_site_traffic.csv'
data.to_csv(processed_data_path, index=False)

# Step 2: Model Implementation
X = data.drop(columns=['recipe', 'high_traffic'])
y = data['high_traffic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Baseline Logistic Regression Model
baseline_model = LogisticRegression(random_state=42, max_iter=500)
baseline_model.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the baseline model
y_pred_baseline = baseline_model.predict(X_test)

# Save the baseline model
baseline_model_path = 'baseline_logistic_regression_model.pkl'
joblib.dump(baseline_model, baseline_model_path)

# Step 3: Optimize Voting Classifier with Random Forest Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Grid search for best Random Forest parameters
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_resampled, y_train_resampled)
best_rf = grid_search_rf.best_estimator_

# Voting Classifier with Soft Voting and Weights
classifiers = [
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=500)),
    ('K-Nearest Neighbor', KNeighborsClassifier(n_neighbors=7)),
    ('Decision Tree', DecisionTreeClassifier(max_depth=5, random_state=42)),
    ('Random Forest', best_rf)
]

voting_classifier = VotingClassifier(
    estimators=classifiers,
    voting='soft',
    weights=[2, 1, 1, 3]  # Giving higher weight to Random Forest
)

# Fit the voting classifier
voting_classifier.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the voting classifier
y_pred_voting = voting_classifier.predict(X_test)
y_prob_voting = voting_classifier.predict_proba(X_test)[:, 1]

# Save the optimized voting classifier
voting_classifier_path = 'optimized_voting_classifier_model.pkl'
joblib.dump(voting_classifier, voting_classifier_path)

# Step 4: Feature Importance Analysis
feature_importances = best_rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Visualize Feature Importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('feature_importance.png')
plt.show()

# Step 5: Decision Threshold Optimization
optimal_threshold = thresholds[np.argmax(precision * recall)]

# Predict with Optimized Threshold
y_pred_optimized = (y_prob_voting >= optimal_threshold).astype(int)

# Additional Plots
# 1. Correlation Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()

# 2. Confusion Matrix Heatmaps for Models
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_baseline), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Baseline Model Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, y_pred_voting), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Voting Classifier Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
plt.savefig('confusion_matrix_heatmaps.png')
plt.show()

# 3. Distribution of Important Features
important_features = importance_df['Feature'].head(3)
for feature in important_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='high_traffic', y=feature, data=data, palette='muted')
    plt.title(f'{feature} Distribution by High Traffic')
    plt.xlabel('High Traffic (1: Yes, 0: No)')
    plt.ylabel(feature)
    plt.savefig(f'{feature}_distribution.png')
    plt.show()

# 4. ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_prob_voting)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='Voting Classifier')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.savefig('roc_curve.png')
plt.show()

Recipe Site Traffic Analysis: Written Report

1. Data Validation and Cleaning

Servings Column:
- Removed invalid entries such as "as a snack."
- Converted all values to integers, filling missing values with 0.
High Traffic Column:
- Missing values filled with "Low."
- Converted categorical values to binary encoding: High = 1, Low = 0.
Categorical Variables (e.g., Category):
- One-hot encoded, dropping the first category to avoid multicollinearity.
Numerical Features (e.g., Protein, Calories):
- Missing values replaced with median values.
- Standardized using StandardScaler to ensure uniform scaling.

2. Exploratory Analysis

Key Insights:

Recipes high in protein and calories are strongly associated with high traffic.
Vegetables and potatoes emerged as highly impactful categories, more so than desserts or breakfast options.

Visualizations:

Protein Distribution by High Traffic:
- Recipes with higher protein content are more likely to result in high traffic, indicating the appeal of health-focused options.
Calories Distribution by High Traffic:
- High-calorie recipes show increased traffic, suggesting consumer interest in indulgent recipes.
Correlation Heatmap:
- Displays strong correlations between Protein, Calories, and Category_Vegetables with high traffic.

3. Model Development

Problem Type:

Binary classification to predict whether a recipe will generate high traffic (High Traffic = 1, Low Traffic = 0).

Baseline Model:

Logistic Regression:
- Chosen for its simplicity and interpretability.
- Metrics: Accuracy: 73.9%, Recall (High Traffic): 72%.

Comparison Model:

Voting Classifier:
- Combines Logistic Regression, Random Forest, KNN, and Decision Tree.
- Optimized using GridSearchCV for Random Forest parameters.
- Metrics: Accuracy: 74.3%, Recall (High Traffic): 97%.

4. Model Evaluation

Baseline vs. Voting Classifier:

Logistic Regression:
- Precision: 83%, Recall: 72%.
- Balanced performance but lower recall for high traffic.
Voting Classifier:
- Precision: 71%, Recall: 97%.
- Prioritizes recall to ensure most high-traffic recipes are identified.

Visualizations:

Precision-Recall Curve:
- Demonstrates how the Voting Classifier balances precision and recall effectively.
ROC Curve:
- Highlights the Voting Classifier’s ability to distinguish high and low traffic recipes (AUC: High).

5. Business Metric Definition

Proposed Metric:

Recall for High Traffic Recipes:
- Ensures the business identifies the majority of high-traffic recipes for promotion.

Initial Value:

97% (achieved by the Voting Classifier).

Monitoring Plan:

Regularly evaluate recall during model predictions.
Retrain the model periodically with new recipe data to maintain performance.

6. Final Summary and Recommendations

Key Findings:

Protein, calories, and category-specific variables (e.g., vegetables and potatoes) are the top predictors of recipe traffic.
The Voting Classifier significantly outperforms Logistic Regression in recall, aligning with the business’s goal to identify high-traffic recipes.

Recommendations:

Promote High-Traffic Recipes:
- Focus on high-protein, low-calorie options to align with health-conscious trends.
- Highlight recipes in impactful categories such as vegetables and potatoes.
Deploy Voting Classifier:
- Use the optimized model to predict high-traffic recipes for marketing campaigns.
Monitor and Iterate:
- Track recall as the primary performance metric.
- Retrain the model regularly to incorporate new recipe data.

Appendix:

Visualizations:
1. Protein Distribution by High Traffic.
2. Calories Distribution by High Traffic.
3. Correlation Heatmap.
4. Precision-Recall Curve.
5. ROC Curve.
Code:
- Full implementation is available for review upon request.

top_recipes_website_model

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Recipe Site Traffic Analysis: Written Report

1. Data Validation and Cleaning

2. Exploratory Analysis

Key Insights:

Visualizations:

3. Model Development

Problem Type:

Baseline Model:

Comparison Model:

4. Model Evaluation

Baseline vs. Voting Classifier:

Visualizations:

5. Business Metric Definition

Proposed Metric:

Initial Value:

Monitoring Plan:

6. Final Summary and Recommendations

Key Findings:

Recommendations:

Appendix:

Recipe Site Traffic Analysis: Written Report