Skip to content
top_recipes_website_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
import joblib
# Load the dataset
data_path = 'recipe_site_traffic_2212.csv'
data = pd.read_csv(data_path)
# Step 1: Data Preprocessing Pipeline
preprocessing_pipeline = Pipeline([
('scaler', StandardScaler())
])
def preprocess_data(df):
# Remove invalid entries in 'servings' and convert to integer
df = df[~df['servings'].astype(str).str.contains('as a snack', na=False)]
df['servings'] = pd.to_numeric(df['servings'], errors='coerce').fillna(0).astype(int)
# Fill missing values
for column in df.columns:
if column != 'high_traffic':
if df[column].dtype in ['float64', 'int64']:
df[column].fillna(df[column].median(), inplace=True)
elif df[column].dtype == 'object':
df[column].fillna(df[column].mode()[0], inplace=True)
# Handle 'high_traffic' column
df['high_traffic'] = df['high_traffic'].fillna('Low')
df['high_traffic'] = df['high_traffic'].map({'High': 1, 'Low': 0})
# One-hot encode 'category'
df = pd.get_dummies(df, columns=['category'], drop_first=True)
# Normalize features
feature_columns = [col for col in df.columns if col not in ['recipe', 'high_traffic']]
df[feature_columns] = preprocessing_pipeline.fit_transform(df[feature_columns])
return df
# Preprocess the data
data = preprocess_data(data)
# Save processed dataset
processed_data_path = 'processed_recipe_site_traffic.csv'
data.to_csv(processed_data_path, index=False)
# Step 2: Model Implementation
X = data.drop(columns=['recipe', 'high_traffic'])
y = data['high_traffic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Baseline Logistic Regression Model
baseline_model = LogisticRegression(random_state=42, max_iter=500)
baseline_model.fit(X_train_resampled, y_train_resampled)
# Predict and evaluate the baseline model
y_pred_baseline = baseline_model.predict(X_test)
# Save the baseline model
baseline_model_path = 'baseline_logistic_regression_model.pkl'
joblib.dump(baseline_model, baseline_model_path)
# Step 3: Optimize Voting Classifier with Random Forest Tuning
rf_params = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5]
}
# Grid search for best Random Forest parameters
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_resampled, y_train_resampled)
best_rf = grid_search_rf.best_estimator_
# Voting Classifier with Soft Voting and Weights
classifiers = [
('Logistic Regression', LogisticRegression(random_state=42, max_iter=500)),
('K-Nearest Neighbor', KNeighborsClassifier(n_neighbors=7)),
('Decision Tree', DecisionTreeClassifier(max_depth=5, random_state=42)),
('Random Forest', best_rf)
]
voting_classifier = VotingClassifier(
estimators=classifiers,
voting='soft',
weights=[2, 1, 1, 3] # Giving higher weight to Random Forest
)
# Fit the voting classifier
voting_classifier.fit(X_train_resampled, y_train_resampled)
# Predict and evaluate the voting classifier
y_pred_voting = voting_classifier.predict(X_test)
y_prob_voting = voting_classifier.predict_proba(X_test)[:, 1]
# Save the optimized voting classifier
voting_classifier_path = 'optimized_voting_classifier_model.pkl'
joblib.dump(voting_classifier, voting_classifier_path)
# Step 4: Feature Importance Analysis
feature_importances = best_rf.feature_importances_
importance_df = pd.DataFrame({
'Feature': X_train.columns,
'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
# Visualize Feature Importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('feature_importance.png')
plt.show()
# Step 5: Decision Threshold Optimization
optimal_threshold = thresholds[np.argmax(precision * recall)]
# Predict with Optimized Threshold
y_pred_optimized = (y_prob_voting >= optimal_threshold).astype(int)
# Additional Plots
# 1. Correlation Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()
# 2. Confusion Matrix Heatmaps for Models
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_baseline), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Baseline Model Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
sns.heatmap(confusion_matrix(y_test, y_pred_voting), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Voting Classifier Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
plt.savefig('confusion_matrix_heatmaps.png')
plt.show()
# 3. Distribution of Important Features
important_features = importance_df['Feature'].head(3)
for feature in important_features:
plt.figure(figsize=(10, 6))
sns.boxplot(x='high_traffic', y=feature, data=data, palette='muted')
plt.title(f'{feature} Distribution by High Traffic')
plt.xlabel('High Traffic (1: Yes, 0: No)')
plt.ylabel(feature)
plt.savefig(f'{feature}_distribution.png')
plt.show()
# 4. ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_prob_voting)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label='Voting Classifier')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.savefig('roc_curve.png')
plt.show()Recipe Site Traffic Analysis: Written Report
1. Data Validation and Cleaning
-
Servings Column:
- Removed invalid entries such as "as a snack."
- Converted all values to integers, filling missing values with
0.
-
High Traffic Column:
- Missing values filled with "Low."
- Converted categorical values to binary encoding:
High = 1,Low = 0.
-
Categorical Variables (e.g., Category):
- One-hot encoded, dropping the first category to avoid multicollinearity.
-
Numerical Features (e.g., Protein, Calories):
- Missing values replaced with median values.
- Standardized using
StandardScalerto ensure uniform scaling.
2. Exploratory Analysis
Key Insights:
- Recipes high in protein and calories are strongly associated with high traffic.
- Vegetables and potatoes emerged as highly impactful categories, more so than desserts or breakfast options.
Visualizations:
-
Protein Distribution by High Traffic:
- Recipes with higher protein content are more likely to result in high traffic, indicating the appeal of health-focused options.
-
Calories Distribution by High Traffic:
- High-calorie recipes show increased traffic, suggesting consumer interest in indulgent recipes.
-
Correlation Heatmap:
- Displays strong correlations between
Protein,Calories, andCategory_Vegetableswith high traffic.
- Displays strong correlations between
3. Model Development
Problem Type:
- Binary classification to predict whether a recipe will generate high traffic (
High Traffic = 1,Low Traffic = 0).
Baseline Model:
- Logistic Regression:
- Chosen for its simplicity and interpretability.
- Metrics: Accuracy: 73.9%, Recall (High Traffic): 72%.
Comparison Model:
- Voting Classifier:
- Combines Logistic Regression, Random Forest, KNN, and Decision Tree.
- Optimized using
GridSearchCVfor Random Forest parameters. - Metrics: Accuracy: 74.3%, Recall (High Traffic): 97%.
4. Model Evaluation
Baseline vs. Voting Classifier:
-
Logistic Regression:
- Precision: 83%, Recall: 72%.
- Balanced performance but lower recall for high traffic.
-
Voting Classifier:
- Precision: 71%, Recall: 97%.
- Prioritizes recall to ensure most high-traffic recipes are identified.
Visualizations:
-
Precision-Recall Curve:
- Demonstrates how the Voting Classifier balances precision and recall effectively.
-
ROC Curve:
- Highlights the Voting Classifier’s ability to distinguish high and low traffic recipes (AUC: High).
5. Business Metric Definition
Proposed Metric:
- Recall for High Traffic Recipes:
- Ensures the business identifies the majority of high-traffic recipes for promotion.
Initial Value:
- 97% (achieved by the Voting Classifier).
Monitoring Plan:
- Regularly evaluate recall during model predictions.
- Retrain the model periodically with new recipe data to maintain performance.
6. Final Summary and Recommendations
Key Findings:
- Protein, calories, and category-specific variables (e.g., vegetables and potatoes) are the top predictors of recipe traffic.
- The Voting Classifier significantly outperforms Logistic Regression in recall, aligning with the business’s goal to identify high-traffic recipes.
Recommendations:
-
Promote High-Traffic Recipes:
- Focus on high-protein, low-calorie options to align with health-conscious trends.
- Highlight recipes in impactful categories such as vegetables and potatoes.
-
Deploy Voting Classifier:
- Use the optimized model to predict high-traffic recipes for marketing campaigns.
-
Monitor and Iterate:
- Track recall as the primary performance metric.
- Retrain the model regularly to incorporate new recipe data.
Appendix:
- Visualizations:
- Protein Distribution by High Traffic.
- Calories Distribution by High Traffic.
- Correlation Heatmap.
- Precision-Recall Curve.
- ROC Curve.
- Code:
- Full implementation is available for review upon request.