Skip to content
Project: Predicting Temperature in London
As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.
You will be working with data stored in london_weather.csv, which contains the following columns:
- date - recorded date of measurement - (int)
- cloud_cover - cloud cover measurement in oktas - (float)
- sunshine - sunshine measurement in hours (hrs) - (float)
- global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
- max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
- mean_temp - mean temperature in degrees Celsius (°C) - (float)
- min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
- precipitation - precipitation measurement in millimeters (mm) - (float)
- pressure - pressure measurement in Pascals (Pa) - (float)
- snow_depth - snow depth measurement in centimeters (cm) - (float)
# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import optuna
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
# Read in the data
weather = pd.read_csv("london_weather.csv")
weather.info()target = "mean_temp"
# Expand date to y, m, d as integers
weather["date"] = weather["date"].astype(str)
weather["year"] = weather["date"].str[:4].astype(int)
weather["month"] = weather["date"].str[4:6].astype(int)
weather["day"] = weather["date"].str[6:].astype(int)
weather = weather.drop(columns="date")
weather.head()# Alternate forward and backward fills while any NA exist
fill_forward = True
while weather.isna().any().any():
if fill_forward:
weather = weather.fillna(method='ffill')
else:
weather = weather.fillna(method='bfill')
fill_forward = not fill_forward
weather.isna().mean()# Fixed code
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna
# Split data correctly: train_test_split returns X_train, X_test, y_train, y_test
X = weather.drop(columns=target)
y = weather[target]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Correct unpacking order
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)
# Helper to run Optuna for a given model and param space
def optuna_tune(model_name, X_train, y_train, X_test, y_test, n_trials=10):
def objective(trial):
if model_name == "lr":
# Try Ridge and Lasso, tune alpha
reg_type = trial.suggest_categorical("reg_type", ["ridge", "lasso"])
alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
if reg_type == "ridge":
model = Ridge(alpha=alpha, random_state=42)
else:
model = Lasso(alpha=alpha, random_state=42, max_iter=10000)
elif model_name == "rf":
n_estimators = trial.suggest_int("n_estimators", 10, 100)
max_depth = trial.suggest_int("max_depth", 2, 10)
min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
model = RandomForestRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
random_state=42,
n_jobs=-1
)
elif model_name == "dt":
max_depth = trial.suggest_int("max_depth", 2, 20)
min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
model = DecisionTreeRegressor(
max_depth=max_depth,
min_samples_split=min_samples_split,
random_state=42
)
elif model_name == "xgb":
n_estimators = trial.suggest_int("n_estimators", 10, 100)
max_depth = trial.suggest_int("max_depth", 2, 10)
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
subsample = trial.suggest_float("subsample", 0.5, 1.0)
model = XGBRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
subsample=subsample,
random_state=42,
n_jobs=-1,
verbosity=0
)
else:
raise ValueError("Unknown model_name")
model.fit(X_train, y_train)
preds = model.predict(X_test)
return mean_squared_error(y_test, preds)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=n_trials)
return study
# Run small Optuna studies for each model
studies = {}
for name in ["lr", "rf", "dt", "xgb"]:
studies[name] = optuna_tune(name, X_train, y_train, X_test, y_test, n_trials=20)
# Show best params and scores
{model: (studies[model].best_params, studies[model].best_value) for model in studies}# Start MLflow experiment
experiment_name = "weather_regression"
mlflow.set_experiment(experiment_name)
def filter_params(model_class, params):
"""Filter params to only those accepted by the model's __init__."""
import inspect
valid_params = inspect.signature(model_class.__init__).parameters
return {k: v for k, v in params.items() if k in valid_params}
with mlflow.start_run(run_name="LinearRegression") as run_lr:
lr_params = filter_params(LinearRegression, studies["lr"].best_params)
lr_model = LinearRegression(**lr_params)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
mlflow.log_param("model_type", "LinearRegression")
mlflow.log_params(lr_model.get_params())
mlflow.log_metric("rmse", rmse_lr)
mlflow.sklearn.log_model(lr_model, "model")
with mlflow.start_run(run_name="DecisionTreeRegressor") as run_dt:
dt_params = filter_params(DecisionTreeRegressor, studies["dt"].best_params)
dt_model = DecisionTreeRegressor(**dt_params)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
rmse_dt = mean_squared_error(y_test, y_pred_dt, squared=False)
mlflow.log_param("model_type", "DecisionTreeRegressor")
mlflow.log_params(dt_model.get_params())
mlflow.log_metric("rmse", rmse_dt)
mlflow.sklearn.log_model(dt_model, "model")
with mlflow.start_run(run_name="RandomForestRegressor") as run_rf:
rf_params = filter_params(RandomForestRegressor, studies["rf"].best_params)
rf_model = RandomForestRegressor(**rf_params)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
mlflow.log_param("model_type", "RandomForestRegressor")
mlflow.log_params(rf_model.get_params())
mlflow.log_metric("rmse", rmse_rf)
mlflow.sklearn.log_model(rf_model, "model")
with mlflow.start_run(run_name="XGBRegressor") as run_xgb:
xgb_params = filter_params(XGBRegressor, studies["xgb"].best_params)
xgb_model = XGBRegressor(**xgb_params)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mlflow.log_param("model_type", "XGBRegressor")
mlflow.log_params(xgb_model.get_params())
mlflow.log_metric("rmse", rmse_xgb)
mlflow.sklearn.log_model(xgb_model, "model")
with mlflow.start_run(run_name="Ensemble") as run_ensemble:
# Include XGB predictions in the ensemble
y_pred_en = np.mean([y_pred_lr, y_pred_dt, y_pred_rf, y_pred_xgb], axis=0)
rmse_en = mean_squared_error(y_test, y_pred_en, squared=False)
mlflow.log_param("model_type", "Ensemble")
mlflow.log_metric("rmse", rmse_en)
# Search all MLflow runs for this experiment
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id
experiment_results = mlflow.search_runs(experiment_ids=experiment_id)
experiment_results
nrmse = experiment_results[experiment_results["params.model_type"]=="Ensemble"]["metrics.rmse"].mean() / (weather.mean_temp.max() - weather.mean_temp.min())
print(f"Normalized RMSE for an ensemble of optuna-optimized models (Linear Regression, Random Forest, Decision Tree, eXtreme Gradient Boosting XGB); root mean squared error of the predicted mean daily temperature as a percentage of the range of observed mean daily temperatures: {nrmse:.2%}")