Skip to content

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

  • date - recorded date of measurement - (int)
  • cloud_cover - cloud cover measurement in oktas - (float)
  • sunshine - sunshine measurement in hours (hrs) - (float)
  • global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
  • max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
  • mean_temp - mean temperature in degrees Celsius (°C) - (float)
  • min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
  • precipitation - precipitation measurement in millimeters (mm) - (float)
  • pressure - pressure measurement in Pascals (Pa) - (float)
  • snow_depth - snow depth measurement in centimeters (cm) - (float)
# Run this cell to import the modules you require
import pandas as pd
import numpy as np

import optuna

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor


# Read in the data
weather = pd.read_csv("london_weather.csv")
weather.info()
target = "mean_temp"

# Expand date to y, m, d as integers
weather["date"] = weather["date"].astype(str)
weather["year"] = weather["date"].str[:4].astype(int)
weather["month"] = weather["date"].str[4:6].astype(int)
weather["day"] = weather["date"].str[6:].astype(int)
weather = weather.drop(columns="date")
weather.head()
# Alternate forward and backward fills while any NA exist
fill_forward = True
while weather.isna().any().any():
    if fill_forward:
        weather = weather.fillna(method='ffill')
    else:
        weather = weather.fillna(method='bfill')
    fill_forward = not fill_forward

weather.isna().mean()
# Fixed code

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna

# Split data correctly: train_test_split returns X_train, X_test, y_train, y_test
X = weather.drop(columns=target)
y = weather[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Correct unpacking order
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)

# Helper to run Optuna for a given model and param space
def optuna_tune(model_name, X_train, y_train, X_test, y_test, n_trials=10):
    def objective(trial):
        if model_name == "lr":
            # Try Ridge and Lasso, tune alpha
            reg_type = trial.suggest_categorical("reg_type", ["ridge", "lasso"])
            alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
            if reg_type == "ridge":
                model = Ridge(alpha=alpha, random_state=42)
            else:
                model = Lasso(alpha=alpha, random_state=42, max_iter=10000)
        elif model_name == "rf":
            n_estimators = trial.suggest_int("n_estimators", 10, 100)
            max_depth = trial.suggest_int("max_depth", 2, 10)
            min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                random_state=42,
                n_jobs=-1
            )
        elif model_name == "dt":
            max_depth = trial.suggest_int("max_depth", 2, 20)
            min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
            model = DecisionTreeRegressor(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                random_state=42
            )
        elif model_name == "xgb":
            n_estimators = trial.suggest_int("n_estimators", 10, 100)
            max_depth = trial.suggest_int("max_depth", 2, 10)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            subsample = trial.suggest_float("subsample", 0.5, 1.0)
            model = XGBRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                subsample=subsample,
                random_state=42,
                n_jobs=-1,
                verbosity=0
            )
        else:
            raise ValueError("Unknown model_name")
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return mean_squared_error(y_test, preds)
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study

# Run small Optuna studies for each model
studies = {}
for name in ["lr", "rf", "dt", "xgb"]:
    studies[name] = optuna_tune(name, X_train, y_train, X_test, y_test, n_trials=20)

# Show best params and scores
{model: (studies[model].best_params, studies[model].best_value) for model in studies}
# Start MLflow experiment
experiment_name = "weather_regression"
mlflow.set_experiment(experiment_name)

def filter_params(model_class, params):
    """Filter params to only those accepted by the model's __init__."""
    import inspect
    valid_params = inspect.signature(model_class.__init__).parameters
    return {k: v for k, v in params.items() if k in valid_params}

with mlflow.start_run(run_name="LinearRegression") as run_lr:
    lr_params = filter_params(LinearRegression, studies["lr"].best_params)
    lr_model = LinearRegression(**lr_params)
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_params(lr_model.get_params())
    mlflow.log_metric("rmse", rmse_lr)
    mlflow.sklearn.log_model(lr_model, "model")

with mlflow.start_run(run_name="DecisionTreeRegressor") as run_dt:
    dt_params = filter_params(DecisionTreeRegressor, studies["dt"].best_params)
    dt_model = DecisionTreeRegressor(**dt_params)
    dt_model.fit(X_train, y_train)
    y_pred_dt = dt_model.predict(X_test)
    rmse_dt = mean_squared_error(y_test, y_pred_dt, squared=False)
    mlflow.log_param("model_type", "DecisionTreeRegressor")
    mlflow.log_params(dt_model.get_params())
    mlflow.log_metric("rmse", rmse_dt)
    mlflow.sklearn.log_model(dt_model, "model")

with mlflow.start_run(run_name="RandomForestRegressor") as run_rf:
    rf_params = filter_params(RandomForestRegressor, studies["rf"].best_params)
    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_params(rf_model.get_params())
    mlflow.log_metric("rmse", rmse_rf)
    mlflow.sklearn.log_model(rf_model, "model")

with mlflow.start_run(run_name="XGBRegressor") as run_xgb:
    xgb_params = filter_params(XGBRegressor, studies["xgb"].best_params)
    xgb_model = XGBRegressor(**xgb_params)
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
    mlflow.log_param("model_type", "XGBRegressor")
    mlflow.log_params(xgb_model.get_params())
    mlflow.log_metric("rmse", rmse_xgb)
    mlflow.sklearn.log_model(xgb_model, "model")

with mlflow.start_run(run_name="Ensemble") as run_ensemble:
    # Include XGB predictions in the ensemble
    y_pred_en = np.mean([y_pred_lr, y_pred_dt, y_pred_rf, y_pred_xgb], axis=0)
    rmse_en = mean_squared_error(y_test, y_pred_en, squared=False)
    mlflow.log_param("model_type", "Ensemble")
    mlflow.log_metric("rmse", rmse_en)

# Search all MLflow runs for this experiment
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id
experiment_results = mlflow.search_runs(experiment_ids=experiment_id)
experiment_results

nrmse = experiment_results[experiment_results["params.model_type"]=="Ensemble"]["metrics.rmse"].mean() / (weather.mean_temp.max() - weather.mean_temp.min())

print(f"Normalized RMSE for an ensemble of optuna-optimized models (Linear Regression, Random Forest, Decision Tree, eXtreme Gradient Boosting XGB); root mean squared error of the predicted mean daily temperature as a percentage of the range of observed mean daily temperatures: {nrmse:.2%}")