Project: Predicting Temperature in London

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

date - recorded date of measurement - (int)
cloud_cover - cloud cover measurement in oktas - (float)
sunshine - sunshine measurement in hours (hrs) - (float)
global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
mean_temp - mean temperature in degrees Celsius (°C) - (float)
min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
precipitation - precipitation measurement in millimeters (mm) - (float)
pressure - pressure measurement in Pascals (Pa) - (float)
snow_depth - snow depth measurement in centimeters (cm) - (float)

# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Read in the data
weather = pd.read_csv("london_weather.csv")

# Start coding here
display(weather.head())
# Use as many cells as you like

# Transformation and data cleaning

display("Feature type checking !")

display(weather.dtypes)

display("None value checking!")
display(weather.isnull().sum())

display("Dupplication checking !")

display(weather.duplicated().sum())

display("Dataset shape")

display(weather.shape)

# Dataset exploration

display("Dataset exploration")

# Fix: Only drop columns that exist in the DataFrame
cols_to_drop = [col for col in ["date"] if col in weather.columns]
df = weather.drop(cols_to_drop, axis=1)

colonnes_numeriques = df.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(15, len(colonnes_numeriques)*4))

# Loop for data distribution represenation

for i, col in enumerate(colonnes_numeriques, 1):
    plt.subplot(len(colonnes_numeriques), 1, i)
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f"Distribution de la colonne : {col}")
    plt.xlabel(col)
    plt.ylabel("Fréquence")

plt.tight_layout()
plt.show()


# Calculation of the correlation matrix
corr = weather.corr(numeric_only=True)

# Tracer la heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title("Matrix correlation of features")
plt.show()

display("Basing on correlation matrix, the features for each models are: \n LinearRegression: min_temp, global_radiation \n DecisionTreeRegressor: min_temp, max_temp, global_radiation, sunshine \n RandomForestRegressor: min_temp, max_temp, global_radiation, sunshine, cloud_cover, precipitation, pressure, snow_depth")

    # LinearRegression is sensitive to correlated features
    # DecisionTreeRegressor and RandomForestRegressor are not affected by correlated features
    # RandomForestRegressor is also robust to irrelevant features (therefore, we will keep all features except the 'date' column)



# Set Auto logging for Scikit-learn flavor

mlflow.sklearn.autolog()

# Run local Project
#mlflow.projects.run(uri='./', entry_point='main',
#experiment_name='london mean temperature model')


# Dataset imputing

#lr = LinearRegression()

# --- FIX STARTS HERE ---



# Only use columns that exist in the DataFrame for both X and the ColumnTransformer
# Select features for X and y


lr_X = weather[["min_temp", "global_radiation"]]  
lr_y = weather["mean_temp"].fillna(weather["mean_temp"].mean()).values

# Build the ColumnTransformer using only columns present in X
lr_preprocessing = ColumnTransformer([
    ("mean_imputer", SimpleImputer(strategy="mean"), ["min_temp"]),
    ("median_imputer", SimpleImputer(strategy="median"), ["global_radiation"])
], remainder="passthrough")

lr_steps = [("imputation", lr_preprocessing), ('scaler', StandardScaler()), ("linear_regression", LinearRegression())]
lr_pipeline = Pipeline(lr_steps)

display(weather.columns)

lr_X_train, lr_X_test, lr_y_train, lr_y_test = train_test_split(lr_X, lr_y, test_size=0.3, random_state=42)
# =========================
# 4. Définir l'expérience MLflow
# =========================

mlflow.set_experiment("weather_mean_temp_regression")

with mlflow.start_run(run_name="linear_regression_min_globalRad"):
    # ----- Training -----
    lr_pipeline.fit(lr_X_train, lr_y_train)

    # ----- Prediction & RMSE -----
    lr_y_pred = lr_pipeline.predict(lr_X_test)
    lr_rmse = mean_squared_error(lr_y_test, lr_y_pred, squared=False)
    print("Score RMSE pour le modèle LinearRegression :", lr_rmse)

    # ----- Log of hyperparameter -----
    lr_model = lr_pipeline.named_steps["linear_regression"]
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_params(lr_model.get_params())

    # ----- Log of metrics -----
    mlflow.log_metric("test_rmse", lr_rmse)

    # ----- Log of metrics -----
    mlflow.sklearn.log_model(lr_pipeline, artifact_path="model")

# =========================
# 5. Retrieve all MLflow runs
# =========================

experiment_results = mlflow.search_runs()
experiment_results.head()

# --- FIX ENDS HERE ---


#dtr = DecisionTreeRegressor()

dtr_steps = [("imputation", SimpleImputer()), ('scaler', StandardScaler()), ("Decision_Tree_Regressor", DecisionTreeRegressor())]

dtr_pipeline = Pipeline(dtr_steps)


#rfr = RandomForestRegressor() 
rfr_steps = [("imputation", SimpleImputer()), ('scaler', StandardScaler()), ("Random_Forest_Regressor", RandomForestRegressor())]

# Fix: Only select features that exist in the DataFrame and exclude 'date'
all_possible_features = ["min_temp", "max_temp", "global_radiation", "sunshine", "cloud_cover", "precipitation", "pressure", "snow_depth"]
rfr_features = [col for col in all_possible_features if col in weather.columns]

rfr_pipeline = Pipeline(rfr_steps)