Skip to content

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

  • date - recorded date of measurement - (int)
  • cloud_cover - cloud cover measurement in oktas - (float)
  • sunshine - sunshine measurement in hours (hrs) - (float)
  • global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
  • max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
  • mean_temp - mean temperature in degrees Celsius (°C) - (float)
  • min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
  • precipitation - precipitation measurement in millimeters (mm) - (float)
  • pressure - pressure measurement in Pascals (Pa) - (float)
  • snow_depth - snow depth measurement in centimeters (cm) - (float)
Spinner
DataFrameas
df
variable
SELECT AVG(mean_temp) AS avg_temp 
FROM 'london_weather.csv'
WHERE mean_temp IS NOT NULL;
# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Read in the data
weather = pd.read_csv("london_weather.csv")

# Start coding here
# Use as many cells as you like
# Entender os dados - Loading the data
weather.info()
# Converter os dados - Data Cleaning
weather["date"] = pd.to_datetime(weather["date"], format="%Y%m%d")

weather["year"] = weather["date"].dt.year
weather["month"] = weather["date"].dt.month

weather.head()
# Visualizar a temperatura média ao longo dos anos

plt.figure(figsize=(10,5)) 
sns.lineplot(data=weather, x="year", y="mean_temp")
plt.title("Média de Temperatura por Ano")
plt.show()
# Analisar a correlação
corr = weather.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Matriz de Correlação")
plt.show()
# Features para prever a temperatura média
feature_selection = ["global_radiation", "sunshine", "cloud_cover", "month", "max_temp"]

# DF sem valores ausentes
df_clean = weather.dropna(subset=["mean_temp"])

# Colunas de features
X = df_clean[feature_selection]

# Target value
y = df_clean["mean_temp"]

print(X.shape, y.shape)
# Preprocess data
# Dividir o treino

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

#IMputer
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Normalização
scaler = StandardScaler()
X_train_processed = scaler.fit_transform(X_train_imputed)
X_test_processed = scaler.transform(X_test_imputed)

print("Shape do X_train após transformação:", X_train_processed.shape)
print("Shape do X_test após transformação:", X_test_processed.shape)
# Training and Evaluation

mlflow.set_experiment("london_weather_experiment")

# Treinando e logando diferentes modelos:
# 1) Linear Regression (sem hiperparâmetros de profundidade)
# 2) Decision Tree com diferentes max_depth
# 3) Random Forest com diferentes max_depth

# Linear Regression

with mlflow.start_run(run_name="Linear_Regression"):
    lr_model = LinearRegression()
    lr_model.fit(X_train_processed, y_train)
    
    # Previsoes 
    y_pred = lr_model.predict(X_test_processed)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Log dos params, LR não tem hiperparamentros relevantes 
    mlflow.log_param("model_type", "LinearRegression")
    
    # Log do RMSE
    mlflow.log_metric("rmse", rmse)
    
    # Log do modelo
    mlflow.sklearn.log_model(lr_model, artifact_path="model")
    
# Decision Tree com vários max_depth
for depth in [1, 2, 10, 20]:
    run_name = f"DecisionTree_depth_{depth}"
    
    with mlflow.start_run(run_name=run_name):
        dt_model = DecisionTreeRegressor(max_depth=depth, random_state=42)
        dt_model.fit(X_train_processed, y_train)
        
        # Previsões
        y_pred = dt_model.predict(X_test_processed)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Log dos hiperparams e metrics
        params = {"model_type": "DecisionTreeRegressor", "max_depth": depth}
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        
        # Log do modelo
        mlflow.sklearn.log_model(dt_model, artifact_path="model")
        
# Random Forest com diversos max_depth
for depth in [5, 10, 15]:
    run_name = f"RandomForest_depth_{depth}"
    
    with mlflow.start_run(run_name=run_name):
        rf_model = RandomForestRegressor(max_depth=depth, n_estimators=100, random_state=42)
        rf_model.fit(X_train_processed, y_train)
        
        # Prev
        y_pred = rf_model.predict(X_test_processed)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Log dos hiperparams e metrics
        params = {"model_type": "RandomForestRegressor", "max_depth": depth, "n_estimators": 100}
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        
        # Log do Modelo
        mlflow.sklearn.log_model(rf_model, artifact_path="model")
        # Consulta a todas runs

with mlflow.start_run(run_name="GradientDescent_Regression"):
    sgd_model = SGDRegressor(random_state=42, max_iter=1000, tol=1e-3)
    sgd_model.fit(X_train_processed, y_train)
    
    y_pred = sgd_model.predict(X_test_processed)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    mlflow.log_params({
        "model_type": "SGDRegressor",
        "max_iter": 1000,
        "tol": 1e-3, 
    })
    
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(sgd_model, artifact_path="model")
        
experiment_results = mlflow.search_runs()
experiment_results.head()