Project — DataLab

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

date - recorded date of measurement - (int)
cloud_cover - cloud cover measurement in oktas - (float)
sunshine - sunshine measurement in hours (hrs) - (float)
global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
mean_temp - mean temperature in degrees Celsius (°C) - (float)
min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
precipitation - precipitation measurement in millimeters (mm) - (float)
pressure - pressure measurement in Pascals (Pa) - (float)
snow_depth - snow depth measurement in centimeters (cm) - (float)

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
import matplotlib.pyplot as plt

# Load data and perform exploratory analysis
weather=pd.read_csv('london_weather.csv')
weather.head(5)

weather['date'] = pd.to_datetime(weather['date'], format='%Y%m%d')
weather['year'] = weather['date'].dt.year
weather['month'] = weather['date'].dt.month
weather_metrics = ['cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'mean_temp', 'min_temp', 'precipitation', 'pressure', 'snow_depth']
weather_per_month = weather.groupby(['year', 'month'], as_index = False)[weather_metrics].mean()


sns.lineplot(x="year", y="mean_temp", data=weather_per_month, ci=None)
plt.show()

sns.barplot(x='month', y='precipitation', data=weather)
plt.show()

plt.figure(figsize=(12, 10)) 
sns.heatmap(weather.corr(), annot=True)
plt.show()


# Choose features, define the target, and drop null values
feature_selection = ['month', 'cloud_cover', 'sunshine', 'precipitation', 'pressure', 'global_radiation']
target_var = 'mean_temp'
weather = weather.dropna(subset=['mean_temp'])

def preprocess_df(df, feature_selection, target_var):
    """
    Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features. Returns the train and test ets
    """
    # Complete this function
    X = df[feature_selection]    
    y = df[target_var]
    #split the data into train and test 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # simple imputer to fill the null value  
    imp_data=SimpleImputer(strategy='mean')
    X_train=imp_data.fit_transform(X_train)
    X_test=imp_data.transform(X_test)
    # Scale The Data 
    scaler=StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = preprocess_df(weather, feature_selection, target_var)


def predict_and_evaluate(model, x_test, y_test):
    """
    Predict values from test set, calculate and return the root mean squared error.
    """
    y_pred=model.predict(x_test)
    rmse=np.sqrt(mean_squared_error(y_test,y_pred))
    
    return rmse

EXPERIMENT_NAME = "mohammadEx"

# Check if the experiment already exists, if not create a new one
if mlflow.get_experiment_by_name(EXPERIMENT_NAME) is None:
    EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    EXPERIMENT_ID = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

# Adjust the parameters
max_depth_parameters = [1, 2]

for idx, depth in enumerate([1, 2, 5, 10, 20]):
    parameters = {
        'max_depth': depth,
        'random_state': 1
    }    
    RUN_NAME = f"run_{idx}"
    # Complete the experiment loop
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME):
        # Create models
        lin_reg = LinearRegression().fit(X_train, y_train)
        tree_reg = DecisionTreeRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
        forest_reg = RandomForestRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
        # Log models
        mlflow.sklearn.log_model(lin_reg, "lin_reg")
        mlflow.sklearn.log_model(tree_reg, "tree_reg")
        mlflow.sklearn.log_model(forest_reg, "forest_reg")
        # Evaluate performance
        lin_reg_rmse = predict_and_evaluate(lin_reg, X_test, y_test)
        tree_reg_rmse = predict_and_evaluate(tree_reg, X_test, y_test)
        forest_reg_rmse = predict_and_evaluate(forest_reg, X_test, y_test)
        # Log performance
        mlflow.log_param("max_depth", depth)
        mlflow.log_metric("rmse_lr", lin_reg_rmse)
        mlflow.log_metric("rmse_tr", tree_reg_rmse)
        mlflow.log_metric("rmse_fr", forest_reg_rmse)

experiment_results = mlflow.search_runs(experiment_names=[EXPERIMENT_NAME])
experiment_results