Project: Predicting Temperature in London

As the climate changes, predicting the weather becomes ever more important for businesses. You have been asked to support on a machine learning project with the aim of building a pipeline to predict the climate in London, England. Specifically, the model should predict mean temperature in degrees Celsius (°C).

Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and mlflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

date - recorded date of measurement - (int)
cloud_cover - cloud cover measurement in oktas - (float)
sunshine - sunshine measurement in hours (hrs) - (float)
global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
mean_temp - target mean temperature in degrees Celsius (°C) - (float)
min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
precipitation - precipitation measurement in millimeters (mm) - (float)
pressure - pressure measurement in Pascals (Pa) - (float)
snow_depth - snow depth measurement in centimeters (cm) - (float)

# Run this cell to install mlflow
!pip install mlflow

# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Read in the data
weather = pd.read_csv("london_weather.csv")

# Start coding here
# convert the date column into a datetime object
weather['date'] = pd.to_datetime(weather['date'])

# check for missing values
weather.isnull().sum().sort_values()

# drop null values for min_temp, pressure, max_temp and precipitation
weather = weather.dropna(subset=['min_temp', 'pressure', 'max_temp', 'precipitation'])
# fill null values using the KNNImputer
imputer = KNNImputer(n_neighbors=2, weights='uniform')
# Reshape the data to 2D array before imputing
weather['cloud_cover'] = imputer.fit_transform(weather[['cloud_cover']])
weather['global_radiation'] = imputer.fit_transform(weather[['global_radiation']])
weather['mean_temp'] = imputer.fit_transform(weather[['mean_temp']])
weather['snow_depth'] = imputer.fit_transform(weather[['snow_depth']])
weather.isnull().sum().sort_values()

# Get the Features
X = weather.drop(['mean_temp', 'date'], axis=1).values
# get the target
y = weather['mean_temp'].values

# start and set experiment for Linear Regression
experiment_name = 'Linear-Regression-Experiment'
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

# Ensure any active run is ended before starting a new one
if mlflow.active_run():
    mlflow.end_run()

# start a run
mlflow.start_run()
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# start auto-logging
mlflow.sklearn.autolog()

# using a pipeline scale the data
steps_1 = [('scaler', StandardScaler()), ('linear regression', LinearRegression())]
# fit and get the accuracy of the model
pipeline = Pipeline(steps_1)
pipeline.fit(X_train, y_train)

# get the prediction of the linear regression model
y_pred = pipeline.predict(X_test)

# calculate the root mean squared error
rmse_li = np.sqrt(MSE(y_test, y_pred))

# log the rmse metric
mlflow.log_metric('rmse', rmse_li)
# end run
mlflow.end_run()
print(rmse_li)



# start a new experiment for the decision tree regressor
experiment_name = 'Decision-Tree-Regressor-Experiment'
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

# Ensure any active run is ended before starting a new one
if mlflow.active_run():
    mlflow.end_run()

# start a run
mlflow.start_run()

# start auto-logging
mlflow.sklearn.autolog()

# fit and get the model's prediction
dt = DecisionTreeRegressor(max_depth=2, min_samples_leaf=0.1, random_state=1, criterion='squared_error')
dt.fit(X_train, y_train)
y_pred1 = dt.predict(X_test)

# get the model's rmse
rmse_dt = np.sqrt(MSE(y_test, y_pred1))

# log the model's rmse
mlflow.log_metric('rmse', rmse_dt)
# end run
mlflow.end_run()
print(rmse_dt)

# start a new experiment for the decision tree regressor
experiment_name = 'Random-Forest-Regressor-Experiment'
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

# Ensure any active run is ended before starting a new one
if mlflow.active_run():
    mlflow.end_run()

# start a run
mlflow.start_run()

# start auto-logging
mlflow.sklearn.autolog()

# fit and get the model's prediction
rf = RandomForestRegressor(n_estimators=400, min_samples_leaf=0.12, random_state=1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# get the rmse of the model
rmse_rf = np.sqrt(MSE(y_test, y_pred_rf))
mlflow.log_metric('rmse', rmse_rf)
mlflow.end_run()

print(rmse_rf)

# get all mlflow runs and store in a variable called experiment_results
from mlflow.tracking import MlflowClient
client = MlflowClient()

# Get all experiments
experiments = client.search_experiments()

# Get all runs for each experiment
experiment_results = []
for experiment in experiments:
    runs = client.search_runs(experiment_ids=[experiment.experiment_id])
    experiment_results.extend(runs)

# Convert to DataFrame
import pandas as pd
experiment_results = pd.DataFrame([run.data.metrics for run in experiment_results])

experiment_results