Project: Predicting Temperature in London

As the climate changes, predicting the weather becomes ever more important for businesses. You have been asked to support on a machine learning project with the aim of building a pipeline to predict the climate in London, England. Specifically, the model should predict mean temperature in degrees Celsius (°C).

Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and mlflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

date - recorded date of measurement - (int)
cloud_cover - cloud cover measurement in oktas - (float)
sunshine - sunshine measurement in hours (hrs) - (float)
global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
mean_temp - target mean temperature in degrees Celsius (°C) - (float)
min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
precipitation - precipitation measurement in millimeters (mm) - (float)
pressure - pressure measurement in Pascals (Pa) - (float)
snow_depth - snow depth measurement in centimeters (cm) - (float)

# Run this cell to install mlflow
!pip install mlflow

Hidden output

# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Read in the data
weather = pd.read_csv("london_weather.csv")

# Start coding here
# Use as many cells as you like

weather.info()

Hidden output

weather['date'] = pd.to_datetime(weather['date'], format="%Y%m%d")
weather['month'] = weather['date'].dt.month
weather['year'] = weather['date'].dt.year
weather.info()
weather.head()

Hidden output

sns.lineplot(x='year', y='mean_temp', data=weather)
plt.show()

correlation_matrix = weather.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm')
plt.show()

Hidden output

feature_selection = ['min_temp', 'max_temp', 'sunshine', 'global_radiation', 'mean_temp']
X = weather[feature_selection]
print(X.isnull().sum())
print(X.shape)
#only dropping rows by checking 'mean_temp' column for null values.
X.dropna(axis=0, subset=['mean_temp'], inplace=True)
print(X[X.isnull().any(axis=1)]) #global_radiation column has null values we will use imputer technique for these nulls
print(X.isnull().sum())
#X.head()
print(X.shape)

Hidden output

y = X.iloc[:, -1]
X = X.iloc[:, :-1]

#impute values in global_radiation column
#Currently, we are going to use mean strategy, we can use most frequent or nearest neighbor KNN imputer
#We can also mark for referene which value we impute with MissingIndicator
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean')

# Fitting the data to the imputer object
imputer = imputer.fit(X)
 
# Imputing the data     
X = imputer.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,train_size=0.7)

#Scale the data
scaler = StandardScaler()

scaler.fit_transform(X_train)

scaler.transform(X_test)

Hidden output

run_name=''

#Set experiment
mlflow.set_experiment("PredictTemperature_Experiment")

for idx, depth in enumerate([1,5,10,20]):
    run_name = f"run_{idx}"

    with mlflow.start_run(run_name=run_name):
        #Start of linear regressor model

        lr = LinearRegression(n_jobs=depth)
        lr.fit(X_train,y_train)
        #print(lr.score(X_test, y_test))    
        y_pred = lr.predict(X_test)
        lr_rmse = mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False)
        #print("lr_RMSE:",lr_rmse)
        
        mlflow.sklearn.log_model(lr,"LinearRegression")
        mlflow.log_metric("rmse_lr", lr_rmse)
    
        #end of linear regression model

        #Start of decision tree regressor model

        dtr = DecisionTreeRegressor(random_state=42, max_depth=depth)
        dtr.fit(X_train,y_train)
        y_pred = dtr.predict(X_test)
        dtr_rmse = mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False)
        #print("dtr_RMSE:",dtr_rmse)
        
        mlflow.sklearn.log_model(dtr, "DecisionTreeRegressor")
        mlflow.log_metric("rmse_dtr", dtr_rmse)

        #End of decision tree regressor model

        #Start of Random Forest Regressor model

        rfg = RandomForestRegressor(random_state=42, n_estimators=depth)
        rfg.fit(X_train,y_train)
        y_pred = rfg.predict(X_test)
        rfr_rmse = mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False)
        #print("rfr_RMSE:",rfr_rmse)
        
        mlflow.sklearn.log_model(rfg, "RandomForestRegressor")
        mlflow.log_metric("rmse_rfr", rfr_rmse)

        #End of Random Forest Regressor model
    
    #End a run
    mlflow.end_run()

experiment_results = mlflow.search_runs()