Skip to content
As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn
and MLflow
.
You will be working with data stored in london_weather.csv
, which contains the following columns:
- date - recorded date of measurement - (int)
- cloud_cover - cloud cover measurement in oktas - (float)
- sunshine - sunshine measurement in hours (hrs) - (float)
- global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
- max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
- mean_temp - mean temperature in degrees Celsius (°C) - (float)
- min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
- precipitation - precipitation measurement in millimeters (mm) - (float)
- pressure - pressure measurement in Pascals (Pa) - (float)
- snow_depth - snow depth measurement in centimeters (cm) - (float)
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
import matplotlib.pyplot as plt
# Load data and perform exploratory analysis
weather=pd.read_csv('london_weather.csv')
weather.head(5)
weather['date'] = pd.to_datetime(weather['date'], format='%Y%m%d')
weather['year'] = weather['date'].dt.year
weather['month'] = weather['date'].dt.month
weather_metrics = ['cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'mean_temp', 'min_temp', 'precipitation', 'pressure', 'snow_depth']
weather_per_month = weather.groupby(['year', 'month'], as_index = False)[weather_metrics].mean()
sns.lineplot(x="year", y="mean_temp", data=weather_per_month, ci=None)
plt.show()
sns.barplot(x='month', y='precipitation', data=weather)
plt.show()
plt.figure(figsize=(12, 10))
sns.heatmap(weather.corr(), annot=True)
plt.show()
# Choose features, define the target, and drop null values
feature_selection = ['month', 'cloud_cover', 'sunshine', 'precipitation', 'pressure', 'global_radiation']
target_var = 'mean_temp'
weather = weather.dropna(subset=['mean_temp'])
def preprocess_df(df, feature_selection, target_var):
"""
Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features. Returns the train and test ets
"""
# Complete this function
X = df[feature_selection]
y = df[target_var]
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# simple imputer to fill the null value
imp_data=SimpleImputer(strategy='mean')
X_train=imp_data.fit_transform(X_train)
X_test=imp_data.transform(X_test)
# Scale The Data
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = preprocess_df(weather, feature_selection, target_var)
def predict_and_evaluate(model, x_test, y_test):
"""
Predict values from test set, calculate and return the root mean squared error.
"""
y_pred=model.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
return rmse
EXPERIMENT_NAME = "mohammadEx"
# Check if the experiment already exists, if not create a new one
if mlflow.get_experiment_by_name(EXPERIMENT_NAME) is None:
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
else:
EXPERIMENT_ID = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
# Adjust the parameters
max_depth_parameters = [1, 2]
for idx, depth in enumerate([1, 2, 5, 10, 20]):
parameters = {
'max_depth': depth,
'random_state': 1
}
RUN_NAME = f"run_{idx}"
# Complete the experiment loop
with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME):
# Create models
lin_reg = LinearRegression().fit(X_train, y_train)
tree_reg = DecisionTreeRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
forest_reg = RandomForestRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
# Log models
mlflow.sklearn.log_model(lin_reg, "lin_reg")
mlflow.sklearn.log_model(tree_reg, "tree_reg")
mlflow.sklearn.log_model(forest_reg, "forest_reg")
# Evaluate performance
lin_reg_rmse = predict_and_evaluate(lin_reg, X_test, y_test)
tree_reg_rmse = predict_and_evaluate(tree_reg, X_test, y_test)
forest_reg_rmse = predict_and_evaluate(forest_reg, X_test, y_test)
# Log performance
mlflow.log_param("max_depth", depth)
mlflow.log_metric("rmse_lr", lin_reg_rmse)
mlflow.log_metric("rmse_tr", tree_reg_rmse)
mlflow.log_metric("rmse_fr", forest_reg_rmse)
experiment_results = mlflow.search_runs(experiment_names=[EXPERIMENT_NAME])
experiment_results