Skip to content
Project
As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.
You will be working with data stored in london_weather.csv, which contains the following columns:
- date - recorded date of measurement - (int)
- cloud_cover - cloud cover measurement in oktas - (float)
- sunshine - sunshine measurement in hours (hrs) - (float)
- global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
- max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
- mean_temp - mean temperature in degrees Celsius (°C) - (float)
- min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
- precipitation - precipitation measurement in millimeters (mm) - (float)
- pressure - pressure measurement in Pascals (Pa) - (float)
- snow_depth - snow depth measurement in centimeters (cm) - (float)
import pandas as pd
df = pd.read_csv("london_weather.csv")# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
class EDAExecutor:
def __init__(self,dataset:pd.DataFrame):
self.dataset = dataset
def show_resume(self):
if any(self.dataset.isnull().any()):
print("Yess! there is\n")
print(self.dataset.isnull().sum())
else:
print("NOOO! there isn't")
print('\n')
print("What are the types of the columns?:")
print(self.dataset.dtypes)
print("\n")
print("Statistic information:")
print(self.dataset.describe)
def show_histogram(self):
plt.figure(figsize=(8,6))
sns.histplot(data=self.dataset,x="mean_temp",kde=True)
plt.title("Distribution of Mean Temperature (°C)")
plt.xlabel("Mean Temperature (°C)")
plt.ylabel("Frequency")
plt.show()
def show_corr_matrix(self):
plt.figure(figsize=(10,8))
sns.heatmap(self.dataset.corr(),annot=True,cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
eda_info = EDAExecutor(df)
# Call the show_resume method
eda_info.show_resume()eda_info.show_histogram()The histogram above indicates that the most frequent temperature range is between 5 and 10 degrees Celsius.
# Call the show_corr_ma method
eda_info.show_corr_matrix()#plt.figure(figsize=(8,6))
sns.relplot(data=df, x="snow_depth", y="max_temp",kind="line")
plt.title("Snow depth vs Maximum temperature")
plt.xlabel("Snow depth")
plt.ylabel("Max Temperature")
plt.show()import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
df = pd.read_csv("london_weather.csv")def preprocess_df(df, feature_selection, target_var):
"""
Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features. Returns the train and test ets
"""
# Complete this function
X = df[feature_selection]
y = df[target_var]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
sc = StandardScaler()
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
return X_train, X_test, y_train, y_test
feature_selection = ['date','sunshine','global_radiation','max_temp','min_temp','snow_depth','pressure','cloud_cover','precipitation']
target_var = 'mean_temp'df_subset = df.dropna(subset=[target_var])#df_subset = df[feature_selection].drop(axis=1)X_train, X_test, y_train, y_test = preprocess_df(df_subset,feature_selection,target_var)