Project — DataLab

As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

You will be working with data stored in london_weather.csv, which contains the following columns:

date - recorded date of measurement - (int)
cloud_cover - cloud cover measurement in oktas - (float)
sunshine - sunshine measurement in hours (hrs) - (float)
global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
mean_temp - mean temperature in degrees Celsius (°C) - (float)
min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
precipitation - precipitation measurement in millimeters (mm) - (float)
pressure - pressure measurement in Pascals (Pa) - (float)
snow_depth - snow depth measurement in centimeters (cm) - (float)

import pandas as pd
df = pd.read_csv("london_weather.csv")

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


class EDAExecutor:
    def __init__(self,dataset:pd.DataFrame):
        self.dataset = dataset
    

    def show_resume(self):
        if any(self.dataset.isnull().any()):
            print("Yess! there is\n")
            print(self.dataset.isnull().sum())
        else:
            print("NOOO! there isn't")
        print('\n')
        print("What are the types of the columns?:")
        print(self.dataset.dtypes)
        print("\n")
        print("Statistic information:")
        print(self.dataset.describe)
        
    def show_histogram(self):
        plt.figure(figsize=(8,6))
        sns.histplot(data=self.dataset,x="mean_temp",kde=True)
        plt.title("Distribution of Mean Temperature (°C)")
        plt.xlabel("Mean Temperature (°C)")
        plt.ylabel("Frequency")
        plt.show()
        
    def show_corr_matrix(self):
        plt.figure(figsize=(10,8))
        sns.heatmap(self.dataset.corr(),annot=True,cmap="coolwarm")
        plt.title("Correlation Heatmap")
        plt.show()

eda_info = EDAExecutor(df)

# Call the show_resume method
eda_info.show_resume()

eda_info.show_histogram()

The histogram above indicates that the most frequent temperature range is between 5 and 10 degrees Celsius.

# Call the show_corr_ma method
eda_info.show_corr_matrix()

#plt.figure(figsize=(8,6))
sns.relplot(data=df, x="snow_depth", y="max_temp",kind="line")
plt.title("Snow depth vs Maximum temperature")
plt.xlabel("Snow depth")
plt.ylabel("Max Temperature")
plt.show()

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("london_weather.csv")

def preprocess_df(df, feature_selection, target_var):
    """
    Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features. Returns the train and test ets
    """
    # Complete this function
    X = df[feature_selection]
    y = df[target_var]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    sc = StandardScaler()
    
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train, X_test, y_train, y_test

feature_selection = ['date','sunshine','global_radiation','max_temp','min_temp','snow_depth','pressure','cloud_cover','precipitation']
target_var = 'mean_temp'

df_subset = df.dropna(subset=[target_var])

#df_subset = df[feature_selection].drop(axis=1)

X_train, X_test, y_train, y_test = preprocess_df(df_subset,feature_selection,target_var)

‌
‌
‌