Skip to content
Project
  • AI Chat
  • Code
  • Report
  • As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

    You will be working with data stored in london_weather.csv, which contains the following columns:

    • date - recorded date of measurement - (int)
    • cloud_cover - cloud cover measurement in oktas - (float)
    • sunshine - sunshine measurement in hours (hrs) - (float)
    • global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
    • max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
    • mean_temp - mean temperature in degrees Celsius (°C) - (float)
    • min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
    • precipitation - precipitation measurement in millimeters (mm) - (float)
    • pressure - pressure measurement in Pascals (Pa) - (float)
    • snow_depth - snow depth measurement in centimeters (cm) - (float)
    import pandas as pd
    import numpy as np
    import mlflow
    import mlflow.sklearn
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from datetime import datetime
    import matplotlib.pyplot as plt
    # Load data and perform exploratory analysis
    weather=pd.read_csv('london_weather.csv')
    weather.head(5)
    
    weather['date'] = pd.to_datetime(weather['date'], format='%Y%m%d')
    weather['year'] = weather['date'].dt.year
    weather['month'] = weather['date'].dt.month
    weather_metrics = ['cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'mean_temp', 'min_temp', 'precipitation', 'pressure', 'snow_depth']
    weather_per_month = weather.groupby(['year', 'month'], as_index = False)[weather_metrics].mean()
    
    sns.lineplot(x="year", y="mean_temp", data=weather_per_month, ci=None)
    plt.show()
    
    sns.barplot(x='month', y='precipitation', data=weather)
    plt.show()
    
    plt.figure(figsize=(12, 10)) 
    sns.heatmap(weather.corr(), annot=True)
    plt.show()
    
    # Choose features, define the target, and drop null values
    feature_selection = ['month', 'cloud_cover', 'sunshine', 'precipitation', 'pressure', 'global_radiation']
    target_var = 'mean_temp'
    weather = weather.dropna(subset=['mean_temp'])
    
    def preprocess_df(df, feature_selection, target_var):
        """
        Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features. Returns the train and test ets
        """
        # Complete this function
        X = df[feature_selection]    
        y = df[target_var]
        #split the data into train and test 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        # simple imputer to fill the null value  
        imp_data=SimpleImputer(strategy='mean')
        X_train=imp_data.fit_transform(X_train)
        X_test=imp_data.transform(X_test)
        # Scale The Data 
        scaler=StandardScaler()
        X_train=scaler.fit_transform(X_train)
        X_test=scaler.transform(X_test)
        
        return X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = preprocess_df(weather, feature_selection, target_var)
    
    def predict_and_evaluate(model, x_test, y_test):
        """
        Predict values from test set, calculate and return the root mean squared error.
        """
        y_pred=model.predict(x_test)
        rmse=np.sqrt(mean_squared_error(y_test,y_pred))
        
        return rmse
    EXPERIMENT_NAME = "mohammadEx"
    
    # Check if the experiment already exists, if not create a new one
    if mlflow.get_experiment_by_name(EXPERIMENT_NAME) is None:
        EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
    else:
        EXPERIMENT_ID = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
    
    # Adjust the parameters
    max_depth_parameters = [1, 2]
    
    for idx, depth in enumerate([1, 2, 5, 10, 20]):
        parameters = {
            'max_depth': depth,
            'random_state': 1
        }    
        RUN_NAME = f"run_{idx}"
        # Complete the experiment loop
        with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME):
            # Create models
            lin_reg = LinearRegression().fit(X_train, y_train)
            tree_reg = DecisionTreeRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
            forest_reg = RandomForestRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
            # Log models
            mlflow.sklearn.log_model(lin_reg, "lin_reg")
            mlflow.sklearn.log_model(tree_reg, "tree_reg")
            mlflow.sklearn.log_model(forest_reg, "forest_reg")
            # Evaluate performance
            lin_reg_rmse = predict_and_evaluate(lin_reg, X_test, y_test)
            tree_reg_rmse = predict_and_evaluate(tree_reg, X_test, y_test)
            forest_reg_rmse = predict_and_evaluate(forest_reg, X_test, y_test)
            # Log performance
            mlflow.log_param("max_depth", depth)
            mlflow.log_metric("rmse_lr", lin_reg_rmse)
            mlflow.log_metric("rmse_tr", tree_reg_rmse)
            mlflow.log_metric("rmse_fr", forest_reg_rmse)
    
    experiment_results = mlflow.search_runs(experiment_names=[EXPERIMENT_NAME])
    experiment_results