Skip to content

Library import

import joblib

# Data Manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learn
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

Auxiliary Functions

def simple_style(
        s: pd.Series, true_css: str, false_css: str = ''
) -> np.ndarray:
    """
    Functions to basic style a dataframe.
    
    It color the max or min value depending the columns.
    """
    
    if s.name in ['MAE', 'MSE', 'RMSE']:
        return np.where(s ==  s.min(), true_css, false_css)
    else:
        return np.where(s ==  s.max(), true_css, false_css)
# Get the idea from:
# https://www.kaggle.com/code/jillanisofttech/flight-price-prediction

def predict(model_lst, X_train, X_test, y_train, y_test):
    """
    This Function is used to first select the best models to hypertunning parameters
    """
    
    
   
    evalation_df = pd.DataFrame(data=None)
    prediction_lst = []
    model_name_lst = []
    
    for model_name, model in model_lst:
        model_evalation = [] 
        
        print('Running: {}'.format(model_name))
        model.fit(X_train,y_train)
        
        # R2 TRain
        train_score= round(model.score(X_train,y_train), 2)
        model_evalation.append(train_score)
        
        # Prediction
        predictions = model.predict(X_test)
        
        # R2 Test
        test_score= round(model.score(X_test,y_test), 2)
        model_evalation.append(test_score)
        
        # MAE
        mae = round(mean_absolute_error(y_test,predictions))
        model_evalation.append(mae)
        
        #MSE
        mse = round(mean_squared_error(y_test,predictions))
        model_evalation.append(mse)
        
        #RMSE
        rmse = round(np.sqrt(mean_squared_error(y_test,predictions)))
        model_evalation.append(rmse)
        
        # Adding a Column to the model evalation DataFrame
        evalation_df[model_name] = model_evalation
        
        # Storing predictions
        prediction_lst.append(predictions)
        prediction_lst.append(predictions)
        
        # Storing Model Name
        model_name_lst.append(model_name)
        model_name_lst.append(model_name)
        
        
        
        
        
    evalation_df.index = ['Train Score', 'Test Score', 'MAE', 'MSE', 'RMSE']
    evalation_df = evalation_df.T
    print('\n\n\n##################################   DataFrame Evaluator    #####################################\n\n\n')
    display(evalation_df.style.apply(simple_style, true_css='background-color: blue'))      
    
    
    
    print('\n\n\n#######################################   Model Plots    #####################################\n\n')
    
    figsize = (12, 6 * len(model_lst))
    fig, axs = plt.subplots(len(model_lst), 2, figsize=figsize)
    
    
    for i, (model_name, prediction, ax) in enumerate(zip(model_name_lst, prediction_lst, axs.flat)):
        
        if i%2==0:
            hist = sns.histplot(y_test-prediction, kde=True, ax=ax)
            hist.set_title(model_name)
        else:
            reg = sns.regplot(x = y_test, y = prediction,
                              color = 'skyblue', ax=ax,
                              line_kws={'color':'blue', 'ls':'--'})
            reg.set_title(model_name)
            reg.set_xlabel('Actual Values')
            reg.set_ylabel('Predictions Values')
            
            
            

Predicting flight prices

The objective of this work is to find the best model to predict the value of air tickets

Final Result:

  • Best Model: RandomFlorest
  • R²: 0.93
  • MAE: 546

Import Data and First Look

# Train data
df = pd.read_excel('data.xlsx')
display(df.sample(5))
print('Data Shape:')
pd.DataFrame(df.shape, columns=['Train'], index=['No. Lines', 'No. Columns'])

Verifing the data types

pd.DataFrame(df.dtypes, columns=['Type'])

Verifing the Nas in the data

null_train = pd.DataFrame(df.isnull().sum(), columns=['No. of Nulls Train'])

null_train