Skip to content
Predicting flight prices (r2=0.93, MAE=546)
Library import
import joblib
# Data Manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learn
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_scoreAuxiliary Functions
def simple_style(
s: pd.Series, true_css: str, false_css: str = ''
) -> np.ndarray:
"""
Functions to basic style a dataframe.
It color the max or min value depending the columns.
"""
if s.name in ['MAE', 'MSE', 'RMSE']:
return np.where(s == s.min(), true_css, false_css)
else:
return np.where(s == s.max(), true_css, false_css)# Get the idea from:
# https://www.kaggle.com/code/jillanisofttech/flight-price-prediction
def predict(model_lst, X_train, X_test, y_train, y_test):
"""
This Function is used to first select the best models to hypertunning parameters
"""
evalation_df = pd.DataFrame(data=None)
prediction_lst = []
model_name_lst = []
for model_name, model in model_lst:
model_evalation = []
print('Running: {}'.format(model_name))
model.fit(X_train,y_train)
# R2 TRain
train_score= round(model.score(X_train,y_train), 2)
model_evalation.append(train_score)
# Prediction
predictions = model.predict(X_test)
# R2 Test
test_score= round(model.score(X_test,y_test), 2)
model_evalation.append(test_score)
# MAE
mae = round(mean_absolute_error(y_test,predictions))
model_evalation.append(mae)
#MSE
mse = round(mean_squared_error(y_test,predictions))
model_evalation.append(mse)
#RMSE
rmse = round(np.sqrt(mean_squared_error(y_test,predictions)))
model_evalation.append(rmse)
# Adding a Column to the model evalation DataFrame
evalation_df[model_name] = model_evalation
# Storing predictions
prediction_lst.append(predictions)
prediction_lst.append(predictions)
# Storing Model Name
model_name_lst.append(model_name)
model_name_lst.append(model_name)
evalation_df.index = ['Train Score', 'Test Score', 'MAE', 'MSE', 'RMSE']
evalation_df = evalation_df.T
print('\n\n\n################################## DataFrame Evaluator #####################################\n\n\n')
display(evalation_df.style.apply(simple_style, true_css='background-color: blue'))
print('\n\n\n####################################### Model Plots #####################################\n\n')
figsize = (12, 6 * len(model_lst))
fig, axs = plt.subplots(len(model_lst), 2, figsize=figsize)
for i, (model_name, prediction, ax) in enumerate(zip(model_name_lst, prediction_lst, axs.flat)):
if i%2==0:
hist = sns.histplot(y_test-prediction, kde=True, ax=ax)
hist.set_title(model_name)
else:
reg = sns.regplot(x = y_test, y = prediction,
color = 'skyblue', ax=ax,
line_kws={'color':'blue', 'ls':'--'})
reg.set_title(model_name)
reg.set_xlabel('Actual Values')
reg.set_ylabel('Predictions Values')
Predicting flight prices
The objective of this work is to find the best model to predict the value of air tickets
Final Result:
- Best Model: RandomFlorest
- R²: 0.93
- MAE: 546
Import Data and First Look
# Train data
df = pd.read_excel('data.xlsx')display(df.sample(5))print('Data Shape:')
pd.DataFrame(df.shape, columns=['Train'], index=['No. Lines', 'No. Columns'])Verifing the data types
pd.DataFrame(df.dtypes, columns=['Type'])Verifing the Nas in the data
null_train = pd.DataFrame(df.isnull().sum(), columns=['No. of Nulls Train'])
null_train