Skip to content
Certification Practice Project
# headers, raw imports
# base packages
import pandas as pd
import numpy as np
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style('darkgrid')
sns.set_palette('colorblind')
# data validation -- not needed
import missingno as miss
# imputation -- ended up not being needed
from sklearn.impute import SimpleImputer, KNNImputer
# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
# train/test systems
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MPE
# machine learning regressor models to try
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
# deep learning regressor models
from tensorflow.keras import Sequential, Input
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Dense, Dropout
# load and inspect the data
toyota_raw = pd.read_csv('toyota.csv')
toyota_raw
# missingness
print(toyota_raw.isna().sum()) # no obvious missing values
miss.matrix(toyota_raw)
plt.show()
# check column info
print(toyota_raw.info()) # all columns the appropriate data type
toyota_raw.describe() # min/max for numerical columns look appropriate, except year
toyota_raw['yearSince1997'] = toyota_raw['year']-1997
toyota_raw.drop('year',axis=1,inplace=True)
# ^ relationships are easier to reason about when the functional dependence starts at zero
# check for whitespace
for (dtype, col_name) in zip(toyota_raw.dtypes, toyota_raw.columns) :
if dtype == 'object' :
print(f"Checking {col_name}")
print(toyota_raw[col_name].unique())
# seems to be whitespace in the 'model' column, let's trim it
for col_name in ['model'] :
toyota_raw[col_name] = toyota_raw[col_name].str.strip()
fig, ax = plt.subplots()
sns.histplot(data=toyota_raw,x='price', ax=ax)
mean = toyota_raw['price'].mean()
ax.axvline(x=mean, label=f"Average: {mean:.0F}", linestyle='--')
ax.legend()
ax.set(xlabel='Price ($)', ylabel='Count')
plt.tight_layout()
plt.savefig('price_histo.png')
# count by model
toyota_models = pd.DataFrame( toyota_raw['model'].value_counts() ) \
.reset_index() \
.sort_values('model', ascending=False)
toyota_models.columns = ['model','count']
print(toyota_models)
colors = sns.diverging_palette(h_neg=220, h_pos=25, s=100, l=60, sep=60, n=len(toyota_raw['model'].unique()))
#print(colors[17])
color_dict = { model: color for model, color in zip(reversed(toyota_models['model'].values),colors) }
fig, ax = plt.subplots()
sns.countplot(data=toyota_raw, y='model',ax=ax, order=toyota_raw['model'].value_counts().index, palette=color_dict )
ax.set(ylabel='Model', xlabel='Count')
# ax.set(xscale='log')
# plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('count_by_model.png')
# revenue by model
model_revenue = pd.DataFrame( toyota_raw[['model','price']].groupby('model')['price'].sum() ) \
.reset_index() \
.sort_values('price', ascending=False)
model_revenue['revenue'] = model_revenue['price']/1e6
fig, ax = plt.subplots()
sns.barplot(data=model_revenue, x='revenue', y='model', ax=ax, palette=color_dict)
ax.set(ylabel='Model', xlabel='Revenue (Million $)')
plt.tight_layout()
plt.savefig('revenue_by_model.png')
# price distribution by model
toyota_price_by_model = toyota_raw[['price','model']]
toyota_price_by_model['mean'] = toyota_price_by_model.groupby('model')['price'].transform('mean')
toyota_price_by_model = toyota_price_by_model.sort_values('mean',ascending=False).drop('mean',axis=1)
print(toyota_price_by_model)
fig, ax = plt.subplots()
sns.boxplot(data=toyota_price_by_model,x='price',y='model', ax=ax, palette=color_dict)
ax.set(xlabel='Price ($)', ylabel='Model')
plt.tight_layout()
plt.savefig('box_plot_price_by_model.png')
fig, axes = plt.subplots(2,3,sharey=True)
i=0
palette = sns.color_palette('hls',6)
for dtype, col_name in zip(toyota_raw.dtypes,toyota_raw.columns):
if dtype == 'object' or col_name == 'price':
continue
sns.scatterplot(data=toyota_raw, x=col_name,y='price', ax=axes[i//3,i%3],color=palette[i],label=col_name)
i+=1
toyota_raw[[col_name,'price']].set_index(col_name).to_csv(f"{col_name}.csv")
plt.legend()
plt.tight_layout()
plt.savefig('linear_plots.png')
# mileage: the price goes like exp(-mileage). So taking the exponential of mileage should make it linear,
# making it easier for linear regressions to working
if 'mileage' in toyota_raw.columns :
toyota_raw['expMileage'] = np.exp(-toyota_raw['mileage']/20000)
toyota_raw.drop('mileage',axis=1,inplace=True)
# tax: goes linearly, or close to a linear power
# mpg: probably a sum of two sigmoids
# the engine size goes like a sigmoid which is alittle difficult to correct for
# F = 1690, H = 42400, w=0.779, x0=2.36
# the price goes like exp(year)
if 'yearSince1997' in toyota_raw.columns :
toyota_raw['expYear'] = np.exp(toyota_raw['yearSince1997']/10)
toyota_raw.drop('yearSince1997',axis=1,inplace=True)
# categorical to numerical through one-hot encoding
toyota_onehot = pd.get_dummies(toyota_raw, drop_first=True)
toyota_shuffled = shuffle(toyota_onehot)
# split into features and labels
X = toyota_shuffled.drop('price', axis=1)
y = toyota_shuffled['price']
# split 20% off for testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2023)
# make sure we haven't leaked label info into feature df
X.head()
# See how well we can do with a simple linear regression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred=linreg.predict(X_test)
mse_linreg = MSE(y_test,y_pred)
rms_linreg = np.sqrt(mse_linreg)
print(f"Root mean squared error: {rms_linreg}")
mpe_linreg = MPE(y_test,y_pred)
print(f"Mean percent error: {mpe_linreg}")
# Setting up list of models to try and their hyper-parameter grids
imputer = SimpleImputer()
scaler = MinMaxScaler() # this should really be StandardScaler for numerical data
knn = KNN()
knn_grid = {
'model__n_neighbors' : [1,2,3,4,5]
}
dt = DecisionTreeRegressor(random_state=2023)
dt_grid = {
'model__max_depth' : [10,11,12,13,14],
'model__min_samples_leaf' : [0.00001,0.00005,0.0001,0.0002]
}
# stump = DecisionTreeRegressor(max_depth=1)
rf = RandomForestRegressor(random_state=2023)
rf_grid = {
'model__max_features' : ['sqrt'],
'model__max_depth' : [13,14,15],
'model__min_samples_leaf' : [0.00005,0.0001,0.0002,0.0003]
}
ada = AdaBoostRegressor(random_state=2023)
ada_grid = {
'model__n_estimators' : [20,25,30],
'model__learning_rate' : [1,1.5,2]
}
grad = HistGradientBoostingRegressor(random_state=2023)
grad_grid = {
'model__learning_rate' : [0.1,0.12,0.14],
'model__max_depth' : [6,7,8],
'model__min_samples_leaf' : [1,2,3]
}
models = [ dt, knn, rf, ada, grad]
model_grids = [dt_grid,knn_grid,rf_grid,ada_grid,grad_grid]
ultimate_score = rms_linreg
ultimate_model = linreg
from sklearn.inspection import permutation_importance
# result = permutation_importance(
# rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
# )
for model, grid in zip(reversed(models),reversed(model_grids)) :
steps = [ ('imputer',imputer), ('scaler',scaler), ('model',model) ]
pipeline = Pipeline(steps=steps)
search_cv = GridSearchCV(estimator=pipeline,
param_grid=grid,
scoring='neg_mean_squared_error',
cv=10)
search_cv.fit(X_train,y_train)
best_model = search_cv.best_estimator_
best_hypers = search_cv.best_params_
best_score = np.sqrt(-search_cv.best_score_)
print(f"For {model=} best RMS score is {best_score} with args {best_hypers} ")
if best_score < ultimate_score:
ultimate_score = best_score
ultimate_model = best_model