Certification Practice Project

# headers, raw imports

# base packages
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style('darkgrid')
sns.set_palette('colorblind')

# data validation  --  not needed
import missingno as miss

# imputation  --  ended up not being needed
from sklearn.impute import SimpleImputer, KNNImputer

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# train/test systems
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MPE

# machine learning regressor models to try
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN

# deep learning regressor models
from tensorflow.keras import Sequential, Input
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Dense, Dropout

# load and inspect the data
toyota_raw = pd.read_csv('toyota.csv')
toyota_raw

# missingness
print(toyota_raw.isna().sum())  # no obvious missing values
miss.matrix(toyota_raw)
plt.show()

# check column info
print(toyota_raw.info())  # all columns the appropriate data type
toyota_raw.describe()  # min/max for numerical columns look appropriate, except year
toyota_raw['yearSince1997'] = toyota_raw['year']-1997
toyota_raw.drop('year',axis=1,inplace=True)
# ^ relationships are easier to reason about when the functional dependence starts at zero

# check for whitespace
for (dtype, col_name) in zip(toyota_raw.dtypes, toyota_raw.columns) :
    if dtype == 'object' :
        print(f"Checking {col_name}")
        print(toyota_raw[col_name].unique())
        
# seems to be whitespace in the 'model' column, let's trim it
for col_name in ['model'] :
    toyota_raw[col_name] = toyota_raw[col_name].str.strip()

fig, ax = plt.subplots()
sns.histplot(data=toyota_raw,x='price', ax=ax)
mean = toyota_raw['price'].mean()
ax.axvline(x=mean, label=f"Average: {mean:.0F}", linestyle='--')
ax.legend()
ax.set(xlabel='Price ($)', ylabel='Count')
plt.tight_layout()
plt.savefig('price_histo.png')

# count by model
toyota_models = pd.DataFrame( toyota_raw['model'].value_counts() ) \
                        .reset_index()                             \
                        .sort_values('model', ascending=False)
toyota_models.columns = ['model','count']
print(toyota_models)
colors = sns.diverging_palette(h_neg=220, h_pos=25, s=100, l=60, sep=60, n=len(toyota_raw['model'].unique()))
#print(colors[17])
color_dict = { model: color for model, color in zip(reversed(toyota_models['model'].values),colors) }
fig, ax = plt.subplots()
sns.countplot(data=toyota_raw, y='model',ax=ax, order=toyota_raw['model'].value_counts().index, palette=color_dict )
ax.set(ylabel='Model', xlabel='Count')
# ax.set(xscale='log')
# plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('count_by_model.png')

# revenue by model
model_revenue = pd.DataFrame( toyota_raw[['model','price']].groupby('model')['price'].sum() ) \
                        .reset_index()                                                        \
                        .sort_values('price', ascending=False)
model_revenue['revenue'] = model_revenue['price']/1e6
fig, ax = plt.subplots()
sns.barplot(data=model_revenue, x='revenue', y='model', ax=ax, palette=color_dict)
ax.set(ylabel='Model', xlabel='Revenue (Million $)')
plt.tight_layout()
plt.savefig('revenue_by_model.png')

# price distribution by model
toyota_price_by_model = toyota_raw[['price','model']]
toyota_price_by_model['mean'] = toyota_price_by_model.groupby('model')['price'].transform('mean')
toyota_price_by_model = toyota_price_by_model.sort_values('mean',ascending=False).drop('mean',axis=1)
print(toyota_price_by_model)

fig, ax = plt.subplots()
sns.boxplot(data=toyota_price_by_model,x='price',y='model', ax=ax, palette=color_dict)
ax.set(xlabel='Price ($)', ylabel='Model')
plt.tight_layout()
plt.savefig('box_plot_price_by_model.png')

fig, axes = plt.subplots(2,3,sharey=True)
i=0
palette = sns.color_palette('hls',6)
for dtype, col_name in zip(toyota_raw.dtypes,toyota_raw.columns):
    if dtype == 'object' or col_name == 'price':
        continue
    sns.scatterplot(data=toyota_raw, x=col_name,y='price', ax=axes[i//3,i%3],color=palette[i],label=col_name)
    i+=1
    toyota_raw[[col_name,'price']].set_index(col_name).to_csv(f"{col_name}.csv")
plt.legend()
plt.tight_layout()
plt.savefig('linear_plots.png')

# mileage: the price goes like exp(-mileage). So taking the exponential of mileage should make it linear, 
# making it easier for linear regressions to working
if 'mileage' in toyota_raw.columns :
    toyota_raw['expMileage'] = np.exp(-toyota_raw['mileage']/20000)
    toyota_raw.drop('mileage',axis=1,inplace=True)

# tax: goes linearly, or close to a linear power

# mpg:  probably a sum of two sigmoids

# the engine size goes like a sigmoid which is alittle difficult to correct for
# F = 1690, H = 42400, w=0.779, x0=2.36  

# the price goes like exp(year)
if 'yearSince1997' in toyota_raw.columns :
    toyota_raw['expYear'] = np.exp(toyota_raw['yearSince1997']/10)
    toyota_raw.drop('yearSince1997',axis=1,inplace=True)

# categorical to numerical through one-hot encoding
toyota_onehot = pd.get_dummies(toyota_raw, drop_first=True)
toyota_shuffled = shuffle(toyota_onehot)

# split into features and labels
X = toyota_shuffled.drop('price', axis=1)
y = toyota_shuffled['price']

# split 20% off for testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2023)

# make sure we haven't leaked label info into feature df
X.head()

# See how well we can do with a simple linear regression
linreg = LinearRegression()

linreg.fit(X_train,y_train)
y_pred=linreg.predict(X_test)

mse_linreg = MSE(y_test,y_pred)
rms_linreg = np.sqrt(mse_linreg)
print(f"Root mean squared error: {rms_linreg}")

mpe_linreg = MPE(y_test,y_pred)
print(f"Mean percent error: {mpe_linreg}")

# Setting up list of models to try and their hyper-parameter grids

imputer = SimpleImputer()
scaler = MinMaxScaler()  # this should really be StandardScaler for numerical data



knn = KNN()
knn_grid = {
    'model__n_neighbors' : [1,2,3,4,5]
}

dt = DecisionTreeRegressor(random_state=2023)
dt_grid = {
    'model__max_depth' : [10,11,12,13,14],
    'model__min_samples_leaf' : [0.00001,0.00005,0.0001,0.0002]
}

# stump = DecisionTreeRegressor(max_depth=1)

rf = RandomForestRegressor(random_state=2023)
rf_grid = {
    'model__max_features' : ['sqrt'],
    'model__max_depth' : [13,14,15],
    'model__min_samples_leaf' : [0.00005,0.0001,0.0002,0.0003]
}

ada = AdaBoostRegressor(random_state=2023)
ada_grid = {
    'model__n_estimators'  : [20,25,30],
    'model__learning_rate' : [1,1.5,2]
}

grad = HistGradientBoostingRegressor(random_state=2023)
grad_grid = {
    'model__learning_rate' : [0.1,0.12,0.14],
    'model__max_depth'     : [6,7,8],
    'model__min_samples_leaf' : [1,2,3]
}

models      = [     dt,     knn,     rf,     ada,     grad]
model_grids = [dt_grid,knn_grid,rf_grid,ada_grid,grad_grid]

ultimate_score = rms_linreg
ultimate_model = linreg

from sklearn.inspection import permutation_importance

# result = permutation_importance(
#     rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
# )

for model, grid in zip(reversed(models),reversed(model_grids)) :
    

    
    steps = [ ('imputer',imputer), ('scaler',scaler), ('model',model) ]
    pipeline = Pipeline(steps=steps)

    search_cv = GridSearchCV(estimator=pipeline, 
                             param_grid=grid,
                             scoring='neg_mean_squared_error',
                             cv=10)
    search_cv.fit(X_train,y_train)

    best_model = search_cv.best_estimator_
    best_hypers = search_cv.best_params_
    best_score = np.sqrt(-search_cv.best_score_)
    
    print(f"For {model=} best RMS score is {best_score} with args {best_hypers} ")

    if best_score < ultimate_score:
        ultimate_score = best_score
        ultimate_model = best_model

‌
‌
‌