Skip to content
Project: Predicting Movie Rental Durations
A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.
The data they provided is in the csv file rental_info.csv
. It has the following features:
"rental_date"
: The date (and time) the customer rents the DVD."return_date"
: The date (and time) the customer returns the DVD."amount"
: The amount paid by the customer for renting the DVD."amount_2"
: The square of"amount"
."rental_rate"
: The rate at which the DVD is rented for."rental_rate_2"
: The square of"rental_rate"
."release_year"
: The year the movie being rented was released."length"
: Lenght of the movie being rented, in minuites."length_2"
: The square of"length"
."replacement_cost"
: The amount it will cost the company to replace the DVD."special_features"
: Any special features, for example trailers/deleted scenes that the DVD also has."NC-17"
,"PG"
,"PG-13"
,"R"
: These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.
# Importing the necessary extensions
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# Reading rentals_info.csv while parsing for dates
rentals_df = pd.read_csv('rental_info.csv', parse_dates = ['rental_date', 'return_date'])
rentals_df.head()
# Preprocessing dataset for nulls
rentals_df.isna().sum().sort_values() # All clear to go forward!
# Creating rental_length_days column
# Converting the new column to datetime series
rentals_df['rental_length_days'] = (rentals_df['return_date'] - rentals_df['rental_date']).dt.days
rentals_df.head()
# Creating dummy variables from the special_features column
# Assigning 1 or 0 based on instring search
rentals_df["deleted_scenes"] = np.where(rentals_df["special_features"].str.contains("Deleted Scenes"), 1,0)
rentals_df["behind_the_scenes"] = np.where(rentals_df["special_features"].str.contains("Behind the Scenes"), 1,0)
rentals_df.head()
# Dataframe for regression models
# A list of columns from rentals that will interfere with the modelling
cols_to_drop = ['rental_date', 'return_date',
'special_features', 'rental_length_days']
# Defining the regression dataframe: X
X_df = rentals_df.drop(cols_to_drop,
axis = 1)
X = X_df.values
# Target variable: y
y = rentals_df['rental_length_days'].values.reshape(-1, 1)
print(np.shape(X))
print(np.shape(y))
# Setting a seed for reproducibility
seed = 9
# Splitting the data for training and testing models
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size = 0.2,
random_state = seed,
stratify = y)
# Experimenting with scaling the dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# First, I want to simplify my dataset by utilising only the most impactful features
# I do this by using a Lasso regression
lasso_initial = Lasso(random_state = seed)
lasso_initial_fit = lasso_initial.fit(X_train, y_train)
# Calculating the best alpha value to use for lasso model
# KFold object
kf = KFold(n_splits = 6,
shuffle = True,
random_state = seed)
# Set up the parameter grid
# Testing was done up-to 20, but lowered in showcase to save on computational time
params_dt = {"alpha": np.linspace(0.00001, 1, 10)}
# Instantiate lasso_cv
lasso_cv = GridSearchCV(lasso_initial_fit, # Testing model
param_grid = params_dt, # Gradual alpha values to test
cv = kf, # Automated no. of cross-val to use
n_jobs = -1) # Use all available processors
# Fit to the training data
lasso_cv_fit = lasso_cv.fit(X_train, y_train)
# Optimized alpha for lasso regression
# In testing, both the scaled model, as well as setting the cv=3, did not impact the hyperparameter value
alpha_lasso = lasso_cv_fit.best_params_['alpha']
# Tuned lasso model
lasso_tuned = Lasso(alpha = alpha_lasso,
random_state = seed)
lasso_tuned_fit = lasso_tuned.fit(X_train, y_train)
# Extract optimal lasso coefficients
lasso_coef = lasso_tuned_fit.coef_
# Applying lasso models to both the original and scaled datasets
# Perform feature selection by choosing columns with positive coefficients
X_train_lasso = X_train[:, lasso_coef > 0]
X_test_lasso = X_test[:, lasso_coef > 0]
# Random search for tuning random forest regressor - GridSearchCV took >30 mins
# Create a random forest regressor
rf = RandomForestRegressor(random_state = seed)
# Random forest hyperparameter space
param_dist_rf = {'n_estimators': np.arange(1, 101, 1),
'max_depth': np.arange(1, 21, 1),
'max_features': ['log2', 'auto', 'sqrt'],
'min_samples_leaf': np.arange(1, 20, 1)}
# Use random search to find the best hyperparameters
rs_rf = RandomizedSearchCV(estimator = rf,
param_distributions = param_dist_rf,
cv = kf,
random_state = seed,
n_jobs = -1)
# Fit the random search object to the data
rs_rf_fit = rs_rf.fit(X_train, y_train)
# Create a variable for the best hyper params
# In testing, changing the X_train datasets did not impact the feature selection
rs_rf_hyper_params = rs_rf_fit.best_params_
print(rs_rf_hyper_params)
# Random search for best hyperparameters of decision tree regressor
dt = DecisionTreeRegressor(random_state = seed)
# Decision tree hyperparameter space
param_dist_dt = {'max_depth': np.arange(1, 101, 1),
'max_features': ['log2', 'auto', 'sqrt'],
'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
'min_samples_leaf': np.arange(1, 20, 1)}
# Use random search to find the best hyperparameters
rs_dt = RandomizedSearchCV(estimator = dt,
param_distributions = param_dist_dt,
cv = kf,
random_state = seed,
n_jobs = -1)
# Fit the random search object to the data
rs_dt_fit = rs_dt.fit(X_train, y_train)
# Create a variable for the best hyper params
# I've assumed that this model will be the same as before with random forest.
# Where, changing the X_train datasets do not impact the feature selection
rs_dt_hyper_params = rs_dt_fit.best_params_
print(rs_dt_hyper_params)
# Linear regression models: OLS
ols = LinearRegression()
# Logistic regression models: logreg
logreg = LogisticRegression(random_state = seed)
# Rf with tuned hyper parameters
rf_tuned = RandomForestRegressor(n_estimators = rs_rf_hyper_params['n_estimators'],
max_depth = rs_rf_hyper_params['max_depth'],
max_features = rs_rf_hyper_params['max_features'],
min_samples_leaf = rs_rf_hyper_params['min_samples_leaf'],
random_state = seed)
# Decision tree regressor model: dt
dt_tuned = DecisionTreeRegressor(max_depth = rs_dt_hyper_params['max_depth'],
max_features = rs_dt_hyper_params['max_features'],
min_samples_leaf = rs_dt_hyper_params['min_samples_leaf'],
criterion = rs_dt_hyper_params['criterion'],
random_state = seed)
# Define the list regressors
regressors = [('Linear Regression', ols),
('Logistic Regression', logreg),
('Random Forest', rf_tuned),
('Decision Tree', dt_tuned)]
results_list = []
# Iterate over the pre-defined list of classifiers
for regressor_name, model in regressors:
# Fit clf to the training set
model.fit(X_train, y_train)
# Predict y_pred
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = mean_squared_error(y_test, y_pred)
# Storing the result
results_list.append(accuracy)
# Repeating with lasso datasets
model.fit(X_train_lasso, y_train)
y_pred_lasso = model.predict(X_test_lasso)
accuracy_lasso = mean_squared_error(y_test, y_pred_lasso)
results_list.append(accuracy_lasso)
# Evaluate clf's accuracy on the test set
print('Scaled {:s} : {:.3f}'.format(regressor_name, accuracy))
print('Lasso {:s} : {:.3f}'.format(regressor_name, accuracy_lasso))
# Numpy array of results
results_array = np.array([['Models', 'MSE of Scaled', 'MSE of Lasso'],
['OLS', results_list[0], results_list[1]],
['Logreg', results_list[2], results_list[3]],
['Random Forest Regressor', results_list[4], results_list[5]],
['Decision Tree Regressor', results_list[6], results_list[7]]])
# Results dataframe
results_df = pd.DataFrame(data=results_array[1:,1:],
index=results_array[1:,0],
columns=results_array[0,1:])
# Best MSE saved outside of loop, found from the results table
rf_fit = rf_tuned.fit(X_train, y_train)
rf_y_pred = rf_fit.predict(X_test)
mse_rf = mean_squared_error(y_test, rf_y_pred)
results_df
# Best model recommendation
best_model = rf_tuned
best_mse = mse_rf