Project: Predicting Movie Rental Durations

A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.

The data they provided is in the csv file rental_info.csv. It has the following features:

"rental_date": The date (and time) the customer rents the DVD.
"return_date": The date (and time) the customer returns the DVD.
"amount": The amount paid by the customer for renting the DVD.
"amount_2": The square of "amount".
"rental_rate": The rate at which the DVD is rented for.
"rental_rate_2": The square of "rental_rate".
"release_year": The year the movie being rented was released.
"length": Lenght of the movie being rented, in minuites.
"length_2": The square of "length".
"replacement_cost": The amount it will cost the company to replace the DVD.
"special_features": Any special features, for example trailers/deleted scenes that the DVD also has.
"NC-17", "PG", "PG-13", "R": These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.

# Importing the necessary extensions
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

# Reading rentals_info.csv while parsing for dates
rentals_df = pd.read_csv('rental_info.csv', parse_dates = ['rental_date', 'return_date'])
rentals_df.head()

# Preprocessing dataset for nulls
rentals_df.isna().sum().sort_values()   # All clear to go forward!

# Creating rental_length_days column
# Converting the new column to datetime series
rentals_df['rental_length_days'] = (rentals_df['return_date'] - rentals_df['rental_date']).dt.days
rentals_df.head()

# Creating dummy variables from the special_features column
# Assigning 1 or 0 based on instring search
rentals_df["deleted_scenes"] =  np.where(rentals_df["special_features"].str.contains("Deleted Scenes"), 1,0)
rentals_df["behind_the_scenes"] =  np.where(rentals_df["special_features"].str.contains("Behind the Scenes"), 1,0)
rentals_df.head()

# Dataframe for regression models
# A list of columns from rentals that will interfere with the modelling
cols_to_drop = ['rental_date', 'return_date',
                'special_features', 'rental_length_days']

# Defining the regression dataframe: X
X_df = rentals_df.drop(cols_to_drop,
                    axis = 1)

X = X_df.values

# Target variable: y
y = rentals_df['rental_length_days'].values.reshape(-1, 1)

print(np.shape(X))
print(np.shape(y))

# Setting a seed for reproducibility
seed = 9

# Splitting the data for training and testing models
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = seed,
                                                    stratify = y)

# Experimenting with scaling the dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# First, I want to simplify my dataset by utilising only the most impactful features
# I do this by using a Lasso regression
lasso_initial = Lasso(random_state = seed)
lasso_initial_fit = lasso_initial.fit(X_train, y_train)

# Calculating the best alpha value to use for lasso model
# KFold object 
kf = KFold(n_splits = 6,
           shuffle = True,
           random_state = seed)

# Set up the parameter grid
# Testing was done up-to 20, but lowered in showcase to save on computational time
params_dt = {"alpha": np.linspace(0.00001, 1, 10)}

# Instantiate lasso_cv
lasso_cv = GridSearchCV(lasso_initial_fit,      # Testing model
                        param_grid = params_dt, # Gradual alpha values to test
                        cv = kf,                # Automated no. of cross-val to use
                        n_jobs = -1)            # Use all available processors

# Fit to the training data
lasso_cv_fit = lasso_cv.fit(X_train, y_train)

# Optimized alpha for lasso regression
# In testing, both the scaled model, as well as setting the cv=3, did not impact the hyperparameter value
alpha_lasso = lasso_cv_fit.best_params_['alpha']

# Tuned lasso model
lasso_tuned = Lasso(alpha = alpha_lasso,
                    random_state = seed)

lasso_tuned_fit = lasso_tuned.fit(X_train, y_train)

# Extract optimal lasso coefficients
lasso_coef = lasso_tuned_fit.coef_

# Applying lasso models to both the original and scaled datasets
# Perform feature selection by choosing columns with positive coefficients
X_train_lasso = X_train[:, lasso_coef > 0]
X_test_lasso = X_test[:, lasso_coef > 0]

# Random search for tuning random forest regressor - GridSearchCV took >30 mins
# Create a random forest regressor
rf = RandomForestRegressor(random_state = seed)

# Random forest hyperparameter space
param_dist_rf = {'n_estimators': np.arange(1, 101, 1),
                 'max_depth': np.arange(1, 21, 1),
                 'max_features': ['log2', 'auto', 'sqrt'],
                 'min_samples_leaf': np.arange(1, 20, 1)}

# Use random search to find the best hyperparameters
rs_rf = RandomizedSearchCV(estimator = rf, 
                           param_distributions = param_dist_rf, 
                           cv = kf, 
                           random_state = seed,
                           n_jobs = -1)

# Fit the random search object to the data
rs_rf_fit = rs_rf.fit(X_train, y_train)

# Create a variable for the best hyper params
# In testing, changing the X_train datasets did not impact the feature selection
rs_rf_hyper_params = rs_rf_fit.best_params_

print(rs_rf_hyper_params)

# Random search for best hyperparameters of decision tree regressor
dt = DecisionTreeRegressor(random_state = seed)

# Decision tree hyperparameter space
param_dist_dt = {'max_depth': np.arange(1, 101, 1),
                 'max_features': ['log2', 'auto', 'sqrt'],
                 'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
                 'min_samples_leaf': np.arange(1, 20, 1)}

# Use random search to find the best hyperparameters
rs_dt = RandomizedSearchCV(estimator = dt, 
                           param_distributions = param_dist_dt, 
                           cv = kf, 
                           random_state = seed,
                           n_jobs = -1)

# Fit the random search object to the data
rs_dt_fit = rs_dt.fit(X_train, y_train)

# Create a variable for the best hyper params
# I've assumed that this model will be the same as before with random forest.
# Where, changing the X_train datasets do not impact the feature selection
rs_dt_hyper_params = rs_dt_fit.best_params_

print(rs_dt_hyper_params)

# Linear regression models: OLS
ols = LinearRegression()

# Logistic regression models: logreg
logreg = LogisticRegression(random_state = seed)

# Rf with tuned hyper parameters
rf_tuned = RandomForestRegressor(n_estimators = rs_rf_hyper_params['n_estimators'], 
                                 max_depth = rs_rf_hyper_params['max_depth'],
                                 max_features = rs_rf_hyper_params['max_features'],
                                 min_samples_leaf = rs_rf_hyper_params['min_samples_leaf'],
                                 random_state = seed)

# Decision tree regressor model: dt
dt_tuned = DecisionTreeRegressor(max_depth = rs_dt_hyper_params['max_depth'],
                                 max_features = rs_dt_hyper_params['max_features'],
                                 min_samples_leaf = rs_dt_hyper_params['min_samples_leaf'],
                                 criterion = rs_dt_hyper_params['criterion'],
                                 random_state = seed)

# Define the list regressors
regressors = [('Linear Regression', ols),
              ('Logistic Regression', logreg),
              ('Random Forest', rf_tuned),
              ('Decision Tree', dt_tuned)]

results_list = []

# Iterate over the pre-defined list of classifiers
for regressor_name, model in regressors:    
 
    # Fit clf to the training set
    model.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = mean_squared_error(y_test, y_pred) 
    
    # Storing the result
    results_list.append(accuracy)
    
    # Repeating with lasso datasets
    model.fit(X_train_lasso, y_train)    
    y_pred_lasso = model.predict(X_test_lasso)
    accuracy_lasso = mean_squared_error(y_test, y_pred_lasso)    
    results_list.append(accuracy_lasso)
    
    # Evaluate clf's accuracy on the test set
    print('Scaled {:s} : {:.3f}'.format(regressor_name, accuracy))
    print('Lasso {:s} : {:.3f}'.format(regressor_name, accuracy_lasso))

# Numpy array of results
results_array = np.array([['Models', 'MSE of Scaled', 'MSE of Lasso'],
                          ['OLS', results_list[0], results_list[1]],
                          ['Logreg', results_list[2], results_list[3]],
                          ['Random Forest Regressor', results_list[4], results_list[5]],
                          ['Decision Tree Regressor', results_list[6], results_list[7]]])

# Results dataframe
results_df = pd.DataFrame(data=results_array[1:,1:],
                          index=results_array[1:,0],
                          columns=results_array[0,1:])

# Best MSE saved outside of loop, found from the results table
rf_fit = rf_tuned.fit(X_train, y_train)
rf_y_pred = rf_fit.predict(X_test)
mse_rf = mean_squared_error(y_test, rf_y_pred)

results_df

# Best model recommendation
best_model = rf_tuned
best_mse = mse_rf