Skip to content
Project: Predicting Movie Rental Durations
A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.
The data they provided is in the csv file rental_info.csv. It has the following features:
"rental_date": The date (and time) the customer rents the DVD."return_date": The date (and time) the customer returns the DVD."amount": The amount paid by the customer for renting the DVD."amount_2": The square of"amount"."rental_rate": The rate at which the DVD is rented for."rental_rate_2": The square of"rental_rate"."release_year": The year the movie being rented was released."length": Lenght of the movie being rented, in minuites."length_2": The square of"length"."replacement_cost": The amount it will cost the company to replace the DVD."special_features": Any special features, for example trailers/deleted scenes that the DVD also has."NC-17","PG","PG-13","R": These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
rentals = pd.read_csv("rental_info.csv")
rentals.head(2)rentals.rental_date = pd.to_datetime(rentals.rental_date)
rentals.return_date = pd.to_datetime(rentals.return_date)
rentals.rental_date.dtype, rentals.return_date.dtyperentals["rental_length_days"] = (rentals.return_date - rentals.rental_date).dt.days
rentals.head(2)rentals.special_features.unique()rentals["deleted_scenes"] = rentals["special_features"].apply(
lambda x: "Deleted Scenes" in str(x)
).astype(int)
rentals["behind_the_scenes"] = rentals["special_features"].apply(
lambda x: "Behind the Scenes" in str(x)
).astype(int)
# rentals["trailers"] = rentals["special_features"].apply(
# lambda x: "Trailers" in str(x)
# ).astype(int)
# rentals["commentaries"] = rentals["special_features"].apply(
# lambda x: "Commentaries" in str(x)
# ).astype(int)
rentals.head()X = rentals.drop(
columns=[
"special_features",
"rental_date", "return_date",
"rental_length_days"
]
).copy()
y = rentals["rental_length_days"]X.shapeimport xgboost as xgb
import optuna
from sklearn.model_selection import cross_val_score
# Assuming X and y are already defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=9)
optimize = False
if optimize:
def objective(trial):
params = {
"objective": "reg:squarederror",
"eval_metric": "rmse",
"tree_method": "hist",
"booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
"lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
"alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
"colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
"subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
"learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
"max_depth": trial.suggest_int("max_depth", 3, 10),
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
"n_estimators": trial.suggest_int("n_estimators", 50, 300),
}
model = xgb.XGBRegressor(**params, random_state=9, n_jobs=-1)
# Use negative RMSE, so maximize
score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=3)
return score.mean() # maximize negative RMSE
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, show_progress_bar=True)
best_params = study.best_params
best_params["objective"] = "reg:squarederror"
best_params["eval_metric"] = "rmse"
best_params["random_state"] = 9
best_params["n_jobs"] = -1
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
best_mse = mean_squared_error(y_test, y_pred)
print(best_mse, best_params)
else:
best_params = {
'booster': 'dart',
'lambda': 0.011975588741440897,
'alpha': 0.03682333289697279,
'colsample_bytree': 0.8094520386201829,
'subsample': 0.9500676752924467,
'learning_rate': 0.2978479368357079,
'max_depth': 5,
'min_child_weight': 8,
'n_estimators': 238,
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'random_state': 9,
'n_jobs': -1
}
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
best_mse = mean_squared_error(y_test, y_pred)
print(best_mse)