Skip to content

A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.

The data they provided is in the csv file rental_info.csv. It has the following features:

  • "rental_date": The date (and time) the customer rents the DVD.
  • "return_date": The date (and time) the customer returns the DVD.
  • "amount": The amount paid by the customer for renting the DVD.
  • "amount_2": The square of "amount".
  • "rental_rate": The rate at which the DVD is rented for.
  • "rental_rate_2": The square of "rental_rate".
  • "release_year": The year the movie being rented was released.
  • "length": Lenght of the movie being rented, in minuites.
  • "length_2": The square of "length".
  • "replacement_cost": The amount it will cost the company to replace the DVD.
  • "special_features": Any special features, for example trailers/deleted scenes that the DVD also has.
  • "NC-17", "PG", "PG-13", "R": These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

rentals = pd.read_csv("rental_info.csv")
rentals.head(2)
rentals.rental_date = pd.to_datetime(rentals.rental_date)
rentals.return_date = pd.to_datetime(rentals.return_date)
rentals.rental_date.dtype, rentals.return_date.dtype
rentals["rental_length_days"] = (rentals.return_date - rentals.rental_date).dt.days
rentals.head(2)
rentals.special_features.unique()
rentals["deleted_scenes"] = rentals["special_features"].apply(
    lambda x: "Deleted Scenes" in str(x)
).astype(int)

rentals["behind_the_scenes"] = rentals["special_features"].apply(
    lambda x: "Behind the Scenes" in str(x)
).astype(int)

# rentals["trailers"] = rentals["special_features"].apply(
#     lambda x: "Trailers" in str(x)
# ).astype(int)

# rentals["commentaries"] = rentals["special_features"].apply(
#     lambda x: "Commentaries" in str(x)
# ).astype(int)

rentals.head()
X = rentals.drop(
    columns=[
        "special_features",
        "rental_date", "return_date",
        "rental_length_days"
    ]
).copy()
y = rentals["rental_length_days"]
X.shape
import xgboost as xgb
import optuna
from sklearn.model_selection import cross_val_score

# Assuming X and y are already defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=9)

optimize = False
if optimize:
    def objective(trial):
        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "tree_method": "hist",
            "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
            "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        }
        model = xgb.XGBRegressor(**params, random_state=9, n_jobs=-1)
        # Use negative RMSE, so maximize
        score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=3)
        return score.mean()  # maximize negative RMSE
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30, show_progress_bar=True)
    
    best_params = study.best_params
    best_params["objective"] = "reg:squarederror"
    best_params["eval_metric"] = "rmse"
    best_params["random_state"] = 9
    best_params["n_jobs"] = -1

    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    best_mse = mean_squared_error(y_test, y_pred)
    
    print(best_mse, best_params)

else:
    best_params = {
        'booster': 'dart',
        'lambda': 0.011975588741440897,
        'alpha': 0.03682333289697279,
        'colsample_bytree': 0.8094520386201829,
        'subsample': 0.9500676752924467,
        'learning_rate': 0.2978479368357079,
        'max_depth': 5, 
        'min_child_weight': 8,
        'n_estimators': 238, 
        'objective': 'reg:squarederror', 
        'eval_metric': 'rmse', 
        'random_state': 9, 
        'n_jobs': -1
    }

    best_model = xgb.XGBRegressor(**best_params)
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    best_mse = mean_squared_error(y_test, y_pred)
    
    print(best_mse)