DataScience
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Bayesian Optimization tutorial

    Libraries Import

    In this notebook, we will be using the following libraries:

    • numpy: a library for numerical computing in Python.
    • scipy: a library for scientific computing in Python.
    • sklearn: a library for machine learning in Python.
    • GPyOpt: a library for Bayesian optimization in Python.

    To use these libraries, we need to import them at the beginning of our code. We can do this using the import statement followed by the name of the library.

    import numpy as np import scipy as sp import sklearn as skl import GPyOpt as gpo

    Prerequisites

    We ensure you have the following Python libraries installed:

    • xgboost: for the XGBoost model
    • scikit-learn: for data handling and evaluation metrics
    • GPyOpt: for Bayesian Optimization
    • numpy: for numerical operations
    • You can install these packages using pip:
    !pip -q install xgboost scikit-learn GPyOpt numpy

    Import the relevant libraries

    import xgboost as xgb
    from sklearn.datasets import fetch_california_housing as fch
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import RandomizedSearchCV
    from GPyOpt.methods import BayesianOptimization
    from sklearn.model_selection import cross_val_score
    import numpy as np

    Dataset preparation

    # Load dataset
    california_housing = fch()
    
    X = california_housing.data
    y = california_housing.target
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.3, 
                                                        random_state=2024)
    print(f"Training Shape: {X_train.shape}")
    print(f"Testing Shape: {X_test.shape}")

    Hyperparameter tuning using Random Search

    # Parameter grid
    param_dist = {
        'max_depth': [3, 10, 5, 15],
        'min_child_weight': [1, 5, 10],
        'subsample': [0.5, 0.7, 1.0],
        'colsample_bytree': [0.5, 0.7, 1.0],
        'n_estimators': [100, 200, 300, 400],
        'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2]
    }
    
    # Initialize XGBoost regressor
    xgb_reg = xgb.XGBRegressor()
    
    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(
        xgb_reg, param_distributions=param_dist, n_iter=25,
        scoring='neg_mean_squared_error', cv=3, verbose=1, random_state=42
    )
    
    # Fit RandomizedSearchCV
    random_search.fit(X_train, y_train)
    
    # Best parameters
    print("Random Search Best Parameters:", random_search.best_params_)
    

    Hyperparameter tuning using bayesian optimization

    from GPyOpt.methods import BayesianOptimization
    from sklearn.model_selection import cross_val_score
    
    # Parameter bounds
    baysian_opt_bounds = [
        {'name': 'max_depth', 'type': 'discrete', 'domain': (3, 10, 5, 15)},
        {'name': 'min_child_weight', 'type': 'discrete', 'domain': (1, 5, 10)},
        {'name': 'subsample', 'type': 'continuous', 'domain': (0.5, 1.0)},
        {'name': 'colsample_bytree', 'type': 'continuous', 'domain': (0.5, 1.0)},
        {'name': 'n_estimators', 'type': 'discrete', 'domain': (100, 200, 300, 400)},
        {'name': 'learning_rate', 'type': 'continuous', 'domain': (0.01, 0.2)}
    ]
    
    # Objective function
    def xgb_cv_score(parameters):
        parameters = parameters[0]
        score = -cross_val_score(
                    xgb.XGBRegressor(
                        max_depth=int(parameters[0]),
                        min_child_weight=int(parameters[1]),
                        subsample=parameters[2],
                        colsample_bytree=parameters[3],
                        n_estimators=int(parameters[4]),
                        learning_rate=parameters[5]), 
                    X_train, y_train, scoring='neg_mean_squared_error', cv=3).mean()
        return score
    
    # Bayesian Optimization
    optimizer = BayesianOptimization(
        f=xgb_cv_score, domain=baysian_opt_bounds, model_type='GP',
        acquisition_type='EI', max_iter=25
    )
    optimizer.run_optimization()
    
    # Best parameters
    best_params_bayesian = {k: int(v) if k in ['max_depth', 'min_child_weight', 'n_estimators'] else v for k, v in zip(['max_depth', 'min_child_weight', 'subsample', 'colsample_bytree', 'n_estimators', 'learning_rate'], optimizer.x_opt)}
    
    print("Bayesian Optimization Best Parameters:", best_params_bayesian)