Skip to content
Bagging in Machine Learning
  • AI Chat
  • Code
  • Report
  • Spinner

    Dataset

    The telecom customer churn dataset comes from an Iranian telecom company, with each row representing a customer over a year period. Along with a churn label, there is information on the customers' activity, such as call failures and subscription length.

    import pandas as pd
    
    customer = pd.read_csv("data/customer_churn.csv")
    customer.head()
    customer.Churn.value_counts()
    X = customer.drop("Churn", axis=1) # Independent variables
    y = customer.Churn # Dependent variable

    Training a machine learning model

    # Split into train and test 
    from sklearn.model_selection import train_test_split, cross_val_score
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    
    # Train a machine learning model
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])
    
    pipeline.fit(X_train, y_train)

    Model Evaluation

    from sklearn.metrics import classification_report
    
    # Make prediction on the testing data
    y_pred = pipeline.predict(X_test)
    
    # Classification Report
    print(classification_report(y_pred, y_test))
    # Evaluate the classifier using cross-validation
    cv_scores = cross_val_score(pipeline, X, y, cv=5)
    
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {np.mean(cv_scores):.2f}")

    Bagging

    from sklearn.ensemble import BaggingClassifier
    
    # Create a bagging classifier with the best decision tree found by grid search
    bagging_classifier = BaggingClassifier(base_estimator=pipeline, n_estimators=50, random_state=42)
    
    # Train the bagging classifier on the training data
    bagging_classifier.fit(X_train, y_train)

    Evaluating Ensemble Model

    # Make prediction on the testing data
    y_pred = bagging_classifier.predict(X_test)
    
    # Classification Report
    print(classification_report(y_pred, y_test))
    # Evaluate the classifier using cross-validation
    cv_scores = cross_val_score(bagging_classifier, X, y, cv=5)
    
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {np.mean(cv_scores):.2f}")