Skip to content
Supervised Learning with scikit-learn
Supervised Learning with scikit-learn
Run the hidden code cell below to import the data used in this course.
# Importing pandas
import pandas as pd
# Importing the course datasets
diabetes = pd.read_csv('datasets/diabetes_clean.csv')
music = pd.read_csv('datasets/music_clean.csv')
advertising = pd.read_csv('datasets/advertising_and_sales_clean.csv')
telecom = pd.read_csv("datasets/telecom_churn_clean.csv")
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Create neighbors
neighbors = np.arange(1, 26)
train_accuracies = {}
test_accuracies = {}
for neighbor in neighbors:
# Set up a KNN Classifier
knn = KNeighborsClassifier(n_neighbors=neighbor)
# Fit the model
knn.fit(X_train, y_train)
# Compute accuracy
train_accuracies[neighbor] = knn.score(X_train, y_train)
test_accuracies[neighbor] = knn.score(X_test, y_test)
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)
# ASSESSING ACCURACY WITH CROSS-VALIDATION FRO R-SQUARED
# Import the necessary modules
from sklearn.model_selection import cross_val_score, KFold
# Create a KFold object
kf = KFold(n_splits=6, shuffle=True, random_state=5)
reg = LinearRegression()
# Compute 6-fold cross-validation scores
cv_scores = cross_val_score(reg, X, y, cv=kf)
# Print scores
print(cv_scores)
# VISUALISING ACCURACY
# Add a title
plt.title("KNN: Varying Number of Neighbors")
# Plot training accuracies
plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy")
# Plot test accuracies
plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy")
plt.legend()
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
# Display the plot
plt.show()
# Predict the probabilities of each individual in the test set having a diabetes diagnosis, storing the array of positive probabilities as y_pred_probs
from sklearn.linear_model import LogisticRegression
# Instantiate the model
logreg = LogisticRegression()
# Fit the model
logreg.fit(X_train, y_train)
# Predict probabilities
y_pred_probs = logreg.predict_proba(X_test)[:, 1]
print(y_pred_probs[:10])
# Calculate the ROC curve valueas and plot true positive rate against false positive rate.
from sklearn.metrics import roc_curve
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
plt.plot([0, 1], [0, 1], 'k--')
# Plot tpr against fpr
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Diabetes Prediction')
plt.show()
# Calculate ROC AUC score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
# Calculate roc_auc_score
print(roc_auc_score(y_test, y_pred_probs))
# Calculate the confusion matrix
print(confusion_matrix(y_test, y_pred))
# Calculate the classification report
print(classification_report(y_test, y_pred))
Hidden output
# HYPERPARAMETER TUNING USING GRIDSEARCHCV
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
# Set up the parameter grid
param_grid = {"alpha": np.linspace(0.00001, 1, num=20)}
# Instantiate lasso_cv
lasso_cv = GridSearchCV(lasso, param_grid, cv=kf)
# Fit to the training data
lasso_cv.fit(X_train, y_train)
print("Tuned lasso paramaters: {}".format(lasso_cv.best_params_))
print("Tuned lasso score: {}".format(lasso_cv.best_score_))
# HYPERPARAMETER TUNING USING RANDOMIZEDSEARCHCV
# Create the parameter space
params = {"penalty": ["l1", "l2"],
"tol": np.linspace(0.0001, 1.0, 50),
"C": np.linspace(.1, 1, num=50),
"class_weight": ["balanced", {0:.8, 1:.2}]}
# Instantiate the RandomizedSearchCV object
logreg_cv = RandomizedSearchCV(logreg, params, cv=kf)
# Fit the data to the model
logreg_cv.fit(X_train, y_train)
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Best Accuracy Score: {}".format(logreg_cv.best_score_))
# REGRESSION WITH CATEGORICAL FEATURES
# Create X and y
X = music_dummies.drop('popularity', axis=1).values
y = music_dummies['popularity'].values
# Instantiate a ridge model
ridge = Ridge(alpha=.2)
# Perform cross-validation
scores = cross_val_score(ridge, X, y, cv=kf, scoring="neg_mean_squared_error")
# Calculate RMSE
rmse = np.sqrt(-scores)
print("Average RMSE: {}".format(np.mean(rmse)))
print("Standard Deviation of the target array: {}".format(np.std(y)))
# HANDLING MISSING DATA
# Print missing values for each column
print(music_df.isna().sum().sort_values())
# Remove values where less than 5% are missing
music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])
# Convert genre to a binary feature
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)
print(music_df.isna().sum().sort_values())
print("Shape of the `music_df`: {}".format(music_df.shape))
# PIPELINE FOR IMPUTATION AND PREDICTION
# Import modules
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# Instantiate an imputer
imputer = SimpleImputer()
# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)
# Build steps for the pipeline
steps = [("imputer", imputer),
("knn", knn)]
# Create the pipeline
pipeline = Pipeline(steps)
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)
# Make predictions on the test set
y_pred = pipeline.predict(X_test)
# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))
# SCALING IN A PIPELINE
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Create pipeline steps
steps = [("scaler", StandardScaler()),
("lasso", Lasso(alpha=.5))]
# Instantiate the pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
# Calculate and print R-squared
print(pipeline.score(X_test, y_test))
# CENTERING AND SCALING FOR CLASSIFICATION
# Build the steps
steps = [("scaler", StandardScaler()),
("logreg", LogisticRegression())]
pipeline = Pipeline(steps)
# Create the parameter space
parameters = {"logreg__C": np.linspace(.001, 1.0, 20)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=21)
# Instantiate the grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)
# Fit to the training data
cv.fit(X_train, y_train)
print(cv.best_score_, "\n", cv.best_params_)
# EVALUATING MULTIPLE MODEL PERFORMANCES (REGRESSION)
models = {"Linear Regression": LinearRegression(), "Ridge": Ridge(alpha=0.1), "Lasso": Lasso(alpha=0.1)}
results = []
# Loop through the models' values
for model in models.values():
kf = KFold(n_splits=6, random_state=42, shuffle=True)
# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
# Append the results
results.append(cv_scores)
# Create a box plot of the results
plt.boxplot(results, labels=models.keys())
plt.show()
# PREDICTING ON TEST DATA
# Import mean_squared_error
from sklearn.metrics import mean_squared_error
for name, model in models.items():
# Fit the model to the training data
model.fit(X_train_scaled, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)
# Calculate the test_rmse
test_rmse = mean_squared_error(y_test, y_pred, squared=Faslse)
print("{} Test Set RMSE: {}".format(name, test_rmse))
# EVALUATING MULTIPLE MODEL PERFORMANCES (CLASSIFICATION)
# Create models dictionary
models = {"KNN": KNeighborsClassifier(), "Logistic Regression": LogisticRegression(), "Decision Tree Classifier": DecisionTreeClassifier()}
results = []
# Loop through the models' values
for model in models.values():
# Instantiate a KFold object
kf = KFold(n_splits=6, random_state=12, shuffle=True)
# Perform cross-validation
cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
results.append(cv_results)
plt.boxplot(results, labels=models.keys())
plt.show()
# SUPERVISED LEARNING WORKFLOW IN A PIPELINE - Impute missing values, scale features, tune hyperparameters
# Create steps
steps = [("imp_mean", SimpleImputer()),
("scaler", StandardScaler()),
("logreg", LogisticRegression())]
# Set up pipeline
pipeline = Pipeline(steps)
params = {"logreg__solver": ["newton-cg", "saga", "lbfgs"],
"logreg__C": np.linspace(0.001, 1.0, 10)}
# Create the GridSearchCV object
tuning = GridSearchCV(pipeline, param_grid=params)
tuning.fit(X_train, y_train)
y_pred = tuning.predict(X_test)
# Compute and print performance
print("Tuned Logistic Regression Parameters: {}, Accuracy: {}".format(tuning.best_params_, tuning.score(X_test, y_test)))