Skip to content
1 hidden cell
Supervised Learning with scikit-learn
Supervised Learning with scikit-learn
Run the hidden code cell below to import the data used in this course.
1 hidden cell
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets hereimport pandas as pd
df = pd.read_csv('datasets/telecom_churn_clean.csv')
print(df['churn'].value_counts())Supervised Learning: KNN for Classification
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
# Create arrays for the features and the target variable
y = churn_df["churn"].values
X = churn_df[["account_length", "customer_service_calls"]].values
# Create a KNN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors = 6)
# Fit the classifier to the data
knn.fit(X, y)
# Predict the labels for the X_new
X_new = np.array([[30.0, 17.5],
[107.0, 24.1],
[213.0, 10.9]])
y_pred = knn.predict(X_new)
# Print the predictions for X_new
print("Predictions: {}".format(y_pred))# Import the module
from sklearn.model_selection import train_test_split
X = churn_df.drop("churn", axis=1).values
y = churn_df["churn"].values
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42, stratify= y)
knn = KNeighborsClassifier(n_neighbors=5)
# Fit the classifier to the training data
knn.fit(X_train, y_train)
# Print the accuracy
print(knn.score(X_test, y_test))
# Create neighbors
neighbors = np.arange(1, 13)
train_accuracies = {}
test_accuracies = {}
for neighbor in neighbors:
# Set up a KNN Classifier
knn = KNeighborsClassifier(n_neighbors = neighbor)
# Fit the model
knn.fit(X_train, y_train)
# Compute accuracy
train_accuracies[neighbor] = knn.score(X_train, y_train)
test_accuracies[neighbor] = knn.score(X_test, y_test)
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)
# Add a title
plt.title("KNN: Varying Number of Neighbors")
# Plot training accuracies
plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy")
# Plot test accuracies
plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy")
plt.legend()
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
# Display the plot
plt.show()import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('datasets/diabetes_clean.csv')
Xdf = df.drop("glucose", axis=1)
X = Xdf.values
Xdrop = X[:, 4]
y = df["glucose"].values
print(Xdrop.shape)
Xdrop = Xdrop.reshape(-1, 1)
print(y.shape)
# y = y.reshape(-1, 1)
print(Xdrop.shape)
print(Xdf.info())
plt.scatter (Xdrop, y)
plt.show()import numpy as np
# Create X from the radio column's values
X = sales_df["radio"].values
# Create y from the sales column's values
y = sales_df["sales"].values
# Reshape X
X = X.reshape(-1,1)
# Check the shape of the features and targets
print(X.shape, y.shape)
# Import LinearRegression
from sklearn.linear_model import LinearRegression
# Create the model
reg = LinearRegression()
# Fit the model to the data
reg.fit(X, y)
# Make predictions
predictions = reg.predict(X)
print(predictions[:5])
# Import matplotlib.pyplot
import matplotlib.pyplot as plt
# Create scatter plot
plt.scatter(X, y, color="blue")
# Create line plot
plt.plot(X, predictions, color="red")
plt.xlabel("Radio Expenditure ($)")
plt.ylabel("Sales ($)")
# Display the plot
plt.show()# Create X and y arrays
X = sales_df.drop("sales", axis=1).values
y = sales_df["sales"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Instantiate the model
reg = LinearRegression()
# Fit the model to the data
reg.fit(X_train, y_train)
# Make predictions
y_pred = reg.predict(X_test)
print("Predictions: {}, Actual Values: {}".format(y_pred[:2], y_test[:2]))
# Import mean_squared_error
from sklearn.metrics import mean_squared_error
# Compute R-squared
r_squared = reg.score(X_test, y_test)
# Compute RMSE
rmse = mean_squared_error(y_test, y_pred, squared = False)
# Print the metrics
print("R^2: {}".format(r_squared))
print("RMSE: {}".format(rmse))# Import the necessary modules
from sklearn.model_selection import cross_val_score, KFold
# Create a KFold object
kf = KFold(n_splits = 6, shuffle=True, random_state = 5)
reg = LinearRegression()
# Compute 6-fold cross-validation scores
cv_scores = cross_val_score(reg, X, y, cv=kf)
# Print scores
print(cv_scores)
# Print the mean
print(np.mean(cv_results))
# Print the standard deviation
print(np.std(cv_results))
# Print the 95% confidence interval
print(np.quantile(cv_results, [0.025, 0.975]))# Import Ridge
from sklearn.linear_model import Ridge
alphas = [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
ridge_scores = []
for alpha in alphas:
# Create a Ridge regression model
ridge = Ridge(alpha=alpha)
# Fit the data
ridge.fit(X_train, y_train)
# Obtain R-squared
score = ridge.score(X_test, y_test)
ridge_scores.append(score)
print(ridge_scores)# Import Lasso
from sklearn.linear_model import Lasso
# Instantiate a lasso regression model
lasso = Lasso(alpha = 0.3)
# Fit the model to the data
lassom = lasso.fit(X, y)
# Compute and print the coefficients
lasso_coef = lassom.coef_
print(lasso_coef)
plt.bar(sales_columns, lasso_coef)
plt.xticks(rotation=45)
plt.show()