Skip to content

Supervised Learning with scikit-learn

Run the hidden code cell below to import the data used in this course.

# Importing pandas
import pandas as pd

# Importing the course datasets 
diabetes = pd.read_csv('datasets/diabetes_clean.csv')
music = pd.read_csv('datasets/music_clean.csv')
advertising = pd.read_csv('datasets/advertising_and_sales_clean.csv')
telecom = pd.read_csv("datasets/telecom_churn_clean.csv")

**## Regression **

  1. View thw Diabets Data Frame
print(diabetes.head())
# diabetes = diabetes.dropna(subset = ["bmi"])
x = diabetes.drop("glucose", axis = 1).values
y = diabetes["glucose"].values
x_bmi = x[:, 4]
print('x_bim old shape : ', x_bmi.shape)
x_bmi = x_bmi.reshape(-1, 1)
print('y Shape : ', y.shape)
print('x_bmi new shape : ', x_bmi.shape)
  1. Plotting Glucose vs Body mass
import matplotlib.pyplot as plt 
plt.scatter(x_bmi, y )
plt.xlabel("Body Mass Index")
plt.ylabel("Blood Glucose (mg/dl)")
plt.show()

3. Fitting a regression model

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(x_bmi, y)
prediction = reg.predict(x_bmi)
plt.scatter(x_bmi, y)
plt.plot(x_bmi, prediction)
plt.xlabel("Body Mass Index")
plt.ylabel("Blood Glucose (mg/dl)")
plt.show()

4 Linear Regression with Train data & Mean Squared Error

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x_bmi, y, test_size = 0.3, random_state = 42 )
reg_all = LinearRegression()
reg_all.fit(x_train, y_train)
y_predict = reg_all.predict(x_test)

from sklearn.metrics import mean_squared_error 
print(mean_squared_error(y_test, y_predict, squared=False))

5. Cross Validation

  • Model, way we split data
from sklearn.model_selection import cross_val_score, KFold 
import numpy as np

kf = KFold(n_splits = 6, shuffle= True, random_state= 5)
reg = LinearRegression()
cv_result = cross_val_score(reg, x, y, cv = kf)
print(cv_result)
# Calculate and print the mean & standard deviation of the results.
print(np.mean(cv_result), np.std(cv_result))

# Display the 95% confidence interval for your results using 
print(np.quantile(cv_result, [0.025, 0.975]))

Confusion Matrix