Skip to content
Supervised Learning with scikit-learn
Supervised Learning with scikit-learn
Run the hidden code cell below to import the data used in this course.
# Importing pandas
import pandas as pd
# Importing the course datasets
diabetes = pd.read_csv('datasets/diabetes_clean.csv')
music = pd.read_csv('datasets/music_clean.csv')
advertising = pd.read_csv('datasets/advertising_and_sales_clean.csv')
telecom = pd.read_csv("datasets/telecom_churn_clean.csv")**## Regression **
- View thw Diabets Data Frame
print(diabetes.head())# diabetes = diabetes.dropna(subset = ["bmi"])
x = diabetes.drop("glucose", axis = 1).values
y = diabetes["glucose"].values
x_bmi = x[:, 4]
print('x_bim old shape : ', x_bmi.shape)
x_bmi = x_bmi.reshape(-1, 1)
print('y Shape : ', y.shape)
print('x_bmi new shape : ', x_bmi.shape)- Plotting Glucose vs Body mass
import matplotlib.pyplot as plt
plt.scatter(x_bmi, y )
plt.xlabel("Body Mass Index")
plt.ylabel("Blood Glucose (mg/dl)")
plt.show()3. Fitting a regression model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_bmi, y)
prediction = reg.predict(x_bmi)
plt.scatter(x_bmi, y)
plt.plot(x_bmi, prediction)
plt.xlabel("Body Mass Index")
plt.ylabel("Blood Glucose (mg/dl)")
plt.show()4 Linear Regression with Train data & Mean Squared Error
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_bmi, y, test_size = 0.3, random_state = 42 )
reg_all = LinearRegression()
reg_all.fit(x_train, y_train)
y_predict = reg_all.predict(x_test)
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_predict, squared=False))5. Cross Validation
- Model, way we split data
from sklearn.model_selection import cross_val_score, KFold
import numpy as np
kf = KFold(n_splits = 6, shuffle= True, random_state= 5)
reg = LinearRegression()
cv_result = cross_val_score(reg, x, y, cv = kf)
print(cv_result)
# Calculate and print the mean & standard deviation of the results.
print(np.mean(cv_result), np.std(cv_result))
# Display the 95% confidence interval for your results using
print(np.quantile(cv_result, [0.025, 0.975]))Confusion Matrix