Skip to content
Wine Quality Prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
import time
# read dataset
data = pd.read_csv('winequality-white.csv', sep=';')
# validate data
display(data.head())
display(data.describe())
display(data.info())
# split dataset into features and targets
X = data.drop(['quality'], axis=1)
y = data['quality']
# split dataset into train and test data
train_data, test_data, train_targets, test_targets = train_test_split(X, y, test_size=0.3, random_state=55)
# standardize dataset
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)
start_time = time.perf_counter()
# establish prediction model
kNC = KNeighborsClassifier(n_jobs=-1)
# configurate grid search cross validation
kf = KFold(n_splits=4, shuffle=True, random_state=55)
param_grid = {
'n_neighbors': np.arange(1, 31),
'weights': ["uniform", "distance"],
'p': [1, 2]
}
grid = GridSearchCV(kNC, param_grid, cv=kf, n_jobs=-1)
# model training
grid.fit(train_data, train_targets)
best_params = grid.best_params_
train_accuracy = grid.best_score_
# model testing
test_predictions = grid.best_estimator_.predict(test_data)
test_accuracy = accuracy_score(test_targets, test_predictions)
mse = mean_squared_error(test_targets, test_predictions)
classification_rep = classification_report(test_targets, test_predictions)
# confusion matrix
cm = confusion_matrix(test_targets, test_predictions)
end_time = time.perf_counter()
execution_time = end_time - start_time
# define the size of plotted diagram to get a good vision
plt.rcParams['savefig.dpi'] = 500
plt.rcParams['figure.dpi'] = 500
# visualize prediction result
sns.heatmap(cm, annot=True, xticklabels=np.arange(0, len(cm), 1) + 3, yticklabels=np.arange(0, len(cm), 1) + 3, linewidths=1, linecolor='white', cmap="coolwarm")
plt.title('Confusion Matrix (KNN)', size=20)
plt.show()
plt.bar(x=np.arange(0, len(cm), 1) + 3, height=cm.sum(axis=0), color='blue', alpha=0.5, label='prediction')
plt.bar(x=np.arange(0, len(cm), 1) + 3, height=cm.sum(axis=1), color='green', alpha=0.5, label='targets')
plt.xlabel('wine quality class', size=15)
plt.ylabel('samples', size=15)
plt.legend(fontsize='x-large')
plt.title('Prediction Result (KNN)', size=20)
plt.show()
# Display results
best_params, train_accuracy, test_accuracy, mse, classification_rep, cm, execution_time