Skip to content
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
import time

# read dataset
data = pd.read_csv('winequality-white.csv', sep=';')

# validate data
display(data.head())
display(data.describe())
display(data.info())

# split dataset into features and targets
X = data.drop(['quality'], axis=1)
y = data['quality']

# split dataset into train and test data
train_data, test_data, train_targets, test_targets = train_test_split(X, y, test_size=0.3, random_state=55)

# standardize dataset
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

start_time = time.perf_counter()

# establish prediction model
kNC = KNeighborsClassifier(n_jobs=-1)

# configurate grid search cross validation
kf = KFold(n_splits=4, shuffle=True, random_state=55)
param_grid = {
    'n_neighbors': np.arange(1, 31),
    'weights': ["uniform", "distance"],
    'p': [1, 2]
}

grid = GridSearchCV(kNC, param_grid, cv=kf, n_jobs=-1)

# model training
grid.fit(train_data, train_targets)
best_params = grid.best_params_
train_accuracy = grid.best_score_

# model testing
test_predictions = grid.best_estimator_.predict(test_data)
test_accuracy = accuracy_score(test_targets, test_predictions)
mse = mean_squared_error(test_targets, test_predictions)
classification_rep = classification_report(test_targets, test_predictions)

# confusion matrix
cm = confusion_matrix(test_targets, test_predictions)

end_time = time.perf_counter()
execution_time = end_time - start_time

# define the size of plotted diagram to get a good vision
plt.rcParams['savefig.dpi'] = 500
plt.rcParams['figure.dpi'] = 500

# visualize prediction result
sns.heatmap(cm, annot=True, xticklabels=np.arange(0, len(cm), 1) + 3, yticklabels=np.arange(0, len(cm), 1) + 3, linewidths=1, linecolor='white', cmap="coolwarm")
plt.title('Confusion Matrix (KNN)', size=20)
plt.show()

plt.bar(x=np.arange(0, len(cm), 1) + 3, height=cm.sum(axis=0), color='blue', alpha=0.5, label='prediction')
plt.bar(x=np.arange(0, len(cm), 1) + 3, height=cm.sum(axis=1), color='green', alpha=0.5, label='targets')
plt.xlabel('wine quality class', size=15)
plt.ylabel('samples', size=15)
plt.legend(fontsize='x-large')
plt.title('Prediction Result (KNN)', size=20)
plt.show()

# Display results
best_params, train_accuracy, test_accuracy, mse, classification_rep, cm, execution_time