Skip to content
Find and Visualize clusters with K-Means
Find and Visualize clusters with K-Means
This template helps you identify and visualize clusters in your data. The K Means algorithm aims to assign the data points in your dataset to K distinct clusters. After running the algorithm each observation (data point) will belong to the cluster whose center it is closest to.
# Load packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
# Upload your data as a csv file and load it as a data frame
df = pd.read_csv('penguins.csv').dropna()
df.head()
# Visualize the correlation your data and identify variables for further analysis
g = sns.PairGrid(df)
g.map(sns.scatterplot);
X = np.array(df.loc[:,['bill_length_mm', # Choose your variable names
'bill_depth_mm']]) \
.reshape(-1, 2)
# Determine optimal cluster number with elbow method
wcss = []
for i in range(1, 11):
model = KMeans(n_clusters = i,
init = 'k-means++', # Initialization method for kmeans
max_iter = 300, # Maximum number of iterations
n_init = 10, # Choose how often algorithm will run with different centroid
random_state = 0) # Choose random state for reproducibility
model.fit(X)
wcss.append(model.inertia_)
# Show Elbow plot
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method') # Set plot title
plt.xlabel('Number of clusters') # Set x axis name
plt.ylabel('Within Cluster Sum of Squares (WCSS)') # Set y axis name
plt.show()
kmeans = KMeans(n_clusters = 3, # Set amount of clusters
init = 'k-means++', # Initialization method for kmeans
max_iter = 300, # Maximum number of iterations
n_init = 10, # Choose how often algorithm will run with different centroid
random_state = 0) # Choose random state for reproducibility
pred_y = kmeans.fit_predict(X)
# Plot the data
plt.scatter(X[:,0],
X[:,1])
# Plot the clusters
plt.scatter(kmeans.cluster_centers_[:, 0],
kmeans.cluster_centers_[:, 1],
s=200, # Set centroid size
c='red') # Set centroid color
plt.show()