Skip to content
Course Notes: Cluster Analysis in Python
Cluster Analysis
Hierarchical Clustering
Linkage method computes distances between intermediate clusters.
Fcluster method generates clusters and assigns associated cluster labels to a new column in the DataFrame
Dendrograms are branching diagrams that show the merging of clusters as we move through the distance matrix.
# Import the fcluster and linkage functions
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import timeit
# Use the linkage() function
%timeit distance_matrix = linkage(comic_con[['x_scaled', 'y_scaled']], method = 'ward', metric = 'euclidean')
# Assign cluster labels
comic_con['cluster_labels'] = fcluster(distance_matrix, 2, criterion='maxclust')
# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled',
hue='cluster_labels', data = comic_con)
plt.show()
# Create a dendrogram
dn = dendrogram(distance_matrix)
# Display the dendogram
plt.show()
KMeans Clustering
The centroids of the clusters are computed using kmeans and cluster assignments for each point are done through vq.
Use elbow method to find the right number of clusters.
# Import random class
from numpy import random
# Initialize seed
random.seed(0)
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq
# Generate cluster centers
cluster_centers, distortion = kmeans(comic_con[['x_scaled','y_scaled']], 2, iter=20, thres=0.00001, check_finite=True)
# Assign cluster labels
comic_con['cluster_labels'], distortion_list = vq(comic_con[['x_scaled','y_scaled']], cluster_centers, check_finite=True)
# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled',
hue='cluster_labels', data = comic_con)
plt.show()
distortions = []
num_clusters = range(1, 7)
# Create a list of distortions from the kmeans function
for i in num_clusters:
cluster_centers, distortion = kmeans(comic_con[['x_scaled','y_scaled']], i, check_finite=True)
distortions.append(distortion)
# Create a DataFrame with two lists - num_clusters, distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})
# Creat a line plot of num_clusters and distortions
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.show()
Normalization
Setting the data into having standard deviation of 1
# Import the whiten function
from scipy.cluster.vq import whiten
goals_for = [4,3,2,3,1,1,2,0,1,4]
# Use the whiten() function to standardize the data
scaled_data = whiten(goals_for)
print(scaled_data)
Finding Dominant Colors
# Import image class of matplotlib
import matplotlib.image as img
# Read batman image and print dimensions
batman_image = img.imread('batman.jpg')
print(batman_image.shape)
# Store RGB values of all pixels in lists r, g and b
for row in batman_image:
for temp_r, temp_g, temp_b in row:
r.append(temp_r)
g.append(temp_g)
b.append(temp_b)
distortions = []
num_clusters = range(1, 7)
# Create a list of distortions from the kmeans function
for i in num_clusters:
cluster_centers, distortion = kmeans(batman_df[['scaled_red','scaled_blue','scaled_green']],i)
distortions.append(distortion)
# Create a DataFrame with two lists, num_clusters and distortions
elbow_plot = pd.DataFrame({"num_clusters":num_clusters, "distortions":distortions})
# Create a line plot of num_clusters and distortions
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.show()
# Get standard deviations of each color
r_std, g_std, b_std = batman_df[['red', 'green', 'blue']].std()
for cluster_center in cluster_centers:
scaled_r, scaled_g, scaled_b = cluster_center
# Convert each standardized value to scaled value
colors.append((
scaled_r * r_std / 255,
scaled_g * g_std / 255,
scaled_b * b_std / 255
))
# Display colors of cluster centers
plt.imshow([colors])
plt.show()
Document Clustering
# Import TfidfVectorizer class from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=0.1, max_features=50, tokenizer=remove_noise)
# Use the .fit_transform() method on the list plots
tfidf_matrix = tfidf_vectorizer.fit_transform(plots)
num_clusters = 2
# Generate cluster centers through the kmeans function
cluster_centers, distortion = kmeans(tfidf_matrix.todense(), num_clusters)
# Generate terms from the tfidf_vectorizer object
terms = tfidf_vectorizer.get_feature_names_out()
for i in range(num_clusters):
# Sort the terms and print top 3 terms
center_terms = dict(zip(terms, list(cluster_centers[i])))
sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
print(sorted_terms[:3])