Skip to content

Basketball Analytics: Clustering Players by Performance Metrics

In the realm of professional basketball analysis, particularly focusing on the National Basketball Association (NBA), there's a keen interest in dissecting and comprehending the nuances of player performances. As part of a sports analytics squad, your mission is to delve into a detailed clustering study, leveraging hierarchical clustering techniques, tidyverse function writing skills, and data visualization to unearth patterns among players based on key performance metrics.

Basketball, a global sport teeming with rich and varied player data, offers a goldmine of insights into individual skills and overall team dynamics. To channel your analysis effectively, you've chosen to narrow down on a dataset created using overall season statistics from Sports Reference for 50 NBA players encapsulating a range of performance indicators:

nba_players_2023.csv encompasses specific performance metrics for each player, outlined as follows:

VariableDescription
nameThe player's full name
ageThe player's age during the season
minutes_per_gameAverage duration of play per game in minutes
rebounds_per_gameAverage number of rebounds per game
assists_per_gameAverage number of assists per game
points_per_gameAverage number of points scored per game
# Import packages
library(readr)
library(dendextend)
library(tibble)
library(dplyr)
library(purrr)
library(cluster)
library(ggplot2)
library(tidyverse)
library(janitor)
# Read data 
nba <- read_csv("nba_players_2023.csv", show_col_types = FALSE)
glimpse(nba)
# Normalise the data and create a new col for combined points, rebounds and assists per player
colnames(nba)

nba_scaled <- nba %>%
  column_to_rownames("name") %>%
  mutate(pra_per_game = points_per_game + rebounds_per_game + assists_per_game) %>%
  select(age, minutes_played_per_game, rebounds_per_game, assists_per_game, points_per_game, pra_per_game) %>%
  scale()
# Determining number of optimal clusters with elbow plot
# Compute WSS for k = 1 to 10
wss_df <- tibble(k = 1:10) %>%
  mutate(wss = map_dbl(k, ~kmeans(nba_scaled, centers = .x, nstart = 25)$tot.withinss))
# Plotting elbow plot
ggplot(wss_df, aes(x = k, y = wss)) +
  geom_line(color = "steelblue", linewidth = 1) +  # use linewidth instead of size
  geom_point(color = "darkred", size = 2) +
  labs(title = "Elbow Method for Optimal K",
       x = "Number of Clusters (k)",
       y = "Total Within-Cluster Sum of Squares (WSS)") +
  theme_minimal()

# Identify the optimal cluster number 
num_clusters <- 2 #look at the elbow plot above. Curve bends at k =2 and after that the curve flattens

# Calculating distance and clustering hierarchically
# Finding euclidean distance between players 
dist_matrix <- dist(nba_scaled, method = "euclidean")

# Apply hierarchical clustering with average linkage
hc_avg <- hclust(dist_matrix, method = "average")

# Creating and visualizing dendrograms
library(dendextend)

# Convert hclust result to a dendrogram object
dend <- as.dendrogram(hc_avg)

# Color the branches by cluster
dend_colored <- color_branches(dend, k = num_clusters)  # num_clusters = 2 

# Plot the colored dendrogram
plot(dend_colored,
     main = "NBA player Dendrogram (Average linkage)",
     ylab = "Height",
     cex = 0.6)



# Assigning clusters and segmenting data
num_clusters <- 2  # confirmed from dendrogram
hc_clusters <- cutree(hc_avg, k = num_clusters)

# Segmenting players based on clusters
nba_segmented <- nba %>%
  mutate(cluster = factor(hc_clusters[match(name, rownames(nba_scaled))]))
# Analyzing clusters and specifying the features of strongest influence
library(dplyr)

colnames(nba_segmented) # pra_per_game is missing. So, I'm adding it

nba_segmented <- nba_segmented %>%
  mutate(pra_per_game = points_per_game + rebounds_per_game + assists_per_game)

# Summary stats for each cluster and each variable
summary_stats <- nba_segmented %>%
  group_by(cluster) %>%
  summarise (
    age_mean = mean(age), age_sd = sd(age), age_median = median(age), age_min = min(age), age_max = max(age),
    min_mean = mean(minutes_played_per_game), min_sd = sd(minutes_played_per_game), min_median = median(minutes_played_per_game), min_min = min(minutes_played_per_game), min_max = max(minutes_played_per_game),
    reb_mean = mean(rebounds_per_game), reb_sd = sd(rebounds_per_game), reb_median = median(rebounds_per_game), reb_min = min(rebounds_per_game), reb_max = max(rebounds_per_game),
    ast_mean = mean(assists_per_game), ast_sd = sd(assists_per_game), ast_median = median(assists_per_game), ast_min = min(assists_per_game), ast_max = max(assists_per_game),
    pts_mean = mean(points_per_game), pts_sd = sd(points_per_game), pts_median = median(points_per_game), pts_min = min(points_per_game), pts_max = max(points_per_game),
    pra_mean = mean(pra_per_game), pra_sd = sd(pra_per_game), pra_median = median(pra_per_game), pra_min = min(pra_per_game), pra_max = max(pra_per_game),
    .groups = "drop"
  )

summary_stats

# Identifying the most influential variables
# Check min/max for each cluster
range_summary <- nba_segmented %>%
  group_by(cluster) %>%
  summarise(
    age_min = min(age), age_max = max(age),
    min_min = min(minutes_played_per_game), min_max = max(minutes_played_per_game),
    reb_min = min(rebounds_per_game), reb_max = max(rebounds_per_game),
    ast_min = min(assists_per_game), ast_max = max(assists_per_game),
    pts_min = min(points_per_game), pts_max = max(points_per_game),
    pra_min = min(pra_per_game), pra_max = max(pra_per_game),
    .groups = "drop"
  )

range_summary

strongest_influence <- c("minutes_played_per_game", "points_per_game", "pra_per_game")