Basketball Analytics: Clustering Players by Performance Metrics
In the realm of professional basketball analysis, particularly focusing on the National Basketball Association (NBA), there's a keen interest in dissecting and comprehending the nuances of player performances. As part of a sports analytics squad, your mission is to delve into a detailed clustering study, leveraging hierarchical clustering techniques, tidyverse function writing skills, and data visualization to unearth patterns among players based on key performance metrics.
Basketball, a global sport teeming with rich and varied player data, offers a goldmine of insights into individual skills and overall team dynamics. To channel your analysis effectively, you've chosen to narrow down on a dataset created using overall season statistics from Sports Reference for 50 NBA players encapsulating a range of performance indicators:
nba_players_2023.csv encompasses specific performance metrics for each player, outlined as follows:
| Variable | Description |
|---|---|
name | The player's full name |
age | The player's age during the season |
minutes_per_game | Average duration of play per game in minutes |
rebounds_per_game | Average number of rebounds per game |
assists_per_game | Average number of assists per game |
points_per_game | Average number of points scored per game |
# Import packages
library(readr)
library(dendextend)
library(tibble)
library(dplyr)
library(purrr)
library(cluster)
library(ggplot2)
library(tidyverse)
library(janitor)# Read data
nba <- read_csv("nba_players_2023.csv", show_col_types = FALSE)
glimpse(nba)# Normalise the data and create a new col for combined points, rebounds and assists per player
colnames(nba)
nba_scaled <- nba %>%
column_to_rownames("name") %>%
mutate(pra_per_game = points_per_game + rebounds_per_game + assists_per_game) %>%
select(age, minutes_played_per_game, rebounds_per_game, assists_per_game, points_per_game, pra_per_game) %>%
scale()
# Determining number of optimal clusters with elbow plot
# Compute WSS for k = 1 to 10
wss_df <- tibble(k = 1:10) %>%
mutate(wss = map_dbl(k, ~kmeans(nba_scaled, centers = .x, nstart = 25)$tot.withinss))# Plotting elbow plot
ggplot(wss_df, aes(x = k, y = wss)) +
geom_line(color = "steelblue", linewidth = 1) + # use linewidth instead of size
geom_point(color = "darkred", size = 2) +
labs(title = "Elbow Method for Optimal K",
x = "Number of Clusters (k)",
y = "Total Within-Cluster Sum of Squares (WSS)") +
theme_minimal()
# Identify the optimal cluster number
num_clusters <- 2 #look at the elbow plot above. Curve bends at k =2 and after that the curve flattens
# Calculating distance and clustering hierarchically
# Finding euclidean distance between players
dist_matrix <- dist(nba_scaled, method = "euclidean")
# Apply hierarchical clustering with average linkage
hc_avg <- hclust(dist_matrix, method = "average")
# Creating and visualizing dendrograms
library(dendextend)
# Convert hclust result to a dendrogram object
dend <- as.dendrogram(hc_avg)
# Color the branches by cluster
dend_colored <- color_branches(dend, k = num_clusters) # num_clusters = 2
# Plot the colored dendrogram
plot(dend_colored,
main = "NBA player Dendrogram (Average linkage)",
ylab = "Height",
cex = 0.6)
# Assigning clusters and segmenting data
num_clusters <- 2 # confirmed from dendrogram
hc_clusters <- cutree(hc_avg, k = num_clusters)
# Segmenting players based on clusters
nba_segmented <- nba %>%
mutate(cluster = factor(hc_clusters[match(name, rownames(nba_scaled))]))# Analyzing clusters and specifying the features of strongest influence
library(dplyr)
colnames(nba_segmented) # pra_per_game is missing. So, I'm adding it
nba_segmented <- nba_segmented %>%
mutate(pra_per_game = points_per_game + rebounds_per_game + assists_per_game)
# Summary stats for each cluster and each variable
summary_stats <- nba_segmented %>%
group_by(cluster) %>%
summarise (
age_mean = mean(age), age_sd = sd(age), age_median = median(age), age_min = min(age), age_max = max(age),
min_mean = mean(minutes_played_per_game), min_sd = sd(minutes_played_per_game), min_median = median(minutes_played_per_game), min_min = min(minutes_played_per_game), min_max = max(minutes_played_per_game),
reb_mean = mean(rebounds_per_game), reb_sd = sd(rebounds_per_game), reb_median = median(rebounds_per_game), reb_min = min(rebounds_per_game), reb_max = max(rebounds_per_game),
ast_mean = mean(assists_per_game), ast_sd = sd(assists_per_game), ast_median = median(assists_per_game), ast_min = min(assists_per_game), ast_max = max(assists_per_game),
pts_mean = mean(points_per_game), pts_sd = sd(points_per_game), pts_median = median(points_per_game), pts_min = min(points_per_game), pts_max = max(points_per_game),
pra_mean = mean(pra_per_game), pra_sd = sd(pra_per_game), pra_median = median(pra_per_game), pra_min = min(pra_per_game), pra_max = max(pra_per_game),
.groups = "drop"
)
summary_stats
# Identifying the most influential variables
# Check min/max for each cluster
range_summary <- nba_segmented %>%
group_by(cluster) %>%
summarise(
age_min = min(age), age_max = max(age),
min_min = min(minutes_played_per_game), min_max = max(minutes_played_per_game),
reb_min = min(rebounds_per_game), reb_max = max(rebounds_per_game),
ast_min = min(assists_per_game), ast_max = max(assists_per_game),
pts_min = min(points_per_game), pts_max = max(points_per_game),
pra_min = min(pra_per_game), pra_max = max(pra_per_game),
.groups = "drop"
)
range_summary
strongest_influence <- c("minutes_played_per_game", "points_per_game", "pra_per_game")