Skip to content

Compare Baseball Player Statistics using Visualizations

This is Aaron Judge. Judge is one of the physically largest players in Major League Baseball standing 6 feet 7 inches (2.01 m) tall and weighing 282 pounds (128 kg). He also hit one of the hardest home runs ever recorded. How do we know this? Statcast.

Statcast is a state-of-the-art tracking system that uses high-resolution cameras and radar equipment to measure the precise location and movement of baseballs and baseball players. Introduced in 2015 to all 30 major league ballparks, Statcast data is revolutionizing the game. Teams are engaging in an "arms race" of data analysis, hiring analysts left and right in an attempt to gain an edge over their competition.

In this project, you're going to wrangle, analyze, and visualize Statcast historical data to compare Mr. Judge and another (extremely large) teammate of his, Giancaro Stanton. They are similar in a lot of ways, one being that they hit a lot of home runs. Stanton and Judge led baseball in home runs in 2017, with 59 and 52, respectively. These are exceptional totals - the player in third "only" had 45 home runs.

Stanton and Judge are also different in many ways. Let's find out how they compare!

The Data

There are two CSV files, judge.csv and stanton.csv, both of which contain Statcast data for 2015-2017. Each row represents one pitch thrown to a batter.

Custom Functions

Two functions have also been provided for you to visualize home rome zones

  • assign_x_coord: Assigns an x-coordinate to Statcast's strike zone numbers.
  • assign_y_coord: Assigns a y-coordinate to Statcast's strike zone numbers.

# Run this cell to begin
# Import the necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load Aaron Judge's Statcast data
judge = pd.read_csv('judge.csv')

# Load Giancarlo Stanton's Statcast data
stanton = pd.read_csv('stanton.csv')

# Display all columns (pandas will collapse some columns if we don't set this option)
pd.set_option('display.max_columns', None)

# Custom Functions
def assign_x_coord(row):
    """
    Assigns an x-coordinate to Statcast's strike zone numbers. Zones 11, 12, 13,
    and 14 are ignored for plotting simplicity.
    """
    # Left third of strike zone
    if row.zone in [1, 4, 7]:
        return 1
    # Middle third of strike zone
    if row.zone in [2, 5, 8]:
        return 2
    # Right third of strike zone
    if row.zone in [3, 6, 9]:
        return 3
    
def assign_y_coord(row):
    """
    Assigns a y-coordinate to Statcast's strike zone numbers. Zones 11, 12, 13,
    and 14 are ignored for plotting simplicity.
    """
    # Upper third of strike zone
    if row.zone in [1, 2, 3]:
        return 3
    # Middle third of strike zone
    if row.zone in [4, 5, 6]:
        return 2
    # Lower third of strike zone
    if row.zone in [7, 8, 9]:
        return 1
    
# Display the last five rows of the Aaron Judge file
judge.tail()

How many of each event did Judge and Stanton have in 2017?

# Filter the data for 2017
judge_2017 = judge[judge['game_year'] == 2017]
stanton_2017 = stanton[stanton['game_year'] == 2017]

# Count the events and sort in descending order
judge_events_2017 = judge_2017['events'].value_counts()
stanton_events_2017 = stanton_2017['events'].value_counts()

Judge's events in 2017

# Create a bar chart for Judge's events in 2017 using Seaborn with flipped X and Y
plt.figure(figsize=(10, 6))
sns.barplot(x=judge_events_2017.values, y=judge_events_2017.index, palette='Blues_d')
plt.title("Aaron Judge's Events in 2017")
plt.xlabel('Count')
plt.ylabel('')
plt.xticks(rotation=45)
plt.show()

Stanton's events in 2017

# Create a bar chart for Stanton's events in 2017 using Seaborn with flipped X and Y
plt.figure(figsize=(10, 6))
sns.barplot(x=stanton_events_2017.values, y=stanton_events_2017.index, palette='Reds_d')
plt.title("Giancarlo Stanton's Events in 2017")
plt.xlabel('Count')
plt.ylabel('')
plt.xticks(rotation=45)
plt.show()

Which player hit home runs slightly lower and harder?

import seaborn as sns

# Filter data for home runs
judge_hr = judge[judge['events'] == 'home_run']
stanton_hr = stanton[stanton['events'] == 'home_run']

# Create the plot
# fig1, ax1 = plt.subplots(1, 2, figsize=(16, 6))
fig1, ax1 = plt.subplots(ncols=2, sharex=True, sharey=True)

# KDE Plot for Judge
sns.kdeplot(x=judge_hr['launch_angle'], y=judge_hr['launch_speed'], ax=ax1[0], fill=True, cmap="Blues", shade=True, shade_lowest=False)
ax1[0].set_title("Aaron Judge\n Home Runs 2015 - 2017")
ax1[0].set_xlabel("Launch Angle (degrees)")
ax1[0].set_ylabel("Launch Speed (mph)")

# KDE Plot for Stanton
sns.kdeplot(x=stanton_hr['launch_angle'], y=stanton_hr['launch_speed'], ax=ax1[1], fill=True, cmap="Reds", shade=True, shade_lowest=False)
ax1[1].set_title("Giancarlo Stanton\n Home Runs 2015 - 2017")
ax1[1].set_xlabel("Launch Angle (degrees)")
ax1[1].set_ylabel("Launch Speed (mph)")

plt.tight_layout()
plt.show()

# Calculate average launch angle and speed for each player
judge_avg_angle = judge_hr['launch_angle'].mean()
judge_avg_speed = judge_hr['launch_speed'].mean()
stanton_avg_angle = stanton_hr['launch_angle'].mean()
stanton_avg_speed = stanton_hr['launch_speed'].mean()

# Determine which player hit home runs slightly lower and harder
if stanton_avg_angle < judge_avg_angle and stanton_avg_speed > judge_avg_speed:
    player_hr = "Stanton"
else:
    player_hr = "Judge"

print(f"Average Launch Angle - Judge: {judge_avg_angle:.2f}, Stanton: {stanton_avg_angle:.2f}")
print(f"Average Launch Speed - Judge: {judge_avg_speed:.2f}, Stanton: {stanton_avg_speed:.2f}")
print(f"{player_hr} hit home runs slightly lower and harder.")

Which player hit their home runs off of faster pitches (has the highest median)?

# Combine the data for both players into a single DataFrame
judge_stanton_hr = pd.concat([judge_hr, stanton_hr])

# Create box plots for pitch velocity comparison
fig2, ax2 = plt.subplots(figsize=(10, 6))

# Create a color palette to differentiate players
palette = {'Aaron Judge': 'blue', 'Giancarlo Stanton': 'red'}

sns.boxplot(x='player_name', y='release_speed', data=judge_stanton_hr, ax=ax2, palette=palette,
            medianprops=dict(color="white"))
ax2.set_title('Pitch Velocity for Home Runs (2015 - 2017)')
ax2.set_xlabel('Player')
ax2.set_ylabel('Release Speed (mph)')

plt.show()

# Calculate median release speed for each player
judge_median = judge_hr['release_speed'].median()
stanton_median = stanton_hr['release_speed'].median()

# Determine which player hit home runs off faster pitches
if stanton_median > judge_median:
    player_fast = "Stanton"
else:
    player_fast = "Judge"

print(f"Median Release Speed - Judge: {judge_median:.2f}, Stanton: {stanton_median:.2f}")
print(f"{player_fast} hit home runs off faster pitches on average.")

Construct a 2D histogram for each player that visualizes the home run strike zones