Skip to content
Whoop Project
DataFrameas
df
variable
-- High-level data points for the dataset
SELECT
STRFTIME('%d/%m/%Y', MIN(CAST(cycle_start_time AS DATE))) AS first_workout_date,
STRFTIME('%d/%m/%Y', MAX(CAST(cycle_end_time AS DATE))) AS last_workout_date,
DATEDIFF('day', MIN(CAST(cycle_start_time AS DATE)), MAX(CAST(cycle_end_time AS DATE))) AS total_days,
COUNT(*) AS total_workouts
FROM whoop_workouts.csvDataFrameas
df5
variable
--
SELECT
activity_name,
COUNT(*) AS count_activities
FROM whoop_workouts.csv
GROUP BY activity_name
ORDER BY count_activities DESC;DataFrameas
df1
variable
SELECT
activity_name,
COUNT(*) AS count_activities
FROM whoop_workouts.csv
GROUP BY activity_name
ORDER BY count_activities DESC
LIMIT 5;import seaborn as sns
sns.barplot(data=df1, y='activity_name', x='count_activities').set_title('Top 5 Activities');import matplotlib.pyplot as plt
activities = df1['activity_name']
count_activities = df1['count_activities']
plt.figure(figsize=(8, 8))
plt.pie(count_activities, labels=activities, autopct='%1.1f%%', startangle=90)
plt.title('Top 5 Activities')
plt.axis('equal')
plt.show()DataFrameas
df2
variable
SELECT
DISTINCT STRFTIME('%m/%Y', (CAST(cycle_start_time AS DATE))) AS month,
COUNT(*) AS count_activities
FROM whoop_workouts.csv
WHERE YEAR(cycle_start_time) = 2024 AND MONTH(cycle_start_time) < 10
GROUP BY 1
ORDER BY month ASC;
NEXT: Look at the total time spent in each zone per activity
DataFrameas
df3
variable
-- Calorie burn rate calculated as calories burned during activity divided by activity duration
SELECT
CASE
WHEN activity_name = 'Pilates' THEN 'Hot Pilates'
ELSE activity_name
END AS activity_name,
COUNT(*) AS count_activities,
SUM(calories_burned) AS total_calories_burned,
SUM(duration_min) AS total_minutes,
ROUND((total_calories_burned / total_minutes), 0) AS calories_per_min
FROM
'whoop_workouts.csv'
GROUP BY
activity_name
HAVING
COUNT(*) > 5
ORDER BY
calories_per_min DESC;DataFrameas
df4
variable
/* SELECT
t1.date,
COUNT(t1.*)
FROM (*/
SELECT
CAST(pc.sleep_end_time AS DATE) AS date,
sl.light_sleep_duration_min AS light_sleep,
sl.deep_sleep_duration_min AS deep_sleep,
sl.rem_sleep_duration_min AS REM_sleep,
sl.awake_duration_min AS time_awake,
(light_sleep + deep_sleep + REM_sleep + time_awake) AS time_in_bed,
(light_sleep + deep_sleep + REM_sleep) AS total_sleep,
sl.Nap,
pc.recovery_score
FROM 'whoop_physiological_cycles.csv' pc
JOIN 'whoop_sleeps.csv' sl
ON CAST(pc.sleep_end_time AS DATE) = CAST(sl.sleep_end_time AS DATE)
AND sl.Nap = 'False'
-- AND sl.sleep_end_time <> '2023-12-19T00:00:00.000'
-- WHERE sl.Nap = 'False'
-- I'm getting 4 rows on the 19th of December 2023
-- AND pc.recovery_score IS NOT NULL
GROUP BY 1, 2, 3, 4, 5, 8, 9
ORDER BY date ASC;
/* ) t1
GROUP BY 1
ORDER BY COUNT(t1.*) DESC */DataFrameas
df6
variable
-- AVG. active & base calories burned per day of the week
-- NEED TO TIDY THIS UP!!!
WITH total_calories AS (
SELECT
HOUR(CAST(cycle_end_time AS TIMESTAMP)) AS hour,
CASE
WHEN HOUR(CAST(cycle_end_time AS TIMESTAMP)) BETWEEN 0 AND 8
THEN CAST(cycle_end_time AS TIMESTAMP) - INTERVAL 9 HOUR
ELSE CAST(cycle_end_time AS TIMESTAMP)
END AS adjusted_cycle_end_time,
CAST(cycle_end_time AS TIMESTAMP) AS date,
calories_burned AS total_cal_burned
FROM
whoop_physiological_cycles.csv
),
active_calories AS (
SELECT
CAST(workout_start_time AS TIMESTAMP) AS date,
SUM(calories_burned) AS active_cal_burned
FROM
'whoop_workouts.csv'
GROUP BY
date
ORDER BY
date ASC
)
SELECT
CAST(tc.adjusted_cycle_end_time AS DATE) AS date,
tc.total_cal_burned,
COALESCE(SUM(ac.active_cal_burned), 0) AS active_calories_burned,
tc.total_cal_burned - COALESCE(SUM(ac.active_cal_burned), 0) AS base_calories
FROM
total_calories tc
LEFT JOIN
active_calories ac
ON CAST(tc.adjusted_cycle_end_time AS DATE) = CAST(ac.date AS DATE)
GROUP BY
tc.adjusted_cycle_end_time,
tc.total_cal_burned
ORDER BY
date ASC;
DataFrameas
df7
variable
-- Day strain versus next day recovery score
-- Create buckets for strain 0-2, 2-4, 4-6, etc.
-- CREATE STRAIN BINS
WITH strain_bins AS (
SELECT
CASE
WHEN HOUR(CAST(cycle_end_time AS TIMESTAMP)) BETWEEN 0 AND 8
THEN CAST(cycle_end_time AS TIMESTAMP) - INTERVAL 9 HOUR
ELSE
CAST(cycle_end_time AS TIMESTAMP)
END AS adjusted_strain_date, -- need to do this as raw data skips some dates where I was still awake past midnight
day_strain,
CASE
WHEN day_strain >= 0 AND day_strain < 2 THEN '0 - 2'
WHEN day_strain >= 2 AND day_strain < 4 THEN '02 - 4'
WHEN day_strain >= 4 AND day_strain < 6 THEN '04 - 6'
WHEN day_strain >= 6 AND day_strain < 8 THEN '06 - 8'
WHEN day_strain >= 8 AND day_strain < 10 THEN '08 - 10'
WHEN day_strain >= 10 AND day_strain < 12 THEN '10 - 12'
WHEN day_strain >= 12 AND day_strain < 14 THEN '12 - 14'
WHEN day_strain >= 14 AND day_strain < 16 THEN '14 - 16'
WHEN day_strain >= 16 AND day_strain < 18 THEN '16 - 18'
WHEN day_strain >= 18 AND day_strain < 20 THEN '18 - 20'
WHEN day_strain >= 20 AND day_strain < 22 THEN '20 - 22'
END AS day_strain_bin
FROM
'whoop_physiological_cycles.csv'
GROUP BY
1, 2
ORDER BY
day_strain_bin
),
recovery_categories AS (
SELECT
CASE
WHEN HOUR(CAST(cycle_end_time AS TIMESTAMP)) BETWEEN 0 AND 8
THEN CAST(cycle_end_time AS TIMESTAMP) - INTERVAL 9 HOUR
ELSE
CAST(cycle_end_time AS TIMESTAMP)
END AS adjusted_recovery_score_date, -- need to do this as raw data skips some dates where I was still awake past midnight
recovery_score,
CASE
WHEN recovery_score >= 0 AND recovery_score < 34 THEN 'RED'
WHEN recovery_score >= 34 AND recovery_score < 67 THEN 'YELLOW'
WHEN recovery_score >= 67 AND recovery_score <= 100 THEN 'GREEN'
END AS recovery_bucket
FROM
'whoop_physiological_cycles.csv'
),
recovery_score_previous_day_strain AS (
SELECT
CAST(rc.adjusted_recovery_score_date AS DATE) AS recovery_date,
rc.recovery_score AS recovery_score,
rc.recovery_bucket AS recovery_group,
CAST(sb.adjusted_strain_date AS DATE) AS previous_day_strain_date,
sb.day_strain AS previous_day_strain,
sb.day_strain_bin AS previous_day_strain_bin
FROM
recovery_categories rc
LEFT JOIN
strain_bins sb ON CAST(rc.adjusted_recovery_score_date AS DATE) = (CAST(sb.adjusted_strain_date AS DATE) + INTERVAL 1 DAY)
)
SELECT
previous_day_strain_bin,
recovery_group,
COUNT(*) AS count_recoveries,
SUM(COUNT(*)) OVER (PARTITION BY previous_day_strain_bin) AS total_recoveries_by_strain_bin
FROM
recovery_score_previous_day_strain
WHERE
recovery_group IS NOT NULL
AND previous_day_strain_bin IS NOT NULL
GROUP BY
1, 2
ORDER BY
previous_day_strain_bin ASC;
DataFrameas
df8
variable
-- CREATE RECOVERY GROUPS
-- https://www.whoop.com/eu/en/thelocker/how-does-whoop-recovery-work-101/
SELECT
CASE
WHEN HOUR(CAST(cycle_end_time AS TIMESTAMP)) BETWEEN 0 AND 8 THEN CAST(cycle_end_time AS TIMESTAMP) - INTERVAL 9 HOUR
ELSE CAST(cycle_end_time AS TIMESTAMP)
END AS adjusted_recovery_score_date, -- need to do this as raw data skips some dates where I was still awake past midnight
recovery_score,
CASE WHEN recovery_score >= 0 AND recovery_score < 34 THEN 'RED'
WHEN recovery_score >= 34 AND recovery_score < 67 THEN 'YELLOW'
WHEN recovery_score >= 67 AND recovery_score <= 100 THEN 'GREEN'
END AS recovery_bucket
FROM 'whoop_physiological_cycles.csv'DataFrameas
df9
variable
SELECT
CASE
WHEN HOUR(CAST(cycle_end_time AS TIMESTAMP)) BETWEEN 0 AND 8 THEN CAST(cycle_end_time AS TIMESTAMP) - INTERVAL 9 HOUR
ELSE CAST(cycle_end_time AS TIMESTAMP)
END AS adjusted_strain_date, -- need to do this as raw data skips some dates where I was still awake past midnight
day_strain,
CASE WHEN day_strain >= 0 AND day_strain < 2 THEN '0 - 2'
WHEN day_strain >= 2 AND day_strain < 4 THEN '02 - 4'
WHEN day_strain >= 4 AND day_strain < 6 THEN '04 - 6'
WHEN day_strain >= 6 AND day_strain < 8 THEN '06 - 8'
WHEN day_strain >= 8 AND day_strain < 10 THEN '08 - 10'
WHEN day_strain >= 10 AND day_strain < 12 THEN '10 - 12'
WHEN day_strain >= 12 AND day_strain < 14 THEN '12 - 14'
WHEN day_strain >= 14 AND day_strain < 16 THEN '14 - 16'
WHEN day_strain >= 16 AND day_strain < 18 THEN '16 - 18'
WHEN day_strain >= 18 AND day_strain < 20 THEN '18 - 20'
WHEN day_strain >= 20 AND day_strain < 22 THEN '20 - 22'
END AS day_strain_bins
FROM 'whoop_physiological_cycles.csv'
GROUP BY 1, 2
ORDER BY adjusted_strain_date DESCDataFrameas
df10
variable
-- High-level data points for the dataset 2
SELECT
COUNT(*) AS total_workouts,
ROUND(AVG(calories_burned), 2) AS avg_calories_burned,
SUM(calories_burned) AS total_calories_burned,
MAX(max_hr) AS max_heartrate
FROM whoop_workouts.csvSELECT
COUNT(*) AS total_workouts,
ROUND(AVG(calories_burned), 2) AS avg_calories_burned,
SUM(calories_burned) AS total_calories_burned,
MAX(max_hr) AS max_heartrate
FROM
'whoop_workouts.csv';