Skip to content
Citibike Capstone
Citibike Analysis
Concat months 4 - 7
DataFrameas
df1
variable
SELECT * FROM '202004_trip_data.csv'
UNION ALL
SELECT * FROM '202005_trip_data.csv'
UNION ALL
SELECT * FROM '202005_trip_data.csv'
UNION ALL
SELECT * FROM '202006_trip_data.csv'
UNION ALL
SELECT * FROM '202007_trip_data.csv';
Concat Months 8 - 11
DataFrameas
df2
variable
SELECT * FROM '202008_trip_data.csv'
UNION ALL
SELECT * FROM '202009_trip_data.csv'
UNION ALL
SELECT * FROM '202010_trip_data.csv'
UNION ALL
SELECT * FROM '202011_trip_data.csv';
Concat Months 12 - 3
DataFrameas
df3
variable
SELECT * FROM '202012_trip_data.csv'
UNION ALL
SELECT * FROM '202101_trip_data.csv'
UNION ALL
SELECT * FROM '202102_trip_data.csv'
UNION ALL
SELECT * FROM '202103_trip_data.csv';
Concat all months
DataFrameas
df
variable
SELECT * FROM df1
UNION
SELECT * FROM df2
UNION
SELECT * FROM df3;
Format row names and order. Create column for ride time and day of the week.
DataFrameas
df4
variable
SELECT
ride_id AS RIDE_ID
,rideable_type AS BIKE_TYPE
,member_casual AS SUB_TYPE
,DATEDIFF('second', started_at, ended_at) AS RIDE_TIME -- get ride time in seconds
,DAYOFWEEK(started_at) AS DAY_OF_WEEK -- 1=sunday ... 7=monday
-- time
,started_at AS START_TIME
,ended_at AS END_TIME
-- station
-- start
,start_station_name AS STATION_START_NAME
,start_station_id AS STATION_START_ID
-- end
,end_station_name AS STATION_END_NAME
,end_station_id AS STATION_END_ID
-- coordinates
-- start
,start_lat AS START_LAT
,start_lng AS START_LNG
-- end
,end_lat AS END_LAT
,end_lng AS END_LNG
FROM df;
Create a map for stations and their ids
DataFrameas
df5
variable
SELECT
STATION_START_ID
,STATION_START_NAME
,COUNT(*)
FROM df4
WHERE STATION_START_ID IS NOT NULL
GROUP BY 1,2
ORDER BY 2;
DataFrameas
df16
variable
SELECT
START_TIME
,STATION_START_ID
,STATION_START_NAME
,START_LAT
,START_LNG
FROM df4
WHERE STATION_START_ID IN ('101.0','15491')
ORDER BY 1 DESC;
It appears that a station can have multiple id's so to be certain which station we aggregate by later we will aggragate by station id and not by name. We will also need to standardize the latitudes and longitudes columns to accurately map the stations.
Exploratory Analysis
Here we will dive in quickly to see what the current state of our data is and what cleaning will need to be done.