Skip to content

Citibike Analysis

Concat months 4 - 7

Spinner
DataFrameas
df1
variable
SELECT * FROM '202004_trip_data.csv'
UNION ALL
SELECT * FROM '202005_trip_data.csv'
UNION ALL
SELECT * FROM '202005_trip_data.csv'
UNION ALL
SELECT * FROM '202006_trip_data.csv'
UNION ALL
SELECT * FROM '202007_trip_data.csv';

Concat Months 8 - 11

Spinner
DataFrameas
df2
variable
SELECT * FROM '202008_trip_data.csv'
UNION ALL
SELECT * FROM '202009_trip_data.csv'
UNION ALL
SELECT * FROM '202010_trip_data.csv'
UNION ALL
SELECT * FROM '202011_trip_data.csv';

Concat Months 12 - 3

Spinner
DataFrameas
df3
variable
SELECT * FROM '202012_trip_data.csv'
UNION ALL
SELECT * FROM '202101_trip_data.csv'
UNION ALL
SELECT * FROM '202102_trip_data.csv'
UNION ALL
SELECT * FROM '202103_trip_data.csv';

Concat all months

Spinner
DataFrameas
df
variable
SELECT * FROM df1
UNION
SELECT * FROM df2
UNION
SELECT * FROM df3;

Format row names and order. Create column for ride time and day of the week.

Spinner
DataFrameas
df4
variable
SELECT
	 ride_id AS RIDE_ID
	,rideable_type AS BIKE_TYPE
	,member_casual AS SUB_TYPE
	,DATEDIFF('second', started_at, ended_at) AS RIDE_TIME -- get ride time in seconds
	,DAYOFWEEK(started_at) AS DAY_OF_WEEK -- 1=sunday ... 7=monday
	-- time
	,started_at AS START_TIME
	,ended_at AS END_TIME
	-- station
		-- start
	,start_station_name AS STATION_START_NAME
	,start_station_id AS STATION_START_ID
		-- end
	,end_station_name AS STATION_END_NAME
	,end_station_id AS STATION_END_ID
	-- coordinates
		-- start
	,start_lat AS START_LAT
	,start_lng AS START_LNG
		-- end
	,end_lat AS END_LAT
	,end_lng AS END_LNG
FROM df;

Create a map for stations and their ids

Spinner
DataFrameas
df5
variable
SELECT 
     STATION_START_ID
	,STATION_START_NAME
	,COUNT(*)
FROM df4
WHERE STATION_START_ID IS NOT NULL
GROUP BY 1,2
ORDER BY 2;
Spinner
DataFrameas
df16
variable
SELECT
	START_TIME
	,STATION_START_ID
	,STATION_START_NAME
	,START_LAT
	,START_LNG
FROM df4
WHERE STATION_START_ID IN ('101.0','15491')
ORDER BY 1 DESC;

It appears that a station can have multiple id's so to be certain which station we aggregate by later we will aggragate by station id and not by name. We will also need to standardize the latitudes and longitudes columns to accurately map the stations.

Exploratory Analysis

Here we will dive in quickly to see what the current state of our data is and what cleaning will need to be done.