Skip to content
UBER
DataFrameas
df
variable
SELECT * FROM "fact_trips.csv"
import pandas as pd
import numpy as np
# Clean columns & dtypes
df = df.copy()
df.columns = (
df.columns.str.strip()
.str.lower()
.str.replace(' ', '_')
.str.replace('#', 'num', regex=False)
)
# Parse datetimes
for c in ['start_date', 'end_date']:
if c in df.columns:
df[c] = pd.to_datetime(df[c], errors='coerce')
# Keep ZIPCODES as strings
if 'zip_code' in df.columns:
df['zip_code'] = df['zip_code'].fillna('Unknown')
df['month'] = df['start_date'].dt.to_period('M').dt.to_timestamp()
top_zips = (df['zip_code']
.value_counts()
.head(8)
.index)
trips_by_zip = (df[df['zip_code'].isin(top_zips)]
.groupby(['month','zip_code'])
.size()
.reset_index(name='num_trips'))
top_stations = (df['start_station']
.value_counts()
.head(8)
.index)
trips_by_station = (df[df['start_station'].isin(top_stations)]
.groupby(['month', 'start_station'])
.size()
.reset_index(name='num_trips'))
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
base = df.copy()
clean_map = {c: re.sub(r'\s+', ' ',c).strip() for c in base.columns}
base.rename(columns=clean_map, inplace=True)
base.columns = (base.columns
.str.lower()
.str.replace(r'[^0-9a-z]+', '_', regex=True)
.str.strip('_'))
print("Columns:", base.columns.tolist())
base['start_date'] = pd.to_datetime(base['start_date'], errors='coerce')
subs = base[base['subscriber_type'].astype('string').str.strip().str.lower()==
'subscriber'].copy()
subs['month'] = subs['start_date'].dt.to_period('M').dt.to_timestamp()
trips_by_zip = (subs.groupby(['month','zip_code']).size().reset_index(name='num_trips'))
top_zips = subs['zip_code'].value_counts().head(8).index
trips_by_zip = trips_by_zip[trips_by_zip['zip_code'].isin(top_zips)]
plt.figure(figsize=(12,6))
sns.lineplot(data=trips_by_zip, x='month', y='num_trips', hue='zip_code', marker='o')
plt.title("Subscriber Trips per Month by Zip Code")
plt.xlabel("Month"); plt.ylabel("Number of Trips")
plt.xticks(rotation=45); plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
top_start = df['start_station'].value_counts().head(10)
plt.figure(figsize=(8,6))
ax = top_start.plot(kind='barh', title='Top 10 Start Stations', color= "steelblue")
for i, value in enumerate(top_start.values):
ax.text(value + 50, i, str(value), va='center')
plt.xlabel("Number of Trips")
plt.ylabel("Start Station")
plt.tight_layout()
plt.show()
top_end = df['end_station'].value_counts().head(10)
plt.figure(figsize=(8,6))
ax = top_end.plot(kind='barh', title='Top 10 End Stations', color= "darkorange")
for i, value in enumerate(top_end.values):
ax.text(value + 50, i, str(value), va='center')
plt.xlabel("Number of Trips")
plt.ylabel("End Station")
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
fact_trips = fact_trips.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))
subs = df[df['subscriber_type'] == 'Subscriber']
zip_counts = subs['zip_code'].value_counts()
plt.figure(figsize=(8,6))
ax = zip_counts.head(10).plot(kind='barh', color="steelblue", title="Most Popular Subscriber Zip Codes")
for i, value in enumerate(zip_counts.head(10).values):
ax.text(value + 5, i, str(value), va='center')
plt.xlabel("Number of Trips")
plt.ylabel("Zip Code")
plt.tight_layout()
plt.show()
plt.figure(figsize=(8,6))
ax = zip_counts.tail(10).plot(kind='barh', color="darkorange", title="Least Common Subscriber Zip Codes")
for i, value in enumerate(zip_counts.tail(10).values):
ax.text(value + 5, i,str(value), va='center')
plt.xlabel("Number of Trips")
plt.ylabel("Zip Code")
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
subs = base[base['subscriber_type'].astype('string').str.strip().str.lower()==
'subscriber'].copy()
zip_counts = subs['zip_code'].value_counts(dropna=True)
low = zip_counts.quantile(0.10)
high = zip_counts.quantile(0.40)
mid_to_bottom = zip_counts[(zip_counts > low) & (zip_counts <= high)]
N = 20
mid_to_bottom_sorted = mid_to_bottom.sort_values(ascending=True).head(N)
plt.figure(figsize=(10,8))
ax = mid_to_bottom_sorted.plot(kind='barh', color='teal')
plt.title("Subscriber Zip Codes (10th-40th Percentile Usage), Sample of 20)", fontsize=14)
plt.xlabel("Number of Trips", fontsize=12)
plt.ylabel("Zip Code", fontsize=12)
for i, v in enumerate(mid_to_bottom_sorted):
ax.text(v + 0.5, i, str(int(v)), va='center', fontsize=10)
plt.tight_layout()
plt.show()
sns.countplot(data=df, x='subscriber_type')
plt.title("Trips by Subscriber Type vs Customers")
plt.show()
sub_by_month = df.groupby(['month', 'subscriber_type']).size().reset_index(name='num_trips')
sns.lineplot(data=sub_by_month, x='month', y='num_trips', hue='subscriber_type')
plt.title("Subscriber vs Customer Trips Over Time")
plt.show()
bike_usage = ( df.groupby(['bike_num', 'start_station', 'zip_code']).size().reset_index(name='num_trips'))
q1 = bike_usage['num_trips'].quantile(0.25)
q2 = bike_usage['num_trips'].quantile(0.50)
q3 = bike_usage['num_trips'].quantile(0.75)
least_used = bike_usage[bike_usage['num_trips'] <= q1]
mid_used = bike_usage[(bike_usage['num_trips'] > q1) & (bike_usage['num_trips'] <= q3)]
most_used = bike_usage[bike_usage['num_trips']> q3]
print("Least used bikes (bottom 25%):")
print(least_used.sort_values('num_trips').head(10))
print("\nMost used bikes (top 25%):")
print(most_used.sort_values('num_trips', ascending=False).head(10))
import matplotlib.pyplot as plt
import seaborn as sns
bike_usage = df.groupby('bike_num').size().reset_index(name='num_trips')
q1 = bike_usage['num_trips'].quantile(0.25)
q2 = bike_usage['num_trips'].quantile(0.50)
q3 = bike_usage['num_trips'].quantile(0.75)
def label_quartile(x):
if x <= q1: return "Q1(Least Used)"
elif x <= q2: return "Q2"
elif x <= q3: return "Q3"
else: return "Q4(Most Used)"
bike_usage['quartile'] = bike_usage['num_trips'].apply(label_quartile)
quartile_share = bike_usage.groupby('quartile')['num_trips'].sum()/bike_usage['num_trips'].sum()
quartile_order = ["Q1(Least Used)", "Q2", "Q3", "Q4(Most Used)"]
plt.figure(figsize=(10,6))
ax=sns.boxplot(data=bike_usage, x='quartile', y='num_trips',
order=quartile_order,
palette="Set2")
for i, quart in enumerate(quartile_order):
pct=quartile_share.loc[quart]*100
y_max=bike_usage.loc[bike_usage['quartile']==quart,'num_trips'].max()
ax.text(i,y_max + 10, f"{pct:.1f}%",
ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.title("Bike Usage Distribution by Quartile", fontsize=14)
plt.xlabel("Quartile"); plt.ylabel("Number of Trips")
plt.tight_layout()
plt.show()
quartile_share = bike_usage.groupby('quartile')['num_trips'].sum() / bike_usage['num_trips'].sum()
print(quartile_share)
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df0 = df.copy()
df0.columns = (df0.columns
.str.strip()
.str.lower()
.str.replace(r'[^0-9a-z]+', '_', regex=True)
.str.strip('_'))
# FIX 1: Corrected regex pattern (added missing closing parenthesis)
z = (df0['zip_code'].astype('string')
.str.extract(r'(\d{3,5})', expand=False)
.str.zfill(5))
df0['zip_clean'] = z.where(z.notna(), np.nan)
subs = df0[df0['subscriber_type'].astype('string').str.strip().str.lower() == 'subscriber'].copy()
subs['area_group'] = np.where(subs['zip_clean'].str.startswith('941', na=False),
'SF Core', 'Suburban/Out-of-area')
# FIX 2: Typo in 'reindex'
area_counts = subs['area_group'].value_counts().reindex(['SF Core', 'Suburban/Out-of-area']).fillna(0)
plt.figure(figsize=(6, 5))
bars = plt.bar(area_counts.index, area_counts.values)
plt.title('Trips by Area Group (Subscribers)')
for b in bars:
y = b.get_height()
plt.text(b.get_x() + b.get_width() / 2, y + max(area_counts.values) * 0.01,
f'{int(y):,}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
zip_counts = (subs['zip_clean']
.value_counts(dropna=True)
.sort_values(ascending=False))
TOP_N = 25
zip_counts_top = zip_counts.head(TOP_N)
colors = ['C0' if str(z).startswith('941') else 'C1' for z in zip_counts_top.index]
plt.figure(figsize=(10, 8))
# FIX 3: Typo in 'arange'
ypos = np.arange(len(zip_counts_top))[::-1]
plt.barh(zip_counts_top.index[::-1], zip_counts_top.values[::-1], color=colors[::-1])
plt.title(f'Top {TOP_N} Subscriber ZIP Codes\n(SF Core highlighted)')
plt.xlabel('Number of Trips')
plt.ylabel('ZIP Code')
vals = zip_counts_top.values[::-1]
labels = zip_counts_top.index[::-1]
for i, v in enumerate(vals):
plt.text(v + max(vals) * 0.01, i, f'{int(v):,}', va='center')
from matplotlib.patches import Patch
legend_elems = [Patch(facecolor='C0', label='SF Core (941xx)'),
Patch(facecolor='C1', label='Suburban/Out-of-area')]
plt.legend(handles=legend_elems, loc='lower right')
plt.tight_layout()
plt.show()
subs['start_date'] = pd.to_datetime(subs['start_date'], errors='coerce')
subs['month'] = subs['start_date'].dt.to_period('M').dt.to_timestamp()
trend = (subs.groupby(['month', 'area_group'])
.size()
.reset_index(name='num_trips'))
plt.figure(figsize=(12, 6))
for grp, gdf in trend.groupby('area_group'):
plt.plot(gdf['month'], gdf['num_trips'], marker='o', label=grp)
plt.title('Monthly Trips by Area Group (Subscribers)')
plt.xlabel('Month')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Bike usage counts
bike_usage = ( df.groupby(['bike_num', 'start_station', 'end_station']).size().reset_index(name='num_trips'))
q3 = bike_usage['num_trips'].quantile(0.75)
top_bikes = bike_usage[bike_usage['num_trips']>= q3]
top_start_stations = (df['start_station'].value_counts().head(10).index)
top_end_stations = (df['end_station'].value_counts().head(10).index)
top_bike_starts = top_bikes[top_bikes['start_station'].isin(top_start_stations)]
top_bike_ends = top_bikes[top_bikes['end_station'].isin(top_end_stations)]
start_summary = (top_bike_starts.groupby('start_station')['num_trips']
.sum()
.sort_values(ascending=False)
)
end_summary = (top_bike_ends.groupby('end_station')['num_trips']
.sum()
.sort_values(ascending=False)
)
fig,axes = plt.subplots(1,2, figsize=(16,6), sharey=True)
sns.barplot(x=start_summary.values, y=start_summary.index, ax=axes[0],color='steelblue')
axes[0].set_title("Top 25% Bikes at Top 10 Start Stations")
axes[0].set_xlabel("Number of Trips"); axes[0].set_ylabel("Start Station")
sns.barplot(x=end_summary.values, y=end_summary.index, ax=axes[1], color='darkorange')
axes[1].set_title("Top 25% Bikes at Top 10 End Stations")
axes[1].set_xlabel("Bumber of Trips");axes[1].set_ylabel("End Station")
plt.tight_layout()
plt.show()
duration_summary = df.groupby('Subscriber Type')['Duration'].describe()
print(duration_summary)
import seaborn as sns
import matplotlib.pyplot as plt # Fixed typo here
plt.figure(figsize=(8,6))
sns.boxplot(data=df, x='subscriber_type', y='duration')
plt.ylim(0,2000)
plt.title("Trip Duration by Subscriber Type")
plt.show()
plt.figure(figsize=(8,6))
sns.violinplot(data=df, x='subscriber_type', y='duration', cut=0, inner="quartile") # Fixed syntax here
plt.ylim(0,2000)
plt.title("Trip Duration Distribution by Subscriber Type")
plt.show()
from scipy.stats import ttest_ind
subs = df[df['subscriber_type'] == 'Subscriber']['duration']
cust = df[df['subscriber_type'] == 'Customer']['duration']
t_stat, p_val = ttest_ind(subs, cust, equal_var=False)
print(f"T-stat: {t_stat:.2f}, p-value: {p_val:.4f}")