Skip to content
Joining Data with pandas
Joining Data with pandas
Run the hidden code cell below to import a few of the datasets used in this course.
Note: There are a large number of datasets in the datasets/ folder. Many of these are Pickle files, which you can read using pd.read_pickle(path_to_file). An example is included in the cell below.
# Import pandas
import pandas as pd
# Import some of the course datasets
actors_movies = pd.read_csv("datasets/actors_movies.csv")
business_owners = pd.read_pickle("datasets/business_owners.p")
casts = pd.read_pickle("datasets/casts.p")
# Preview one of the DataFrames
caststaxi_owners = pd.read_pickle('datasets/taxi_owners.p')
taxi_veh = pd.read_pickle("datasets/taxi_vehicles.p")
# Merge the taxi_owners and taxi_veh tables setting a suffix
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes=('_own','_veh'))
taxi_own_veh.head(5)
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes=('_own','_veh'))
# Print the value_counts to find the most popular fuel_type
print(taxi_own_veh['fuel_type'].value_counts())licenses = pd.read_pickle("datasets/licenses.p")
biz_owners = pd.read_pickle("datasets/business_owners.p")
licenses.head(10)biz_owners.head(10)# Merge the licenses and biz_owners table on account
licenses_owners = licenses.merge(biz_owners, on="account")
# Group the results by title then count the number of accounts
counted_df = licenses_owners.groupby("title").agg({'account':'count'})
# Sort the counted_df in desending order
sorted_df = counted_df.sort_values("account",ascending=False)
# Use .head() method to print the first few rows of sorted_df
print(sorted_df.head())cal = pd.read_pickle("datasets/cta_calendar.p")
ridership = pd.read_pickle("datasets/cta_ridership.p")
stations =pd.read_pickle("datasets/stations.p")cal.head()ridership.head()stations.head()# Merge the ridership, cal, and stations tables
ridership_cal_stations = ridership.merge(cal, on=['year','month','day']) \
.merge(stations, on='station_id')
# Create a filter to filter ridership_cal_stations
filter_criteria = ((ridership_cal_stations['month'] == 7)
& (ridership_cal_stations['day_type'] == "Weekday")
& (ridership_cal_stations['station_name'] == "Wilson"))
# Use .loc and the filter to select for rides
print(ridership_cal_stations.loc[filter_criteria, 'rides'].sum())
ridership_cal_stations.head()zip_demo = pd.read_pickle("datasets/zip_demo.p")
wards =pd.read_pickle("datasets/ward.p")
# Merge licenses and zip_demo, on zip; and merge the wards on ward
licenses_zip_ward = licenses.merge(zip_demo, on="zip") \
.merge(wards, on = "ward")
# Print the results by alderman and show median income
licenses_zip_ward.groupby("alderman").agg({'income':'median'}).head()land_use = pd.read_pickle("datasets/land_use.p")
census = pd.read_pickle("datasets/census.p")
# Merge land_use and census and merge result with licenses including suffixes
land_cen_lic = land_use.merge(census, on='ward') \
.merge(licenses, on='ward', suffixes=('_cen','_lic'))
# Group by ward, pop_2010, and vacant, then count the # of accounts
pop_vac_lic = land_cen_lic.groupby(['ward','pop_2010','vacant'],
as_index=False).agg({'account':'count'})
# Sort pop_vac_lic and print the results
sorted_pop_vac_lic = pop_vac_lic.sort_values(["vacant","account","pop_2010"],
ascending=[False,True,True])
# Print the top few rows of sorted_pop_vac_lic
print(sorted_pop_vac_lic.head())
movies = pd.read_pickle("datasets/movies.p")
financials = pd.read_pickle("datasets/financials.p")
# Merge the movies table with the financials table with a left join
movies_financials = movies.merge(financials, on='id', how='left')
# Count the number of rows in the budget column that are missing
number_of_missing_fin = movies_financials['budget'].isnull().sum()#missings
# Print the number of movies missing financials
print(number_of_missing_fin)