Skip to content

Joining Data with pandas

Run the hidden code cell below to import a few of the datasets used in this course.

Note: There are a large number of datasets in the datasets/ folder. Many of these are Pickle files, which you can read using pd.read_pickle(path_to_file). An example is included in the cell below.

# Import pandas
import pandas as pd

# Import some of the course datasets 
actors_movies = pd.read_csv("datasets/actors_movies.csv")
business_owners = pd.read_pickle("datasets/business_owners.p")
casts = pd.read_pickle("datasets/casts.p")

# Preview one of the DataFrames
casts
taxi_owners = pd.read_pickle('datasets/taxi_owners.p')
taxi_veh = pd.read_pickle("datasets/taxi_vehicles.p")

# Merge the taxi_owners and taxi_veh tables setting a suffix
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes=('_own','_veh'))

taxi_own_veh.head(5)
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes=('_own','_veh'))
# Print the value_counts to find the most popular fuel_type
print(taxi_own_veh['fuel_type'].value_counts())
licenses = pd.read_pickle("datasets/licenses.p")
biz_owners = pd.read_pickle("datasets/business_owners.p")

licenses.head(10)
biz_owners.head(10)
# Merge the licenses and biz_owners table on account
licenses_owners = licenses.merge(biz_owners, on="account")

# Group the results by title then count the number of accounts
counted_df = licenses_owners.groupby("title").agg({'account':'count'})

# Sort the counted_df in desending order
sorted_df = counted_df.sort_values("account",ascending=False)

# Use .head() method to print the first few rows of sorted_df
print(sorted_df.head())
cal = pd.read_pickle("datasets/cta_calendar.p")
ridership = pd.read_pickle("datasets/cta_ridership.p")
stations =pd.read_pickle("datasets/stations.p")
cal.head()
ridership.head()
stations.head()
# Merge the ridership, cal, and stations tables
ridership_cal_stations = ridership.merge(cal, on=['year','month','day']) \
							.merge(stations, on='station_id')

# Create a filter to filter ridership_cal_stations
filter_criteria = ((ridership_cal_stations['month'] == 7) 
                   & (ridership_cal_stations['day_type'] == "Weekday") 
                   & (ridership_cal_stations['station_name'] == "Wilson"))

# Use .loc and the filter to select for rides
print(ridership_cal_stations.loc[filter_criteria, 'rides'].sum())

ridership_cal_stations.head()
zip_demo = pd.read_pickle("datasets/zip_demo.p")
wards =pd.read_pickle("datasets/ward.p")

# Merge licenses and zip_demo, on zip; and merge the wards on ward
licenses_zip_ward = licenses.merge(zip_demo, on="zip") \
            			.merge(wards, on = "ward")

# Print the results by alderman and show median income
licenses_zip_ward.groupby("alderman").agg({'income':'median'}).head()
land_use = pd.read_pickle("datasets/land_use.p")
census = pd.read_pickle("datasets/census.p")
# Merge land_use and census and merge result with licenses including suffixes
land_cen_lic = land_use.merge(census, on='ward') \
                    .merge(licenses, on='ward', suffixes=('_cen','_lic'))

# Group by ward, pop_2010, and vacant, then count the # of accounts
pop_vac_lic = land_cen_lic.groupby(['ward','pop_2010','vacant'], 
                                   as_index=False).agg({'account':'count'})

# Sort pop_vac_lic and print the results
sorted_pop_vac_lic = pop_vac_lic.sort_values(["vacant","account","pop_2010"], 
                                             ascending=[False,True,True])

# Print the top few rows of sorted_pop_vac_lic
print(sorted_pop_vac_lic.head())
movies = pd.read_pickle("datasets/movies.p")
financials = pd.read_pickle("datasets/financials.p")

# Merge the movies table with the financials table with a left join
movies_financials = movies.merge(financials, on='id', how='left')

# Count the number of rows in the budget column that are missing
number_of_missing_fin = movies_financials['budget'].isnull().sum()#missings

# Print the number of movies missing financials
print(number_of_missing_fin)