Skip to content
Joining Data with pandas
Joining Data with pandas
Run the hidden code cell below to import a few of the datasets used in this course.
Note: There are a large number of datasets in the datasets/
folder. Many of these are Pickle files, which you can read using pd.read_pickle(path_to_file)
. An example is included in the cell below.
# Import pandas
import pandas as pd
# Import some of the course datasets
actors_movies = pd.read_csv("datasets/actors_movies.csv")
business_owners = pd.read_pickle("datasets/business_owners.p")
casts = pd.read_pickle("datasets/casts.p")
# Preview one of the DataFrames
casts
Take Notes
Add notes here about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Merge the taxi_owners and taxi_veh tables setting a suffix
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes=('_own','_veh'))
# Print the value_counts to find the most popular fuel_type
print(taxi_own_veh['fuel_type'].value_counts())
# Use right join to merge the movie_to_genres and pop_movies tables
genres_movies = movie_to_genres.merge(pop_movies, how='right',
left_on='movie_id',
right_on='id')
# Count the number of genres
genre_count = genres_movies.groupby('genre').agg({'id':'count'})
# Plot a bar chart of the genre_count
genre_count.plot(kind='bar')
plt.show()
# Concatenate the tracks, show only columns names that are in all tables
tracks_from_albums = pd.concat([tracks_master, tracks_ride, tracks_st],
join='inner',
sort=True)
print(tracks_from_albums)
# Concatenate the tables and add keys
inv_jul_thr_sep = pd.concat([inv_jul, inv_aug, inv_sep],
keys=['7Jul', '8Aug', '9sep'])
# Group the invoices by the index keys and find avg of the total column
avg_inv_by_month = inv_jul_thr_sep.groupby(level=0).agg({'total':'mean'})
# Bar plot of avg_inv_by_month
avg_inv_by_month.plot(kind='bar')
plt.show()
# Use merge_ordered() to merge gdp and sp500, interpolate missing value
gdp_sp500 = pd.merge_ordered(gdp, sp500, left_on='year', right_on='date',
how='left', fill_method='ffill')
# Subset the gdp and returns columns
gdp_returns = gdp_sp500.loc[:,['gdp', 'returns']]
# Print gdp_returns correlation
print (gdp_returns.corr())
# Use merge_ordered() to merge inflation, unemployment with inner join
inflation_unemploy = pd.merge_ordered(inflation, unemployment, on='date', how='inner')
# Print inflation_unemploy
print(inflation_unemploy)
# Plot a scatter plot of unemployment_rate vs cpi of inflation_unemploy
inflation_unemploy.plot(x='unemployment_rate', y='cpi', kind='scatter')
plt.show()
# Use merge_asof() to merge jpm and wells
jpm_wells = pd.merge_asof(jpm,wells,on='date_time', suffixes=('','_wells'), direction='nearest' )
# Use merge_asof() to merge jpm_wells and bac
jpm_wells_bac = pd.merge_asof(jpm_wells, bac, on='date_time', suffixes=('_jpm', '_bac'), direction='nearest')
# Compute price diff
price_diffs = jpm_wells_bac.diff()
print(price_diffs)
# Plot the price diff of the close of jpm, wells and bac only
price_diffs.plot(y=['close_jpm', 'close_wells', 'close_bac'], kind='line')
plt.show()
# Merge gdp and recession on date using merge_asof()
gdp_recession = pd.merge_asof(gdp, recession, on='date')
# Create a list based on the row value of gdp_recession['econ_status']
is_recession = ['r' if s=='recession' else 'g' for s in gdp_recession['econ_status']]
# Plot a bar chart of gdp_recession
gdp_recession.plot(kind='bar', y='gdp', x='date', color=is_recession, rot=90)
plt.show()
# unpivot everything besides the year column
ur_tall = ur_wide.melt(id_vars=['year'], var_name='month', value_name='unempl_rate')
# Create a date column using the month and year columns of ur_tall
ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month'])
# Sort ur_tall by date in ascending order
ur_sorted = ur_tall.sort_values('date')
# Plot the unempl_rate by date
ur_sorted.plot( x='date', y='unempl_rate',kind='line' )
plt.show()