Skip to content
Joining Data with pandas
Run the hidden code cell below to import a few of the datasets used in this course.
Note: There are a large number of datasets in the datasets/
folder. Many of these are Pickle files, which you can read using pd.read_pickle(path_to_file)
. An example is included in the cell below.
# Import pandas
import pandas as pd
# Import some of the course datasets
actors_movies = pd.read_csv("datasets/actors_movies.csv")
business_owners = pd.read_pickle("datasets/business_owners.p")
casts = pd.read_pickle("datasets/casts.p")
# Preview one of the DataFrames
casts
Take Notes
Add notes here about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Import pandas
import pandas as pd
import matplotlib.pyplot as plt
# Import some datasets
gdp = pd.read_csv("datasets/WorldBank_GDP.csv")
pop = pd.read_csv("datasets/WorldBank_POP.csv")
print(gdp)
print(pop)
# Merge gdp and pop on date and country with fill
gdp_pop = pd.merge_ordered(gdp, pop, on=['Country Name','Year'], fill_method='ffill')
# Add a column named gdp_per_capita to gdp_pop that divides the gdp by pop
gdp_pop['gdp_per_capita'] = gdp_pop['GDP'] / gdp_pop['Pop']
# Pivot data so gdp_per_capita, where index is date and columns is country
gdp_pivot = gdp_pop.pivot_table('gdp_per_capita', 'Year', 'Country Name')
# Select dates equal to or greater than 1991-01-01
recent_gdp_pop = gdp_pivot.query('Year>=2010')
# Plot recent_gdp_pop
recent_gdp_pop.plot(rot=90)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# unpivot everything besides the year column
print(ur_wide)
ur_tall = ur_wide.melt(id_vars='year',value_vars=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],var_name='month',value_name='unempl_rate')
print(ur_tall)
# Create a date column using the month and year columns of ur_tall
ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month'])
print(ur_tall)
# Sort ur_tall by date in ascending order
ur_sorted = ur_tall.sort_values(by='date',ascending=True)
print(ur_sorted)
# Plot the unempl_rate by date
ur_sorted.plot(x='date',y='unempl_rate')
plt.show()
Run cancelled
import pandas as pd
import matplotlib.pyplot as plt
ten_yr=pd.read_csv('')
# Use melt on ten_yr, unpivot everything besides the metric column
print(ten_yr)
bond_perc = ten_yr.melt(id_vars='metric' ,var_name='date',value_name='close')
print(bond_perc)
# Use query on bond_perc to select only the rows where metric=close
bond_perc_close = bond_perc.query('metric=="close"')
# Merge (ordered) dji and bond_perc_close on date with an inner join
dow_bond = pd.merge_ordered(dji,bond_perc_close,on='date',how='inner',suffixes=('_dow','_bond'))
# Plot only the close_dow and close_bond columns
dow_bond.plot(y=['close_dow','close_bond'], x='date', rot=90)
plt.show()