Skip to content
Joining Data with pandas
  • AI Chat
  • Code
  • Report
  • Joining Data with pandas

    Run the hidden code cell below to import a few of the datasets used in this course.

    Note: There are a large number of datasets in the datasets/ folder. Many of these are Pickle files, which you can read using pd.read_pickle(path_to_file). An example is included in the cell below.

    # Import pandas
    import pandas as pd
    
    # Import some of the course datasets 
    actors_movies = pd.read_csv("datasets/actors_movies.csv")
    business_owners = pd.read_pickle("datasets/business_owners.p")
    casts = pd.read_pickle("datasets/casts.p")
    
    # Preview one of the DataFrames
    casts

    Take Notes

    Add notes here about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    
    # Import pandas
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # Import some datasets 
    gdp = pd.read_csv("datasets/WorldBank_GDP.csv")
    pop = pd.read_csv("datasets/WorldBank_POP.csv")
    
    print(gdp)
    print(pop)
    
    # Merge gdp and pop on date and country with fill
    gdp_pop = pd.merge_ordered(gdp, pop, on=['Country Name','Year'], fill_method='ffill')
    
    # Add a column named gdp_per_capita to gdp_pop that divides the gdp by pop
    gdp_pop['gdp_per_capita'] = gdp_pop['GDP'] / gdp_pop['Pop']
    
    # Pivot data so gdp_per_capita, where index is date and columns is country
    gdp_pivot = gdp_pop.pivot_table('gdp_per_capita', 'Year', 'Country Name')
    
    # Select dates equal to or greater than 1991-01-01
    recent_gdp_pop = gdp_pivot.query('Year>=2010')
    
    # Plot recent_gdp_pop
    recent_gdp_pop.plot(rot=90)
    plt.show()
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # unpivot everything besides the year column
    print(ur_wide)
    ur_tall = ur_wide.melt(id_vars='year',value_vars=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],var_name='month',value_name='unempl_rate')
    
    print(ur_tall)
    # Create a date column using the month and year columns of ur_tall
    ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month'])
    
    print(ur_tall)
    
    # Sort ur_tall by date in ascending order
    ur_sorted = ur_tall.sort_values(by='date',ascending=True)
    
    print(ur_sorted)
    
    # Plot the unempl_rate by date
    ur_sorted.plot(x='date',y='unempl_rate')
    plt.show()
    Run cancelled
    import pandas as pd
    import matplotlib.pyplot as plt
    
    ten_yr=pd.read_csv('')
    
    # Use melt on ten_yr, unpivot everything besides the metric column
    print(ten_yr)
    bond_perc = ten_yr.melt(id_vars='metric' ,var_name='date',value_name='close')
    print(bond_perc)
    # Use query on bond_perc to select only the rows where metric=close
    bond_perc_close = bond_perc.query('metric=="close"')
    
    # Merge (ordered) dji and bond_perc_close on date with an inner join
    dow_bond = pd.merge_ordered(dji,bond_perc_close,on='date',how='inner',suffixes=('_dow','_bond'))
    
    
    # Plot only the close_dow and close_bond columns
    dow_bond.plot(y=['close_dow','close_bond'], x='date', rot=90)
    plt.show()