Skip to content
Course Notes: Reshaping Data with pandas
  • AI Chat
  • Code
  • Report
  • Spinner

    Course Notes

    Use this workspace to take notes, store code snippets, and build your own interactive cheatsheet!

    Note that the data from the course is not yet added to this workspace. You will need to navigate to the course overview page, download any data you wish to use, and add it to the file browser.

    # Import any packages you want to use here
    

    Take Notes

    Add notes here about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    fifa_mean = fifa_players.pivot_table(index=['nationality', 'club'], 
                                         columns='year', 
                                         aggfunc='max', 
                                         margins=True)
    
    # Specify that wide columns have a suffix containing words
    the_code_long = pd.wide_to_long(books_brown, 
                                    stubnames=['language', 'publisher'], 
                                    i=['author', 'title'], 
                                    j='code', 
                                    sep='_', 
                                    suffix="\w+")
    
    # Print the_code_long
    print(the_code_long)
    
    # Modify books_hunger by resetting the index without dropping it
    books_hunger.reset_index(drop=False, inplace=True)
    
    # Reshape using title and language as index, feature as new name, publication and page as prefix separated by space and ending in a word
    publication_features = pd.wide_to_long(books_hunger, 
                                           stubnames=["publication", "page"], 
                                           i=["title", "language"], 
                                           j="feature", 
                                           sep=" ", 
                                           suffix="\w+")
    
    # Print publication_features
    print(publication_features)
    # Concatenate the title and subtitle separated by "and" surrounded by spaces
    hp_books['full_title'] = hp_books['title'].str.cat(hp_books['subtitle'], sep =" and ") 
    
    # Split the authors into writer and illustrator columns
    hp_books[['writer', 'illustrator']] = hp_books['authors'].str.split('/', expand=True)
    
    # Melt goodreads and amazon columns into a single column
    hp_melt = hp_books.melt(id_vars=["full_title", "writer"], 
                            var_name="source", 
                            value_vars=["goodreads", "amazon"], 
                            value_name="rating")
    
    # Print hp_melt
    print(hp_melt)
    # Split main_title by a colon and assign it to two columns named title and subtitle 
    books_sh[['title', 'subtitle']] = books_sh['main_title'].str.split(':', expand=True)
    
    # Split version by a space and assign the second element to the column named volume 
    books_sh['volume'] = books_sh['version'].str.split(' ').str.get(1)
    
    # Drop the main_title and version columns modifying books_sh
    books_sh.drop(['main_title', 'version'], axis=1, inplace=True)
    
    # Reshape using title, subtitle and volume as index, name feature the new variable from columns starting with number, separated by undescore and ending in words 
    sh_long = pd.wide_to_long(books_sh, stubnames="number", i=["title", "subtitle", "volume"], 
                      j="feature", sep="_", suffix="\w+")
    
    # Print sh_long 
    print(sh_long)

    Stacking/Unstacking Dataframes

    # Predefined list to use as index
    new_index = [['California', 'California', 'New York', 'Ohio'], 
                 ['Los Angeles', 'San Francisco', 'New York', 'Cleveland']]
    
    # Create a multi-level index using predefined new_index
    churn_new = pd.MultiIndex.from_arrays(new_index, names=['state', 'city'])
    
    # Assign the new index to the churn index
    churn.index = churn_new
    
    # Reshape by stacking churn DataFrame
    churn_stack = churn.stack()
    
    # Print churn_stack
    print(churn_stack)
    
    # Set state and city as index modifying the DataFrame
    churn.set_index(['state', 'city'], inplace=True)
    
    # Reshape by stacking the second level
    churn_stack = churn.stack(level=1)
    
    # Print churn_stack
    print(churn_stack)
    
    # Stack churn by the feature column level
    churn_feature = churn.stack(level="feature")
    
    # Print churn_feature
    print(churn_feature)
    
    # Sort the index in descending order
    churn_time = churn.unstack(level='time').sort_index(ascending=False)
    
    # Print churn_time
    print(churn_time)
    
    # Unstack churn by type level
    churn_type = churn.unstack(level="type")
    
    # Stack the resulting DataFrame using the first column level
    churn_final = churn_type.stack(level=0)
    
    # Print churn_type
    print(churn_final)
    
    # Switch the first and third row index levels in churn
    churn_swap = churn.swaplevel(0, 2)
    
    # Reshape by unstacking the last row level 
    churn_unstack = churn_swap.unstack(level=-1)
    
    # Print churn_unstack
    print(churn_unstack)
    
    # Unstack the first and second row level of churn
    churn_unstack = churn.unstack(level=[0, 1])
    
    # Stack the resulting DataFrame using plan and year
    churn_py = churn_unstack.stack(['plan', 'year'])
    
    # Switch the first and second column levels
    churn_switch = churn_py.swaplevel(0, 1, axis=1)
    
    # Print churn_switch
    print(churn_switch)
    # Unstack churn level and fill missing values with zero
    churn = churn.unstack(level="churn", fill_value=0)
    
    # Sort by descending voice mail plan and ascending international plan
    churn_sorted = churn.sort_index(level=["voice_mail_plan", "international_plan"], 
                              ascending=[False, True])
    
    # Print final DataFrame and observe pattern
    print(churn_sorted)
    
    # Stack the level scope without dropping rows with missing values
    churn_stack = churn.stack(level="scope", dropna=False)
    
    # Fill the resulting missing values with zero
    churn_fill = churn_stack.fillna(0)
    
    # Print churn_fill
    print(churn_fill)

    Statistics

    # Stack the first level, get sum, and unstack the second level
    obesity_sum = obesity.stack(level=0).sum(axis=1).unstack(level=1)
    
    # Print obesity_max
    print(obesity_sum)
    
    # Stack country level, group by country and get the mean
    obesity_mean = obesity.stack(level="country").groupby("country").mean()
    
    # Print obesity_mean
    print(obesity_mean)
    
    # Stack country level, group by country and get the median 
    obesity_median = obesity.stack(level="country").groupby("country").median()
    
    # Print obesity_median
    print(obesity_median)

    Transforming a list-like column with explode()

    # Explode the values of bounds to a separate row
    obesity_bounds = obesity['bounds'].explode()
    
    # Merge obesity_bounds with country and perc_obesity columns of obesity using the indexes
    obesity_final = obesity[["country", "perc_obesity"]].merge(obesity_bounds, 
                                            right_index=True, 
                                            left_index=True)
    
    # Print obesity_final
    print(obesity_final)
    
    #Faster method:
    # Transform the list-like column named bounds  
    obesity_explode = obesity.explode("bounds")
    
    # Modify obesity_explode by resetting the index 
    obesity_explode.reset_index(drop=True, inplace=True)
    
    # Print obesity_explode
    print(obesity_explode)
    
    #--
    # Assign the result of the split to the bounds column
    obesity_split = obesity.assign(bounds=obesity['bounds'].str.split('-'))
    
    # Print obesity
    print(obesity_split)
    
    # Transform the column bounds in the obesity DataFrame
    obesity_split = obesity.assign(bounds=obesity['bounds'].str.split('-')).explode("bounds")
    
    # Print obesity_split
    print(obesity_split)

    Reading nested data into a DataFrame (i.e. JSON)