Skip to content
<Python> Data Wrangling (Demo)
Clean multilevel table
import sys
import json
import ast # handle single/double quote in dict/json string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
%matplotlib inline
%config InlineBackend.figure_format='retina'df = pd.read_excel('https://github.com/prasertcbs/basic-dataset/raw/master/starbucks_bakery.xlsx')
dfdf = df.dropna(subset=['Product Name']).reset_index(drop=True) # drop Null-values in all columns by rows
dfdf.info()df.loc[0] # this is what we don't likedf.loc[1] # this is what we wantnp.isnan(df.loc[0, 'Calories'])np.isnan(df.loc[1, 'Calories'])df['Category'] = df.apply(lambda r: r['Product Name'] if np.isnan(r['Calories']) else np.nan, axis=1)
dfdf['Category'] = df['Category'].ffill()
dfdf = df.dropna(subset=['Calories']).reset_index(drop=True)
dfdf.columnsdf = df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',
'Saturated Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',
'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',
'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',
'Iron (%DV)']]
df
# df.to_excel('bakery_clean.xlsx', index=False)df.groupby('Category').describe()