<Python> Data Wrangling (Demo)

Clean multilevel table

import sys
import json
import ast # handle single/double quote in dict/json string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
%matplotlib inline
%config InlineBackend.figure_format='retina'

df = pd.read_excel('https://github.com/prasertcbs/basic-dataset/raw/master/starbucks_bakery.xlsx')
df

df = df.dropna(subset=['Product Name']).reset_index(drop=True) # drop Null-values in all columns by rows
df

df.info()

df.loc[0] # this is what we don't like

df.loc[1] # this is what we want

np.isnan(df.loc[0, 'Calories'])

np.isnan(df.loc[1, 'Calories'])

df['Category'] = df.apply(lambda r: r['Product Name'] if np.isnan(r['Calories']) else np.nan, axis=1)
df

df['Category'] = df['Category'].ffill()
df

df = df.dropna(subset=['Calories']).reset_index(drop=True)
df

df.columns

df = df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',
       'Saturated  Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',
       'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',
       'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',
       'Iron (%DV)']]
df

# df.to_excel('bakery_clean.xlsx', index=False)

df.groupby('Category').describe()

‌
‌
‌

<Python> Data Wrangling (Demo)

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Clean multilevel table

Clean multilevel table