Skip to content
DA Certification Case Study
Data Analyst Certification Case Study
Office Supplies
- Pens & Printers is a national office supplies chain. At the moment, they send office supplies out of warehouses in four regions: East, West, South, and Central, and all four warehouses stock the same products. The Head of Sales thinks this leads to large amounts of unsold products in some locations. The management at Pens & Printers would like to look at the data and present to them the popularity of products in each region. Are there products that do not sell in some locations? Are there any other patterns over time in each region that you can find in the data?
1. Data Overview & Wrangling
# Import needed libraries for analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Load data from csv
df = pd.read_csv('data/office_supplies.csv')
df.head()# explore data & get overview
df.info()df.describe()# modify data types of different columns
df['Order Date'] = df['Order Date'].astype('datetime64[ns]')
for column in df.columns :
if df[column].dtype == 'O':
df[column] = df[column].astype('string')
df.info()# update all data columns title to be lower case and without spaces
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('-', '_')
df.info()# fill null values with zero
df.fillna(0, inplace=True)# add new column year
df['year'] = df['order_date'].dt.year
df.info()2. Exploratory Data Analysis
2.1 Univariate EDA
# view distribution of sales in different regions
sns.color_palette()[0]
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
regions = df['region'].unique()
for ax, region in zip(axes, regions):
sns.boxplot(data=df[df['region']==region], x='sales', ax=ax).set(title=f'{region}')# view distribution of profits in different regions
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
regions = df['region'].unique()
for ax, region in zip(axes, regions):
sns.boxplot(data=df[df['region'] == region], x='profit', ax=ax).set(title=f'{region}')# View count of sales transactions for each category in each region
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
regions = df['region'].unique()
for ax, region in zip(axes, regions):
sns.countplot(data=df[df['region']==region], x='category', ax=ax).set_title(f'{region}')# View count of sales transactions for each sub-category in each region
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
regions = df['region'].unique()
for ax, region in zip(axes, regions):
sns.countplot(data=df[df['region']==region], x='sub_category', ax=ax).set(title=f'{region}')
ax.tick_params(axis='x', rotation=60)