Skip to content
import pandas as pd
import matplotlib as plt 
import seaborn as sns
import numpy as np
%matplotlib inline

df = pd.read_csv('Sales Dataset.csv', engine='python', on_bad_lines='skip')
df.head()
#Removing rows with null values 
df_clean=df.dropna()
print(df_clean.head())
#Adding a cost column - Profit = Revenue - Cost therefore Cost = Revenue - Profit
df_clean['Cost']=df_clean['Sales'] - df_clean['Profit']
print(df_clean['Cost'].head())
#Counting the number of unique products by region
unique_products=df_clean['Product Name'].unique()
unique_products_count=df_clean['Product Name'].nunique() 
# there are 1364 distinct products in this dataFrame
# Extract month from Order Date Column - This will help visualize seasonality
df_clean['Order Date']= pd.to_datetime(df_clean['Order Date'])
df_clean['Month'] = df_clean['Order Date'].dt.month
df_clean['Year'] = df_clean['Order Date'].dt.year
df_clean['Month'].head()
df_clean['Year'].head()
# Visualizing Sales by Category by Region
title = 'Sales by Product Category by Region'
barplot = sns.barplot(x= df_clean['Sales'], y=df_clean['Category'], hue= df_clean["Region"], data=df_clean)
barplot.set_title(title)
# Sales by product categories by region
sales_by_cat_region = pd.pivot_table(data=df_clean, values=['Sales'], index=['Category'], columns=['Region'], aggfunc=np.sum, sort=True)
print(sales_by_cat_region)
# Sales by Category by Region
title = 'Sales by Product Category by Region'
e=sns.heatmap(data=sales_by_cat_region, linewidths=2)
e.set_title(title)
#Conclusion : technology products don't sell well inEast and in West regions - Furniture don't sell as well in the West , office supplies as well in the west
# drilling down to Sales by product sub categories by region
sales_by_subcat_region = pd.pivot_table(data=df_clean, values=['Sales'], index=['Sub-Category'], columns=['Region'], aggfunc=np.sum, sort=True)
print(sales_by_subcat_region)
heatmap=sns.heatmap(data=sales_by_subcat_region, linewidths=2)
heatmap.set_title('Comparison of Products Sales Performance by Region')
#Conclusion : technology products don't sell well in East and in West regions - Furniture don't sell as well in the West , office supplies as well in the west
# Average quantity ordered by subcategory
quant_per_month_subcat = pd.pivot_table(data=df_clean, values=['Quantity'], index=['Month'], columns=['Sub-Category','Region'], aggfunc=np.mean, sort=True) 
quant_per_month_subcat
# Correlation between cost, sales and profit 
corr_df=df_clean[['Cost','Sales','Profit']]
matrix=corr_df.corr()
plot = sns.heatmap(data=matrix, annot=True, linewidths=2)
plot.set_title('Correlation heatmap between cost, sales and profit')
# Cost increases profit naturally always decreases, when sales increases costs increase, when sales increase profit increases mildly because sales generate costs
# maybe the costs of running the business are high
# Visualizing Cost by Category 
title = 'Total Cost by Product Category by Region'
sns.barplot(x= df_clean['Cost'], y=df_clean['Category'], hue= df_clean["Region"], data=df_clean)
# Conclusion : Costs are pretty much equivalent accross the Furniture and the office Supplies categories 
# For Technology category costs tend to be much higher in the South and in the East 
# Visualizing total profit by Category by Region
sns.barplot(x= df_clean['Profit'], y=df_clean['Sub-Category'], hue= df_clean["Region"], data=df_clean)
# t.fig.suptitle('Profit by Product Category by Region',y=1.03)
# identifying negative profit 
negative_profit = df_clean[df_clean['Profit']<0]
Selected_cols = ['Category', 'Region', 'Profit']
negative_profit_select = negative_profit[Selected_cols]
print(negative_profit_select.head())