This is a continuation of Cleaning & transforming data in R.
Reminder: The context for this project is that we're working for a company that sales bicycles and accessories and they need a general dashboard with key insights about their business. (AdventureWorks' dataset)
In this case we are going to take the data that was cleaned and transformed to explore the numbers and see what interesting initial findings can we provide.
Let's load the packages and get the data
library(tidyverse)
library(lubridate)
library(readxl)
products <- read_excel("Data.xlsx", sheet = "products", col_names = TRUE)
customers <- read_excel("Data.xlsx", sheet = "customers", col_names = TRUE)
territory <- read_excel("Data.xlsx", sheet = "territory", col_names = TRUE)
sales <- read_excel("Data.xlsx", sheet = "sales", col_names = TRUE)
categories <- read_excel("Data.xlsx", sheet = "categories", col_names = TRUE)
subcategories <- read_excel("Data.xlsx", sheet = "subcateogries", col_names = TRUE)
Because this is an initial EDA I will formulate a set of general questions about the data that I believe can orientate our analysis and put us on the right path of finding interesting insights.
Let's ask the following:
What are the overall results? (Revenue, Profit, Items sold, Sales by category)
What is the sales performance over time? are there any discernible patterns?
What is the sales performance by territory?
How much each customer spends on average? are there any differences based on territory?
What is the average number of items bought by customers for each territory?
What is the age average and what's the age distribution of customers?
What percentage of customers can be considered regular (more than 1 purchase) and what is the difference in spending between regular and non-regular customers?
What other customers' characteristics are relevant for understanding sales?
Overview of the main KPIs or results:
sales %>%
summarize(items_sold = n(),
avg_revenue_item = floor(mean(SalesAmount)),
total_revenue = floor(sum(SalesAmount)),
gross_profit = floor(sum(GrossProfit)),
net_profit = floor(sum(NetProfit))
)
## Total sales by category
category_key <- products %>% select(ProductKey, ProductCategoryKey)
category_name <- categories %>% select(ProductCategoryKey, CategoryName)
sales %>%
select(ProductKey, SalesAmount) %>%
left_join(category_key, by = "ProductKey") %>%
left_join(category_name, by = "ProductCategoryKey") %>%
select(CategoryName, SalesAmount) %>%
group_by(CategoryName) %>%
summarize(total_sales = floor(sum(SalesAmount)) ) %>%
arrange(-total_sales) %>%
mutate(percentage_of_total = round((total_sales / sum(total_sales) * 100), digits = 1) )
Patterns in sales over time
## Date & Sales
Date_sales <- select(sales, OrderDate, SalesAmount)
## New column with only year & month
Date_sales$Month_OrderDate <- as_date(cut(Date_sales$OrderDate, breaks = 'month'))
## Grouping by year-month and creating fields for % differences between months
sales_month_year <- Date_sales %>%
group_by(Month_OrderDate) %>%
summarize(TotalSales = floor(sum(SalesAmount))) %>%
ungroup() %>%
mutate(nominal_difference = TotalSales - lag(TotalSales),
percentage_difference = round((nominal_difference / lag(TotalSales)) * 100, digits = 1)
)
##Visualizing the evolution of sales over time
ggplot(sales_month_year, aes(Month_OrderDate, TotalSales)) +
geom_line() +
scale_x_date(date_labels = "%b-%y", date_breaks = "3 month") +
labs(x = "Date", y = "Sales (USD)")
##Visualizing the differences in sales between months
ggplot(sales_month_year, aes(Month_OrderDate, percentage_difference)) +
geom_col(color = "grey", fill = "#e74c3c") +
scale_x_date(date_labels = "%b-%y", date_breaks = "3 month") +
labs(tittle = "Difference in sales between months", x = "Date")
Sales by territory
##Sales by country
select(sales, SalesTerritoryKey, SalesAmount) %>%
left_join(select(territory, TerritoryKey, Country), by = c("SalesTerritoryKey" = "TerritoryKey")) %>%
select(-SalesTerritoryKey, Country, SalesAmount) %>%
group_by(Country) %>%
summarize(total_sales_USD = floor(sum(SalesAmount))) %>%
arrange(-total_sales_USD)
## Sales within the United States
select(sales, SalesTerritoryKey, SalesAmount) %>%
left_join(select(territory, TerritoryKey, Region), by = c("SalesTerritoryKey" = "TerritoryKey")) %>%
filter(SalesTerritoryKey < 6) %>% #here we are not considering other territories
select(US_regions = Region, SalesAmount) %>%
group_by(US_regions) %>%
summarize(total_sales_USD = floor(sum(SalesAmount))) %>%
arrange(-total_sales_USD)
Customer spending
## Average spending by customer in each country
spending <- sales %>%
select(SalesTerritoryKey, CustomerKey, SalesAmount) %>%
group_by(SalesTerritoryKey, CustomerKey) %>%
summarize(total_spending = sum(SalesAmount)) %>%
ungroup() %>%
left_join(select(territory, 1, 3), by = c("SalesTerritoryKey" = "TerritoryKey") )
spending %>%
group_by(Country) %>%
summarize(avg_spending_by_customer = floor(mean(total_spending))) %>%
arrange(-avg_spending_by_customer)
Items bought by customers