Skip to content

This is a continuation of Cleaning & transforming data in R.

Reminder: The context for this project is that we're working for a company that sales bicycles and accessories and they need a general dashboard with key insights about their business. (AdventureWorks' dataset)

In this case we are going to take the data that was cleaned and transformed to explore the numbers and see what interesting initial findings can we provide.

Let's load the packages and get the data

library(tidyverse)
library(lubridate)
library(readxl)
Hidden output
products <- read_excel("Data.xlsx", sheet = "products", col_names = TRUE)
customers <- read_excel("Data.xlsx", sheet = "customers", col_names = TRUE)
territory <- read_excel("Data.xlsx", sheet = "territory", col_names = TRUE)
sales <- read_excel("Data.xlsx", sheet = "sales", col_names = TRUE)
categories <- read_excel("Data.xlsx", sheet = "categories", col_names = TRUE)
subcategories <- read_excel("Data.xlsx", sheet = "subcateogries", col_names = TRUE)

Because this is an initial EDA I will formulate a set of general questions about the data that I believe can orientate our analysis and put us on the right path of finding interesting insights.

Let's ask the following:

What are the overall results? (Revenue, Profit, Items sold, Sales by category)

What is the sales performance over time? are there any discernible patterns?

What is the sales performance by territory?

How much each customer spends on average? are there any differences based on territory?

What is the average number of items bought by customers for each territory?

What is the age average and what's the age distribution of customers?

What percentage of customers can be considered regular (more than 1 purchase) and what is the difference in spending between regular and non-regular customers?

What other customers' characteristics are relevant for understanding sales?

Overview of the main KPIs or results:

sales %>%
	summarize(items_sold = n(),
              avg_revenue_item = floor(mean(SalesAmount)),
              total_revenue = floor(sum(SalesAmount)),
              gross_profit = floor(sum(GrossProfit)),
              net_profit = floor(sum(NetProfit))
             )

## Total sales by category

category_key <- products %>% select(ProductKey, ProductCategoryKey)

category_name <- categories %>% select(ProductCategoryKey, CategoryName)

sales %>%
	select(ProductKey, SalesAmount) %>%
	left_join(category_key, by = "ProductKey") %>%
	left_join(category_name, by = "ProductCategoryKey") %>%
	select(CategoryName, SalesAmount) %>%
	group_by(CategoryName) %>%
	summarize(total_sales = floor(sum(SalesAmount)) ) %>%
	arrange(-total_sales) %>%
	mutate(percentage_of_total = round((total_sales / sum(total_sales) * 100), digits = 1) )
	
	

Patterns in sales over time

## Date & Sales

Date_sales <- select(sales, OrderDate, SalesAmount)

## New column with only year & month

Date_sales$Month_OrderDate <- as_date(cut(Date_sales$OrderDate, breaks = 'month'))

## Grouping by year-month and creating fields for % differences between months

sales_month_year <- Date_sales %>%
	group_by(Month_OrderDate) %>%
 	summarize(TotalSales = floor(sum(SalesAmount))) %>%
	ungroup() %>%
	mutate(nominal_difference = TotalSales - lag(TotalSales),
           percentage_difference = round((nominal_difference / lag(TotalSales)) * 100, digits = 1)
           )

##Visualizing the evolution of sales over time

ggplot(sales_month_year, aes(Month_OrderDate, TotalSales)) +
	geom_line() +
	scale_x_date(date_labels = "%b-%y", date_breaks = "3 month") +
	labs(x = "Date", y = "Sales (USD)")

##Visualizing the differences in sales between months

ggplot(sales_month_year, aes(Month_OrderDate, percentage_difference)) +
	geom_col(color = "grey", fill = "#e74c3c") +
	scale_x_date(date_labels = "%b-%y", date_breaks = "3 month") +
	labs(tittle = "Difference in sales between months", x = "Date")
	

Sales by territory

##Sales by country

select(sales, SalesTerritoryKey, SalesAmount) %>%
	left_join(select(territory, TerritoryKey, Country), by = c("SalesTerritoryKey" = "TerritoryKey")) %>%
	select(-SalesTerritoryKey, Country, SalesAmount) %>%
	group_by(Country) %>%
	summarize(total_sales_USD = floor(sum(SalesAmount))) %>%
	arrange(-total_sales_USD)

## Sales within the United States

select(sales, SalesTerritoryKey, SalesAmount) %>%
	left_join(select(territory, TerritoryKey, Region), by = c("SalesTerritoryKey" = "TerritoryKey")) %>%
	filter(SalesTerritoryKey < 6) %>% #here we are not considering other territories
	select(US_regions = Region, SalesAmount) %>%
	group_by(US_regions) %>%
	summarize(total_sales_USD = floor(sum(SalesAmount))) %>%
	arrange(-total_sales_USD)
	
	

Customer spending

## Average spending by customer in each country

spending <- sales %>%
	select(SalesTerritoryKey, CustomerKey, SalesAmount) %>%
	group_by(SalesTerritoryKey, CustomerKey) %>%
	summarize(total_spending = sum(SalesAmount)) %>%
	ungroup() %>%
	left_join(select(territory, 1, 3), by = c("SalesTerritoryKey" = "TerritoryKey") )

spending %>%
	group_by(Country) %>%
 	summarize(avg_spending_by_customer = floor(mean(total_spending))) %>%
	arrange(-avg_spending_by_customer)



Items bought by customers