sql-e commerce analysis

Unknown table

DataFrameas

df

variable

-- How many orders are there overall?
SELECT COUNT(*)
FROM shipping_ecommerce.csv;

DataFrameas

df2

variable

-- Which shipment modes are used the most?
SELECT Mode_of_Shipment, COUNT (*)
FROM shipping_ecommerce.csv
GROUP BY Mode_of_Shipment
ORDER BY Mode_of_Shipment DESC;

DataFrameas

df3

variable

-- What’s the average discount offered per shipment mode?
SELECT Mode_of_Shipment, ROUND(AVG(Discount_offered), 2) AS avg_discount
FROM shipping_ecommerce.csv
GROUP BY Mode_of_Shipment;

DataFrameas

df4

variable

-- Which warehouse blocks ship the most items?
SELECT Warehouse_block, count(*) AS orders_shipped
FROM shipping_ecommerce.csv
GROUP BY Warehouse_block
ORDER BY orders_shipped DESC
LIMIT 1;

DataFrameas

df5

variable

-- Is there any pattern between customer care calls and product return (Class = 1)?
SELECT Customer_care_calls, count(*) AS orders,
       SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
       SUM(CASE WHEN Class = 0 THEN 1 ELSE 0 END) AS no_return,
       ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END)/COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY Customer_care_calls
ORDER BY Customer_care_calls;

DataFrameas

df6

variable

-- Which customer ratings are associated with returns vs. no returns?
SELECT Customer_rating, 
       count(*) AS orders,
       SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
       ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END)/COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY Customer_rating
ORDER BY Customer_rating;

DataFrameas

df7

variable

-- Do different genders show different return behavior?
SELECT Gender, 
		count(*) AS orders,
       SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
       ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END)/COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY Gender
ORDER BY Gender;

DataFrameas

df8

variable

-- Is there a connection between prior purchases and returns?
SELECT Prior_purchases,
		count(*) AS orders,
       SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
       ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END)/COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY Prior_purchases
ORDER BY Prior_purchases;

DataFrameas

df9

variable

SELECT
  CASE
    WHEN Weight_in_gms < 1000 THEN '< 1kg'
    WHEN Weight_in_gms BETWEEN 1000 AND 1999 THEN '1kg - 2kg'
    WHEN Weight_in_gms BETWEEN 2000 AND 2999 THEN '2kg - 3kg'
    WHEN Weight_in_gms BETWEEN 3000 AND 3999 THEN '3kg - 4kg'
	WHEN Weight_in_gms BETWEEN 4000 AND 4999 THEN '4kg - 5kg'
	WHEN Weight_in_gms BETWEEN 5000 AND 5999 THEN '5kg - 6kg'
	WHEN Weight_in_gms BETWEEN 6000 AND 6999 THEN '6kg - 7kg'
    ELSE '7kg - 8kg'
  END AS weight,
  COUNT(*) AS orders,
  SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
  ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) / COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY weight
ORDER BY weight;

DataFrameas

df10

variable

-- Which product importance level (low, medium, high) gets returned the most?
SELECT Product_importance,
		count(*) AS orders,
       SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
       ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END)/COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY Product_importance
ORDER BY return_rate DESC;

DataFrameas

df11

variable

-- Which combination of Warehouse Block and Product Importance has the highest return rate?
-- (use cube)
SELECT 
	Warehouse_block,
	Product_importance,
	ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END)/COUNT(*), 2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY CUBE (Warehouse_block, Product_importance);

DataFrameas

df12

variable

-- What is the average weight of returned vs non-returned products, and does product weight significantly affect returns?
SELECT 
  Class,
  COUNT(*) AS total_orders,
  ROUND(AVG(Weight_in_gms),2) AS avg_weight
FROM shipping_ecommerce.csv
GROUP BY Class;

DataFrameas

df13

variable

-- Are higher discounts associated with higher return rates?
SELECT
  CASE
    WHEN Discount_offered BETWEEN 1 AND 10 THEN 'low (1–10%)'
    WHEN Discount_offered BETWEEN 11 AND 15 THEN 'mid (11–15%)'
    WHEN Discount_offered BETWEEN 16 AND 30 THEN 'high (16–30%)'
    ELSE 'extreme (31%+)' 
  END AS discount_band,
  COUNT(*) AS orders,
  SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) AS returns,
  ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) / COUNT(*),2) AS return_rate
FROM shipping_ecommerce.csv
GROUP BY discount_band
ORDER BY 
  CASE 
    WHEN discount_band = 'low (1–10%)' THEN 1
    WHEN discount_band = 'mid (11–15%)' THEN 2
    WHEN discount_band = 'high (16–30%)' THEN 3
    ELSE 4
  END;

-- a basic query showing just return rate w individual discounts ordered by discounts showed that till 10% discount the return rate varied and post 10% discount it was always 100%, this could be either due to a problem in the dataset or because only defected products are shipped out on a high discount.

DataFrameas

df14

variable

-- Do repeat customers (with prior purchases > 0) return products more or less than first-time buyers?
SELECT 
	CASE 
		WHEN Prior_purchases >= 1 THEN 'return customer'
		ELSE 'first time customer'
	END AS customer_type,
	ROUND(100.0 * SUM(CASE WHEN Class = 1 THEN 1 ELSE 0 END) / COUNT(*), 2) AS return_rate,
	COUNT(*) AS orders
FROM shipping_ecommerce.csv
GROUP BY customer_type;

-- no first time customers

‌
‌
‌