Skip to content
Online Shop Customer Sales Data - SQL+Python
The dataset analyze some data about the customers of an online store in the year 2021. Data covers items such as total purchases, time on site, etc.
#Import libraries and dataset in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
customers = pd.read_csv ('Online Shop Customer Sales Data.csv')customers.info()
#No null values, most of the data are integersduplicates = customers.duplicated()
print (duplicates)
#No duplicatescustomers.shape
#Our dataset is composed of 65796 rows and 12 columnscustomers.head()#As we see above some columns contain numbers instead of the effective value (for example the Gender column contains 0 and 1 instead of Male and Female), let's replace this kind of values with their effective value
customers['Gender'].replace(0,'Male', inplace = True)
customers['Gender'].replace(1,'Female', inplace = True)
customers['Pay_Method'].replace(0,'Digital Wallets', inplace = True)
customers['Pay_Method'].replace(1,'Card', inplace = True)
customers['Pay_Method'].replace(2,'Paypal', inplace = True)
customers['Pay_Method'].replace(3,'Other', inplace = True)
customers['Browser'].replace(0,'Chrome', inplace = True)
customers['Browser'].replace(1,'Safari', inplace = True)
customers['Browser'].replace(2,'Edge', inplace = True)
customers['Browser'].replace(3,'Other', inplace = True)
customers.head()
#Let's see in percentage how many male and female customer we have in our dataset
# Set the background color and figsize
plt.style.use('dark_background')
plt.figure(figsize = (6, 6))
# Gender value count
gender_counts = customers['Gender'].value_counts()
# Create the pie chart
fig, ax = plt.subplots()
ax.pie(gender_counts, labels = gender_counts.index, autopct='%1.1f%%', startangle=90, colors=['#007fff', '#FF40B4'])
# Add title and show
ax.set_title('Gender Distribution')
plt.show()
#As we see below 67% of the customers are Female, 33% are male#Let's see which browsers are most commonly used
# Set the background color and figsize
plt.style.use('dark_background')
plt.figure(figsize = (6, 6))
# Browser value count
browser_counts = customers['Browser'].value_counts()
# Create the pie chart
fig, ax = plt.subplots()
ax.pie(browser_counts, labels = browser_counts.index, autopct='%1.1f%%', startangle=90,colors=['#007fff', '#FF40B4','#A2CD5A','#FF7F00'])
# Add title and show
ax.set_title('Browser Distribution')
plt.show()
#Most customers use Chrome for their purchases (64%), followed by Safari (20%), the rest of the customers use the Edge browser or other browsers (Mozilla, Opera ecc) and only 5% of the customers use the browser Edge#Let's see, in percentage, the use of various payment methods
plt.style.use('dark_background')
plt.figure(figsize = (6, 6))
# Payment value count
payment_counts = customers['Pay_Method'].value_counts()
# Create the pie chart
fig, ax = plt.subplots()
ax.pie(payment_counts, labels = payment_counts.index, autopct='%1.1f%%', startangle=90,colors=['#007fff', '#FF40B4','#A2CD5A','#FF7F00'])
# Add title and show
ax.set_title('Payment Distribution')
plt.show()
#As we see Card and Digital Wallets are the cost common methods of payment on the website (togheter they represents almost 60% of the used pay methods)
DataFrameas
df
variable
--Let's see the revenue by month and the total number of purchases
SELECT MONTHNAME(Purchase_DATE) AS Month, ROUND(SUM(Revenue_Total),0) AS Month_Revenue, SUM(N_Purchases) AS Month_Number_Purchases
FROM 'Online Shop Customer Sales Data.csv'
GROUP BY Month
ORDER BY Month_Revenue DESC
/*The month with most sales is january (almost 160 thousand $ sales) while the month with less sales is february (138 thousand $ sales).
The column "Month_Purchases" shows the total number of purchases by month, here we have an interesting insight: for example we notice that in September there was 22229 units sold (with 151 thousand $ of revenue) while in March the units sold was 21620 (with 153 thousand $ of revenue): this means that products with a higher price were purchased in March than in September, which closed the gap in sales volume*/DataFrameas
df1
variable
--Let's see now the time spent (in hours) on the site by month
SELECT MONTHNAME(Purchase_DATE) AS Month, SUM(Time_Spent)/60 AS Time_Spent_In_Hours
FROM 'Online Shop Customer Sales Data.csv'
GROUP BY Month
ORDER BY Time_Spent_In_Hours DESC
--By confronting the results below with the results above we see that, in general, there's a correlation between the time spent on the site and the revenue of the month (more time spent on the site brings more sales)#Let's now see the percentage of customers who did or did not purchase with a voucher
plt.style.use('dark_background')
plt.figure(figsize = (6, 6))
# Voucher value count
voucher_counts = customers['Voucher'].value_counts()
labels = ["voucher" if val == 1 else "no voucher" for val in voucher_counts.index]
# Create the pie chart
fig, ax = plt.subplots()
ax.pie(voucher_counts, labels=labels, autopct='%1.1f%%', startangle=90,colors=['#007fff', '#FF40B4'])
# Add title and show
ax.set_title('Voucher Distribution')
plt.show()
#Three-quarters of purchases were made without vouchers, while the remainder were made through vouchers.A good strategy to increase the number of consumers who buy through vouchers could be to offer vouchers with larger discounts or offer more specific vouchers based on customers' shopping habits#Let's now see, in percentage, how many customers are subscribed to the site's newsletter and how many are not
plt.style.use('dark_background')
plt.figure (figsize= (6,6))
# Newsletter value count
newsletter_counts = customers['Newsletter'].value_counts()
labels = ["subscribed" if val == 1 else "no subscribed" for val in newsletter_counts.index]
# Create the pie chart
fig, ax = plt.subplots()
ax.pie(newsletter_counts, labels=labels, autopct='%1.1f%%', startangle=90,colors=['#007fff', '#FF40B4'])
# Add title and show
ax.set_title('Newsletter Subscription Distribution')
plt.show()
#The 85% of the customers were not subscribed to the newsletter at the time of their purchases on the site, this indicates that in 2021 most of the purchases on the site were made by new customers or existing customers not subscribed to the newsletter