Skip to content
Big Retail Company - 50k Rows
Five Data Analysis Tasks
Customer Demographics Analysis: (done)
- Analyze the age distribution of customers.
- Identify patterns in purchases by gender and age groups.
Product Performance: (done)
- Determine which product categories have the highest revenue.
- Identify the most frequently purchased products.
Geographical Insights: (done)
- Analyze sales distribution across different store locations.
- Identify which cities have the highest average purchase amounts.
Sales Trend Analysis: (done)
- Analyze monthly sales trends over time.
- Identify the top-performing months and stores for sales.
Customer Segmentation: (done)
- Segment customers based on their purchase behavior (e.g., frequent buyers, high spenders).
- Show similar group customers.
import pandas as pd
import matplotlib.pyplot as plt retail = pd.read_csv("retail_sales_data.csv")
retail#Analyzing the age distribution of customers
age = retail[["Age"]]
age.hist(bins=100)
plt.ylim(850, None)
plt.ylabel("Numbers")
plt.xlabel("Age")
plt.title("Age distribution of customers")
plt.show()
#Identify patterns in purchases by gender
age_gender = retail[["Age","Gender"]]
age_gender.groupby("Gender")["Age"].count().plot(kind="bar")
plt.ylim(16400, None)
plt.ylabel("Numbers")
plt.xlabel("Genders")
plt.title("Patterns in purchases by gender")
plt.show()
#Identify patterns in purchases by gender and age groups
age_pov = age_gender.pivot_table(index="Age", columns="Gender", aggfunc="size")
age_pov.plot()
plt.ylabel("Numbers")
plt.xlabel("Gender")
plt.title("Patterns in purchases by gender and age")
plt.grid()
plt.show()
#Determine which product categories have the highest revenue
product = retail[["Product_Category","Purchase_Amount"]]
rev_pro= product.groupby("Product_Category")["Purchase_Amount"].sum().reset_index(name='Revenue')
rev_pro["Revenue"] = round(rev_pro["Revenue"]/1000000, 2)
rev_pro.plot(x="Product_Category",kind="bar", legend=False)
plt.ylim(9.8, None)
plt.ylabel("Revenue in Millions")
plt.xlabel("Product Category")
plt.title("Product Categories vs Revenue")
plt.show()
#Identify the most frequently purchased products.
cat_sold = product.groupby("Product_Category")["Purchase_Amount"].count().reset_index(name='Numbers_sold')
cat_sold.plot(x="Product_Category",kind="bar", legend=False)
plt.ylim(9800, None)
plt.ylabel("Numbers_sold")
plt.xlabel("Product Category")
plt.title("Product Categories vs items sold")
plt.show()
#Analyze sales distribution across different store locations
store = retail.pivot_table(index="Store_Location", aggfunc="size").reset_index(name="number of sales")
store.plot(kind="bar", x="Store_Location", legend=False)
plt.ylim(9800, None)
plt.ylabel("Number of Sales")
plt.xlabel("Store Locations")
plt.title("Sales distribution across different store locations")
plt.show()
#Identify which cities have the highest average purchase amounts.
st_rev = retail.pivot_table(values="Purchase_Amount",index="Store_Location", aggfunc="mean").round(2).reset_index()
st_rev.plot(kind="bar", x="Store_Location", legend=False)
plt.ylim(900, None)
plt.ylabel("Average purchase amounts")
plt.xlabel("Store Locations")
plt.title("Cities have the highest average purchase amounts")
plt.show()
#Analyze monthly sales trends over time.
retail["Purchase_Date"] = pd.to_datetime(retail["Purchase_Date"])
retail["Purchase_Date"] = retail["Purchase_Date"] .dt.to_period('M').dt.to_timestamp()
sales_m = retail.groupby('Purchase_Date').size().reset_index(name="numbers of sales")
sales_m["Purchase_Date"] = sales_m["Purchase_Date"].dt.to_period('M')
sales_m.plot(kind="line", x="Purchase_Date", rot=45, legend=False)
plt.ylim(650, None)
plt.ylabel("Number of sales")
plt.xlabel("Months")
plt.title("Monthly sales trends over time")
plt.show()
#Identify the top-performing months and stores for sales
#Store with the highest sales
top_ms = retail.pivot_table(index= "Store_ID", columns="Purchase_Date", aggfunc="size").reset_index()
top_ms["total_store_sales"] = top_ms.sum(axis=1)
top_ms.loc["total_month_sales"] = top_ms.sum()
top_ms.sort_values("total_store_sales", ascending = False).head()
store_id = 47
#top-performing months
top_mss = top_ms.iloc[-1].sort_values(ascending = False).reset_index()
top_performing_months = top_mss[top_mss['total_month_sales'] == 744].sort_values("Purchase_Date")
print(top_performing_months)
#Segment customers based on their purchase behavior
beh = retail[["Gender","Product_Category", "Purchase_Amount"]]
beh_s = beh.pivot_table(values="Purchase_Amount", index="Gender", columns="Product_Category", aggfunc="sum")
beh_spend = round(beh_s / 1000000, 2)
#showing similar customers behavior
new_columns= ["Groceries", "Clothing", "Books", "Electronics","Furniture"]
new_rows = ["Female", 'Non-Binary',"Male"]
beh_spendd = beh_spend[new_columns].loc[new_rows]
beh_spendd.plot(kind="bar")
plt.ylim(3, None)
plt.ylabel("Number of sales in Millions")
plt.xlabel("Gender")
plt.title("Genders based on their purchasing behavior")
plt.show()