Big Retail Company - 50k Rows

Five Data Analysis Tasks

Customer Demographics Analysis: (done)

Analyze the age distribution of customers.
Identify patterns in purchases by gender and age groups.

Product Performance: (done)

Determine which product categories have the highest revenue.
Identify the most frequently purchased products.

Geographical Insights: (done)

Analyze sales distribution across different store locations.
Identify which cities have the highest average purchase amounts.

Sales Trend Analysis: (done)

Analyze monthly sales trends over time.
Identify the top-performing months and stores for sales.

Customer Segmentation: (done)

Segment customers based on their purchase behavior (e.g., frequent buyers, high spenders).
Show similar group customers.

import pandas as pd
import matplotlib.pyplot as plt

retail = pd.read_csv("retail_sales_data.csv")
retail

#Analyzing the age distribution of customers
age = retail[["Age"]]
age.hist(bins=100)
plt.ylim(850, None)
plt.ylabel("Numbers")
plt.xlabel("Age")
plt.title("Age distribution of customers")
plt.show()

#Identify patterns in purchases by gender

age_gender = retail[["Age","Gender"]]

age_gender.groupby("Gender")["Age"].count().plot(kind="bar")
plt.ylim(16400, None)
plt.ylabel("Numbers")
plt.xlabel("Genders")
plt.title("Patterns in purchases by gender")
plt.show()

#Identify patterns in purchases by gender and age groups
age_pov = age_gender.pivot_table(index="Age", columns="Gender", aggfunc="size")
age_pov.plot()
plt.ylabel("Numbers")
plt.xlabel("Gender")
plt.title("Patterns in purchases by gender and age")
plt.grid()
plt.show()

#Determine which product categories have the highest revenue

product = retail[["Product_Category","Purchase_Amount"]]

rev_pro= product.groupby("Product_Category")["Purchase_Amount"].sum().reset_index(name='Revenue')

rev_pro["Revenue"] = round(rev_pro["Revenue"]/1000000, 2)

rev_pro.plot(x="Product_Category",kind="bar", legend=False)
plt.ylim(9.8, None)
plt.ylabel("Revenue in Millions")
plt.xlabel("Product Category")
plt.title("Product Categories vs Revenue")
plt.show()

#Identify the most frequently purchased products.
cat_sold = product.groupby("Product_Category")["Purchase_Amount"].count().reset_index(name='Numbers_sold')

cat_sold.plot(x="Product_Category",kind="bar", legend=False)
plt.ylim(9800, None)
plt.ylabel("Numbers_sold")
plt.xlabel("Product Category")
plt.title("Product Categories vs items sold")
plt.show()

#Analyze sales distribution across different store locations

store = retail.pivot_table(index="Store_Location", aggfunc="size").reset_index(name="number of sales")

store.plot(kind="bar", x="Store_Location", legend=False)
plt.ylim(9800, None)
plt.ylabel("Number of Sales")
plt.xlabel("Store Locations")
plt.title("Sales distribution across different store locations")
plt.show()

#Identify which cities have the highest average purchase amounts.

st_rev = retail.pivot_table(values="Purchase_Amount",index="Store_Location", aggfunc="mean").round(2).reset_index()

st_rev.plot(kind="bar", x="Store_Location", legend=False)
plt.ylim(900, None)
plt.ylabel("Average purchase amounts")
plt.xlabel("Store Locations")
plt.title("Cities have the highest average purchase amounts")
plt.show()

#Analyze monthly sales trends over time.

retail["Purchase_Date"] = pd.to_datetime(retail["Purchase_Date"])
retail["Purchase_Date"]  = retail["Purchase_Date"] .dt.to_period('M').dt.to_timestamp()

sales_m = retail.groupby('Purchase_Date').size().reset_index(name="numbers of sales")

sales_m["Purchase_Date"]  = sales_m["Purchase_Date"].dt.to_period('M')

sales_m.plot(kind="line", x="Purchase_Date", rot=45, legend=False)
plt.ylim(650, None)
plt.ylabel("Number of sales")
plt.xlabel("Months")
plt.title("Monthly sales trends over time")
plt.show()

#Identify the top-performing months and stores for sales

#Store with the highest sales 
top_ms = retail.pivot_table(index= "Store_ID", columns="Purchase_Date", aggfunc="size").reset_index()

top_ms["total_store_sales"] = top_ms.sum(axis=1)

top_ms.loc["total_month_sales"] = top_ms.sum()

top_ms.sort_values("total_store_sales", ascending = False).head()

store_id = 47

#top-performing months
top_mss = top_ms.iloc[-1].sort_values(ascending = False).reset_index()

top_performing_months = top_mss[top_mss['total_month_sales'] == 744].sort_values("Purchase_Date")
print(top_performing_months)

#Segment customers based on their purchase behavior

beh = retail[["Gender","Product_Category", "Purchase_Amount"]]

beh_s = beh.pivot_table(values="Purchase_Amount", index="Gender", columns="Product_Category", aggfunc="sum")

beh_spend = round(beh_s / 1000000, 2)


#showing similar customers behavior
new_columns= ["Groceries", "Clothing", "Books", "Electronics","Furniture"]
new_rows = ["Female", 'Non-Binary',"Male"]

beh_spendd = beh_spend[new_columns].loc[new_rows]

beh_spendd.plot(kind="bar")
plt.ylim(3, None)
plt.ylabel("Number of sales in Millions")
plt.xlabel("Gender")
plt.title("Genders based on their purchasing behavior")
plt.show()