Skip to content
My workspace
DataFrameas
df
variable
SELECT * FROM product_sales.csvimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
data = pd.read_csv("product_sales.csv")
#View the data
print(data.info)
print(data.describe())# Data validation and cleaning
# Validate each column
print("Week")
print(" - Unique values: ", data["week"].nunique())
print(" - Missing values: ", data["week"].isnull().sum())
print("Sales Method")
print(" - Unique values: ", data["sales_method"].unique())
print(" - Missing values: ", data["sales_method"].isnull().sum())
print("Customer ID")
print(" - Unique values: ", data["customer_id"].nunique())
print(" - Missing values: ", data["customer_id"].isnull().sum())
print("Number of Products Sold")
print(" - Unique values: ", data["nb_sold"].nunique())
print(" - Missing values: ", data["nb_sold"].isnull().sum())
print("Number of Products Sold")
print(" - Unique values: ", data["nb_sold"].nunique())
print(" - Missing values: ", data["nb_sold"].isnull().sum())
print("Revenue")
print(" - Unique values: ", data["revenue"].nunique())
print(" - Missing values: ", data["revenue"].isnull().sum())
print("Years as Customer")
print(" - Unique values: ", data["years_as_customer"].nunique())
print(" - Missing values: ", data["years_as_customer"].isnull().sum())
print("Number of Site Visits")
print(" - Unique values: ", data["nb_site_visits"].nunique())
print(" - Missing values: ", data["nb_site_visits"].isnull().sum())
print("State")
print(" - Unique values: ", data["state"].nunique())
print(" - Missing values: ", data["state"].isnull().sum())# Check data types of each column
print(data.dtypes)
# Transforming the data column
# Rename the data in 'sales_method' column
data["sales_method"] = data["sales_method"].replace("em + call", "Email + Call")
data["sales_method"] = data["sales_method"].replace("email", "Email")
print(data["sales_method"].unique())
# Validate 'customer_id'
df['customer_id'] = df['customer_id'].astype(str)
df = df.drop_duplicates(subset=['customer_id'])
# Calculate the median of the "revenue" column
revenue_median = data['revenue'].median()
# Replace the missing (NA) values in the "revenue" column with the calculated median
data['revenue'].fillna(revenue_median, inplace=True)
# Remove duplicate rows using the drop_duplicates() method
data = data.drop_duplicates()
# View data
print(data)
# Save the updated data to CSV file
data.to_csv('product_sales.csv', index=False)#Single Variable Analysis
# Revenue Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['revenue'], bins=10, kde=True)
plt.title('Revenue Distribution')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.show()
# Create a box plot for revenue values
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['revenue'])
plt.title('Box Plot of Revenue Values')
plt.xlabel('Revenue')
plt.show()
# Calculate summary statistics
min_revenue = data['revenue'].min()
max_revenue = data['revenue'].max()
mean_revenue = data['revenue'].mean()# Number Of Products Sold
plt.figure(figsize=(10, 6))
sns.countplot(x='sales_method', data= data)
plt.title('Number of Sales Method To Customers')
plt.xlabel('Number of Sales Method')
plt.ylabel('Count')
plt.show()# Counting unique customers for each sales method
customer_counts = data.groupby('sales_method')['customer_id'].nunique().sort_values(ascending=False)
# Define the colors for each sales method
colors = ['#4472C4', '#ED7D31', '#A5A5A5']
# Plotting the pie chart
plt.figure(figsize=(8, 8))
plt.pie(customer_counts, labels=customer_counts.index, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Unique Customers by Sales Method', fontsize=16)
plt.legend(title='Sales Method', loc='upper right', fontsize='large')
plt.show()
print("Number of customers for each approach:")
print(customer_counts)# Revenue by Sales Method
plt.figure(figsize=(10, 6))
sns.boxplot(x='sales_method', y='revenue', data=df)
plt.title('Revenue by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Revenue')
plt.show()
# Using bar chart
import matplotlib.pyplot as plt
# Total revenue by sales method
total_revenue_by_method = data.groupby('sales_method')['revenue'].sum().sort_values(ascending=False)
# Define the colors for each sales method
colors = ['#4472C4', '#A5A5A5', '#ED7D31']
# Create bar chart
plt.figure(figsize=(10, 6))
plt.bar(total_revenue_by_method.index, total_revenue_by_method.values, color=colors)
plt.title('Total Revenue by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Total Revenue')
plt.show()
# Describe overall revenue
overall_revenue = data['revenue'].describe()
print('Overall Revenue:')
print(overall_revenue)
# Describe revenue by method
revenue_by_method = data.groupby('sales_method')['revenue'].describe()
print('\nRevenue by Method:')
print(revenue_by_method)
# Group data by sales method and week, and calculate the median number of products sold
sales_by_week = data.groupby(['sales_method', 'week'])['nb_sold'].median().reset_index()
# Define alternative color palette
colors = {'Email': '#4472C4', 'Call': '#ED7D31', 'Email + Call': '#A5A5A5'}
# Pivot the data to create a table with sales method as columns and week as rows
sales_pivot = sales_by_week.pivot(index='week', columns='sales_method', values='nb_sold')
# Create an area plot
sales_pivot.plot.area(color=colors)
plt.title('Trend of Number of Products Sold by Sales Method over Time')
plt.xlabel('Week')
plt.ylabel('Number of Products Sold')
plt.show()
print(sales_by_week)
## Was there any difference in revenue over time for each of the methods?
revenue_over_time = data.groupby(['week', 'sales_method'])['revenue'].median().unstack()
#Define color palette
colors = {'Email': '#4472C4', 'Call': '#ED7D31', 'Email + Call': '#A5A5A5'}
revenue_over_time.plot(kind='line', color=colors)
plt.title('Revenue over time for each of the methods')
plt.show()
print (revenue_over_time)4 Business Metric
# Group the data by year, sales method, and calculate the median revenue and number of site visits
grouped_data = data.groupby(['week', 'sales_method']).agg({'revenue': 'median', 'nb_site_visits': 'median'}).reset_index()
# Define the colors for each sales method
colors = ['#ED7D31', '#4472C4', '#A5A5A5']
# Create bubble plot for Call sales method
plt.scatter(x=grouped_data.loc[grouped_data['sales_method'] == 'Call', 'week'],
y=grouped_data.loc[grouped_data['sales_method'] == 'Call', 'revenue'],
s=grouped_data.loc[grouped_data['sales_method'] == 'Call', 'nb_site_visits']*10,
c=colors[0],
alpha=0.5,
label='Call')
# Create bubble plot for Email sales method
plt.scatter(x=grouped_data.loc[grouped_data['sales_method'] == 'Email', 'week'],
y=grouped_data.loc[grouped_data['sales_method'] == 'Email', 'revenue'],
s=grouped_data.loc[grouped_data['sales_method'] == 'Email', 'nb_site_visits']*10,
c=colors[1],
alpha=0.5,
label='Email')
# Create bubble plot for Email + Call sales method
plt.scatter(x=grouped_data.loc[grouped_data['sales_method'] == 'Email + Call', 'week'],
y=grouped_data.loc[grouped_data['sales_method'] == 'Email + Call', 'revenue'],
s=grouped_data.loc[grouped_data['sales_method'] == 'Email + Call', 'nb_site_visits']*10,
c=colors[2],
alpha=0.5,
label='Email + Call')
# Set plot properties
plt.title('Relationship Between Number of Site Visits and Revenue by Sales Method Over Time')
plt.xlabel('week')
plt.ylabel('Revenue')
plt.legend(title='Sales Method')
# Display the plot
plt.show()
print(grouped_data)# Calculate average revenue per customer by sales method
avg_revenue_per_customer = df.groupby('sales_method')['revenue'].mean()
# Print the average revenue per customer by sales method
print('Average Revenue per Customer by Sales Method:')
print(avg_revenue_per_customer)