My workspace — DataLab

DataFrameas

df

variable

SELECT * FROM product_sales.csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Load the data
data = pd.read_csv("product_sales.csv")

#View the data
print(data.info)
print(data.describe())

# Data validation and cleaning
# Validate each column
print("Week")
print(" - Unique values: ", data["week"].nunique())
print(" - Missing values: ", data["week"].isnull().sum())
print("Sales Method")
print(" - Unique values: ", data["sales_method"].unique())
print(" - Missing values: ", data["sales_method"].isnull().sum())
print("Customer ID")
print(" - Unique values: ", data["customer_id"].nunique())
print(" - Missing values: ", data["customer_id"].isnull().sum())
print("Number of Products Sold")
print(" - Unique values: ", data["nb_sold"].nunique())
print(" - Missing values: ", data["nb_sold"].isnull().sum())
print("Number of Products Sold")
print(" - Unique values: ", data["nb_sold"].nunique())
print(" - Missing values: ", data["nb_sold"].isnull().sum())
print("Revenue")
print(" - Unique values: ", data["revenue"].nunique())
print(" - Missing values: ", data["revenue"].isnull().sum())
print("Years as Customer")
print(" - Unique values: ", data["years_as_customer"].nunique())
print(" - Missing values: ", data["years_as_customer"].isnull().sum())
print("Number of Site Visits")
print(" - Unique values: ", data["nb_site_visits"].nunique())
print(" - Missing values: ", data["nb_site_visits"].isnull().sum())
print("State")
print(" - Unique values: ", data["state"].nunique())
print(" - Missing values: ", data["state"].isnull().sum())

# Check data types of each column
print(data.dtypes)

# Transforming the data column
# Rename the data in 'sales_method' column
data["sales_method"] = data["sales_method"].replace("em + call", "Email + Call")
data["sales_method"] = data["sales_method"].replace("email", "Email")
print(data["sales_method"].unique())
# Validate 'customer_id'
df['customer_id'] = df['customer_id'].astype(str)
df = df.drop_duplicates(subset=['customer_id'])
# Calculate the median of the "revenue" column
revenue_median = data['revenue'].median()

# Replace the missing (NA) values in the "revenue" column with the calculated median
data['revenue'].fillna(revenue_median, inplace=True)

# Remove duplicate rows using the drop_duplicates() method
data = data.drop_duplicates()
# View data
print(data)

# Save the updated data to CSV file
data.to_csv('product_sales.csv', index=False)

#Single Variable Analysis
# Revenue Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['revenue'], bins=10, kde=True)
plt.title('Revenue Distribution')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.show()

# Create a box plot for revenue values
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['revenue'])
plt.title('Box Plot of Revenue Values')
plt.xlabel('Revenue')
plt.show()

# Calculate summary statistics
min_revenue = data['revenue'].min()
max_revenue = data['revenue'].max()
mean_revenue = data['revenue'].mean()

# Number Of Products Sold
plt.figure(figsize=(10, 6))
sns.countplot(x='sales_method', data= data)
plt.title('Number of Sales Method To Customers')
plt.xlabel('Number of Sales Method')
plt.ylabel('Count')
plt.show()

# Counting unique customers for each sales method
customer_counts = data.groupby('sales_method')['customer_id'].nunique().sort_values(ascending=False)

# Define the colors for each sales method
colors = ['#4472C4', '#ED7D31', '#A5A5A5']

# Plotting the pie chart
plt.figure(figsize=(8, 8))
plt.pie(customer_counts, labels=customer_counts.index, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Unique Customers by Sales Method', fontsize=16)
plt.legend(title='Sales Method', loc='upper right', fontsize='large')
plt.show()

print("Number of customers for each approach:")
print(customer_counts)

# Revenue by Sales Method
plt.figure(figsize=(10, 6))
sns.boxplot(x='sales_method', y='revenue', data=df)
plt.title('Revenue by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Revenue')
plt.show()

# Using bar chart
import matplotlib.pyplot as plt

# Total revenue by sales method
total_revenue_by_method = data.groupby('sales_method')['revenue'].sum().sort_values(ascending=False)

# Define the colors for each sales method
colors = ['#4472C4', '#A5A5A5', '#ED7D31']

# Create bar chart
plt.figure(figsize=(10, 6))
plt.bar(total_revenue_by_method.index, total_revenue_by_method.values, color=colors)
plt.title('Total Revenue by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Total Revenue')
plt.show()

# Describe overall revenue 
overall_revenue = data['revenue'].describe()
print('Overall Revenue:')
print(overall_revenue)

# Describe revenue by method
revenue_by_method = data.groupby('sales_method')['revenue'].describe()
print('\nRevenue by Method:')
print(revenue_by_method)

# Group data by sales method and week, and calculate the median number of products sold
sales_by_week = data.groupby(['sales_method', 'week'])['nb_sold'].median().reset_index()

# Define alternative color palette
colors = {'Email': '#4472C4', 'Call': '#ED7D31', 'Email + Call': '#A5A5A5'}

# Pivot the data to create a table with sales method as columns and week as rows
sales_pivot = sales_by_week.pivot(index='week', columns='sales_method', values='nb_sold')

# Create an area plot
sales_pivot.plot.area(color=colors)
plt.title('Trend of Number of Products Sold by Sales Method over Time')
plt.xlabel('Week')
plt.ylabel('Number of Products Sold')
plt.show()

print(sales_by_week)

## Was there any difference in revenue over time for each of the methods?
revenue_over_time = data.groupby(['week', 'sales_method'])['revenue'].median().unstack()
#Define color palette
colors = {'Email': '#4472C4', 'Call': '#ED7D31', 'Email + Call': '#A5A5A5'}

revenue_over_time.plot(kind='line', color=colors)
plt.title('Revenue over time for each of the methods')
plt.show()
print (revenue_over_time)

4 Business Metric

# Group the data by year, sales method, and calculate the median revenue and number of site visits
grouped_data = data.groupby(['week', 'sales_method']).agg({'revenue': 'median', 'nb_site_visits': 'median'}).reset_index()

# Define the colors for each sales method
colors = ['#ED7D31', '#4472C4', '#A5A5A5']

# Create bubble plot for Call sales method
plt.scatter(x=grouped_data.loc[grouped_data['sales_method'] == 'Call', 'week'], 
            y=grouped_data.loc[grouped_data['sales_method'] == 'Call', 'revenue'], 
            s=grouped_data.loc[grouped_data['sales_method'] == 'Call', 'nb_site_visits']*10,
            c=colors[0],
            alpha=0.5,
            label='Call')

# Create bubble plot for Email sales method
plt.scatter(x=grouped_data.loc[grouped_data['sales_method'] == 'Email', 'week'], 
            y=grouped_data.loc[grouped_data['sales_method'] == 'Email', 'revenue'], 
            s=grouped_data.loc[grouped_data['sales_method'] == 'Email', 'nb_site_visits']*10,
            c=colors[1],
            alpha=0.5,
            label='Email')

# Create bubble plot for Email + Call sales method
plt.scatter(x=grouped_data.loc[grouped_data['sales_method'] == 'Email + Call', 'week'], 
            y=grouped_data.loc[grouped_data['sales_method'] == 'Email + Call', 'revenue'], 
            s=grouped_data.loc[grouped_data['sales_method'] == 'Email + Call', 'nb_site_visits']*10,
            c=colors[2],
            alpha=0.5,
            label='Email + Call')

# Set plot properties
plt.title('Relationship Between Number of Site Visits and Revenue by Sales Method Over Time')
plt.xlabel('week')
plt.ylabel('Revenue')
plt.legend(title='Sales Method')

# Display the plot
plt.show()
print(grouped_data)

# Calculate average revenue per customer by sales method
avg_revenue_per_customer = df.groupby('sales_method')['revenue'].mean()

# Print the average revenue per customer by sales method
print('Average Revenue per Customer by Sales Method:')
print(avg_revenue_per_customer)

‌
‌
‌