Skip to content
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pens_printers = pd.read_csv('product_sales.csv')
pens_printers
sns.histplot(data=pens_printers, x='years_as_customer')
plt.title('Customer Count by Year')
plt.xlabel('Years as Customer')
plt.ylabel('Number of Customers')
pens_printers.groupby('years_as_customer')['state'].count()
plt.show()
# 1 Clean sales_method column
# 2 Check for revenue Missing data
# 3 OUtliers in the years 
pens_printers['sales_method'] = pens_printers['sales_method'].str.replace('em + call', 'Email + Call')
pens_printers['sales_method'] = pens_printers['sales_method'].str.replace('email', 'Email')
pens_printers['sales_method'].value_counts()
#missing values
data = len(pens_printers) * .05
pens_printers = pens_printers.dropna()
seventyfifth = pens_printers['years_as_customer'].quantile(0.75)
twentyfifth = pens_printers['years_as_customer'].quantile(0.25)
iqr = seventyfifth - twentyfifth
upper = seventyfifth + (iqr * 1.5)
lower =  twentyfifth - (iqr * 1.5)
pens_printers_clean = pens_printers[(pens_printers['years_as_customer'] > lower) & (pens_printers['years_as_customer'] < upper)]
pens_printers_clean
customers = pens_printers_clean['customer_id'].nunique()
print(upper, lower, customers)
pens_printers_clean.describe()
 pens_printers_clean.groupby('sales_method').agg(
    Customer_count=('customer_id', 'count'),
    revenue=('revenue', 'sum'),
    avg_rev=('revenue', 'mean')
)
df = pens_printers_clean.groupby('sales_method').agg(
    Customer_count=('customer_id', 'count'),
    revenue=('revenue', 'sum'),
    avg_rev=('revenue', 'mean')
).reset_index()
df
sns.set_style('darkgrid')
sns.barplot(data=df, x='sales_method', y='Customer_count', hue='sales_method')
plt.xlabel('Sales Method')
plt.ylabel('Customer Count')
plt.title('Customer vs Sale Method')
plt.show()

1 hidden cell
week_df = pens_printers_clean.groupby(['sales_method','week'])['revenue'].sum().reset_index()
sns.lineplot(data=week_df, x='week', y='revenue', hue='sales_method')
plt.title('Weekly Revenue by Sales Method')
plt.xlabel('Week')
plt.ylabel('Revenue')
plt.show()
sns.set_style('darkgrid')
sns.barplot(data=df, x='sales_method', y='avg_rev', hue='sales_method')
plt.xlabel('Sales Method')
plt.ylabel('Avg Rev')
plt.title('Avg Revenue Per Customer')
plt.show()
numerical_correlations = pens_printers_clean[['nb_sold', 'revenue', 'years_as_customer', 'nb_site_visits']]

sns.set_style('darkgrid')

sns.heatmap(numerical_correlations.corr(), annot=True)
plt.title('Correlation of Numerical Data')
customer_years2 = pens_printers.groupby('years_as_customer').agg(
    Customer_count=('customer_id', 'count'),
    revenue=('revenue', 'sum'),
    avg_rev=('revenue', 'mean')
).reset_index()
customer_years
sns.lineplot(data=customer_years, x='years_as_customer', y='revenue')
sns.lineplot(data=customer_years, x='years_as_customer', y='avg_rev')
customer_years
Run cancelled
percentage_customer_base = pens_printers_clean.value_counts('years_as_customer', normalize=True).reset_index()
percentage_customer_base['c_sum'] = percentage_customer_base['proportion'].cumsum()
percentage_customer_base