Skip to content
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Validation: Data Cleaning
df = pd.read_csv('product_sales.csv')

df['sales_method'] = df['sales_method'].astype(str).str.replace('em + call', 'Email + Call')
df['sales_method'] = df['sales_method'].replace('email', 'Email')

# check the customer_id is unique
#print(df[df.duplicated(subset='customer_id', keep=False)])

df.head(100)

#df.info()
# Check the relationship between revenue column and nb_sold column by sales_method Email
df_Email = df[df['sales_method'] == 'Email']
print(df_Email[['revenue','nb_sold']].corr())

# Check the relationship between revenue column and nb_sold column by sales_method Email + Call
df_Email_Call = df[df['sales_method'] == 'Email + Call']
print(df_Email_Call[['revenue','nb_sold']].corr())

# Check the relationship between revenue column and nb_sold column by sales_method Call
df_Call = df[df['sales_method'] == 'Call']
print(df_Email_Call[['revenue','nb_sold']].corr())

# Visualize the relationship between revenue column and nb_sold column classified by sales_method.
sns.relplot(x='nb_sold', y='revenue', data=df,hue='sales_method')
import statsmodels.formula.api as smf

# Create df_trainmodels
df_trainmodel = df[~df['revenue'].isna()]

# Create df_results
df_result = df[df['revenue'].isna()]

# Create the model then predict the missing values
x = smf.ols('revenue ~ nb_sold + C(sales_method)', data = df_trainmodel).fit()
result = x.predict(df_result)
print(result)
# Fill missing values with results
df['revenue'] = df['revenue'].fillna(result).round(2)
print(df.head())
df.head(200)
# 1. How many customers were there for each approach?

customer_count = df.groupby('sales_method')['customer_id'].nunique().reset_index()
customer_count.columns = ['sales_method', 'unique_customers']

#There were more sales via email 6,922 (7,466) more than the other methods since it's inexpensive to use.


# Plotting
plt.figure(figsize=(8, 6))
plt.bar(customer_count['sales_method'], customer_count['unique_customers'], color='green')
plt.title('Graph 1: Number of Unique Customers per Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Number of Unique Customers')
#plt.gca().spines['left'].set_visible(False)  # Corrected this line
plt.xticks(rotation=45)
plt.grid(False)
plt.tight_layout()

# Display values on top of bars
#for i, v in enumerate(customer_count['unique_customers']):
    #plt.text(i, v + 2, str(round(v, 2)), ha='center', va='bottom')
plt.show()

customer_count
# 2. - What does the spread of the revenue look like overall? And for each method?

# Spread of the revenue
overall_stats = df['revenue'].describe()
print("Overall Revenue Statistics:\n", overall_stats)

plt.figure(figsize=(6, 2))
sns.histplot(df['revenue'], kde=True)
plt.title('Graph 2: Overall Revenue Distribution')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()


# Revenue Spread by Method
method_stats = df.groupby('sales_method')['revenue'].describe()
print("\nRevenue Statistics by Sales Method:\n", method_stats)

plt.figure(figsize=(6, 2))
sns.boxplot(x='sales_method', y='revenue', data=df)
plt.title('Graph 3: Revenue Distribution by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Revenue')
plt.grid(False)
plt.show()
# the IQR of each method's revenue
from scipy.stats import iqr
import pandas as pd

call = df.loc[df.sales_method=='Call', 'revenue'].round(2)
email = df.loc[df.sales_method=='Email', 'revenue'].round(2)
email_call = df.loc[df.sales_method=='Email + Call', 'revenue'].round(2)

iqr_values = {
    'Call IQR': iqr(call),
    'Email IQR': iqr(email),
    'Email + Call IQR': iqr(email_call)
}

iqr_value = pd.DataFrame([iqr_values])
iqr_value.head()

# Plotting
plt.figure(figsize=(8, 6))
plt.bar(iqr_value.columns, iqr_value.values.flatten(), color=['blue', 'green', 'orange'])
plt.title('Graph 4: Interquartile Range (IQR) of Revenue by Sales Method')
plt.ylabel('IQR')
plt.xlabel('Sales Method')
plt.ylim(0, max(iqr_value.values.flatten()) * 1.1)  
plt.grid(False)

# Display values on top of bars
for i, v in enumerate(iqr_value.values.flatten()):
    plt.text(i, v + 2, str(round(v, 2)), ha='center', va='bottom')

plt.show()


# 3. Was there any difference in revenue over time for each of the methods?

# Grouping by week and sales method to get the sum of revenue for each combination
revenue_over_time = df.groupby(['week', 'sales_method'])['revenue'].agg(['mean','median']).unstack()

# Plotting
plt.figure(figsize=(6, 4))

# Plot mean revenue
if ('mean', 'Call') in revenue_over_time.columns:
    plt.plot(revenue_over_time.index, revenue_over_time[('mean', 'Call')], marker='o', linestyle='-', label='Call', color='blue')
if ('mean', 'Email') in revenue_over_time.columns:
    plt.plot(revenue_over_time.index, revenue_over_time[('mean', 'Email')], marker='o', linestyle='-', label='Email', color='green')
if ('mean', 'Email + Call') in revenue_over_time.columns:
    plt.plot(revenue_over_time.index, revenue_over_time[('mean', 'Email + Call')], marker='o', linestyle='-', label='Email + Call', color='orange')


plt.title('Graph 5: Revenue Over Time for Each Sales Method')
plt.xlabel('Week')
plt.ylabel('Revenue ($)')
plt.xticks(revenue_over_time.index)
plt.legend()
plt.grid(False)
plt.tight_layout()
plt.show()

revenue_over_time.head(7).style.background_gradient()
# 4. Based on the data, which method would you recommend we continue to use? 
# Some of these methods take more time from the team so they may not be the best for us to use if the results are similar.

# Calculate the average revenue per customer for each sales method
avg_revenue_per_customer = df.groupby('sales_method')['revenue'].mean().round(2)

# Calculate the total revenue for each sales method
total_revenue = df.groupby('sales_method')['revenue'].sum()

# Calculate the number of customers for each sales method
customer_count = df.groupby('sales_method')['customer_id'].nunique()

# Combine the results into a single DataFrame for comparison
comparison_df = pd.DataFrame({
    'Total Revenue': total_revenue,
    'Average Revenue per Customer': avg_revenue_per_customer,
    'Customer Count': customer_count
})

# Based on the comparison, we can recommend the method with the highest average revenue per customer
recommended_method = comparison_df['Average Revenue per Customer'].idxmax()

# Display the comparison DataFrame
print(comparison_df.head())
print('\n\nRecommended Method:',recommended_method)

# Plotting
plt.figure(figsize=(18, 6))

# Plot Total Revenue
plt.subplot(1, 3, 1)
plt.bar(comparison_df.index, comparison_df['Total Revenue'], color='blue')
plt.title('Total Revenue')
plt.xlabel('Sales Method')
plt.ylabel('Total Revenue')
plt.grid(False)

# Plot Average Revenue per Customer
plt.subplot(1, 3, 2)
plt.bar(comparison_df.index, comparison_df['Average Revenue per Customer'], color='green')
plt.title('Average Revenue per Customer')
plt.xlabel('Sales Method')
plt.ylabel('Average Revenue')
plt.grid(False)

# Plot Customer Count
plt.subplot(1, 3, 3)
plt.bar(comparison_df.index, comparison_df['Customer Count'], color='red')
plt.title('Customer Count')
plt.xlabel('Sales Method')
plt.ylabel('Number of Customers')
plt.grid(False)

# Add a general title for the entire figure
plt.suptitle('\nGraph 6: Sales Performance by Method\n\n')

plt.tight_layout()
plt.show()
# the total observations of each sales method
observation = df.sales_method.value_counts()

# Plotting as a horizontal bar chart with custom color
plt.figure(figsize=(8, 5))
observation.plot.barh(color='purple')
plt.title('Graph 7: Sales Method Counts')
plt.xlabel('Counts')
plt.ylabel('Sales Method')
#plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save the plot as a PNG file
plt.savefig('Graph_7.png', bbox_inches='tight')

# Show the plot
plt.show()

# Display the total observations
observation
 #plot distribution of number products sold

plt.figure(figsize=(6, 4))
ax = df.nb_sold.plot.hist(color='pink')
ax.set_title('Graph 8: Distribution of New Products Sold')
ax.set_xlabel('New Product Sold')
ax.set_ylabel('Distribution')
plt.grid(False)
plt.tight_layout()

# Save the plot as a PNG file
plt.savefig('Graph 8 Distribution of New Products Sold.png')

# Show the plot
plt.show()
# plot count of years_as_customer
plt.figure(figsize=(9, 7))
ax = df.years_as_customer.value_counts().plot(kind='barh', color='magenta')
ax.set_title('Graph 9: Customers per Year')
ax.set_ylabel('Years as customers')
ax.set_xlabel('Count')
plt.grid(False)
plt.tight_layout()

# Save the plot as a PNG file
plt.savefig('Graph 9 Customers per Year.png')

# Show the plot
plt.show()
import matplotlib.pyplot as plt

# plot count of weeks since product was launched
plt.figure(figsize=(6, 4))
Week = [1, 2, 3, 4, 5, 6] 
ax = df.week.value_counts().reindex(Week)[::-1].plot(kind='bar', color='brown')
ax.set_title('Graph 10: Sales in Weeks since product was launched')
ax.set_xlabel('Weeks')
ax.set_ylabel('Count')
plt.xticks(rotation=0)
plt.grid(False)
plt.tight_layout()

# Save the plot as a PNG file
plt.savefig('Graph 10 Sales in Weeks since product was launched.png')

# Show the plot
plt.show()
# plot a scatter plot with regression line for website visits and products sold

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid") 

# Customize colors
scatter_color = 'red'
line_color = 'blue'

# Plot
ax = sns.lmplot(data=df, x='nb_site_visits', y='nb_sold', scatter_kws={'color': scatter_color}, line_kws={'color': line_color})
plt.grid(False)

# Set the title of the plot
plt.title('Graph 11: The more website visits, the more sales')

# Save the plot as a PNG file
plt.savefig('Graph 11 The more website visits, the more sales.png')

# Show the plot
plt.show()