Skip to content
Exam_Coding
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Data Validation: Data Cleaning
df = pd.read_csv('product_sales.csv')
df['sales_method'] = df['sales_method'].astype(str).str.replace('em + call', 'Email + Call')
df['sales_method'] = df['sales_method'].replace('email', 'Email')
# check the customer_id is unique
#print(df[df.duplicated(subset='customer_id', keep=False)])
df.head(100)
#df.info()# Check the relationship between revenue column and nb_sold column by sales_method Email
df_Email = df[df['sales_method'] == 'Email']
print(df_Email[['revenue','nb_sold']].corr())
# Check the relationship between revenue column and nb_sold column by sales_method Email + Call
df_Email_Call = df[df['sales_method'] == 'Email + Call']
print(df_Email_Call[['revenue','nb_sold']].corr())
# Check the relationship between revenue column and nb_sold column by sales_method Call
df_Call = df[df['sales_method'] == 'Call']
print(df_Email_Call[['revenue','nb_sold']].corr())
# Visualize the relationship between revenue column and nb_sold column classified by sales_method.
sns.relplot(x='nb_sold', y='revenue', data=df,hue='sales_method')import statsmodels.formula.api as smf
# Create df_trainmodels
df_trainmodel = df[~df['revenue'].isna()]
# Create df_results
df_result = df[df['revenue'].isna()]
# Create the model then predict the missing values
x = smf.ols('revenue ~ nb_sold + C(sales_method)', data = df_trainmodel).fit()
result = x.predict(df_result)
print(result)
# Fill missing values with results
df['revenue'] = df['revenue'].fillna(result).round(2)
print(df.head())df.head(200)
# 1. How many customers were there for each approach?
customer_count = df.groupby('sales_method')['customer_id'].nunique().reset_index()
customer_count.columns = ['sales_method', 'unique_customers']
#There were more sales via email 6,922 (7,466) more than the other methods since it's inexpensive to use.
# Plotting
plt.figure(figsize=(8, 6))
plt.bar(customer_count['sales_method'], customer_count['unique_customers'], color='green')
plt.title('Graph 1: Number of Unique Customers per Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Number of Unique Customers')
#plt.gca().spines['left'].set_visible(False) # Corrected this line
plt.xticks(rotation=45)
plt.grid(False)
plt.tight_layout()
# Display values on top of bars
#for i, v in enumerate(customer_count['unique_customers']):
#plt.text(i, v + 2, str(round(v, 2)), ha='center', va='bottom')
plt.show()
customer_count# 2. - What does the spread of the revenue look like overall? And for each method?
# Spread of the revenue
overall_stats = df['revenue'].describe()
print("Overall Revenue Statistics:\n", overall_stats)
plt.figure(figsize=(6, 2))
sns.histplot(df['revenue'], kde=True)
plt.title('Graph 2: Overall Revenue Distribution')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()
# Revenue Spread by Method
method_stats = df.groupby('sales_method')['revenue'].describe()
print("\nRevenue Statistics by Sales Method:\n", method_stats)
plt.figure(figsize=(6, 2))
sns.boxplot(x='sales_method', y='revenue', data=df)
plt.title('Graph 3: Revenue Distribution by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Revenue')
plt.grid(False)
plt.show()
# the IQR of each method's revenue
from scipy.stats import iqr
import pandas as pd
call = df.loc[df.sales_method=='Call', 'revenue'].round(2)
email = df.loc[df.sales_method=='Email', 'revenue'].round(2)
email_call = df.loc[df.sales_method=='Email + Call', 'revenue'].round(2)
iqr_values = {
'Call IQR': iqr(call),
'Email IQR': iqr(email),
'Email + Call IQR': iqr(email_call)
}
iqr_value = pd.DataFrame([iqr_values])
iqr_value.head()
# Plotting
plt.figure(figsize=(8, 6))
plt.bar(iqr_value.columns, iqr_value.values.flatten(), color=['blue', 'green', 'orange'])
plt.title('Graph 4: Interquartile Range (IQR) of Revenue by Sales Method')
plt.ylabel('IQR')
plt.xlabel('Sales Method')
plt.ylim(0, max(iqr_value.values.flatten()) * 1.1)
plt.grid(False)
# Display values on top of bars
for i, v in enumerate(iqr_value.values.flatten()):
plt.text(i, v + 2, str(round(v, 2)), ha='center', va='bottom')
plt.show()
# 3. Was there any difference in revenue over time for each of the methods?
# Grouping by week and sales method to get the sum of revenue for each combination
revenue_over_time = df.groupby(['week', 'sales_method'])['revenue'].agg(['mean','median']).unstack()
# Plotting
plt.figure(figsize=(6, 4))
# Plot mean revenue
if ('mean', 'Call') in revenue_over_time.columns:
plt.plot(revenue_over_time.index, revenue_over_time[('mean', 'Call')], marker='o', linestyle='-', label='Call', color='blue')
if ('mean', 'Email') in revenue_over_time.columns:
plt.plot(revenue_over_time.index, revenue_over_time[('mean', 'Email')], marker='o', linestyle='-', label='Email', color='green')
if ('mean', 'Email + Call') in revenue_over_time.columns:
plt.plot(revenue_over_time.index, revenue_over_time[('mean', 'Email + Call')], marker='o', linestyle='-', label='Email + Call', color='orange')
plt.title('Graph 5: Revenue Over Time for Each Sales Method')
plt.xlabel('Week')
plt.ylabel('Revenue ($)')
plt.xticks(revenue_over_time.index)
plt.legend()
plt.grid(False)
plt.tight_layout()
plt.show()
revenue_over_time.head(7).style.background_gradient()# 4. Based on the data, which method would you recommend we continue to use?
# Some of these methods take more time from the team so they may not be the best for us to use if the results are similar.
# Calculate the average revenue per customer for each sales method
avg_revenue_per_customer = df.groupby('sales_method')['revenue'].mean().round(2)
# Calculate the total revenue for each sales method
total_revenue = df.groupby('sales_method')['revenue'].sum()
# Calculate the number of customers for each sales method
customer_count = df.groupby('sales_method')['customer_id'].nunique()
# Combine the results into a single DataFrame for comparison
comparison_df = pd.DataFrame({
'Total Revenue': total_revenue,
'Average Revenue per Customer': avg_revenue_per_customer,
'Customer Count': customer_count
})
# Based on the comparison, we can recommend the method with the highest average revenue per customer
recommended_method = comparison_df['Average Revenue per Customer'].idxmax()
# Display the comparison DataFrame
print(comparison_df.head())
print('\n\nRecommended Method:',recommended_method)
# Plotting
plt.figure(figsize=(18, 6))
# Plot Total Revenue
plt.subplot(1, 3, 1)
plt.bar(comparison_df.index, comparison_df['Total Revenue'], color='blue')
plt.title('Total Revenue')
plt.xlabel('Sales Method')
plt.ylabel('Total Revenue')
plt.grid(False)
# Plot Average Revenue per Customer
plt.subplot(1, 3, 2)
plt.bar(comparison_df.index, comparison_df['Average Revenue per Customer'], color='green')
plt.title('Average Revenue per Customer')
plt.xlabel('Sales Method')
plt.ylabel('Average Revenue')
plt.grid(False)
# Plot Customer Count
plt.subplot(1, 3, 3)
plt.bar(comparison_df.index, comparison_df['Customer Count'], color='red')
plt.title('Customer Count')
plt.xlabel('Sales Method')
plt.ylabel('Number of Customers')
plt.grid(False)
# Add a general title for the entire figure
plt.suptitle('\nGraph 6: Sales Performance by Method\n\n')
plt.tight_layout()
plt.show()# the total observations of each sales method
observation = df.sales_method.value_counts()
# Plotting as a horizontal bar chart with custom color
plt.figure(figsize=(8, 5))
observation.plot.barh(color='purple')
plt.title('Graph 7: Sales Method Counts')
plt.xlabel('Counts')
plt.ylabel('Sales Method')
#plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Save the plot as a PNG file
plt.savefig('Graph_7.png', bbox_inches='tight')
# Show the plot
plt.show()
# Display the total observations
observation #plot distribution of number products sold
plt.figure(figsize=(6, 4))
ax = df.nb_sold.plot.hist(color='pink')
ax.set_title('Graph 8: Distribution of New Products Sold')
ax.set_xlabel('New Product Sold')
ax.set_ylabel('Distribution')
plt.grid(False)
plt.tight_layout()
# Save the plot as a PNG file
plt.savefig('Graph 8 Distribution of New Products Sold.png')
# Show the plot
plt.show()# plot count of years_as_customer
plt.figure(figsize=(9, 7))
ax = df.years_as_customer.value_counts().plot(kind='barh', color='magenta')
ax.set_title('Graph 9: Customers per Year')
ax.set_ylabel('Years as customers')
ax.set_xlabel('Count')
plt.grid(False)
plt.tight_layout()
# Save the plot as a PNG file
plt.savefig('Graph 9 Customers per Year.png')
# Show the plot
plt.show()import matplotlib.pyplot as plt
# plot count of weeks since product was launched
plt.figure(figsize=(6, 4))
Week = [1, 2, 3, 4, 5, 6]
ax = df.week.value_counts().reindex(Week)[::-1].plot(kind='bar', color='brown')
ax.set_title('Graph 10: Sales in Weeks since product was launched')
ax.set_xlabel('Weeks')
ax.set_ylabel('Count')
plt.xticks(rotation=0)
plt.grid(False)
plt.tight_layout()
# Save the plot as a PNG file
plt.savefig('Graph 10 Sales in Weeks since product was launched.png')
# Show the plot
plt.show()# plot a scatter plot with regression line for website visits and products sold
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
# Customize colors
scatter_color = 'red'
line_color = 'blue'
# Plot
ax = sns.lmplot(data=df, x='nb_site_visits', y='nb_sold', scatter_kws={'color': scatter_color}, line_kws={'color': line_color})
plt.grid(False)
# Set the title of the plot
plt.title('Graph 11: The more website visits, the more sales')
# Save the plot as a PNG file
plt.savefig('Graph 11 The more website visits, the more sales.png')
# Show the plot
plt.show()