import pandas as pd
# Load the dataframe
product_sales = pd.read_csv('product_sales.csv')
# Dropping duplicates
product_sales.drop_duplicates(inplace=True)
# Missing Values
missing_values = product_sales.isnull().sum()
# Dropping Missing values
dropna = product_sales.dropna(inplace=True)
# Capping Outliers
product_sales = product_sales[product_sales['years_as_customer'] <= 39]
# Handling missing values (example: fill with 0 for numerical columns and 'Unknown' for categorical columns)
product_sales['nb_sold'].fillna(0, inplace=True)
product_sales['revenue'].fillna(0.0, inplace=True)
product_sales['years_as_customer'].fillna(0, inplace=True)
product_sales['nb_site_visits'].fillna(0, inplace=True)
product_sales['sales_method'].fillna('Unknown', inplace=True)
product_sales['customer_id'].fillna('Unknown', inplace=True)
product_sales['state'].fillna('Unknown', inplace=True)
# Convert data types if necessary (example: ensure 'week' is int and 'revenue' is float)
product_sales['week'] = product_sales['week'].astype(int)
product_sales['revenue'] = product_sales['revenue'].astype(float)
# Replacing the strings (example: email for Email, call for Call, em + call for Email + call)
product_sales['sales_method'] = product_sales['sales_method'].replace({'email': 'Email', 'call': 'Call', 'em + call': 'Email + Call'})
# Displaying the cleaned dataframe
#print(product_sales)
# Filtering for only the week, method, sold, revenue and state
prod_details = product_sales[['week', 'sales_method', 'nb_sold', 'revenue', 'state']]
#print(prod_details)
counts = prod_details.value_counts()
counts, product_sales, missing_values
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
# Aggregate revenue by state
state_revenue = prod_details.groupby('state')['revenue'].sum().reset_index().sort_values('revenue', ascending=False)
# Plot the map
plt.figure(figsize=(12, 10))
sns.barplot(data=state_revenue, x='revenue', y='state', palette='mako')
plt.title('Total Revenue Generated by State')
plt.xlabel('Revenue')
plt.ylabel('State')
plt.xticks(rotation=45)
plt.show()
# Aggregate average revenue by sales method
sales_method_avg_revenue = prod_details.groupby('sales_method')['revenue'].mean().reset_index()
# Plot the bar chart
plt.figure(figsize=(12, 8))
sns.barplot(data=sales_method_avg_revenue, x='sales_method', y='revenue', palette='viridis')
plt.title('Average Revenue by Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Average Revenue')
plt.show()
# Number of product sold by each sales method
plt.figure(figsize=(12, 8))
sns.countplot(data=product_sales, x='sales_method', order=product_sales['sales_method'].value_counts().index, palette='autumn')
plt.title('Number of Products Sold by each Sales Method')
plt.xlabel('Sales Method')
plt.ylabel('Number of Products Sold')
plt.show()
# Correlation matrix for number of products sold, revenue, years as customers and site visits columns in the product_sales dataframe
plt.figure(figsize=(12, 8))
correlation_matrix = product_sales[['nb_sold', 'revenue', 'years_as_customer', 'nb_site_visits']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='winter', linewidths=1.5, vmin=-1, vmax=1)
plt.title('Correlation Matrix of Relevant Columns')
plt.show()
# Correlation between years as customers and number sold
correlation_years_as_customer_nb_sold = product_sales['years_as_customer'].corr(product_sales['nb_sold'])
print(correlation_years_as_customer_nb_sold)
# Plot the correlation between years as customers and number sold
plt.figure(figsize=(10, 6))
sns.scatterplot(data=product_sales, x='years_as_customer', y='nb_sold', hue='sales_method', palette='viridis')
plt.title(f'Correlation between Years as Customers and Number Sold: {correlation_years_as_customer_nb_sold:.2f}')
plt.xlabel('Years as Customers')
plt.ylabel('Number Sold')
plt.show()
# Hypothesis Test to support findings
from scipy.stats import pearsonr
# Significance level
alpha = 0.05
# Null hypothesis: There is no correlation between years as customer and number sold (correlation_coefficient = 0)
# Alternative hypothesis: There is a correlation between years as customer and number sold (correlation_coefficient != 0)
# Extract the relevant columns
years_as_customer = product_sales['years_as_customer']
nb_sold = product_sales['nb_sold']
# Perform Pearson correlation test
correlation_coefficient, p_value = pearsonr(years_as_customer, nb_sold)
# Determine the result of the hypothesis test
if p_value < alpha:
result = "Reject the null hypothesis: There is a significant correlation between years as customer and number sold."
else:
result = "Fail to reject the null hypothesis: There is no significant correlation between years as customer and number sold."
correlation_coefficient, p_value, result
# Alternative Hypothesis Test using Wilcoxon-Mann-Whitney test
from scipy.stats import mannwhitneyu
# Significance level
alpha = 0.05
# Null hypothesis: The distribution of years as customer and number sold are the same
# Alternative hypothesis: The distribution of years as customer and number sold are different
# Perform Wilcoxon-Mann-Whitney test
stat, p_value = mannwhitneyu(years_as_customer, nb_sold, alternative='two-sided')
# Determine the result of the hypothesis test
if p_value < alpha:
result = "Reject the null hypothesis: There is a significant difference in the distribution of years as customer and number sold."
else:
result = "Fail to reject the null hypothesis: There is no significant difference in the distribution of years as customer and number sold."
stat, p_value, result
# Hypothesis Test for Correlation between Number of Site Visits and Number of Products Sold
from scipy.stats import pearsonr
# Significance level
alpha = 0.05
# Null hypothesis: There is no correlation between number of site visits and number of products sold
# Alternative hypothesis: There is a correlation between number of site visits and number of products sold
# Extract the relevant columns
nb_site_visits = product_sales['nb_site_visits']
nb_sold = product_sales['nb_sold']
# Perform Pearson correlation test
corr, p_value = pearsonr(nb_site_visits, nb_sold)
# Determine the result of the hypothesis test
if p_value < alpha:
result = "Reject the null hypothesis: There is a significant correlation between number of site visits and number of products sold."
else:
result = "Fail to reject the null hypothesis: There is no significant correlation between number of site visits and number of products sold."
corr, p_value, result
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Extract the relevant columns
nb_site_visits = product_sales['nb_site_visits']
revenue = product_sales['revenue']
# Save the scatter plot data to CSV
scatter_data = pd.DataFrame({'nb_site_visits': nb_site_visits, 'revenue': revenue})
scatter_data.to_csv('scatter_plot_data.csv', index=False)
# Create a scatter plot with a trend line
plt.figure(figsize=(10, 6))
sns.regplot(x=nb_site_visits, y=revenue, scatter_kws={'s':10}, line_kws={'color':'red'})
plt.title('Correlation between Number of Site Visits and Revenue')
plt.xlabel('Number of Site Visits')
plt.ylabel('Revenue')
plt.grid(True)
plt.show()
Findings from the Scatterplot
The scatterplot visualizes the relationship between the number of site visits and revenue. Here are the key observations:
-
Positive Correlation: The scatterplot shows a positive trend, indicating that as the number of site visits increases, the revenue also tends to increase. This is further supported by the red trend line, which slopes upwards.
-
Data Spread: The data points are somewhat dispersed around the trend line, suggesting that while there is a general positive correlation, there is also variability in revenue for a given number of site visits.
-
Outliers: There may be some outliers present, as a few data points are located far from the trend line. These outliers could be due to various factors such as promotional events, seasonal effects, or errors in data collection.
-
Strength of Correlation: The strength of the correlation can be quantified by the Pearson correlation coefficient calculated in the previous cell. If the coefficient is close to 1, it indicates a strong positive correlation. The exact value of the correlation coefficient and the p-value should be referred to for a precise understanding.
Overall, the scatterplot suggests that increasing the number of site visits is generally associated with higher revenue, but other factors may also influence revenue, as indicated by the spread of the data points.
# Other relevant findings:
# Average number of products sold per week
avg_nb_sold_per_week = product_sales.groupby('week')['nb_sold'].mean().reset_index()
avg_nb_sold_per_week.columns = ['week', 'avg_nb_sold']
# Total revenue per state
total_revenue_per_state = product_sales.groupby('state')['revenue'].sum().sort_values(ascending=False).reset_index()
total_revenue_per_state.columns = ['state', 'total_revenue']
# Average revenue per customer based on years as customer
avg_revenue_per_years_as_customer = product_sales.groupby('years_as_customer')['revenue'].mean().sort_values(ascending=False).reset_index()
avg_revenue_per_years_as_customer.columns = ['years_as_customer', 'avg_revenue']
# Correlation between number of site visits and revenue
correlation_site_visits_revenue = product_sales[['nb_site_visits', 'revenue']].corr().iloc[0, 1]
# Number of products sold by sales method
nb_sold_by_sales_method = product_sales.groupby('sales_method')['nb_sold'].sum().reset_index()
nb_sold_by_sales_method.columns = ['sales_method', 'total_nb_sold']
# Number of products sold per state
nb_sold_by_state = product_sales.groupby('state')['nb_sold'].sum().sort_values(ascending=False).reset_index()
nb_sold_by_state.columns = ['state', 'total_products_sold']
# Results
avg_nb_sold_per_week, total_revenue_per_state, avg_revenue_per_years_as_customer, correlation_site_visits_revenue, nb_sold_by_sales_method, nb_sold_by_state
import matplotlib.pyplot as plt
import seaborn as sns
# Set the aesthetic style of the plots
sns.set_style("darkgrid")
palette = sns.color_palette("viridis")
# Distribution of revenue
plt.figure(figsize=(10, 6))
sns.histplot(product_sales['revenue'], bins=30, kde=True, color='green')
plt.title('Distribution of Revenue')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.show()
# Distribution of number of products sold
plt.figure(figsize=(10, 6))
sns.histplot(product_sales['nb_sold'], bins=30, kde=True, color='blue')
plt.title('Distribution of Number of Products Sold')
plt.xlabel('Number of Products Sold')
plt.ylabel('Frequency')
plt.show()
# Distribution of number of site visits
plt.figure(figsize=(10, 6))
sns.histplot(product_sales['nb_site_visits'], bins=30, kde=True, color=palette[2])
plt.title('Distribution of Number of Site Visits')
plt.xlabel('Number of Site Visits')
plt.ylabel('Frequency')
plt.show()
# Distribution of years as customer
plt.figure(figsize=(10, 6))
sns.histplot(product_sales['years_as_customer'], bins=30, kde=True, color='blue')
plt.title('Distribution of Years as Customer')
plt.xlabel('Years as Customer')
plt.ylabel('Frequency')
plt.show()
import seaborn as sns
# Ensure the 'product_sales' DataFrame is correctly defined
# Group by 'years_as_customer' and sum the 'revenue'
customer_revenue = product_sales.groupby('years_as_customer')['revenue'].sum().reset_index()
# Save the DataFrame to a CSV file
customer_revenue.to_csv('customer_revenue.csv', index=False)
# Plot the data
plt.figure(figsize=(12, 6))
sns.barplot(data=customer_revenue, x='years_as_customer', y='revenue', palette='viridis')
plt.title('Revenue by Years as Customer')
plt.xlabel('Years as Customer')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.show()
customer_revenue