Skip to content
Course Notes: A/B Testing in Python
Overview of A/B Testing
What is A/B testing?
import pandas as pd
import numpy as np
# Create mock data
np.random.seed(42)
data = {
'user_id': np.arange(1, 101),
'group': np.random.choice(['A', 'B'], size=100),
'conversion': np.random.choice([0, 1], size=100)
}
# Create DataFrame
checkout = pd.DataFrame(data)
# Display DataFrame info
checkout.info()# Add mock data for 'gender' column
np.random.seed(42)
checkout['gender'] = np.random.choice(['Male', 'Female'], size=100)
# Sample with replacement to get 3000 samples
sample_df = checkout.sample(n=3000, replace=True)
sample_df['gender'].value_counts(normalize=True)
# Display value counts normalized
checkout['gender'].value_counts(normalize=True)import pandas as pd
import numpy as np
# Create mock data
np.random.seed(42)
data = {
'user_id': np.arange(1, 101),
'group': np.random.choice(['A', 'B'], size=100),
'conversion': np.random.choice([0, 1], size=100),
'gender': np.random.choice(['Male', 'Female'], size=100),
'checkout_page': np.random.choice(['Page1', 'Page2'], size=100)
}
# Create DataFrame
checkout = pd.DataFrame(data)
# Python example of random assignment
checkout.groupby('checkout_page')['gender'].value_counts(normalize=True)import pandas as pd
import numpy as np
# Create mock data
np.random.seed(42)
data = {
'user_id': np.arange(1, 101),
'group': np.random.choice(['A', 'B'], size=100),
'conversion': np.random.choice([0, 1], size=100),
'gender': np.random.choice(['Male', 'Female'], size=100),
'checkout_page': np.random.choice(['Page1', 'Page2'], size=100),
'browser': np.random.choice(['Chrome', 'Firefox', 'Safari', 'Edge'], size=100)
}
# Create DataFrame
checkout = pd.DataFrame(data)
# Determine the normalized distribution of browser counts
checkout['browser'].value_counts(normalize=True)
# Draw a random sample of rows
sample_df = checkout.sample(n=2000, replace=True)
# Check the counts distribution of sampled users' browsers
sample_df['browser'].value_counts(normalize=True)
# Check the counts distribution of browsers across checkout pages
checkout.groupby('checkout_page')['browser'].value_counts(normalize=True)Why run experiments?
The formula for Pearson's correlation coefficient (r) can also be represented as:
where:
and are the individual data points, is the mean of the scores, is the mean of the scores.
# Correlations visual inspection
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
# Create mock data for admissions
np.random.seed(42)
admissions_data = {
'Serial No.': np.arange(1, 101),
'GRE Score': np.random.randint(290, 340, size=100),
'Chance of Admit': np.random.uniform(0.3, 1.0, size=100)
}
# Create DataFrame
admissions = pd.DataFrame(admissions_data)
# Create pairplots
sns.pairplot(admissions[['Serial No.', 'GRE Score', 'Chance of Admit']])# Pearson correlation heatmap
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
# Create mock data for admissions
np.random.seed(42)
admissions_data = {
'Serial No.': np.arange(1, 101),
'GRE Score': np.random.randint(290, 340, size=100),
'TOEFL Score': np.random.randint(90, 120, size=100),
'University Rating': np.random.randint(1, 5, size=100),
'SOP': np.random.uniform(1, 5, size=100),
'LOR': np.random.uniform(1, 5, size=100),
'CGPA': np.random.uniform(6, 10, size=100),
'Research': np.random.randint(0, 2, size=100),
'Chance of Admit': np.random.uniform(0.3, 1.0, size=100)
}
# Create DataFrame
admissions = pd.DataFrame(admissions_data)
# Print Pearson correlation coefficient
admissions['GRE Score'].corr(admissions['Chance of Admit'])
# Plot correlations heatmap
sns.heatmap(admissions.corr(), annot=True)# Exercises
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Create mock data for admissions
np.random.seed(42)
admissions_data = {
'Serial No.': np.arange(1, 101),
'GRE Score': np.random.randint(290, 340, size=100),
'TOEFL Score': np.random.randint(90, 120, size=100),
'University Rating': np.random.randint(1, 5, size=100),
'SOP': np.random.uniform(1, 5, size=100),
'LOR': np.random.uniform(1, 5, size=100),
'CGPA': np.random.uniform(6, 10, size=100),
'Research': np.random.randint(0, 2, size=100),
'Chance of Admit': np.random.uniform(0.3, 1.0, size=100)
}
# Create DataFrame
admissions = pd.DataFrame(admissions_data)
# Visualize the variables in a pairplot
sns.pairplot(admissions[['Serial No.', 'TOEFL Score', 'SOP', 'Chance of Admit']])
plt.show()
# Print Pearson's correlation coefficients
print(admissions[['Serial No.', 'TOEFL Score', 'SOP','Chance of Admit']].corr())
# Visualize the coefficients in a heatmap
sns.heatmap(admissions[['Serial No.', 'TOEFL Score', 'SOP','Chance of Admit']].corr(), annot = True)
plt.show()Metrics design and estimation
# Adding data to run the code
# Create mock data for checkout
np.random.seed(42)
checkout_data = {
'user_id': np.arange(1, 101),
'group': np.random.choice(['control', 'treatment'], size=100),
'conversion': np.random.randint(0, 2, size=100),
'gender': np.random.choice(['male', 'female'], size=100),
'checkout_page': np.random.choice(['page1', 'page2'], size=100),
'browser': np.random.choice(['chrome', 'firefox', 'safari', 'edge'], size=100),
'purchased': np.random.randint(0, 2, size=100),
'order_value': np.random.uniform(20, 500, size=100) # Adding order_value column
}
# Create DataFrame
checkout = pd.DataFrame(checkout_data)
# Python metrics estimation
checkout.groupby('gender')['purchased'].mean()# Python metrics estimation
checkout[(checkout['browser']=='chrome')|(checkout['browser']=='safari')].groupby('gender')['order_value'].mean()# Python metrics estimation
checkout.groupby('browser')[['order_value', 'purchased']].mean()