Skip to content

Overview of A/B Testing

What is A/B testing?
import pandas as pd
import numpy as np

# Create mock data
np.random.seed(42)
data = {
    'user_id': np.arange(1, 101),
    'group': np.random.choice(['A', 'B'], size=100),
    'conversion': np.random.choice([0, 1], size=100)
}

# Create DataFrame
checkout = pd.DataFrame(data)

# Display DataFrame info
checkout.info()
# Add mock data for 'gender' column
np.random.seed(42)
checkout['gender'] = np.random.choice(['Male', 'Female'], size=100)

# Sample with replacement to get 3000 samples
sample_df = checkout.sample(n=3000, replace=True)
sample_df['gender'].value_counts(normalize=True)

# Display value counts normalized
checkout['gender'].value_counts(normalize=True)
import pandas as pd
import numpy as np

# Create mock data
np.random.seed(42)
data = {
    'user_id': np.arange(1, 101),
    'group': np.random.choice(['A', 'B'], size=100),
    'conversion': np.random.choice([0, 1], size=100),
    'gender': np.random.choice(['Male', 'Female'], size=100),
    'checkout_page': np.random.choice(['Page1', 'Page2'], size=100)
}

# Create DataFrame
checkout = pd.DataFrame(data)

# Python example of random assignment
checkout.groupby('checkout_page')['gender'].value_counts(normalize=True)
import pandas as pd
import numpy as np

# Create mock data
np.random.seed(42)
data = {
    'user_id': np.arange(1, 101),
    'group': np.random.choice(['A', 'B'], size=100),
    'conversion': np.random.choice([0, 1], size=100),
    'gender': np.random.choice(['Male', 'Female'], size=100),
    'checkout_page': np.random.choice(['Page1', 'Page2'], size=100),
    'browser': np.random.choice(['Chrome', 'Firefox', 'Safari', 'Edge'], size=100)
}

# Create DataFrame
checkout = pd.DataFrame(data)

# Determine the normalized distribution of browser counts
checkout['browser'].value_counts(normalize=True)

# Draw a random sample of rows
sample_df = checkout.sample(n=2000, replace=True)

# Check the counts distribution of sampled users' browsers
sample_df['browser'].value_counts(normalize=True)

# Check the counts distribution of browsers across checkout pages
checkout.groupby('checkout_page')['browser'].value_counts(normalize=True)
Why run experiments?

The formula for Pearson's correlation coefficient (r) can also be represented as:

where:

  • and are the individual data points,
  • is the mean of the scores,
  • is the mean of the scores.
# Correlations visual inspection

# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np

# Create mock data for admissions
np.random.seed(42)
admissions_data = {
    'Serial No.': np.arange(1, 101),
    'GRE Score': np.random.randint(290, 340, size=100),
    'Chance of Admit': np.random.uniform(0.3, 1.0, size=100)
}

# Create DataFrame
admissions = pd.DataFrame(admissions_data)

# Create pairplots
sns.pairplot(admissions[['Serial No.', 'GRE Score', 'Chance of Admit']])
# Pearson correlation heatmap

# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np

# Create mock data for admissions
np.random.seed(42)
admissions_data = {
    'Serial No.': np.arange(1, 101),
    'GRE Score': np.random.randint(290, 340, size=100),
    'TOEFL Score': np.random.randint(90, 120, size=100),
    'University Rating': np.random.randint(1, 5, size=100),
    'SOP': np.random.uniform(1, 5, size=100),
    'LOR': np.random.uniform(1, 5, size=100),
    'CGPA': np.random.uniform(6, 10, size=100),
    'Research': np.random.randint(0, 2, size=100),
    'Chance of Admit': np.random.uniform(0.3, 1.0, size=100)
}

# Create DataFrame
admissions = pd.DataFrame(admissions_data)

# Print Pearson correlation coefficient
admissions['GRE Score'].corr(admissions['Chance of Admit'])

# Plot correlations heatmap
sns.heatmap(admissions.corr(), annot=True)
# Exercises

# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create mock data for admissions
np.random.seed(42)
admissions_data = {
    'Serial No.': np.arange(1, 101),
    'GRE Score': np.random.randint(290, 340, size=100),
    'TOEFL Score': np.random.randint(90, 120, size=100),
    'University Rating': np.random.randint(1, 5, size=100),
    'SOP': np.random.uniform(1, 5, size=100),
    'LOR': np.random.uniform(1, 5, size=100),
    'CGPA': np.random.uniform(6, 10, size=100),
    'Research': np.random.randint(0, 2, size=100),
    'Chance of Admit': np.random.uniform(0.3, 1.0, size=100)
}

# Create DataFrame
admissions = pd.DataFrame(admissions_data)

# Visualize the variables in a pairplot
sns.pairplot(admissions[['Serial No.', 'TOEFL Score', 'SOP', 'Chance of Admit']])
plt.show()

# Print Pearson's correlation coefficients
print(admissions[['Serial No.', 'TOEFL Score', 'SOP','Chance of Admit']].corr())

# Visualize the coefficients in a heatmap
sns.heatmap(admissions[['Serial No.', 'TOEFL Score', 'SOP','Chance of Admit']].corr(), annot = True)
plt.show()
Metrics design and estimation
# Adding data to run the code

# Create mock data for checkout
np.random.seed(42)
checkout_data = {
    'user_id': np.arange(1, 101),
    'group': np.random.choice(['control', 'treatment'], size=100),
    'conversion': np.random.randint(0, 2, size=100),
    'gender': np.random.choice(['male', 'female'], size=100),
    'checkout_page': np.random.choice(['page1', 'page2'], size=100),
    'browser': np.random.choice(['chrome', 'firefox', 'safari', 'edge'], size=100),
    'purchased': np.random.randint(0, 2, size=100),
    'order_value': np.random.uniform(20, 500, size=100)  # Adding order_value column
}

# Create DataFrame
checkout = pd.DataFrame(checkout_data)

# Python metrics estimation
checkout.groupby('gender')['purchased'].mean()
# Python metrics estimation

checkout[(checkout['browser']=='chrome')|(checkout['browser']=='safari')].groupby('gender')['order_value'].mean()
# Python metrics estimation

checkout.groupby('browser')[['order_value', 'purchased']].mean()