Skip to content

Experimental Design in Python

Experimental Design Preliminaries

Setting up experiments

import pandas as pd

# Sample DataFrame for demonstration
data = {
    'height': [150, 160, 170, 180, 190, 200, 210, 220, 230, 240] * 20
}
height = pd.DataFrame(data)

# Non-random assignment

# Assignment by slicing the DataFrame
group1_nonrandom = height.iloc[0:100,:]
group2_nonrandom = height.iloc[100:,:]
compare_df = pd.concat(
    [group1_nonrandom['height'].describe(),
     group2_nonrandom['height'].describe()],
    axis = 1)
compare_df.columns = ['group1', 'group2']
print(compare_df)
# Import necessary library
import pandas as pd

# Create a sample DataFrame for heights
data = {'heights': [150, 160, 170, 180, 190, 200, 210, 220, 230, 240]}
heights = pd.DataFrame(data)

# Random assignment
group1 = heights.sample(frac=0.5,
                         replace=False,
                         random_state=42)

group2 = heights.drop(group1.index)

print(compare_df)
import pandas as pd

# Sample data for demonstration
data = {
    'weight': [i for i in range(1, 501)]
}
weights = pd.DataFrame(data)

# Non-random assignment
group1_non_rand = weights.iloc[0:250,:]
group2_non_rand = weights.iloc[250:,:]

# Compare descriptive statistics of groups
compare_df_non_rand = pd.concat([group1_non_rand['weight'].describe(), group2_non_rand['weight'].describe()], axis=1)
compare_df_non_rand.columns = ['group1', 'group2']

# Print to assess
print(compare_df_non_rand)

# Randomly assign half
group1_random = weights.sample(frac=0.5, random_state=42, replace=False)

# Create second assignment
group2_random = weights.drop(group1_random.index)

# Compare assignments
compare_df_random = pd.concat([group1_random['weight'].describe(), group2_random['weight'].describe()], axis=1)
compare_df_random.columns = ['group1', 'group2']
print(compare_df_random)

Experimental data setup

import pandas as pd

# Sample data for demonstration
data = {
    'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'PurchaseAmount': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
}
ecom = pd.DataFrame(data)

# Block randomization in Python
group1 = ecom.sample(frac=0.5, random_state=42, replace=False)
group1['Block'] = 1
group2 = ecom.drop(group1.index)
group2['Block'] = 2
print(len(group1), len(group2))
# Visualizing splits

import seaborn as sns
import matplotlib.pyplot as plt

ecom = pd.DataFrame({
    'basket_size': [1, 2, 3, 4, 5],
    'power_user': ['yes', 'no', 'yes', 'no', 'yes']
})

sns.displot(data=ecom,
           x='basket_size',
           hue='power_user',
           fill=True,
           kind='kde')
plt.show()
# Our first data

strata_1 = ecom[ecom['power_user'] == 1]
strata_1['Block'] = 1
strata_1_g1 = strata_1.sample(frac=0.5, replace=False)
strata_1_g1['T_C'] = 'T'
strata_1_g2 = strata_1.drop(strata_1_g1.index)
strata_1_g2['T_C'] = 'C'

# The second strata

strata_2 = ecom.drop(strata_1.index)
strata_2['Block'] = 2
strata_2_g1 = strata_2.sample(frac=0.5, replace=False)
strata_2_g1['T_C'] = 'T'
strata_2_g2 = strata_1.drop(strata_1_g1.index)
strata_2_g2['T_C'] = 'C'

# Confirmin stratification

ecom_stratified = pd.concat([strata_1_g1, strata_1_g2, strata_2_g1, strata_2_g2])
ecom_stratified.groupby(['Block', 'T_C', 'power_user']).size()
import pandas as pd

productivity_subjects = pd.DataFrame({
    'subject_id': range(1, 101),
    'productivity_score': range(100, 200)
})

wealth_data = pd.DataFrame({
    'subject_id': range(1, 201),
    'high_wealth': [1 if i < 100 else 0 for i in range(200)]
})

# Exercises

# Randomly assign half
block_1 = productivity_subjects.sample(frac=0.5, random_state=42, replace=False)

# Set the block column
block_1['block'] = 1

# Create second assignment and label
block_2 = productivity_subjects.drop(block_1.index)
block_2['block'] = 2

# Concatenate and print
productivity_combined = pd.concat([block_1, block_2], axis=0)
print(productivity_combined['block'].value_counts())

# Create the first block
strata_1 = wealth_data[wealth_data['high_wealth'] == 1]
strata_1['Block'] = 1

# Create two groups assigning to Treatment or Control
strata_1_g1 = strata_1.sample(frac=0.5, replace=False)
strata_1_g1['T_C'] = 'T'
strata_1_g2 = strata_1.drop(strata_1_g1.index)
strata_1_g2['T_C'] = 'C'

# Create the second block and assign groups
strata_2 = wealth_data[wealth_data['high_wealth'] == 0]
strata_2['Block'] = 2

strata_2_g1 = strata_2.sample(90, replace=False)  # Adjusted sample size to match the example data
strata_2_g1['T_C'] = 'T'
strata_2_g2 = strata_2.drop(strata_2_g1.index)
strata_2_g2['T_C'] = 'C'

# Concatenate the grouping work
wealth_data_stratified = pd.concat([strata_1_g1, strata_1_g2, strata_2_g1, strata_2_g2])
print(wealth_data_stratified.groupby(['Block','T_C', 'high_wealth']).size())
import pandas as pd

data_1 = {
    'Block': ['A', 'A', 'B', 'B'],
    'T_C': ['T', 'C', 'T', 'C'],
    'high_free_time': [1, 0, 1, 0]
}

data_2 = {
    'Block': ['A', 'A', 'B', 'B'],
    'T_C': ['T', 'C', 'T', 'C'],
    'high_free_time': [0, 1, 0, 1]
}

df_1 = pd.DataFrame(data_1)
df_2 = pd.DataFrame(data_2)

# Check the stratification of 'high_free_time' in df_1
df_1_stratification = df_1.groupby(['Block', 'T_C', 'high_free_time']).size()

# Check the stratification of 'high_free_time' in df_2
df_2_stratification = df_2.groupby(['Block', 'T_C', 'high_free_time']).size()

df_1_stratification, df_2_stratification

Normal Data

# Import necessary libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Sample data for demonstration
data = {
    'salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000]
}
salaries = pd.DataFrame(data)

# Visualizing normal data
sns.displot(data=salaries,
           x='salary',
           kind="kde")
plt.show()
## QQ Plots

from statsmodels.graphics.gofplots import qqplot
from scipy.stats.distributions import norm
qqplot(salaries['salary'],
      line='s',
      dist=norm)
plt.show()