Skip to content
Course Notes: Experimental Design in Python
Experimental Design in Python
Experimental Design Preliminaries
Setting up experiments
import pandas as pd
# Sample DataFrame for demonstration
data = {
'height': [150, 160, 170, 180, 190, 200, 210, 220, 230, 240] * 20
}
height = pd.DataFrame(data)
# Non-random assignment
# Assignment by slicing the DataFrame
group1_nonrandom = height.iloc[0:100,:]
group2_nonrandom = height.iloc[100:,:]
compare_df = pd.concat(
[group1_nonrandom['height'].describe(),
group2_nonrandom['height'].describe()],
axis = 1)
compare_df.columns = ['group1', 'group2']
print(compare_df)# Import necessary library
import pandas as pd
# Create a sample DataFrame for heights
data = {'heights': [150, 160, 170, 180, 190, 200, 210, 220, 230, 240]}
heights = pd.DataFrame(data)
# Random assignment
group1 = heights.sample(frac=0.5,
replace=False,
random_state=42)
group2 = heights.drop(group1.index)
print(compare_df)import pandas as pd
# Sample data for demonstration
data = {
'weight': [i for i in range(1, 501)]
}
weights = pd.DataFrame(data)
# Non-random assignment
group1_non_rand = weights.iloc[0:250,:]
group2_non_rand = weights.iloc[250:,:]
# Compare descriptive statistics of groups
compare_df_non_rand = pd.concat([group1_non_rand['weight'].describe(), group2_non_rand['weight'].describe()], axis=1)
compare_df_non_rand.columns = ['group1', 'group2']
# Print to assess
print(compare_df_non_rand)
# Randomly assign half
group1_random = weights.sample(frac=0.5, random_state=42, replace=False)
# Create second assignment
group2_random = weights.drop(group1_random.index)
# Compare assignments
compare_df_random = pd.concat([group1_random['weight'].describe(), group2_random['weight'].describe()], axis=1)
compare_df_random.columns = ['group1', 'group2']
print(compare_df_random)Experimental data setup
import pandas as pd
# Sample data for demonstration
data = {
'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'PurchaseAmount': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
}
ecom = pd.DataFrame(data)
# Block randomization in Python
group1 = ecom.sample(frac=0.5, random_state=42, replace=False)
group1['Block'] = 1
group2 = ecom.drop(group1.index)
group2['Block'] = 2
print(len(group1), len(group2))# Visualizing splits
import seaborn as sns
import matplotlib.pyplot as plt
ecom = pd.DataFrame({
'basket_size': [1, 2, 3, 4, 5],
'power_user': ['yes', 'no', 'yes', 'no', 'yes']
})
sns.displot(data=ecom,
x='basket_size',
hue='power_user',
fill=True,
kind='kde')
plt.show()# Our first data
strata_1 = ecom[ecom['power_user'] == 1]
strata_1['Block'] = 1
strata_1_g1 = strata_1.sample(frac=0.5, replace=False)
strata_1_g1['T_C'] = 'T'
strata_1_g2 = strata_1.drop(strata_1_g1.index)
strata_1_g2['T_C'] = 'C'
# The second strata
strata_2 = ecom.drop(strata_1.index)
strata_2['Block'] = 2
strata_2_g1 = strata_2.sample(frac=0.5, replace=False)
strata_2_g1['T_C'] = 'T'
strata_2_g2 = strata_1.drop(strata_1_g1.index)
strata_2_g2['T_C'] = 'C'
# Confirmin stratification
ecom_stratified = pd.concat([strata_1_g1, strata_1_g2, strata_2_g1, strata_2_g2])
ecom_stratified.groupby(['Block', 'T_C', 'power_user']).size()import pandas as pd
productivity_subjects = pd.DataFrame({
'subject_id': range(1, 101),
'productivity_score': range(100, 200)
})
wealth_data = pd.DataFrame({
'subject_id': range(1, 201),
'high_wealth': [1 if i < 100 else 0 for i in range(200)]
})
# Exercises
# Randomly assign half
block_1 = productivity_subjects.sample(frac=0.5, random_state=42, replace=False)
# Set the block column
block_1['block'] = 1
# Create second assignment and label
block_2 = productivity_subjects.drop(block_1.index)
block_2['block'] = 2
# Concatenate and print
productivity_combined = pd.concat([block_1, block_2], axis=0)
print(productivity_combined['block'].value_counts())
# Create the first block
strata_1 = wealth_data[wealth_data['high_wealth'] == 1]
strata_1['Block'] = 1
# Create two groups assigning to Treatment or Control
strata_1_g1 = strata_1.sample(frac=0.5, replace=False)
strata_1_g1['T_C'] = 'T'
strata_1_g2 = strata_1.drop(strata_1_g1.index)
strata_1_g2['T_C'] = 'C'
# Create the second block and assign groups
strata_2 = wealth_data[wealth_data['high_wealth'] == 0]
strata_2['Block'] = 2
strata_2_g1 = strata_2.sample(90, replace=False) # Adjusted sample size to match the example data
strata_2_g1['T_C'] = 'T'
strata_2_g2 = strata_2.drop(strata_2_g1.index)
strata_2_g2['T_C'] = 'C'
# Concatenate the grouping work
wealth_data_stratified = pd.concat([strata_1_g1, strata_1_g2, strata_2_g1, strata_2_g2])
print(wealth_data_stratified.groupby(['Block','T_C', 'high_wealth']).size())import pandas as pd
data_1 = {
'Block': ['A', 'A', 'B', 'B'],
'T_C': ['T', 'C', 'T', 'C'],
'high_free_time': [1, 0, 1, 0]
}
data_2 = {
'Block': ['A', 'A', 'B', 'B'],
'T_C': ['T', 'C', 'T', 'C'],
'high_free_time': [0, 1, 0, 1]
}
df_1 = pd.DataFrame(data_1)
df_2 = pd.DataFrame(data_2)
# Check the stratification of 'high_free_time' in df_1
df_1_stratification = df_1.groupby(['Block', 'T_C', 'high_free_time']).size()
# Check the stratification of 'high_free_time' in df_2
df_2_stratification = df_2.groupby(['Block', 'T_C', 'high_free_time']).size()
df_1_stratification, df_2_stratificationNormal Data
# Import necessary libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Sample data for demonstration
data = {
'salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000]
}
salaries = pd.DataFrame(data)
# Visualizing normal data
sns.displot(data=salaries,
x='salary',
kind="kde")
plt.show()## QQ Plots
from statsmodels.graphics.gofplots import qqplot
from scipy.stats.distributions import norm
qqplot(salaries['salary'],
line='s',
dist=norm)
plt.show()