Skip to content
Statistical Thinking in Python (Part 2)
Statistical Thinking in Python (Part 2)
Run the hidden code cell below to import the data used in the course.
# Importing the course packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Importing the course datasets
anscombe = pd.read_csv('datasets/anscombe.csv', header=[0,1])
bees = pd.read_csv('datasets/bee_sperm.csv', comment='#')
literacy_fertility = pd.read_csv('datasets/female_literacy_fertility.csv')
beaks_1975 = pd.read_csv('datasets/finch_beaks_1975.csv')
beaks_2012 = pd.read_csv('datasets/finch_beaks_2012.csv')
frogs = pd.read_csv('datasets/frog_tongue.csv', comment='#')
mlb = pd.read_csv('datasets/mlb_nohitters.csv')
weather = pd.read_csv('datasets/sheffield_weather_station.csv', comment='#', delimiter='\s+', na_values='---')
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Function to perfom bootstrap for linear regression
def draw_bs_pairs_linreg(x, y, size=1):
"""Perform pairs bootstrap for linear regression."""
# Set up array of indices to sample from: inds
inds = np.arange(0, len(x))
# Initialize replicates: bs_slope_reps, bs_intercept_reps
bs_slope_reps = np.empty(size)
bs_intercept_reps = np.empty(size)
# Generate replicates
for i in range(size):
bs_inds = np.random.choice(inds, size=len(inds))
bs_x, bs_y = x[bs_inds], y[bs_inds]
bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
return bs_slope_reps, bs_intercept_reps
Hypothesis Testing
def permutation_sample(data1, data2):
"""Generate a permutation sample from two data sets."""
# Concatenate the data sets: data
data = np.concatenate((data1, data2))
# Permute the concatenated array: permuted_data
permuted_data = np.random.permutation(data)
# Split the permuted array into two: perm_sample_1, perm_sample_2
perm_sample_1 = permuted_data[:len(data1)]
perm_sample_2 = permuted_data[len(data1):]
return perm_sample_1, perm_sample_2
for _ in range(50):
# Generate permutation samples
perm_sample_1, perm_sample_2 = permutation_sample(rain_june, rain_november)
# Compute ECDFs
x_1, y_1 = ecdf(perm_sample_1)
x_2, y_2 = ecdf(perm_sample_2)
# Plot ECDFs of permutation sample
_ = plt.plot(x_1, y_1, marker='.', linestyle='none',
color='red', alpha=0.02)
_ = plt.plot(x_2, y_2, marker='.', linestyle='none',
color='blue', alpha=0.02)
# Create and plot ECDFs from original data
x_1, y_1 = ecdf(rain_june)
x_2, y_2 = ecdf(rain_november)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')
# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('monthly rainfall (mm)')
_ = plt.ylabel('ECDF')
plt.show()
Explore Datasets
Use the DataFrames imported in the first cell to explore the data and practice your skills!
- Show that the four sets of Anscombe data have the same slope and intercept. Plot the four sets separately and their best fit lines.
- Investigate whether beak length and beak depth are significantly different across years (
beaks_1975
vsbeaks_2012
) or species (seespecies
column). - Pick two continents from
literacy_fertility
and test the null hypothesis that the country-level female literacy rate is identically distributed between the two continents you've picked.