Skip to content

Statistical Thinking in Python (Part 2)

Run the hidden code cell below to import the data used in the course.

# Importing the course packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importing the course datasets
anscombe = pd.read_csv('datasets/anscombe.csv', header=[0,1])
bees = pd.read_csv('datasets/bee_sperm.csv', comment='#')
literacy_fertility = pd.read_csv('datasets/female_literacy_fertility.csv')
beaks_1975 = pd.read_csv('datasets/finch_beaks_1975.csv')
beaks_2012 = pd.read_csv('datasets/finch_beaks_2012.csv')
frogs = pd.read_csv('datasets/frog_tongue.csv', comment='#')
mlb = pd.read_csv('datasets/mlb_nohitters.csv')
weather = pd.read_csv('datasets/sheffield_weather_station.csv', comment='#', delimiter='\s+', na_values='---')

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Function to perfom bootstrap for linear regression

def draw_bs_pairs_linreg(x, y, size=1):
    """Perform pairs bootstrap for linear regression."""

    # Set up array of indices to sample from: inds
    inds = np.arange(0, len(x))

    # Initialize replicates: bs_slope_reps, bs_intercept_reps
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, size=len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)

    return bs_slope_reps, bs_intercept_reps

Hypothesis Testing

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2
for _ in range(50):
    # Generate permutation samples
    perm_sample_1, perm_sample_2 = permutation_sample(rain_june, rain_november)


    # Compute ECDFs
    x_1, y_1 = ecdf(perm_sample_1)
    x_2, y_2 = ecdf(perm_sample_2)

    # Plot ECDFs of permutation sample
    _ = plt.plot(x_1, y_1, marker='.', linestyle='none',
                 color='red', alpha=0.02)
    _ = plt.plot(x_2, y_2, marker='.', linestyle='none',
                 color='blue', alpha=0.02)

# Create and plot ECDFs from original data
x_1, y_1 = ecdf(rain_june)
x_2, y_2 = ecdf(rain_november)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')

# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('monthly rainfall (mm)')
_ = plt.ylabel('ECDF')
plt.show()

Explore Datasets

Use the DataFrames imported in the first cell to explore the data and practice your skills!

  • Show that the four sets of Anscombe data have the same slope and intercept. Plot the four sets separately and their best fit lines.
  • Investigate whether beak length and beak depth are significantly different across years (beaks_1975 vs beaks_2012) or species (see species column).
  • Pick two continents from literacy_fertility and test the null hypothesis that the country-level female literacy rate is identically distributed between the two continents you've picked.