Project: Hypothesis Testing with Men's and Women's Soccer Matches

You're working as a sports journalist at a major online sports media company, specializing in soccer analysis and reporting. You've been watching both men's and women's international soccer matches for a number of years, and your gut instinct tells you that more goals are scored in women's international football matches than men's. This would make an interesting investigative article that your subscribers are bound to love, but you'll need to perform a valid statistical hypothesis test to be sure!

While scoping this project, you acknowledge that the sport has changed a lot over the years, and performances likely vary a lot depending on the tournament, so you decide to limit the data used in the analysis to only official FIFA World Cup matches (not including qualifiers) since 2002-01-01.

You create two datasets containing the results of every official men's and women's international football match, which you scraped from a reliable online source. This data is stored in two CSV files: women_results.csv and men_results.csv.

The question you are trying to determine the answer to is:

Are more goals scored in women's international soccer matches than men's?

You assume a 10% significance level, and use the following null and alternative hypotheses:

: The mean number of goals scored in women's international soccer matches is the same as men's.

: The mean number of goals scored in women's international soccer matches is greater than men's.

# Start your code here!
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt

# Import data
df_m = pd.read_csv('men_results.csv')
df_w = pd.read_csv('women_results.csv')

# Parse date column 
df_m['date'] = pd.to_datetime(df_m['date']).dt.strftime('%Y-%m-%d')
df_w['date'] = pd.to_datetime(df_w['date']).dt.strftime('%Y-%m-%d')

# Slice data as per conditions
df_m = df_m[(df_m['tournament'] == "FIFA World Cup") & (df_m['date'] >= "2002-01-01")]
df_w = df_w[(df_w['tournament'] == "FIFA World Cup") & (df_w['date'] >= "2002-01-01")]

# Group and transform data 
df_m_grouped = df_m.groupby(['home_team'])['home_score', 'away_score'].sum()
df_w_grouped = df_w.groupby(['home_team'])['home_score', 'away_score'].sum()

# Summarize
df_m_grouped['total_score'] = np.sum([df_m_grouped['home_score'], df_m_grouped['away_score']], axis=0)
df_w_grouped['total_score'] = np.sum([df_w_grouped['home_score'], df_w_grouped['away_score']], axis=0)

# Reshape
df_m_clean = df_m_grouped.drop(['home_score', 'away_score'], axis=1)
df_w_clean = df_w_grouped.drop(['home_score', 'away_score'], axis=1)

# Initial Graphical Exploratory Analysis

# Make x and y arrays to plot ecdf
x_m = np.sort(df_m_clean['total_score'])
n_m = len(x_m)
y_m = np.arange(1, n_m+1)/n_m

x_w = np.sort(df_w_clean['total_score'])
n_w = len(x_w)
y_w = np.arange(1, n_w+1)/n_w

# Plot ECDF
_ = plt.plot(x_m, y_m, marker='.', linestyle='none', color='blue')
_ = plt.plot(x_w, y_w, marker='.', linestyle='none', color='red')
_ = plt.xlabel('Total Goal Score')
_ = plt.ylabel('ECDF')
plt.margins(0.02)
plt.show()

# Compute the mean
df_m_clean_mean = np.mean(df_m_clean['total_score'])
df_w_clean_mean = np.mean(df_w_clean['total_score'])

# Compute difference of means (Observed Test Statistic)
diff_of_means_observed = df_w_clean_mean - df_m_clean_mean

# Compute combined mean to normalise datasets
combined_mean = (df_m_clean_mean + df_w_clean_mean) / 2

# new arrays to perform bootstrap on
men_shifted = (df_m_clean - df_m_clean_mean) + combined_mean
women_shifted = (df_w_clean - df_w_clean_mean) + combined_mean

# Convert to arrays
men_shifted_array = np.array(men_shifted)
women_shifted_array = np.array(women_shifted)

# Write function call signature for bootstrapping 
def draw_bs_reps(data, func, size=1):
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_replicates[i] = func(np.random.choice(data.ravel(), len(data)))
    return bs_replicates    

# Acquire 10000 replicates of means
replicates_m = draw_bs_reps(men_shifted_array, np.mean, size=10000)
replicates_w = draw_bs_reps(women_shifted_array, np.mean, size=10000)

# Compute difference of means for replicates (Sample Test Statistic)
diff_of_means_reps = replicates_w - replicates_m

# Compute p-value
p = np.sum(diff_of_means_reps >= diff_of_means_observed)/len(diff_of_means_reps)

print('p-value: ', p)

# Conclusion: Assuming a significance level of 10% (0.1): In this case, since the p-value (0.3) is not less than 0.1, we fail to reject the null hypothesis and can conclude that there is not enough evidence to support the alternative hypothesis.

# Compute 95% Confidence Interval
conf_int = np.percentile(np.sort(diff_of_means_reps), [2.5, 97.5])
print('Difference of means: ', diff_of_means_observed,'\n' '95% Confidence Interval: ', conf_int)

# Histogram of difference of mean (replicates vs observed)
_ = plt.hist(diff_of_means_reps)
# Add a vertical line for the observed difference of means
plt.axvline(diff_of_means_observed, color='black', linestyle='--', label='Observed Difference')
plt.xlabel('Difference of Means')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Computing p-value with Permutation Test

# Inner function
def permutation_sample(data_1, data_2):
    data = np.concatenate((data_1, data_2))
    permutted_data = np.random.permutation(data)
    perm_sample_1 = permutted_data[:len(data_1)]
    perm_sample_2 = permutted_data[len(data_1):]
    return perm_sample_1, perm_sample_2

# Outer function
def draw_perm_reps(data_1, data_2, func, size=1):
    perm_replicates = np.empty(size)
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates

# function signature call
def diff_of_means(data_1, data_2):
    diff = np.mean(data_1) - np.mean(data_2)
    return diff
perm_replicates = draw_perm_reps(df_w_clean, df_m_clean, diff_of_means, 10000)
p = np.sum(perm_replicates >= diff_of_means_observed)/10000

# Print p-value
print(p)