Skip to content
1 hidden cell
2019 EPL data
2019 EPL Data
This dataset contains data of every game from the 2018-2019 season in the English Premier League.
Not sure where to begin? Scroll to the bottom to find challenges!
import pandas as pd
import plotly.express as px
import timeit
import numpy as np
soccer_df = pd.read_csv("soccer18-19.csv")# Inspect dataset
soccer_df.head(100)1 hidden cell
# Clean up date column
soccer_df.Date = pd.to_datetime(soccer_df.Date)
soccer_df.Date = soccer_df.Date.dt.strftime('%m-%d-%Y')
soccer_df = soccer_df.replace(to_replace=dates.keys(), value=dates.values())Functions - Useful and unuseful
Function to create a dataframe for the specified team
def create_team_df(team):
    at = soccer_df.loc[soccer_df.AwayTeam == team]
    ht = soccer_df.loc[soccer_df.HomeTeam == team]
    
    return at.merge(ht, how='outer')Calculate total Wins, Draws, losses, with Team as input (This is likely a slow solution)
%%timeit
def count_wins(team):
    team_df = create_team_df(team)
    counts = dict(team_df['FTR'].value_counts())
    draws = counts['D']
    away_wins_tup = np.where((team_df.FTR == 'A') & (team_df.AwayTeam == team))
    away_wins = len(away_wins_tup[0])
    
    home_wins_tup = np.where((team_df.FTR == 'H') & (team_df.HomeTeam == team))
    home_wins = len(home_wins_tup[0])
    
    away_losses_tup = np.where((team_df.FTR == 'A') & (team_df.AwayTeam != team))
    away_losses = len(away_losses_tup[0])
    
    home_losses_tup = np.where((team_df.FTR == 'H') & (team_df.HomeTeam != team))
    home_losses = len(home_losses_tup[0])
    
    results = {'wins': away_wins + home_wins, 'draws': draws, 'losses': away_losses + home_losses}
    return results
count_wins('Chelsea')chelsea_df = create_team_df('Chelsea')
che_subset_df = chelsea_df[['Date','HomeTeam','AwayTeam','HTHG',
                            'HTAG','HTR','FTHG','FTAG','FTR']]
che_subset_dfInvestigation into number of yellow cards given by referees
def get_yellows_by_column(ref, col):
    """ Narrow down DataFrame to show the ref
        This can be altered to filter for team, Date, etc
    """
    ref_df = soccer_df[soccer_df['Referee'] == ref]
    """ specify column """
    col = ref_df[col]
    return sum(col)#%%timeit
def get_card_total(ref, color):
    """Check to see if Referee name is valid"""
    if ref in soccer_df['Referee'].values:
        pass
    else:
        return 'Please use a valid Ref name'
    # Narrow down DataFrame to show the ref
    ref_df = soccer_df[soccer_df['Referee'] == ref]
    # Check color, perform calculation, warn of wrong color
    if color == 'red':
        red_total = sum(ref_df['HR']) + sum(ref_df['AR'])
        return red_total
    elif color == 'yellow':
        yellow_total = sum(ref_df['HY']) + sum(ref_df['AY'])
        return yellow_total
    else:
        raise ValueError("Please enter 'yellow' or 'red'")# Function to get yellow cards by Ref
def get_yellow_total(ref):
    """ Narrow down DataFrame to show the ref; Calculate yellow cards """
    ref_df = soccer_df[soccer_df['Referee'] == ref]
    yellow_total = sum(ref_df['HY']) + sum(ref_df['AY'])
    return yellow_total