Skip to content
Premier League
Soccer Data
This dataset contains data of every game from the 2018-2019 season in the English Premier League.
# Cell showing the raw dataset
import pandas as pd
soccer = pd.read_csv("soccer18-19.csv")
print(soccer.shape)
soccer.head(100)Data Dictionary
| Column | Explanation |
|---|---|
| Div | Division the game was played in |
| Date | The date the game was played |
| HomeTeam | The home team |
| AwayTeam | The away team |
| FTHG | Full time home goals |
| FTAG | Full time away goals |
| FTR | Full time result |
| HTHG | Half time home goals |
| HTAG | Half time away goals |
| HTR | Half time result |
| Referee | The referee of the game |
| HS | Number of shots taken by home team |
| AS | Number of shots taken by away team |
| HST | Number of shots taken by home team on target |
| AST | Number of shots taken by away team on target |
| HF | Number of fouls made by home team |
| AF | Number of fouls made by away team |
| HC | Number of corners taken by home team |
| AC | Number of corners taken by away team |
| HY | Number of yellow cards received by home team |
| AY | Number of yellow cards received by away team |
| HR | Number of red cards received by home team |
| AR | Number of red cards received by away team |
Source of dataset.
# PROMPT: What team commits the most fouls?
# Import pandas
import pandas as pd
# Load the document and explore the contents
soccer = pd.read_csv("soccer18-19.csv")
# Subset DataFrame to extract teams and the fouls they commit at home and away
home_fouls = soccer[['HomeTeam', 'HF']]
total_home_fouls = home_fouls.groupby('HomeTeam')['HF'].sum()
home_sorted = total_home_fouls.sort_values(ascending=False)
# home_sorted contains the number of home fouls in descending order
# Repeat, for away
away_fouls = soccer[['AwayTeam', 'AF']]
total_away_fouls = away_fouls.groupby('AwayTeam')['AF'].sum()
away_sorted = total_away_fouls.sort_values(ascending=False)
# away_sorted contains the number of away fouls in descending order
# Concatenate tables
total_sorted = pd.concat([home_sorted, away_sorted], axis=1)
# Create column to display total fouls
total_sorted['TF'] = total_sorted['HF'] + total_sorted['AF']
total_sorted = total_sorted.sort_values(by='TF', ascending=False)
# Make the team the index column
team_with_most_fouls = total_sorted['TF'].idxmax()
# Print
print('The team with the most fouls was ' + team_with_most_fouls)
total_sorted# PROMPT: Plot the percentage of games that ended in a draw over time
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load document
soccer = pd.read_csv("soccer18-19.csv")
# "soccer[soccer['FTR'] == 'D'.count()['FTR']]" provides the number of games that ended in a draw
# "soccer.count()['FTR']" provides the total number of games over the season
# Multiply denominator by 100
draw_pct = soccer[soccer['FTR'] == 'D'].count()['FTR'] / soccer.count()['FTR']*100
draw_pct = draw_pct.round(2)
# Get ready to make the Pie Chart
labels = 'Draw', 'No draw'
sizes = [draw_pct, (100-draw_pct)]
colors = ['#89CFF0', '#FF5733']
# Make the pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, colors=colors, autopct='%.2f%%')
plt.show()
# Print
print('{}% of matches end in a draw.'.format(draw_pct))# PROMPT: Does the number of red cards a team receives have an effect on its probability of winning a game?
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load document
soccer = pd.read_csv("soccer18-19.csv")
# Make new column indicating a difference in red cards by team in a match
soccer['Red_Diff'] = soccer['HR'] - soccer['AR']
# Make new column indicating whether the home team won or not
soccer['HomeWin'] = soccer['FTR'].apply(lambda x: 1 if x == 'H' else 0)
# Group by the values of red card difference and calculate the mean of home team wins per value
red_diff_home_win = soccer.groupby('Red_Diff')['HomeWin'].mean()
# Plot the results
plt.plot(red_diff_home_win.index, red_diff_home_win.values)
plt.xlabel('Difference in Red Cards (Home - Away)')
plt.ylabel('Probability of Home Win')
plt.ylim(-0.05, 1.05)
plt.title('Effect of Red Cards on Probability of Home Win')
plt.show()# PROMPT: Make a league table from match data.
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load document
soccer = pd.read_csv("soccer18-19.csv")
# Create variables, group, sort
home_gf = soccer[['HomeTeam', 'FTHG']]
home_gf = home_gf.groupby('HomeTeam')['FTHG'].sum()
home_gf = home_gf.sort_values(ascending=False)
away_gf = soccer[['AwayTeam', 'FTAG']]
away_gf = away_gf.groupby('AwayTeam')['FTAG'].sum()
away_gf = away_gf.sort_values(ascending=False)
# Concatenate and add final column
total_gf = pd.concat([home_gf, away_gf], axis=1)
total_gf['GF'] = total_gf['FTHG'] + total_gf['FTAG']
total_gf = total_gf.drop(['FTHG', 'FTAG'], axis=1)
# Repeat process, but for goals allowed
home_ga = soccer[['HomeTeam', 'FTAG']]
home_ga = home_ga.groupby('HomeTeam')['FTAG'].sum()
home_ga = home_ga.sort_values(ascending=False)
away_ga = soccer[['AwayTeam', 'FTHG']]
away_ga = away_ga.groupby('AwayTeam')['FTHG'].sum()
away_ga = away_ga.sort_values(ascending=False)
total_ga = pd.concat([home_ga, away_ga], axis=1)
total_ga['GA'] = total_ga['FTAG'] + total_ga['FTHG']
total_ga = total_ga.drop(['FTAG', 'FTHG'], axis=1)
# Concatenate and add another column to new table
total_goals = pd.concat([total_gf, total_ga], axis=1)
total_goals['GD'] = total_goals['GF'] - total_goals['GA']
# Repeat the above steps, but for home and away points
home_points = soccer[['HomeTeam', 'FTR']]
home_points['HomePts'] = home_points['FTR'].apply(lambda x: 3 if x == 'H' else (1 if x == 'D' else 0))
home_points = home_points.groupby('HomeTeam')['HomePts'].sum()
home_points = home_points.sort_values(ascending=False)
away_points = soccer[['AwayTeam', 'FTR']]
away_points['AwayPts'] = away_points['FTR'].apply(lambda x: 3 if x == 'A' else (1 if x == 'D' else 0))
away_points = away_points.groupby('AwayTeam')['AwayPts'].sum()
away_points = away_points.sort_values(ascending=False)
total_points = pd.concat([home_points, away_points], axis=1)
total_points['Pts'] = total_points['HomePts'] + total_points['AwayPts']
total_points = total_points.drop(['HomePts', 'AwayPts'], axis=1)
# Merge, sort, and make the team the index
table_2018_19 = pd.concat([total_points, total_goals], axis=1)
table_2018_19 = table_2018_19.sort_values(by=['Pts', 'GD', 'GF'], ascending=[False, False, False])
table_2018_19['Team'] = table_2018_19.index
# What team won the Premier League?
champion = table_2018_19['Team'].iloc[0]
print(champion, 'won the 2018/19 Premier League.')
# The top four teams qualified for the 2019/20 UEFA Champions League. What teams qualified?
champions_league = table_2018_19['Team'].iloc[0:4]
print(f"{champions_league[0]}, {champions_league[1]}, {champions_league[2]}, and {champions_league[3]} qualified for the 2019/20 UEFA Champions League.")
# The fifth through seventh placed teams qualified for the 2019/20 UEFA Europa League. What teams qualified?
europa_league = table_2018_19['Team'].iloc[4:7]
print(f"{europa_league[0]}, {europa_league[1]}, and {europa_league[2]} qualified for the 2019/20 UEFA Europa League.")
# The three teams at the bottom of the table were relegated to the 2019/20 EFL Championship. What teams were relegated from the Premier League?
relegated_teams = table_2018_19['Team'].iloc[17:]
# For relegation, put the teams in reverse order
print(f"{relegated_teams[2]}, {relegated_teams[1]}, and {relegated_teams[0]} were relegated from the Premier League.")
# Drop the Team column, so the index doesn't appear twice
table_2018_19 = table_2018_19.drop('Team', axis=1)
print(table_2018_19)