Premier League

Soccer Data

This dataset contains data of every game from the 2018-2019 season in the English Premier League.

# Cell showing the raw dataset
import pandas as pd
soccer = pd.read_csv("soccer18-19.csv")
print(soccer.shape)
soccer.head(100)

Data Dictionary

Column	Explanation
Div	Division the game was played in
Date	The date the game was played
HomeTeam	The home team
AwayTeam	The away team
FTHG	Full time home goals
FTAG	Full time away goals
FTR	Full time result
HTHG	Half time home goals
HTAG	Half time away goals
HTR	Half time result
Referee	The referee of the game
HS	Number of shots taken by home team
AS	Number of shots taken by away team
HST	Number of shots taken by home team on target
AST	Number of shots taken by away team on target
HF	Number of fouls made by home team
AF	Number of fouls made by away team
HC	Number of corners taken by home team
AC	Number of corners taken by away team
HY	Number of yellow cards received by home team
AY	Number of yellow cards received by away team
HR	Number of red cards received by home team
AR	Number of red cards received by away team

Source of dataset.

# PROMPT: What team commits the most fouls?
# Import pandas
import pandas as pd

# Load the document and explore the contents
soccer = pd.read_csv("soccer18-19.csv")

# Subset DataFrame to extract teams and the fouls they commit at home and away
home_fouls = soccer[['HomeTeam', 'HF']]
total_home_fouls = home_fouls.groupby('HomeTeam')['HF'].sum()
home_sorted = total_home_fouls.sort_values(ascending=False)
# home_sorted contains the number of home fouls in descending order
# Repeat, for away
away_fouls = soccer[['AwayTeam', 'AF']]
total_away_fouls = away_fouls.groupby('AwayTeam')['AF'].sum()
away_sorted = total_away_fouls.sort_values(ascending=False)
# away_sorted contains the number of away fouls in descending order

# Concatenate tables
total_sorted = pd.concat([home_sorted, away_sorted], axis=1)
# Create column to display total fouls
total_sorted['TF'] = total_sorted['HF'] + total_sorted['AF']
total_sorted = total_sorted.sort_values(by='TF', ascending=False)
# Make the team the index column
team_with_most_fouls = total_sorted['TF'].idxmax()

# Print
print('The team with the most fouls was ' + team_with_most_fouls)
total_sorted

# PROMPT: Plot the percentage of games that ended in a draw over time
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load document
soccer = pd.read_csv("soccer18-19.csv")

# "soccer[soccer['FTR'] == 'D'.count()['FTR']]" provides the number of games that ended in a draw
# "soccer.count()['FTR']" provides the total number of games over the season
# Multiply denominator by 100
draw_pct = soccer[soccer['FTR'] == 'D'].count()['FTR'] / soccer.count()['FTR']*100
draw_pct = draw_pct.round(2)

# Get ready to make the Pie Chart
labels = 'Draw', 'No draw'
sizes = [draw_pct, (100-draw_pct)]
colors = ['#89CFF0', '#FF5733']
# Make the pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, colors=colors, autopct='%.2f%%')
plt.show()

# Print
print('{}% of matches end in a draw.'.format(draw_pct))

# PROMPT: Does the number of red cards a team receives have an effect on its probability of winning a game?
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load document
soccer = pd.read_csv("soccer18-19.csv")

# Make new column indicating a difference in red cards by team in a match
soccer['Red_Diff'] = soccer['HR'] - soccer['AR']

# Make new column indicating whether the home team won or not
soccer['HomeWin'] = soccer['FTR'].apply(lambda x: 1 if x == 'H' else 0)

# Group by the values of red card difference and calculate the mean of home team wins per value
red_diff_home_win = soccer.groupby('Red_Diff')['HomeWin'].mean()

# Plot the results
plt.plot(red_diff_home_win.index, red_diff_home_win.values)
plt.xlabel('Difference in Red Cards (Home - Away)')
plt.ylabel('Probability of Home Win')
plt.ylim(-0.05, 1.05)
plt.title('Effect of Red Cards on Probability of Home Win')
plt.show()

# PROMPT: Make a league table from match data.
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load document
soccer = pd.read_csv("soccer18-19.csv")

# Create variables, group, sort
home_gf = soccer[['HomeTeam', 'FTHG']]
home_gf = home_gf.groupby('HomeTeam')['FTHG'].sum()
home_gf = home_gf.sort_values(ascending=False)
away_gf = soccer[['AwayTeam', 'FTAG']]
away_gf = away_gf.groupby('AwayTeam')['FTAG'].sum()
away_gf = away_gf.sort_values(ascending=False)
# Concatenate and add final column
total_gf = pd.concat([home_gf, away_gf], axis=1)
total_gf['GF'] = total_gf['FTHG'] + total_gf['FTAG']
total_gf = total_gf.drop(['FTHG', 'FTAG'], axis=1)

# Repeat process, but for goals allowed
home_ga = soccer[['HomeTeam', 'FTAG']]
home_ga = home_ga.groupby('HomeTeam')['FTAG'].sum()
home_ga = home_ga.sort_values(ascending=False)
away_ga = soccer[['AwayTeam', 'FTHG']]
away_ga = away_ga.groupby('AwayTeam')['FTHG'].sum()
away_ga = away_ga.sort_values(ascending=False)
total_ga = pd.concat([home_ga, away_ga], axis=1)
total_ga['GA'] = total_ga['FTAG'] + total_ga['FTHG']
total_ga = total_ga.drop(['FTAG', 'FTHG'], axis=1)

# Concatenate and add another column to new table
total_goals = pd.concat([total_gf, total_ga], axis=1)
total_goals['GD'] = total_goals['GF'] - total_goals['GA']

# Repeat the above steps, but for home and away points
home_points = soccer[['HomeTeam', 'FTR']]
home_points['HomePts'] = home_points['FTR'].apply(lambda x: 3 if x == 'H' else (1 if x == 'D' else 0))
home_points = home_points.groupby('HomeTeam')['HomePts'].sum()
home_points = home_points.sort_values(ascending=False)
away_points = soccer[['AwayTeam', 'FTR']]
away_points['AwayPts'] = away_points['FTR'].apply(lambda x: 3 if x == 'A' else (1 if x == 'D' else 0))
away_points = away_points.groupby('AwayTeam')['AwayPts'].sum()
away_points = away_points.sort_values(ascending=False)
total_points = pd.concat([home_points, away_points], axis=1)
total_points['Pts'] = total_points['HomePts'] + total_points['AwayPts']
total_points = total_points.drop(['HomePts', 'AwayPts'], axis=1)

# Merge, sort, and make the team the index
table_2018_19 = pd.concat([total_points, total_goals], axis=1)
table_2018_19 = table_2018_19.sort_values(by=['Pts', 'GD', 'GF'], ascending=[False, False, False])
table_2018_19['Team'] = table_2018_19.index

# What team won the Premier League?
champion = table_2018_19['Team'].iloc[0]
print(champion, 'won the 2018/19 Premier League.')

# The top four teams qualified for the 2019/20 UEFA Champions League. What teams qualified?
champions_league = table_2018_19['Team'].iloc[0:4]
print(f"{champions_league[0]}, {champions_league[1]}, {champions_league[2]}, and {champions_league[3]} qualified for the 2019/20 UEFA Champions League.")

# The fifth through seventh placed teams qualified for the 2019/20 UEFA Europa League. What teams qualified?
europa_league = table_2018_19['Team'].iloc[4:7]
print(f"{europa_league[0]}, {europa_league[1]}, and {europa_league[2]} qualified for the 2019/20 UEFA Europa League.")

# The three teams at the bottom of the table were relegated to the 2019/20 EFL Championship. What teams were relegated from the Premier League?
relegated_teams = table_2018_19['Team'].iloc[17:]
# For relegation, put the teams in reverse order
print(f"{relegated_teams[2]}, {relegated_teams[1]}, and {relegated_teams[0]} were relegated from the Premier League.")

# Drop the Team column, so the index doesn't appear twice
table_2018_19 = table_2018_19.drop('Team', axis=1)
print(table_2018_19)

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Soccer Data

Data Dictionary

Soccer Data