Skip to content
1 hidden cell
Subqueries and intro to window functions
1 hidden cell
DataFrameas
df
variable
-- Use of a semi-join to select all languages spoken in the middle east. (A semi-join used a subquery in the WHERE statement)
SELECT DISTINCT name
FROM world.languages
WHERE code IN(
SELECT code
FROM world.countries
WHERE region = 'Middle East'
);DataFrameas
df
variable
-- SUBqueries inside WHERE and SELECT
-- subqueries in WHERE clauses is the most common as it is used for filtering data, (WHERE X IN ()). Here I am filtering for countries based on life expectancy in a different table.
SELECT code
FROM world.countries
WHERE code IN(
SELECT country_code
FROM world.populations
WHERE life_expectancy > 80 AND
life_expectancy IS NOT NULL);
----------------------------------------------------------------------------------------------------------------
-- Can I do the same with a join? Is this better? I have done avg for all years. (Im moving to Hongkong)
SELECT c.code, AVG(p.life_expectancy) AS average_life_expectancy
FROM world.countries AS c
JOIN world.populations AS p
ON c.code = p.country_code
WHERE life_expectancy IS NOT NULL
GROUP BY c.code
ORDER BY average_life_expectancy DESC;
-- Subqueries inside SELECT return 1 value
SELECT DISTINCT continent
FROM world.states;
-- Query how many monarchs for each continent? Subquery counts rows in monarchs table where the continent = continent in the main query.
SELECT DISTINCT s.continent,
(SELECT COUNT(*)
FROM world.monarchs AS m
WHERE m.continent = s.continent) AS monarch_count -- Always requires an alias like this
FROM world.states AS s
GROUP BY s.continent;
-- Now look at subqueries inside the from clause. This is a bit like building your own table to get results from.
SELECT DISTINCT monarchs.continent, most_recent
FROM world.monarchs,
(SELECT continent, MAX(indep_year) AS most_recent
FROM world.states
GROUP BY continent) AS sub
WHERE monarchs.continent = sub.continent;
-- How many languages are spoken in each country? Set up table 'l' with count for each country code.
SELECT countries.local_name, l.lang_number
FROM
world.countries,
(SELECT code, COUNT(name) AS lang_number
FROM world.languages
GROUP BY code) AS l
WHERE countries.code = l.code
ORDER BY lang_number DESC;
-- Could I have used a join here?
SELECT countries.local_name AS country, COUNT(languages.name) AS lang_number
FROM world.countries
JOIN world.languages
USING(code)
GROUP BY countries.local_name
ORDER BY lang_number DESC;
---------------------------- Code that ranks capitcal cities in terms of the percentage of pop that live in city area.
SELECT name, (city_proper_pop/metroarea_pop*100) AS city_perc
FROM world.cities
WHERE name IN (
SELECT capital
FROM world.countries
WHERE continent LIKE('%Europe') OR continent LIKE('%America')
)
AND (city_proper_pop/metroarea_pop*100) IS NOT NULL
ORDER BY city_perc DESC
LIMIT 10
;
DataFrameas
df
variable
-- Window Functions
-- How many goals were scored in each match in 2011/2012, and how did that compare to the average? Using sub in SELECT.
SELECT date,
(home_goal + away_goal) AS goals,
(SELECT
AVG(home_goal + away_goal)
FROM soccer.match
WHERE season = '2011/2012'
)
FROM soccer.match
WHERE season = '2011/2012';
-- Can do the same using the OVER syntax, using a window function.
SELECT
date,
(home_goal + away_goal) AS goals,
AVG(home_goal + away_goal) OVER() AS overall_average
FROM soccer.match
WHERE season = '2011/2012'; -- This fumctions just like a subquery in SELECT but is better.
-- OVER tells SQL to pass the aggregate value (avg) over the existing results set.
-- RANK creates a column numbering your dataset from highest to lowest or lowest to highest based on a column you specify
-- What is the rank of matches based on number of goals scored?
SELECT
date,
(home_goal + away_goal) AS goals,
RANK() OVER(ORDER BY home_goal + away_goal DESC) AS goals_rank
FROM soccer.match
WHERE season = '2011/2012';
-- Windows functions are processed after every other part of the query excep for ORDER BY. It also uses infomation in the results set rather than querying the database directly.
SELECT
-- Select the league name and average goals scored
l.name AS league,
AVG(m.home_goal + m.away_goal) AS avg_goals,
-- Rank leagues in descending order by average goals
RANK() OVER(ORDER BY AVG(m.home_goal + m.away_goal) DESC) AS league_rank
FROM soccer.league AS l
LEFT JOIN soccer.match AS m
ON l.id = m.country_id
WHERE m.season = '2011/2012'
GROUP BY l.name
-- Order the query by the rank you created
ORDER BY league_rank;
-- Window Partitions in OVER clause: Can calculate separate values for different catagories
-- AVG(home_goal) OVER(Partition BY season) will return averages which will change depending on season, rather than an average across all games.
-- Remember this from before:
SELECT
date,
(home_goal + away_goal) AS goals,
AVG(home_goal + away_goal) OVER() AS overall_average
FROM soccer.match; /* This returns an average for each match compares to overall average. */
-- If we introduced a partition into the window function the average will change dependant on another value.
SELECT
date,
(home_goal + away_goal) AS goals,
AVG(home_goal + away_goal) OVER(PARTITION BY season) AS season_average
FROM soccer.match; -- Different value for each season
-- Can partition by multiple columns:
SELECT
c.name,
m.season,
(home_goal + away_goal) AS goals,
AVG(home_goal + away_goal) OVER(PARTITION BY m.season, c.name) AS season_country_average
FROM soccer.country AS c
LEFT JOIN soccer.match AS m
ON c.id = m.country_id;
-- See the results, previously the GROUP BY fn would have returned one result for Belgium and 2011/2012, thus losing individual results for each match.
-- Example exercise partitoning by season and month for a certain teams games:
SELECT
date,
season,
home_goal,
away_goal,
CASE WHEN hometeam_id = 8673 THEN 'home'
ELSE 'away' END AS warsaw_location,
-- Calculate average goals partitioned by season and month
AVG(home_goal) OVER(PARTITION BY season,
EXTRACT(MONTH FROM date)) AS season_mo_home,
AVG(away_goal) OVER(PARTITION BY season,
EXTRACT(MONTH FROM date)) AS season_mo_away
FROM soccer.match
WHERE
hometeam_id = 8673
OR awayteam_id = 8673
ORDER BY (home_goal + away_goal) DESC;
-- Sliding windows, calculated value changes with each subsequent row in a data set (like a rolling average or total). Can also be partitioned by one or more columns.
-- General syntax: 'ROWS BETWEEN <start> AND <finish>''; <start> or <finish> could include: PRECEDING, FOLLOWING, UNBOUNDED PRECEDING, UNBOUNDED FOLLOWING, CURRENT ROW.
-- Manchester City Home Games
SELECT
date,
home_goal,
away_goal,
SUM(home_goal)
OVER(ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM soccer.match
WHERE hometeam_id = 8456 AND season = '2011/2012';
-- Change UNBOUNDED to '2' which returns running total and current and previous match:
SELECT
date,
home_goal,
away_goal,
SUM(home_goal)
OVER(ORDER BY date ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS last2
FROM soccer.match
WHERE hometeam_id = 8456 AND season = '2011/2012';
----- Example - using CURRENT ROW and UNBOUNDED FOLLOWING for revesing order of results.
SELECT
-- Select the date, home goal, and away goals
date,
home_goal,
away_goal,
-- Create a running total and running average of home goals
SUM(home_goal) OVER(ORDER BY date DESC
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_total,
AVG(home_goal) OVER(ORDER BY date DESC
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_avg
FROM match
WHERE
awayteam_id = 9908
AND season = '2011/2012';
--- Example below, need to study further
-- Set up the home team CTE
WITH home AS (
SELECT m.id, t.team_long_name,
CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
WHEN m.home_goal < m.away_goal THEN 'MU Loss'
ELSE 'Tie' END AS outcome
FROM match AS m
LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
SELECT m.id, t.team_long_name,
CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
WHEN m.home_goal < m.away_goal THEN 'MU Loss'
ELSE 'Tie' END AS outcome
FROM match AS m
LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id)
-- Select team names, the date and goals
SELECT DISTINCT
date,
home.team_long_name AS home_team,
away.team_long_name AS away_team,
m.home_goal,
m.away_goal
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
AND (home.team_long_name = 'Manchester United'
OR away.team_long_name = 'Manchester United');
-------- This is bananas
-- Set up the home team CTE
WITH home AS (
SELECT m.id, t.team_long_name,
CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
WHEN m.home_goal < m.away_goal THEN 'MU Loss'
ELSE 'Tie' END AS outcome
FROM soccer.match AS m
LEFT JOIN soccer.team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
SELECT m.id, t.team_long_name,
CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss'
WHEN m.home_goal < m.away_goal THEN 'MU Win'
ELSE 'Tie' END AS outcome
FROM soccer.match AS m
LEFT JOIN soccer.team AS t ON m.awayteam_id = t.team_api_id)
-- Select columns and and rank the matches by goal difference
SELECT DISTINCT
date,
home.team_long_name AS home_team,
away.team_long_name AS away_team,
m.home_goal, m.away_goal,
RANK() OVER(ORDER BY ABS(home_goal - away_goal) DESC) as match_rank
-- Join the CTEs onto the match table
FROM soccer.match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
AND ((home.team_long_name = 'Manchester United' AND home.outcome = 'MU Loss')
OR (away.team_long_name = 'Manchester United' AND away.outcome = 'MU Loss'));