Skip to content

1 hidden cell
Spinner
DataFrameas
df
variable
-- Use of a semi-join to select all languages spoken in the middle east. (A semi-join used a subquery in the WHERE statement)
SELECT DISTINCT name
FROM world.languages
WHERE code IN(
    SELECT code
        FROM world.countries
        WHERE region = 'Middle East'
);
Spinner
DataFrameas
df
variable
-- SUBqueries inside WHERE and SELECT

-- subqueries in WHERE clauses is the most common as it is used for filtering data, (WHERE X IN ()). Here I am filtering for countries based on life expectancy in a different table. 

SELECT code
FROM world.countries
WHERE code IN(
    SELECT country_code
    FROM world.populations
    WHERE life_expectancy > 80 AND
          life_expectancy IS NOT NULL);
          
----------------------------------------------------------------------------------------------------------------

-- Can I do the same with a join? Is this better? I have done avg for all years. (Im moving to Hongkong)

SELECT c.code, AVG(p.life_expectancy) AS average_life_expectancy 
FROM world.countries AS c
JOIN world.populations AS p
ON c.code = p.country_code
WHERE life_expectancy IS NOT NULL
GROUP BY c.code
ORDER BY average_life_expectancy DESC;

-- Subqueries inside SELECT return 1 value 
    
SELECT DISTINCT continent
FROM world.states;

-- Query how many monarchs for each continent? Subquery counts rows in monarchs table where the continent = continent in the main query. 
SELECT DISTINCT s.continent, 
    (SELECT COUNT(*)
     FROM world.monarchs AS m
     WHERE m.continent = s.continent) AS monarch_count -- Always requires an alias like this
FROM world.states AS s
GROUP BY s.continent;

-- Now look at subqueries inside the from clause. This is a bit like building your own table to get results from.
SELECT DISTINCT monarchs.continent, most_recent
FROM world.monarchs,
(SELECT continent, MAX(indep_year) AS most_recent
FROM world.states
GROUP BY continent) AS sub
WHERE monarchs.continent = sub.continent;

-- How many languages are spoken in each country? Set up table 'l' with count for each country code. 
SELECT countries.local_name, l.lang_number
FROM 
    world.countries, 
    (SELECT code, COUNT(name) AS lang_number
        FROM world.languages
        GROUP BY code) AS l
WHERE countries.code = l.code
ORDER BY lang_number DESC; 

-- Could I have used a join here?
SELECT countries.local_name AS country, COUNT(languages.name) AS lang_number
FROM world.countries
JOIN world.languages
USING(code)
GROUP BY countries.local_name
ORDER BY lang_number DESC;

---------------------------- Code that ranks capitcal cities in terms of the percentage of pop that live in city area.
SELECT name, (city_proper_pop/metroarea_pop*100) AS city_perc
FROM world.cities
WHERE name IN (
    SELECT capital
    FROM world.countries
    WHERE continent LIKE('%Europe') OR continent LIKE('%America')
) 

AND (city_proper_pop/metroarea_pop*100) IS NOT NULL
ORDER BY city_perc DESC
LIMIT 10
;





Spinner
DataFrameas
df
variable
-- Window Functions

-- How many goals were scored in each match in 2011/2012, and how did that compare to the average? Using sub in SELECT.

SELECT date, 
(home_goal + away_goal) AS goals, 
(SELECT
    AVG(home_goal + away_goal)
FROM soccer.match
 WHERE season = '2011/2012'
)
FROM soccer.match
WHERE season = '2011/2012';

-- Can do the same using the OVER syntax, using a window function.

SELECT
    date,
    (home_goal + away_goal) AS goals,
    AVG(home_goal + away_goal) OVER() AS overall_average
FROM soccer.match
WHERE season = '2011/2012'; -- This fumctions just like a subquery in SELECT but is better.

-- OVER tells SQL to pass the aggregate value (avg) over the existing results set.

-- RANK creates a column numbering your dataset from highest to lowest or lowest to highest based on a column you specify

-- What is the rank of matches based on number of goals scored?

SELECT
    date,
    (home_goal + away_goal) AS goals,
    RANK() OVER(ORDER BY home_goal + away_goal DESC) AS goals_rank
FROM soccer.match
WHERE season = '2011/2012';

-- Windows functions are processed after every other part of the query excep for ORDER BY. It also uses infomation in the results set rather than querying the database directly.

SELECT 
	-- Select the league name and average goals scored
	l.name AS league,
    AVG(m.home_goal + m.away_goal) AS avg_goals,
    -- Rank leagues in descending order by average goals
    RANK() OVER(ORDER BY AVG(m.home_goal + m.away_goal) DESC) AS league_rank
FROM soccer.league AS l
LEFT JOIN soccer.match AS m 
ON l.id = m.country_id
WHERE m.season = '2011/2012'
GROUP BY l.name
-- Order the query by the rank you created
ORDER BY league_rank;

-- Window Partitions in OVER clause: Can calculate separate values for different catagories

-- AVG(home_goal) OVER(Partition BY season) will return averages which will change depending on season, rather than an average across all games.

-- Remember this from before:

SELECT
    date,
    (home_goal + away_goal) AS goals,
    AVG(home_goal + away_goal) OVER() AS overall_average
FROM soccer.match; /* This returns an average for each match compares to overall average. */ 

-- If we introduced a partition into the window function the average will change dependant on another value.

SELECT
    date,
    (home_goal + away_goal) AS goals,
    AVG(home_goal + away_goal) OVER(PARTITION BY season) AS season_average
FROM soccer.match; -- Different value for each season

-- Can partition by multiple columns:
SELECT 
    c.name,
    m.season,
    (home_goal + away_goal) AS goals,
    AVG(home_goal + away_goal) OVER(PARTITION BY m.season, c.name) AS season_country_average
FROM soccer.country AS c
LEFT JOIN soccer.match AS m
ON c.id = m.country_id;

-- See the results, previously the GROUP BY fn would have returned one result for Belgium and 2011/2012, thus losing individual results for each match.

-- Example exercise partitoning by season and month for a certain teams games:

SELECT 
	date,
	season,
	home_goal,
	away_goal,
	CASE WHEN hometeam_id = 8673 THEN 'home' 
         ELSE 'away' END AS warsaw_location,
	-- Calculate average goals partitioned by season and month
    AVG(home_goal) OVER(PARTITION BY season, 
         	EXTRACT(MONTH FROM date)) AS season_mo_home,
    AVG(away_goal) OVER(PARTITION BY season, 
            EXTRACT(MONTH FROM date)) AS season_mo_away
FROM soccer.match
WHERE 
	hometeam_id = 8673
    OR awayteam_id = 8673
ORDER BY (home_goal + away_goal) DESC;

-- Sliding windows, calculated value changes with each subsequent row in a data set (like a rolling average or total). Can also be partitioned by one or more columns.

-- General syntax: 'ROWS BETWEEN <start> AND <finish>''; <start> or <finish> could include: PRECEDING, FOLLOWING, UNBOUNDED PRECEDING, UNBOUNDED FOLLOWING, CURRENT ROW.

-- Manchester City Home Games
SELECT
date,
home_goal,
away_goal,
SUM(home_goal)
    OVER(ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM soccer.match
WHERE hometeam_id = 8456 AND season = '2011/2012';

-- Change UNBOUNDED to '2' which returns running total and current and previous match:
SELECT
date,
home_goal,
away_goal,
SUM(home_goal)
    OVER(ORDER BY date ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS last2
FROM soccer.match
WHERE hometeam_id = 8456 AND season = '2011/2012';

----- Example - using CURRENT ROW and UNBOUNDED FOLLOWING for revesing order of results.
SELECT 
	-- Select the date, home goal, and away goals
	date,
    home_goal,
    away_goal,
    -- Create a running total and running average of home goals
    SUM(home_goal) OVER(ORDER BY date DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_total,
    AVG(home_goal) OVER(ORDER BY date DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_avg
FROM match
WHERE 
	awayteam_id = 9908 
    AND season = '2011/2012';
    
    
--- Example below, need to study further

-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id)
-- Select team names, the date and goals
SELECT DISTINCT
    date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_goal,
    m.away_goal
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND (home.team_long_name = 'Manchester United' 
           OR away.team_long_name = 'Manchester United');
           
           
-------- This is bananas
-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM soccer.match AS m
  LEFT JOIN soccer.team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss'
		   WHEN m.home_goal < m.away_goal THEN 'MU Win' 
  		   ELSE 'Tie' END AS outcome
  FROM soccer.match AS m
  LEFT JOIN soccer.team AS t ON m.awayteam_id = t.team_api_id)
-- Select columns and and rank the matches by goal difference
SELECT DISTINCT
    date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_goal, m.away_goal,
    RANK() OVER(ORDER BY ABS(home_goal - away_goal) DESC) as match_rank
-- Join the CTEs onto the match table
FROM soccer.match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND ((home.team_long_name = 'Manchester United' AND home.outcome = 'MU Loss')
      OR (away.team_long_name = 'Manchester United' AND away.outcome = 'MU Loss'));