Skip to content

Tables to be used:

Spinner
DataFrameas
film
variable
SELECT * FROM dvdrentals.film LIMIT 3;
Spinner
DataFrameas
customer
variable
SELECT * FROM dvdrentals.customer LIMIT 3;
Spinner
DataFrameas
address
variable
SELECT * FROM dvdrentals.address LIMIT 3;
Spinner
DataFrameas
category
variable
SELECT * FROM dvdrentals.category LIMIT 3;
Spinner
DataFrameas
actor
variable
SELECT * FROM dvdrentals.actor LIMIT 3;
Spinner
DataFrameas
film_actor
variable
SELECT * FROM dvdrentals.film_actor LIMIT 3;
Spinner
DataFrameas
rental
variable
SELECT * FROM dvdrentals.rental LIMIT 3;

Questions and Queries

1. Most popular actors among movie genres

Who are the 3 most frequently performing actors in each film category?
Spinner
DataFrameas
df
variable
WITH cte1 AS (
	SELECT f.film_id, fa.actor_id, category
	FROM dvdrentals.film AS f
	JOIN dvdrentals.category AS c USING(film_id)
	JOIN dvdrentals.film_actor AS fa USING(film_id)
	),
	cte2 AS(
	SELECT category, actor_id, COUNT(*) AS cnt
	FROM cte1
	GROUP BY category, actor_id
	)
SELECT category, INITCAP(first_name) || ' ' || INITCAP(last_name) AS actor
FROM (SELECT
		category,
		actor_id,
		ROW_NUMBER() OVER(PARTITION BY category ORDER BY cnt DESC) AS row_n
	  FROM cte2) AS ranked
JOIN dvdrentals.actor USING (actor_id)
WHERE row_n <= 3;

2. Exploring replacement cost differences

What are the mean and median replacement costs across different groupings of movies?
Spinner
DataFrameas
df1
variable
WITH r_year AS(
	SELECT
		'Release year: ' || release_year AS grouped_by,
		ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
		PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
	FROM dvdrentals.film
	GROUP BY release_year
	),
	rat AS(
	SELECT
		'Rating: ' || rating AS grouped_by,
		ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
		PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
	FROM dvdrentals.film
	GROUP BY rating
	),
	lan AS(
	SELECT
		'Language: ' || name AS grouped_by,
		ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
		PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
	FROM dvdrentals.film
	JOIN dvdrentals.language USING(language_id)
	GROUP BY name
	),
	len_cat AS(
	SELECT
		CASE WHEN length >= 45 AND length < 80 THEN 'Length: 45-79 min'
			WHEN length >= 80 AND length < 115 THEN 'Length: 80-114 min'
			WHEN length >= 115 AND length < 150 THEN 'Length: 115-149 min'
			ELSE 'Length 150-185 min' END AS len_category,
		replacement_cost
	FROM dvdrentals.film
	)
SELECT *
FROM r_year
UNION ALL
SELECT *
FROM rat
UNION ALL
SELECT *
FROM lan
UNION ALL
SELECT
	len_category AS grouped_by,
	ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
	PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
FROM len_cat
GROUP BY len_category;

3. Districts with the highest records of activity

Which districts are characterized by the highest number of active customers?
Spinner
DataFrameas
df2
variable
WITH cte AS (
	SELECT
		district,
		DENSE_RANK() OVER(ORDER BY active_customers DESC) AS rank
	FROM (SELECT district, SUM(active::integer) AS active_customers
		  FROM dvdrentals.customer as c 
		  JOIN dvdrentals.address as d using(address_id)
		  GROUP BY district) AS temp_table
	ORDER BY rank
	)
SELECT district
FROM cte
WHERE rank <= 3;