Skip to content
Own SQL Queries: DVD rentals
Tables to be used:
DataFrameas
film
variable
SELECT * FROM dvdrentals.film LIMIT 3;
DataFrameas
customer
variable
SELECT * FROM dvdrentals.customer LIMIT 3;
DataFrameas
address
variable
SELECT * FROM dvdrentals.address LIMIT 3;
DataFrameas
category
variable
SELECT * FROM dvdrentals.category LIMIT 3;
DataFrameas
actor
variable
SELECT * FROM dvdrentals.actor LIMIT 3;
DataFrameas
film_actor
variable
SELECT * FROM dvdrentals.film_actor LIMIT 3;
DataFrameas
rental
variable
SELECT * FROM dvdrentals.rental LIMIT 3;
Questions and Queries
1. Most popular actors among movie genres
Who are the 3 most frequently performing actors in each film category?
DataFrameas
df
variable
WITH cte1 AS (
SELECT f.film_id, fa.actor_id, category
FROM dvdrentals.film AS f
JOIN dvdrentals.category AS c USING(film_id)
JOIN dvdrentals.film_actor AS fa USING(film_id)
),
cte2 AS(
SELECT category, actor_id, COUNT(*) AS cnt
FROM cte1
GROUP BY category, actor_id
)
SELECT category, INITCAP(first_name) || ' ' || INITCAP(last_name) AS actor
FROM (SELECT
category,
actor_id,
ROW_NUMBER() OVER(PARTITION BY category ORDER BY cnt DESC) AS row_n
FROM cte2) AS ranked
JOIN dvdrentals.actor USING (actor_id)
WHERE row_n <= 3;
2. Exploring replacement cost differences
What are the mean and median replacement costs across different groupings of movies?
DataFrameas
df1
variable
WITH r_year AS(
SELECT
'Release year: ' || release_year AS grouped_by,
ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
FROM dvdrentals.film
GROUP BY release_year
),
rat AS(
SELECT
'Rating: ' || rating AS grouped_by,
ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
FROM dvdrentals.film
GROUP BY rating
),
lan AS(
SELECT
'Language: ' || name AS grouped_by,
ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
FROM dvdrentals.film
JOIN dvdrentals.language USING(language_id)
GROUP BY name
),
len_cat AS(
SELECT
CASE WHEN length >= 45 AND length < 80 THEN 'Length: 45-79 min'
WHEN length >= 80 AND length < 115 THEN 'Length: 80-114 min'
WHEN length >= 115 AND length < 150 THEN 'Length: 115-149 min'
ELSE 'Length 150-185 min' END AS len_category,
replacement_cost
FROM dvdrentals.film
)
SELECT *
FROM r_year
UNION ALL
SELECT *
FROM rat
UNION ALL
SELECT *
FROM lan
UNION ALL
SELECT
len_category AS grouped_by,
ROUND(AVG(replacement_cost)::NUMERIC,2) AS average_replacement_cost,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY replacement_cost) AS median_replacement_cost
FROM len_cat
GROUP BY len_category;
3. Districts with the highest records of activity
Which districts are characterized by the highest number of active customers?
DataFrameas
df2
variable
WITH cte AS (
SELECT
district,
DENSE_RANK() OVER(ORDER BY active_customers DESC) AS rank
FROM (SELECT district, SUM(active::integer) AS active_customers
FROM dvdrentals.customer as c
JOIN dvdrentals.address as d using(address_id)
GROUP BY district) AS temp_table
ORDER BY rank
)
SELECT district
FROM cte
WHERE rank <= 3;