Skip to content
%reset -f
import pandas as pd
import csv

file = 'amazon_reviews_us_Mobile_Apps_v1_00.tsv'

df = pd.read_csv(
    file,  # Replace with your file path
    sep='\t',
    quoting=csv.QUOTE_NONE,
    engine='c'  # Optional: Use if you encounter errors
)
# Replace problematic HTML entities with proper characters
df.replace({'"': '"'}, regex=True, inplace=True)

# Replace nulls with '\N' in-place
df.fillna(r'\N', inplace=True)
# Write the DataFrame back to a TSV file
output_file = file[:-5] + '1.tsv'

df.to_csv(
    output_file,
    sep='\t',               # Use tab as the delimiter for TSV
    quotechar='"',
    quoting=csv.QUOTE_MINIMAL,  # Quote fields with special characters
    index=False,            # Do not include the DataFrame index
    header=True             # Include column headers
)

print(f"File '{output_file}' has been written successfully.")

Tasks Completed in DB

Removing Duplicates

Spinner
DataFrameas
df
variable
--REMOVE DUPLICATES

WITH cte AS (
    SELECT 
        review_id,
        ROW_NUMBER() OVER (PARTITION BY review_id) AS row_num
    FROM amazon_us_reviews
)

DELETE FROM review_id
WHERE review_id IN (
    SELECT review_id 
	FROM cte 
	WHERE row_num > 1
);

Data Type Optimization

Spinner
DataFrameas
df1
variable
-- marketplace
SELECT DISTINCT marketplace
FROM amazon_us_reviews; 
-- only US 
ALTER TABLE amazon_us_reviews
DROP COLUMN	marketplace; -- information contained in table name


-- customer_id
SELECT 
	MIN(customer_id), 
	MAX(customer_id)
FROM amazon_us_reviews;
-- min: 10,001 & max: 53,096,592
-- Integer good choice

-- TEXT appropriate choice for
--	review_id
--	product_id

-- product_parent
SELECT 
	MIN(product_parent), 
	MAX(product_parent)
FROM amazon_us_reviews;
-- min: 222 & max: 999,999,945
-- Integer good choice

-- TEXT appropriate choice for
--	product_title
--	product_category

-- start_rating
SELECT 
	MIN(star_rating), 
	MAX(star_rating)
FROM amazon_us_reviews;
-- 1 to 5
ALTER TABLE amazon_us_reviews 
ALTER COLUMN star_rating TYPE SMALLINT;

-- helpful_votes
SELECT 
	MIN(helpful_votes), 
	MAX(helpful_votes)
FROM amazon_us_reviews; 
-- min: 0 & max: 47,524
-- Integer good choice

-- total_votes
SELECT 
	MIN(total_votes), 
	MAX(total_votes)
FROM amazon_us_reviews;
-- min: 0 & max: 48,362
-- Integer good choice

-- vine
SELECT 
	vine, 
	COUNT(*)
FROM amazon_us_reviews
GROUP BY vine; 
-- 'Y' (312204) or 'N' (89078595) only
ALTER TABLE amazon_us_reviews
ADD COLUMN vine_bool BOOLEAN;

UPDATE amazon_us_reviews
SET vine_bool = CASE
	WHEN vine = 'Y' THEN TRUE
	ELSE FALSE
END;

SELECT 
	vine, vine_bool,
	COUNT(*)
FROM amazon_us_reviews
GROUP BY vine, vine_bool; -- counts match

ALTER TABLE amazon_us_reviews
DROP COLUMN vine; 
--ALTER TABLE amazon_us_reviews
--RENAME COLUMN vine_bool TO vine;

-- verified_purchase
SELECT 
	verified_purchase, 
	COUNT(*)
FROM amazon_us_reviews
GROUP BY verified_purchase; 
-- 'Y' (73212869) or 'N' (16177930) only
ALTER TABLE amazon_us_reviews
ADD COLUMN verified_purchase_bool BOOLEAN;

UPDATE amazon_us_reviews
SET verified_purchase_bool = CASE
	WHEN verified_purchase = 'Y' THEN TRUE
	ELSE FALSE
END;

SELECT 
	verified_purchase, verified_purchase_bool,
	COUNT(*)
FROM amazon_us_reviews
GROUP BY verified_purchase, verified_purchase_bool; -- counts match

ALTER TABLE amazon_us_reviews
	DROP COLUMN verified_purchase; 

-- TEXT appropriate choice for
--    review_headline
--    review_body

-- review_date

Compute Resource Tuning

Spinner
DataFrameas
df5
variable
ALTER SYSTEM SET max_connections = 40;

ALTER SYSTEM SET shared_buffers = '4GB';
ALTER SYSTEM SET effective_cache_size = '12GB';
ALTER SYSTEM SET maintenance_work_mem = '2047MB';
ALTER SYSTEM SET work_mem = '13107kB';

ALTER SYSTEM SET wal_buffers = '16MB';
ALTER SYSTEM SET min_wal_size = '4GB';
ALTER SYSTEM SET max_wal_size = '16GB';

ALTER SYSTEM SET checkpoint_completion_target = 0.9;

ALTER SYSTEM SET default_statistics_target = 500;

ALTER SYSTEM SET random_page_cost = 1.1;

ALTER SYSTEM SET max_worker_processes = 8;
ALTER SYSTEM SET max_parallel_workers_per_gather = 4;
ALTER SYSTEM SET max_parallel_workers = 8;
ALTER SYSTEM SET max_parallel_maintenance_workers = 4;

ALTER SYSTEM SET huge_pages = 'off';

SELECT name, setting, unit
FROM pg_settings
WHERE name IN (
  'max_connections',
  'shared_buffers',
  'effective_cache_size',
  'maintenance_work_mem',
  'checkpoint_completion_target',
  'wal_buffers',
  'default_statistics_target',
  'random_page_cost',
  'work_mem',
  'huge_pages',
  'min_wal_size',
  'max_wal_size',
  'max_worker_processes',
  'max_parallel_workers_per_gather',
  'max_parallel_workers',
  'max_parallel_maintenance_workers'
);

Normalization

1NF
  • no repeating groups or columns
  • all values are atomic
  • review_id is the primary key for the amazon_us_reviews table Set review_id as primary key
2NF
  • product_title, product_category, and product_parent depend on product_id (or form a composite key)
  • marketplace can be depends on the product, however the entire column contains a single unqiue value 'US', this information is redundant and space consuming at it exists in the table name amazon_us_reviews
  • customer_id can be independant but also depends on the review_id
  • all other columns depend on the review_id

Tables to create (and other actions)

  • products
  • customers?
  • delete marketplace column
3NF
  • all non-key columns do not depend on other non-key columns No action needed
Primary Key
  • Check for duplicates
  • Check for Nulls
  • Make primary key
Spinner
DataFrameas
df2
variable
-- make review_id primary key
SELECT 
	review_id,
	COUNT(*)
FROM amazon_us_reviews
GROUP BY review_id
HAVING COUNT(*) > 1; -- checks for duplicates

SELECT COUNT(*)
FROM amazon_us_reviews
WHERE review_id IS NULL; -- checks for nulls

ALTER TABLE amazon_us_reviews
ALTER COLUMN review_id SET NOT NULL; 

ALTER TABLE amazon_us_reviews 
ADD CONSTRAINT amazon_us_reviews_pkey PRIMARY KEY (review_id);

Customers table