Skip to content
Project - Amazon_US_data
%reset -fimport pandas as pd
import csv
file = 'amazon_reviews_us_Mobile_Apps_v1_00.tsv'
df = pd.read_csv(
file, # Replace with your file path
sep='\t',
quoting=csv.QUOTE_NONE,
engine='c' # Optional: Use if you encounter errors
)# Replace problematic HTML entities with proper characters
df.replace({'"': '"'}, regex=True, inplace=True)
# Replace nulls with '\N' in-place
df.fillna(r'\N', inplace=True)# Write the DataFrame back to a TSV file
output_file = file[:-5] + '1.tsv'
df.to_csv(
output_file,
sep='\t', # Use tab as the delimiter for TSV
quotechar='"',
quoting=csv.QUOTE_MINIMAL, # Quote fields with special characters
index=False, # Do not include the DataFrame index
header=True # Include column headers
)
print(f"File '{output_file}' has been written successfully.")
Tasks Completed in DB
Removing Duplicates
DataFrameas
df
variable
--REMOVE DUPLICATES
WITH cte AS (
SELECT
review_id,
ROW_NUMBER() OVER (PARTITION BY review_id) AS row_num
FROM amazon_us_reviews
)
DELETE FROM review_id
WHERE review_id IN (
SELECT review_id
FROM cte
WHERE row_num > 1
);Data Type Optimization
DataFrameas
df1
variable
-- marketplace
SELECT DISTINCT marketplace
FROM amazon_us_reviews;
-- only US
ALTER TABLE amazon_us_reviews
DROP COLUMN marketplace; -- information contained in table name
-- customer_id
SELECT
MIN(customer_id),
MAX(customer_id)
FROM amazon_us_reviews;
-- min: 10,001 & max: 53,096,592
-- Integer good choice
-- TEXT appropriate choice for
-- review_id
-- product_id
-- product_parent
SELECT
MIN(product_parent),
MAX(product_parent)
FROM amazon_us_reviews;
-- min: 222 & max: 999,999,945
-- Integer good choice
-- TEXT appropriate choice for
-- product_title
-- product_category
-- start_rating
SELECT
MIN(star_rating),
MAX(star_rating)
FROM amazon_us_reviews;
-- 1 to 5
ALTER TABLE amazon_us_reviews
ALTER COLUMN star_rating TYPE SMALLINT;
-- helpful_votes
SELECT
MIN(helpful_votes),
MAX(helpful_votes)
FROM amazon_us_reviews;
-- min: 0 & max: 47,524
-- Integer good choice
-- total_votes
SELECT
MIN(total_votes),
MAX(total_votes)
FROM amazon_us_reviews;
-- min: 0 & max: 48,362
-- Integer good choice
-- vine
SELECT
vine,
COUNT(*)
FROM amazon_us_reviews
GROUP BY vine;
-- 'Y' (312204) or 'N' (89078595) only
ALTER TABLE amazon_us_reviews
ADD COLUMN vine_bool BOOLEAN;
UPDATE amazon_us_reviews
SET vine_bool = CASE
WHEN vine = 'Y' THEN TRUE
ELSE FALSE
END;
SELECT
vine, vine_bool,
COUNT(*)
FROM amazon_us_reviews
GROUP BY vine, vine_bool; -- counts match
ALTER TABLE amazon_us_reviews
DROP COLUMN vine;
--ALTER TABLE amazon_us_reviews
--RENAME COLUMN vine_bool TO vine;
-- verified_purchase
SELECT
verified_purchase,
COUNT(*)
FROM amazon_us_reviews
GROUP BY verified_purchase;
-- 'Y' (73212869) or 'N' (16177930) only
ALTER TABLE amazon_us_reviews
ADD COLUMN verified_purchase_bool BOOLEAN;
UPDATE amazon_us_reviews
SET verified_purchase_bool = CASE
WHEN verified_purchase = 'Y' THEN TRUE
ELSE FALSE
END;
SELECT
verified_purchase, verified_purchase_bool,
COUNT(*)
FROM amazon_us_reviews
GROUP BY verified_purchase, verified_purchase_bool; -- counts match
ALTER TABLE amazon_us_reviews
DROP COLUMN verified_purchase;
-- TEXT appropriate choice for
-- review_headline
-- review_body
-- review_date
Compute Resource Tuning
DataFrameas
df5
variable
ALTER SYSTEM SET max_connections = 40;
ALTER SYSTEM SET shared_buffers = '4GB';
ALTER SYSTEM SET effective_cache_size = '12GB';
ALTER SYSTEM SET maintenance_work_mem = '2047MB';
ALTER SYSTEM SET work_mem = '13107kB';
ALTER SYSTEM SET wal_buffers = '16MB';
ALTER SYSTEM SET min_wal_size = '4GB';
ALTER SYSTEM SET max_wal_size = '16GB';
ALTER SYSTEM SET checkpoint_completion_target = 0.9;
ALTER SYSTEM SET default_statistics_target = 500;
ALTER SYSTEM SET random_page_cost = 1.1;
ALTER SYSTEM SET max_worker_processes = 8;
ALTER SYSTEM SET max_parallel_workers_per_gather = 4;
ALTER SYSTEM SET max_parallel_workers = 8;
ALTER SYSTEM SET max_parallel_maintenance_workers = 4;
ALTER SYSTEM SET huge_pages = 'off';
SELECT name, setting, unit
FROM pg_settings
WHERE name IN (
'max_connections',
'shared_buffers',
'effective_cache_size',
'maintenance_work_mem',
'checkpoint_completion_target',
'wal_buffers',
'default_statistics_target',
'random_page_cost',
'work_mem',
'huge_pages',
'min_wal_size',
'max_wal_size',
'max_worker_processes',
'max_parallel_workers_per_gather',
'max_parallel_workers',
'max_parallel_maintenance_workers'
);
Normalization
1NF
- no repeating groups or columns
- all values are atomic
- review_id is the primary key for the amazon_us_reviews table Set review_id as primary key
2NF
- product_title, product_category, and product_parent depend on product_id (or form a composite key)
- marketplace can be depends on the product, however the entire column contains a single unqiue value 'US', this information is redundant and space consuming at it exists in the table name amazon_us_reviews
- customer_id can be independant but also depends on the review_id
- all other columns depend on the review_id
Tables to create (and other actions)
- products
- customers?
- delete marketplace column
3NF
- all non-key columns do not depend on other non-key columns No action needed
Primary Key
- Check for duplicates
- Check for Nulls
- Make primary key
DataFrameas
df2
variable
-- make review_id primary key
SELECT
review_id,
COUNT(*)
FROM amazon_us_reviews
GROUP BY review_id
HAVING COUNT(*) > 1; -- checks for duplicates
SELECT COUNT(*)
FROM amazon_us_reviews
WHERE review_id IS NULL; -- checks for nulls
ALTER TABLE amazon_us_reviews
ALTER COLUMN review_id SET NOT NULL;
ALTER TABLE amazon_us_reviews
ADD CONSTRAINT amazon_us_reviews_pkey PRIMARY KEY (review_id);Customers table