Amazon Web Scrapper Project

# import libraries
from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib

# Connect to a website
URL = 'https://www.amazon.com/dp/B01B3ET8IG/ref=sspa_dk_detail_2?psc=1&pd_rd_i=B01B3ET8IG&pd_rd_w=0uSIj&content-id=amzn1.sym.eb7c1ac5-7c51-4df5-ba34-ca810f1f119a&pf_rd_p=eb7c1ac5-7c51-4df5-ba34-ca810f1f119a&pf_rd_r=7G3F0KCT5SPC3C7SVWGV&pd_rd_wg=daOen&pd_rd_r=9b1280e6-b1c6-4982-812b-daa63e2baafb&s=apparel&sp_csd=d2lkZ2V0TmFtZT1zcF9kZXRhaWw'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

page = requests.get(URL, headers=headers)

soup1= BeautifulSoup(page.content, 'html.parser')

soup2= BeautifulSoup(soup1.prettify(), 'html.parser')

title = soup2.find(id='productTitle').get_text()

print(title)

price = soup2.find('span', {'class':'a-price'}).find('span').text
print(price)

# Clean up the data a little bit

price = price.strip()[1:]
title = title.strip()
print(title)
print(price)

type(price)

# Create a timestamp for the output to track when the data was collected

import datetime

today = datetime.date.today()

print(today)

import csv

header = ['Title', 'Price', 'Date']
data = [title, price, today]

with open('AmazonWebScrapperDataset.csv', 'w', newline = '', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)

import pandas as pd

df = pd.read_csv(r"C:\Users\sylvi\AmazonWebScrapperDataset.csv")
print(df)

# Appending data to the csv

with open('AmazonWebScrapperDataset.csv', 'a+', newline='', encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

# Combine all of the above code into one function to automate the process

def check_price():
    
    URL = 'https://www.amazon.com/dp/B01B3ET8IG/ref=sspa_dk_detail_2?psc=1&pd_rd_i=B01B3ET8IG&pd_rd_w=0uSIj&content-id=amzn1.sym.eb7c1ac5-7c51-4df5-ba34-ca810f1f119a&pf_rd_p=eb7c1ac5-7c51-4df5-ba34-ca810f1f119a&pf_rd_r=7G3F0KCT5SPC3C7SVWGV&pd_rd_wg=daOen&pd_rd_r=9b1280e6-b1c6-4982-812b-daa63e2baafb&s=apparel&sp_csd=d2lkZ2V0TmFtZT1zcF9kZXRhaWw'

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(URL, headers=headers)

    soup1= BeautifulSoup(page.content, 'html.parser')

    soup2= BeautifulSoup(soup1.prettify(), 'html.parser')

    title = soup2.find(id='productTitle').get_text()
    
    price = soup2.find('span', {'class':'a-price'}).find('span').text
    
    price = price.strip()[1:]
    title = title.strip()
    
    import datetime

    today = datetime.date.today()
    
    import csv

    header = ['Title', 'Price', 'Date']
    data = [title, price, today]
    
    with open('AmazonWebScrapperDataset.csv', 'a+', newline='', encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

# Runs check_price after a set time (per day) and inputs data into your CSV

while(True):
    check_price()
    time.sleep(86400)

import pandas as pd

df = pd.read_csv(r"C:\Users\sylvi\AmazonWebScrapperDataset.csv")
print(df)