Skip to content
Web Scraping Practice
# Ensure Scrapy is installed
!pip install scrapy
!pip install cloudscraper
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
import os
import pandas as pd
import cloudscraper
scraper = cloudscraper.create_scraper()
print(scraper.get('https://www.thecrag.com/en/climbing/world').text)
class CragScraper(scrapy.Spider):
name = 'crag_scraper'
def open_spider(self, spider):
self.csvfile = open('crag_data.csv', 'a', newline='')
self.writer = None
if os.path.getsize('crag_data.csv') == 0:
self.writer = csv.DictWriter(self.csvfile, fieldnames=[
'continent', 'country', 'region', 'sub_region', 'area', 'zone', 'crag', 'location', 'routes', 'climb_type', 'grades'
])
self.writer.writeheader()
else:
self.writer = csv.DictWriter(self.csvfile, fieldnames=[
'continent', 'country', 'region', 'sub_region', 'area', 'zone', 'crag', 'location', 'routes', 'climb_type', 'grades'
])
def close_spider(self, spider):
self.csvfile.close()
def start_requests(self):
urls = ['https://www.thecrag.com/en/climbing/world']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_continents)
def parse_continents(self, response):
continents = response.css('span.primary-node-name a')
for continent in continents:
link = continent.css('::attr(href)').get()
name = continent.css('::text').get()
yield response.follow(url=link, callback=self.parse_countries, meta={'continent': name})
def parse_countries(self, response):
countries = response.css('span.primary-node-name a')
continent = response.meta['continent']
for country in countries:
link = country.css('::attr(href)').get()
name = country.css('::text').get()
if name in ['United Kingdom', 'UK']:
yield response.follow(url=link, callback=self.parse_regions, meta={'continent': continent, 'country': name})
def parse_regions(self, response):
regions = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
for region in regions:
link = region.css('::attr(href)').get()
name = region.css('::text').get()
yield response.follow(url=link, callback=self.parse_sub_regions, meta={'continent': continent, 'country': country, 'region': name})
def parse_sub_regions(self, response):
sub_regions = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
for sub_region in sub_regions:
link = sub_region.css('::attr(href)').get()
name = sub_region.css('::text').get()
yield response.follow(url=link, callback=self.parse_areas, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': name})
def parse_areas(self, response):
areas = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
for area in areas:
link = area.css('::attr(href)').get()
name = area.css('::text').get()
yield response.follow(url=link, callback=self.parse_zones, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': name})
def parse_zones(self, response):
zones = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
area = response.meta['area']
for zone in zones:
link = zone.css('::attr(href)').get()
name = zone.css('::text').get()
yield response.follow(url=link, callback=self.parse_crags, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': area, 'zone': name})
def parse_crags(self, response):
crags = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
area = response.meta['area']
zone = response.meta['zone']
location = response.css('li.nav-header::text').extract_first()
for crag in crags:
link = crag.css('::attr(href)').get()
name = crag.css('::text').get()
yield response.follow(url=link, callback=self.parse_routes, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': area, 'zone': zone, 'crag': name, 'location': location})
def parse_routes(self, response):
routes = response.css('span.primary-node-name a::text').extract()
climb_type = response.css('span.tags::text').extract()
grades = response.css('span.gb3::text').extract()
dict_crags = {
'continent': response.meta['continent'],
'country': response.meta['country'],
'region': response.meta['region'],
'sub_region': response.meta['sub_region'],
'area': response.meta['area'],
'zone': response.meta['zone'],
'crag': response.meta['crag'],
'location': response.meta['location'],
'routes': routes,
'climb_type': climb_type,
'grades': grades
}
self.writer.writerow(dict_crags)
# Run the spider
process = CrawlerProcess()
process.crawl(CragScraper)
process.start()
# Check if the file is created
print("Files in the current directory after running the scraper:")
print(os.listdir(os.getcwd()))
# Read the CSV file into a pandas DataFrame if it exists
if 'crag_data.csv' in os.listdir(os.getcwd()):
crag_df = pd.read_csv('crag_data.csv')
print(crag_df.head(10))
else:
print("The file 'crag_data.csv' is not working")import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
class CragScraper(scrapy.Spider):
name = 'crag_scraper'
def __init__(self):
self.data = []
def start_requests(self):
urls = ['https://www.thecrag.com/en/climbing/world']
for url in urls:
print(f"Starting with URL: {url}")
yield scrapy.Request(url=url, callback=self.parse_continents)
def parse_continents(self, response):
continents = response.css('span.primary-node-name a')
for continent in continents:
link = continent.css('::attr(href)').get()
name = continent.css('::text').get()
print(f"Found continent: {name} with link: {link}")
yield response.follow(url=link, callback=self.parse_countries, meta={'continent': name})
def parse_countries(self, response):
countries = response.css('span.primary-node-name a')
continent = response.meta['continent']
for country in countries:
link = country.css('::attr(href)').get()
name = country.css('::text').get()
print(f"Found country: {name} in continent: {continent} with link: {link}")
if name in ['United Kingdom', 'UK']:
yield response.follow(url=link, callback=self.parse_regions, meta={'continent': continent, 'country': name})
def parse_regions(self, response):
regions = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
for region in regions:
link = region.css('::attr(href)').get()
name = region.css('::text').get()
print(f"Found region: {name} in country: {country} with link: {link}")
yield response.follow(url=link, callback=self.parse_sub_regions, meta={'continent': continent, 'country': country, 'region': name})
def parse_sub_regions(self, response):
sub_regions = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
for sub_region in sub_regions:
link = sub_region.css('::attr(href)').get()
name = sub_region.css('::text').get()
print(f"Found sub-region: {name} in region: {region} with link: {link}")
yield response.follow(url=link, callback=self.parse_areas, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': name})
def parse_areas(self, response):
areas = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
for area in areas:
link = area.css('::attr(href)').get()
name = area.css('::text').get()
print(f"Found area: {name} in sub-region: {sub_region} with link: {link}")
yield response.follow(url=link, callback=self.parse_zones, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': name})
def parse_zones(self, response):
zones = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
area = response.meta['area']
for zone in zones:
link = zone.css('::attr(href)').get()
name = zone.css('::text').get()
print(f"Found zone: {name} in area: {area} with link: {link}")
yield response.follow(url=link, callback=self.parse_crags, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': area, 'zone': name})
def parse_crags(self, response):
crags = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
area = response.meta['area']
zone = response.meta['zone']
location = response.css('li.nav-header::text').extract_first()
for crag in crags:
link = crag.css('::attr(href)').get()
name = crag.css('::text').get()
print(f"Found crag: {name} in zone: {zone} with link: {link}")
yield response.follow(url=link, callback=self.parse_routes, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': area, 'zone': zone, 'crag': name, 'location': location})
def parse_routes(self, response):
routes = response.css('span.primary-node-name a::text').extract()
climb_type = response.css('span.tags::text').extract()
grades = response.css('span.gb3::text').extract()
dict_crags = {
'continent': response.meta['continent'],
'country': response.meta['country'],
'region': response.meta['region'],
'sub_region': response.meta['sub_region'],
'area': response.meta['area'],
'zone': response.meta['zone'],
'crag': response.meta['crag'],
'location': response.meta['location'],
'routes': routes,
'climb_type': climb_type,
'grades': grades
}
print(f"Collected data: {dict_crags}")
self.data.append(dict_crags)
# Run the spider
process = CrawlerProcess()
process.crawl(CragScraper)
process.start()
# Convert the data to a pandas DataFrame
scraper = CragScraper()
df = pd.DataFrame(scraper.data)
# Display the first few rows of the DataFrame
print(df.head())
# Ensure Scrapy is installed
!pip install scrapy
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
import os
import pandas as pd
class CragScraper(scrapy.Spider):
name = 'crag_scraper'
def __init__(self):
self.data = []
def open_spider(self, spider):
self.csvfile = open('crag_data.csv', 'a', newline='')
self.writer = None
if os.path.getsize('crag_data.csv') == 0:
self.writer = csv.DictWriter(self.csvfile, fieldnames=[
'continent', 'country', 'region', 'sub_region', 'area', 'zone', 'crag', 'location', 'routes', 'climb_type', 'grades'
])
self.writer.writeheader()
else:
self.writer = csv.DictWriter(self.csvfile, fieldnames=[
'continent', 'country', 'region', 'sub_region', 'area', 'zone', 'crag', 'location', 'routes', 'climb_type', 'grades'
])
def close_spider(self, spider):
self.csvfile.close()
def start_requests(self):
urls = ['https://www.thecrag.com/en/climbing/world']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_continents)
def parse_continents(self, response):
continents = response.css('span.primary-node-name a')
for continent in continents:
link = continent.css('::attr(href)').get()
name = continent.css('::text').get()
yield response.follow(url=link, callback=self.parse_countries, meta={'continent': name})
def parse_countries(self, response):
countries = response.css('span.primary-node-name a')
continent = response.meta['continent']
for country in countries:
link = country.css('::attr(href)').get()
name = country.css('::text').get()
if name in ['United Kingdom', 'UK']:
yield response.follow(url=link, callback=self.parse_regions, meta={'continent': continent, 'country': name})
def parse_regions(self, response):
regions = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
for region in regions:
link = region.css('::attr(href)').get()
name = region.css('::text').get()
yield response.follow(url=link, callback=self.parse_sub_regions, meta={'continent': continent, 'country': country, 'region': name})
def parse_sub_regions(self, response):
sub_regions = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
for sub_region in sub_regions:
link = sub_region.css('::attr(href)').get()
name = sub_region.css('::text').get()
yield response.follow(url=link, callback=self.parse_areas, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': name})
def parse_areas(self, response):
areas = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
for area in areas:
link = area.css('::attr(href)').get()
name = area.css('::text').get()
yield response.follow(url=link, callback=self.parse_zones, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': name})
def parse_zones(self, response):
zones = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
area = response.meta['area']
for zone in zones:
link = zone.css('::attr(href)').get()
name = zone.css('::text').get()
yield response.follow(url=link, callback=self.parse_crags, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': area, 'zone': name})
def parse_crags(self, response):
crags = response.css('span.primary-node-name a')
continent = response.meta['continent']
country = response.meta['country']
region = response.meta['region']
sub_region = response.meta['sub_region']
area = response.meta['area']
zone = response.meta['zone']
location = response.css('li.nav-header::text').extract_first()
for crag in crags:
link = crag.css('::attr(href)').get()
name = crag.css('::text').get()
yield response.follow(url=link, callback=self.parse_routes, meta={'continent': continent, 'country': country, 'region': region, 'sub_region': sub_region, 'area': area, 'zone': zone, 'crag': name, 'location': location})
def parse_routes(self, response):
routes = response.css('span.primary-node-name a::text').extract()
climb_type = response.css('span.tags::text').extract()
grades = response.css('span.gb3::text').extract()
dict_crags = {
'continent': response.meta['continent'],
'country': response.meta['country'],
'region': response.meta['region'],
'sub_region': response.meta['sub_region'],
'area': response.meta['area'],
'zone': response.meta['zone'],
'crag': response.meta['crag'],
'location': response.meta['location'],
'routes': routes,
'climb_type': climb_type,
'grades': grades
}
self.data.append(dict_crags)
# Run the spider
process = CrawlerProcess()
process.crawl(CragScraper)
process.start()
# Check if the file is created
scraper = CragScraper()
df_crag = pd.DataFrame(scraper.data)
print(df_crag.head(10))