Skip to content
Web scraping in Python
from scrapy import Selector
import requests
Introduction to HTML
# Write and run code here
html = '''
<html>
<head>
<title>Intro HTML</title>
</head>
<body>
<p>Hello World!</p>
<p>Enjoy Datacamp!</p>
</body>
</html>
'''
xpath = '/html/body/div[2]/p'
xpath2 = '//span[@class="span-class"]'
Xpaths and selectors
# Create an XPath string to direct to children of body element
xpath = '/html/body/*'
# Print out the number of elements selected
how_many_elements( xpath )
<html>
<body>
<div>
<p>Hello World!</p>
<div>
<p>Choose DataCamp!</p>
</div>
</div>
<div>
<p>Thanks for Watching!</p>
</div>
</body>
</html>
# Create an XPath string to the desired paragraph element
xpath = '/html/body/div/div/p'
# Print out the element text
print_element_text( xpath )
# Create an Xpath string to select desired p element
xpath = '//*[@id="div3"]/p'
# Print out selection text
print_element_text(xpath)
# Create an XPath string to select p element by class
xpath = '//p[@class="class-1 class-2"]'
# Print out select text
print_element_text( xpath )
# Create an xpath to the href attribute
xpath = '//p[@id="p2"]/a/@href'
# Print out the selection(s); there should be only one
print_attribute( xpath )
# Create an xpath to the href attributes
xpath = '//a[contains(@href,"package-snippet")]/@href'
# Print out how many elements are selected
how_many_elements( xpath )
# Preview the selected elements
preview( xpath )
sel.xpath( '//div' ).xpath( './span/p[3]' )
# Create a Selector selecting html as the HTML document
sel = Selector( text=html )
# Create a SelectorList of all div elements in the HTML document
divs = sel.xpath( "//div" )
# Create the string html containing the HTML source
html = requests.get( url ).content
# Create the Selector object sel from html
sel = Selector( text = html )
# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )