Skip to content

Web scraping in Python

from scrapy import Selector
import requests

Introduction to HTML

# Write and run code here
html = '''
<html>
  <head>
    <title>Intro HTML</title>
  </head>
  <body>
    <p>Hello World!</p>
    <p>Enjoy Datacamp!</p>
  </body>
</html>
'''
xpath = '/html/body/div[2]/p'
xpath2 = '//span[@class="span-class"]'

Xpaths and selectors

# Create an XPath string to direct to children of body element
xpath = '/html/body/*'

# Print out the number of elements selected
how_many_elements( xpath )
<html>
  <body>
    <div>
      <p>Hello World!</p>
      <div>
        <p>Choose DataCamp!</p>
      </div>
    </div>
    <div>
      <p>Thanks for Watching!</p>
    </div>
  </body>
</html>

# Create an XPath string to the desired paragraph element
xpath = '/html/body/div/div/p'

# Print out the element text
print_element_text( xpath )
# Create an Xpath string to select desired p element
xpath = '//*[@id="div3"]/p'

# Print out selection text
print_element_text(xpath)
# Create an XPath string to select p element by class
xpath = '//p[@class="class-1 class-2"]'

# Print out select text
print_element_text( xpath )
# Create an xpath to the href attribute
xpath = '//p[@id="p2"]/a/@href'

# Print out the selection(s); there should be only one
print_attribute( xpath )
# Create an xpath to the href attributes
xpath = '//a[contains(@href,"package-snippet")]/@href'

# Print out how many elements are selected
how_many_elements( xpath )
# Preview the selected elements
preview( xpath )
sel.xpath( '//div' ).xpath( './span/p[3]' ) 
# Create a Selector selecting html as the HTML document
sel = Selector( text=html )

# Create a SelectorList of all div elements in the HTML document
divs = sel.xpath( "//div" )

# Create the string html containing the HTML source
html = requests.get( url ).content

# Create the Selector object sel from html
sel = Selector( text = html )

# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )