Skip to content
Course Notes: Web Scraping in Python
  • AI Chat
  • Code
  • Report
  • Web scraping in Python

    from scrapy import Selector
    import requests

    Introduction to HTML

    # Write and run code here
    html = '''
    <html>
      <head>
        <title>Intro HTML</title>
      </head>
      <body>
        <p>Hello World!</p>
        <p>Enjoy Datacamp!</p>
      </body>
    </html>
    '''
    
    xpath = '/html/body/div[2]/p'
    xpath2 = '//span[@class="span-class"]'

    Xpaths and selectors

    # Create an XPath string to direct to children of body element
    xpath = '/html/body/*'
    
    # Print out the number of elements selected
    how_many_elements( xpath )
    <html>
      <body>
        <div>
          <p>Hello World!</p>
          <div>
            <p>Choose DataCamp!</p>
          </div>
        </div>
        <div>
          <p>Thanks for Watching!</p>
        </div>
      </body>
    </html>
    
    # Create an XPath string to the desired paragraph element
    xpath = '/html/body/div/div/p'
    
    # Print out the element text
    print_element_text( xpath )
    # Create an Xpath string to select desired p element
    xpath = '//*[@id="div3"]/p'
    
    # Print out selection text
    print_element_text(xpath)
    # Create an XPath string to select p element by class
    xpath = '//p[@class="class-1 class-2"]'
    
    # Print out select text
    print_element_text( xpath )
    # Create an xpath to the href attribute
    xpath = '//p[@id="p2"]/a/@href'
    
    # Print out the selection(s); there should be only one
    print_attribute( xpath )
    # Create an xpath to the href attributes
    xpath = '//a[contains(@href,"package-snippet")]/@href'
    
    # Print out how many elements are selected
    how_many_elements( xpath )
    # Preview the selected elements
    preview( xpath )
    sel.xpath( '//div' ).xpath( './span/p[3]' ) 
    # Create a Selector selecting html as the HTML document
    sel = Selector( text=html )
    
    # Create a SelectorList of all div elements in the HTML document
    divs = sel.xpath( "//div" )
    
    
    # Create the string html containing the HTML source
    html = requests.get( url ).content
    
    # Create the Selector object sel from html
    sel = Selector( text = html )
    
    # Print out the number of elements in the HTML document
    print( "There are 1020 elements in the HTML document.")
    print( "You have found: ", len( sel.xpath('//*') ) )