Skip to content
Course Notes: Web Scraping in Python
  • AI Chat
  • Code
  • Report
  • Course Notes

    Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! The datasets used in this course are available in the datasets folder.

    # Import any packages you want to use here
    

    Take Notes

    Add notes here about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    Your job in this exercise is to create an XPath which directs to all href attribute values of the hyperlink a elements whose class attributes contain the string "package-snippet". If you do it correctly, you should find that you have selected 10 elements with your XPath string and that it previews links.

    Fill in the blanks below to assign an XPath string to the variable xpath which directs to all href attribute values of the hyperlink a elements whose class attributes contain the string "package-snippet". Remember that we use the contains call within the XPath string to check if an attribute value contains a particular string.

    # Create an xpath to the href attributes
    xpath = '//a[contains(@class,"package-snippet")]/@href'
    
    # Print out how many elements are selected
    how_many_elements( xpath )
    # Preview the selected elements
    preview( xpath )
    Run cancelled
    # Chain together xpath methods to select desired p element
    res = sel.xpath( '//div' ).xpath( './span/p[3]' )
    print(res)
    !pip install scrapy
    # Import a scrapy Selector
    from scrapy import Selector
    
    # Import requests
    import requests
    url = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'
    
    # Create the string html containing the HTML source
    html = requests.get( url ).content
    
    # Create the Selector object sel from html
    sel = Selector( text = html)
    
    # Print out the number of elements in the HTML document
    print( "There are 1020 elements in the HTML document.")
    print( "You have found: ", len( sel.xpath('//*') ) )
    Run cancelled
    # Import a scrapy Selector
    from scrapy import Selector
    
    # Import requests
    import requests
    url = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'
    
    # Create the string html containing the HTML source
    html = requests.get( url ).content
    
    # Create the Selector object sel from html
    sel = Selector( text = html)
    
    # Print out the number of elements in the HTML document
    print( "There are 1020 elements in the HTML document.")
    print( "You have found: ", len( sel.xpath('//*') ) )
    Run cancelled
    # Create the XPath string equivalent to the CSS Locator 
    xpath = '/html/body/span[1]//a'
    
    # Create the CSS Locator string equivalent to the XPath
    css_locator = 'html > body > span:nth-of-type(1) a'
    Run cancelled
    # Create the XPath string equivalent to the CSS Locator 
    xpath = '//div[@id="uid"]/span//h4'
    
    # Create the CSS Locator string equivalent to the XPath
    css_locator = 'div#uid > span h4'
    from scrapy import Selector
    
    # Create a selector from the html (of a secret website)
    sel = Selector(text=html)
    
    # Create a selector from the html (of a secret website)
    sel = Selector( text = html )
    
    # Fill in the blank
    css_locator = 'div.course-block'
    
    # Print the number of selected elements.
    how_many_elements( css_locator )
    # Create the CSS Locator to all children of the element whose id is uid
    css_locator = '#uid > *'
    from scrapy import Selector
    
    # Create a selector object from a secret website
    sel = Selector( text = html )
    
    # Select all hyperlinks of div elements belonging to class "course-block"
    course_as = sel.css( 'div.course-block > a' )
    
    # Selecting all href attributes chaining with css
    hrefs_from_css = course_as.css( '::attr(href)' )
    
    # Selecting all href attributes chaining with xpath
    hrefs_from_xpath = course_as.xpath( './@href' )
    # Create an XPath string to the desired text.
    xpath = '//p[@id="p3"]/text()'
    
    # Create a CSS Locator string to the desired text.
    css_locator = 'p#p3::text'
    
    # Print the text from our selections
    print_results( xpath, css_locator )