Skip to content

In this workspace, you'll scrape the novel Moby Dick from the website Project Gutenberg (which contains a large corpus of books) using the Python requests package. You'll extract words from this web data using BeautifulSoup before analyzing the distribution of words using the Natural Language ToolKit (nltk) and Counter.

The Data Science pipeline you'll build in this workspace can be used to visualize the word frequency distributions of any novel you can find on Project Gutenberg.

# Import and download packages
import requests
from bs4 import BeautifulSoup
import nltk
from collections import Counter
nltk.download('stopwords')

# Start coding here... 

Import libraries and request the HTML page

# Cell 1: Import libraries and request the HTML page

import requests

# URL to scrape
url = "https://s3.amazonaws.com/assets.datacamp.com/production/project_147/datasets/2701-h.htm"

# Request the page and encode as utf-8
r = requests.get(url)
r.encoding = 'utf-8'
html = r.text

Parse HTML and extract text with BeautifulSoup

# Cell 2: Parse HTML and extract text with BeautifulSoup

from bs4 import BeautifulSoup

# Create BeautifulSoup object using html.parser
html_soup = BeautifulSoup(html, "html.parser")

# Extract text
moby_text = soup.get_text()

Tokenize the text using RegexpTokenizer

# Cell 3: Tokenize the text using RegexpTokenizer

import nltk
from nltk.tokenize import RegexpTokenizer

# Initialize the tokenizer to keep only alphanumeric words
tokenizer = RegexpTokenizer(r'\w+')

# Tokenize the text
tokens = tokenizer.tokenize(moby_text)

# Convert tokens to lowercase and store in words
words = [t.lower() for t in tokens]

Remove stopwords and transform to lowercase

# Cell 4: Remove stopwords and create stop_words

from nltk.corpus import stopwords

# Download stopwords if needed
nltk.download('stopwords')

# Create stop_words: this contains the stop words from nltk
stop_words = stopwords.words('english')

# Filter words to exclude stop words
words_no_stop = [word for word in words if word not in stop_words]

Count words with Counter and display top ten

# Cell 5: Count word frequencies

from collections import Counter

# Create count: Counter object over words_no_stop
count = Counter(words_no_stop)

# Find the ten most common words
top_ten = count.most_common(10)

# Print the top ten words
print(top_ten)