Skip to content
Course Notes: Introduction to Natural Language Processing in Python
Course Notes
Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! For courses that use data, the datasets will be available in the datasets
folder.
# Import Counter
from collections import Counter
Take Notes
Add notes here about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Tokenize the article: tokens
tokens = word_tokenize(article)
# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]
# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)
# Print the 10 most common tokens
print(bow_simple.most_common(10))
Example code for using the lemmatizer
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]
# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
# Create the bag-of-words: bow
bow = Counter(lemmatized)
# Print the 10 most common tokens
print(bow.most_common(10))
Creating and querying a corpus
# Import Dictionary
from gensim.corpora.dictionary import Dictionary
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)
# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")
# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))
# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]
# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])
# Save the fifth document: doc
doc = corpus[4]
# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
print(dictionary.get(word_id), word_count)
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
total_word_count[word_id] += word_count
# Import spacy
import spacy
# Instantiate the English model: nlp
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'matcher'])
# Create a new document: doc
doc = nlp(article)
# Print all of the found entities and their labels
for ent in doc.ents:
print(ent.label_, ent.text)