Skip to content
Topic Identification with Tf-idf
Identify and visualize the words that characterize a text within a collection of texts. Tf–idf, short for term frequency–inverse document frequency, is an information retrieval technique, that reflect how important a word is to a document in a collection of documents. A tf-idf value of 0 indicates generic terms, higher values indicate words that uniquely identify a given document.
%%capture
!pip install wordcloud
# Load packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
# Upload your data as a .txt files to the data folder
reference = dict()
corpus = []
for filepath in glob.glob("data/*.txt"):
basename = os.path.basename(filepath).replace('.txt', '')
with open(filepath, 'r') as f:
content = f.read().replace("\n", '')
reference[basename] = content
corpus.append(content)
# Create a Tfidf matrix
vectorizer = TfidfVectorizer(stop_words='english', #
ngram_range = (1,1), #
max_df = .6, #
min_df = .01) #
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
dense = X.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df
# Find the top words in each document
top_dict = {}
data = df.transpose()
data.columns = reference.keys()
for i, c in enumerate(data.columns):
top = data.loc[:,c].sort_values(ascending=False).head(30)
top_dict[data.columns[i]]= list(zip(top.index, top.values))
# Print the top 15 words said by each President
for president, top_words in top_dict.items():
print(president)
print(', '.join([word for word, count in top_words[0:14]]))
print('---')
# Prep TF-IDF Matrix for Word Clouds
data = df.transpose()
data.columns = reference.keys()
# change the value to black
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(0,100%, 1%)")
wc = WordCloud(background_color="white", # select background color
width=3000, # set wight
height=2000, # set height
max_words=500)\
.generate_from_frequencies(data['biden']) # set max amount of words
# choose column for wordcloud
wc.recolor(color_func = black_color_func) # set the word color to black
plt.figure(figsize=[15,10]) # set the figsize
plt.imshow(wc, interpolation="bilinear"); # plot the wordcloud
plt.axis("off") # remove plot axes
plt.savefig('wordcloud.png') # pick neame and save as png