Skip to content
Topic Identification with Tf-idf
  • AI Chat
  • Code
  • Report
  • Topic Identification with Tf-idf

    Identify and visualize the words that characterize a text within a collection of texts. Tf–idf, short for term frequency–inverse document frequency, is an information retrieval technique, that reflect how important a word is to a document in a collection of documents. A tf-idf value of 0 indicates generic terms, higher values indicate words that uniquely identify a given document.

    %%capture
    !pip install wordcloud
    # Load packages
    import matplotlib.pyplot as plt
    import pandas as pd 
    import numpy as np
    import os 
    import glob
    from sklearn.feature_extraction.text import TfidfVectorizer
    from wordcloud import WordCloud
    # Upload your data as a .txt files to the data folder 
    reference = dict()
    corpus = []
    
    for filepath in glob.glob("data/*.txt"):
        basename = os.path.basename(filepath).replace('.txt', '')
        with open(filepath, 'r') as f:
            content = f.read().replace("\n", '')
            reference[basename] = content
            corpus.append(content)
    # Create a Tfidf matrix
    vectorizer = TfidfVectorizer(stop_words='english',     #
                                 ngram_range = (1,1),      #
                                 max_df = .6,              #
                                 min_df = .01)             #
    
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    dense = X.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    df
    # Find the top words in each document
    top_dict = {}
    data = df.transpose()
    data.columns = reference.keys()
    
    for i, c in enumerate(data.columns):
        top = data.loc[:,c].sort_values(ascending=False).head(30)
        top_dict[data.columns[i]]= list(zip(top.index, top.values))
    
    # Print the top 15 words said by each President
    for president, top_words in top_dict.items():
        print(president)
        print(', '.join([word for word, count in top_words[0:14]]))
        print('---')
    # Prep TF-IDF Matrix for Word Clouds
    data = df.transpose()
    data.columns = reference.keys()
    
    # change the value to black
    def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
        return("hsl(0,100%, 1%)")
    
    wc = WordCloud(background_color="white",                  # select background color
                   width=3000,                                # set wight
                   height=2000,                               # set height
                   max_words=500)\
                .generate_from_frequencies(data['biden'])     # set max amount of words
                                                              # choose column for wordcloud
        
    wc.recolor(color_func = black_color_func)                 # set the word color to black
    plt.figure(figsize=[15,10])                               # set the figsize
    plt.imshow(wc, interpolation="bilinear");                 # plot the wordcloud
    plt.axis("off")                                           # remove plot axes
    plt.savefig('wordcloud.png')                              # pick neame and save as png