Skip to content
Large Language Models
  • AI Chat
  • Code
  • Report
  • Spinner

    OpenAI's New Embedding Models

    Import relevant libraries

    %%bash
    pip -q install tiktoken
    pip -q install openai
    import os
    import tiktoken
    import numpy as np
    import pandas as pd
    from openai import OpenAI
    from sklearn.metrics.pairwise import cosine_similarity

    Load the dataset

    scientific_docs = pd.read_parquet("./data/cord19_df_sample.parquet")
    scientific_docs.head(2)
    percent_missing = scientific_docs.isnull().sum() * 100 / len(scientific_docs)
    percent_missing
    def concatenate_columns_with_null_handling(df, body_text_column, 
                                               abstract_column, 
                                               title_column, 
                                               new_col_name):
        
        df[new_col_name] = df[body_text_column].fillna('') + df[abstract_column].fillna('') + df[title_column].fillna('')
        
        return df
    new_scientific_docs = concatenate_columns_with_null_handling(scientific_docs, 
                                                                 "body_text", 
                                                                 "abstract", 
                                                                 "title", 
                                                                 "concatenated_text")
    new_scientific_docs.head(3)

    Now we can count for each document the total number of tokens.

    def num_tokens_from_text(text: str, encoding_name="cl100k_base"):
            """
            Returns the number of tokens in a text string.
            """
            encoding = tiktoken.get_encoding(encoding_name)
            num_tokens = len(encoding.encode(text))
            return num_tokens
    
    text = "This article is about new OpenAI Embeddings."
    
    num_tokens = num_tokens_from_text(text)
    
    print(f"Number of tokens: {num_tokens}")
    #text = "This article is about new OpenAI Embeddings."

    The application of the function is performed as follows: