Skip to content

Medical professionals often summarize patient encounters in transcripts written in natural language, which include details about symptoms, diagnosis, and treatments. These transcripts can be used for other medical documentation, such as for insurance purposes, but as they are densely packed with medical information, extracting the key data accurately can be challenging.

You and your team at Lakeside Healthcare Network have decided to leverage the OpenAI API to automatically extract medical information from these transcripts and automate the matching with the appropriate ICD-10 codes. ICD-10 codes are a standardized system used worldwide for diagnosing and billing purposes, such as insurance claims processing.

The Data

The dataset contains anonymized medical transcriptions categorized by specialty.

transcriptions.csv

ColumnDescription
"medical_specialty"The medical specialty associated with each transcription.
"transcription"Detailed medical transcription texts, with insights into the medical case.
# Import the necessary libraries
import pandas as pd
from openai import OpenAI
import json
# Load the data
df = pd.read_csv("data/transcriptions.csv")
df.head()
# Instantiate empty data frame
df_structured = pd.DataFrame()
df_structured.head()
# Initialize the OpenAI client
import json
import requests

# Assuming OpenAI is already imported and df is defined elsewhere
client = OpenAI()

# Extract Age and recommended treatment
function_definition = [{
    'type':'function',
    'function':{
        'name':'get_age_and_recommended_treatment',
        'description':'This function will use the table df to extract the age and medical specialty in the transcription table and to provide a recommended treatment based on the extracted data from each transcription.',
        'parameters':{
            'type':'object',
            'properties':{
                'age':{'type':'string','description':'The age of the patient found in the transcription'},
                'recommended_treatment':{'type':'string','description':'The recommended treatment for the patient based on the transcription and medical specialty'}
            }
        }
    }
}]

def get_age_and_recommended_treatment(rec_treatment):
    return rec_treatment

age_list = []
recommended_list = []

# Loop to get the exact responses to the row length
for idx, row in df.iterrows():

    transcription_text = row['transcription']
    specialty = row['medical_specialty']
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"Extract age and recommend treatment per transcription If requested data is not found, reply with 'Apologies, I could not found the requested data'. Do not assume the values if it is not available."},
                 {"role":"user","content":f"Medical specialty: {specialty}\n"
                f"Transcription: {transcription_text}"}],
        tools=function_definition
    )

    if response.choices[0].finish_reason == 'tool_calls':
        function_call = response.choices[0].message.tool_calls[0].function
        if function_call.name == "get_age_and_recommended_treatment":
            args = json.loads(function_call.arguments)
            age = get_age_and_recommended_treatment(args['age'])
            treatment = get_age_and_recommended_treatment(args['recommended_treatment'])
            recommended_list.append(treatment)
            age_list.append(age)
        else:
            print('Cannot find')
            recommended_list.append(None)
            age_list.append(None)
    else:
        print('Not allowed')
        recommended_list.append(None)
        age_list.append(None)

# Insert the extracted data and recommended treatment to new column
df_structured['age'] = age_list
df_structured['recommended_treatment'] = recommended_list

df_structured
# ICD-10 Code Matching
ICD_codes = []

# Loop for each row in recommended_treatment
for x in df_structured['recommended_treatment']:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"Provide one answer only. Keep it short and only reply the code in International Classification of Diseases (ICD)"},
                  {"role":"user","content":f"Match this recommended treatment to the corresponding ICD code: {x}"}]    
    )

    
    code = response.choices[0].message.content
    ICD_codes.append(code)


df_structured['ICD_code'] = ICD_codes
df_structured
# Display final dataframe
df_structured['medical_specialty'] = df['medical_specialty']

df_structured = df_structured[['age','medical_specialty','recommended_treatment','ICD_code']]
df_structured