Medical professionals often summarize patient encounters in transcripts written in natural language, which include details about symptoms, diagnosis, and treatments. These transcripts can be used for other medical documentation, such as for insurance purposes, but as they are densely packed with medical information, extracting the key data accurately can be challenging.
You and your team at Lakeside Healthcare Network have decided to leverage the OpenAI API to automatically extract medical information from these transcripts and automate the matching with the appropriate ICD-10 codes. ICD-10 codes are a standardized system used worldwide for diagnosing and billing purposes, such as insurance claims processing.
The Data
The dataset contains anonymized medical transcriptions categorized by specialty.
transcriptions.csv
| Column | Description |
|---|---|
"medical_specialty" | The medical specialty associated with each transcription. |
"transcription" | Detailed medical transcription texts, with insights into the medical case. |
# Import the necessary libraries
import pandas as pd
from openai import OpenAI
import json# Load the data
df = pd.read_csv("data/transcriptions.csv")
df.head()import openai
import pandas as pd
import json
import os
df = pd.read_csv("data/transcriptions.csv")
# Authentication
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_key is None or openai.api_key == "" or openai.api_key == "your_openai_api_key_here":
raise ValueError("OpenAI API key not set. Please set the OPENAI_API_KEY environment variable.")
def extract_info_from_transcription(transcription):
messages = [
{"role": "system", "content": "You are a medical data extraction assistant."},
{"role": "user", "content": (
f"Extract the following information from the medical transcription below:\n"
f"- Age of the patient\n"
f"- Medical specialty of the transcription\n"
f"- Recommended treatment\n"
f"\n"
f"Provide the output in JSON format as:\n"
f"{{\n"
f' "age": "...",\n'
f' "medical_specialty": "...",\n'
f' "recommended_treatment": "..."\n'
f"}}\n"
f"\n"
f"Medical transcription:\n"
f"'''{transcription}'''"
)}
]
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=200,
temperature=0
)
content = response.choices[0].message.content.strip()
try:
data = json.loads(content)
except json.JSONDecodeError:
data = {"age": None, "medical_specialty": None, "recommended_treatment": None}
return data
# Example usage
extraction = []
for transcription in df['transcription']:
extraction.append(extract_info_from_transcription(transcription))
print(extraction)import openai
import json
def map_treatment_to_icd(recommended_treatment):
"""
Use OpenAI API to find the ICD code for a given treatment description.
"""
messages = [
{"role": "system", "content": "You are a medical data extraction assistant specialized in matching treatments with ICD codes"},
{"role": "user", "content": f'Given the following recommended treatment, provide the corresponding ICD code(s) in the format JSON list - Recommended treatment: {recommended_treatment}'}
]
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=200,
temperature=0
)
text_response = response.choices[0].message.content.strip()
try:
icd_codes = json.loads(text_response)
except json.JSONDecodeError:
icd_codes = []
return icd_codes
# Declare recommended_treatment before using it
recommended_treatments = []
for dict in extraction:
recommended_treatments.append(dict['recommended_treatment'])
# print (recommended_treatments)
all_icd_codes = []
for treatment in recommended_treatments:
all_icd_codes.append(map_treatment_to_icd(treatment))
#icd_code = map_treatment_to_icd(recommended_treatment)
print(all_icd_codes)def process_transcriptions(df):
"""
Process each transcription in the DataFrame and return a structured DataFrame.
"""
records = []
for transcription in df['transcription']:
extracted = extract_info_from_transcription(transcription)
icd_codes = map_treatment_to_icd(extracted.get('recommended_treatment', ''))
record = {
'age': extracted.get('age'),
'medical_specialty': extracted.get('medical_specialty'),
'recommended_treatment': extracted.get('recommended_treatment'),
'icd_codes': icd_codes
}
records.append(record)
df_structured = pd.DataFrame(records)
return df_structured
# test example
df_structured = process_transcriptions(df)
print(df_structured.head())import json
import pandas as pd
from openai import OpenAI
# authentication
client = OpenAI()
# Load the data
df = pd.read_csv("data/transcriptions.csv")
df.head()
# function to extract age and recommended treatment/procedure
def extract_info_with_openai(transcription):
"""Extracts age and recommended treatment from a transcription using OpenAI."""
messages = [
{
"role": "system",
"content": "You are a healthcare professional extracting patient data. Always return both the age and recommended treatment. If the information is missing, still create the field and specify 'Unknown'.",
"role": "user",
"content": f"Please extract and return both the patient's age and recommended treatment from the following transcription. Transcription: {transcription}."
}
]
function_definition = [
{
'type': 'function',
'function': {
'name': 'extract_medical_data',
'description': 'Get the age and recommended treatment from the input text. Always return both age and recommended treatment.',
'parameters': {
'type': 'object',
'properties': {
'Age': {
'type': 'integer',
'description': 'Age of the patient'
},
'Recommended Treatment/Procedure': {
'type': 'string',
'description': 'Recommended treatment or procedure for the patient'
}
}
}
}
}
]
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
tools=function_definition
)
return json.loads(response.choices[0].message.tool_calls[0].function.arguments)
for index, row in df.iterrows():
medical_specialty = row['medical_specialty']
extracted_data = extract_info_with_openai(row['transcription'])
# function to extract age and recommended treatment/procedure
def get_icd_codes(treatment):
if treatment != 'Unknown':
"""Retrieves ICD codes for a given treatment using OpenAI."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": f"Provide the ICD codes for the following treatment or procedure: {treatment}. Return the answer as a list of codes. Please only include the codes and no other information."
}],
temperature=0.3
)
output = response.choices[0].message.content
else:
output = 'Unknown'
return output
# empty list to store processed data
processed_data = []
# Processing each row in the DataFrame
for index, row in df.iterrows():
medical_specialty = row['medical_specialty']
extracted_data = extract_info_with_openai(row['transcription'])
icd_code = get_icd_codes(extracted_data["Recommended Treatment/Procedure"]) if 'Recommended Treatment/Procedure' in extracted_data.keys() else 'Unknown'
extracted_data["Medical Specialty"] = medical_specialty
extracted_data["ICD Code"] = icd_code
# Appending the extracted information as a new row in the list
processed_data.append(extracted_data)
# Converting the list to a DataFrame
df_structured = pd.DataFrame(processed_data)
print(df_structured)