Skip to content

Medical professionals often summarize patient encounters in transcripts written in natural language, which include details about symptoms, diagnosis, and treatments. These transcripts can be used for other medical documentation, such as for insurance purposes, but as they are densely packed with medical information, extracting the key data accurately can be challenging.

You and your team at Lakeside Healthcare Network have decided to leverage the OpenAI API to automatically extract medical information from these transcripts and automate the matching with the appropriate ICD-10 codes. ICD-10 codes are a standardized system used worldwide for diagnosing and billing purposes, such as insurance claims processing.

The Data

The dataset contains anonymized medical transcriptions categorized by specialty.

transcriptions.csv

ColumnDescription
"medical_specialty"The medical specialty associated with each transcription.
"transcription"Detailed medical transcription texts, with insights into the medical case.
# Import the necessary libraries
import pandas as pd
from openai import OpenAI
import json
# Load the data
df = pd.read_csv("data/transcriptions.csv")
df.head()
# Initialize the OpenAI client
client = OpenAI()

## Start coding here, use as many cells as you need
# Step 1: Define a function to extract patient data using OpenAI API
def extract_medical_data(transcription, specialty):
    """
    Extracts patient age, recommended treatment/procedure,
    and ICD-10 code from a medical transcription using OpenAI function calling.
    """

    # Define a schema for structured extraction
    functions = [
        {
            "name": "extract_patient_data",
            "description": "Extract patient details and ICD-10 code from transcription",
            "parameters": {
                "type": "object",
                "properties": {
                    "age": {"type": "string", "description": "Age of the patient"},
                    "medical_specialty": {"type": "string", "description": "Medical specialty"},
                    "recommended_treatment": {"type": "string", "description": "Recommended treatment or procedure"},
                    "icd10_code": {"type": "string", "description": "ICD-10 code corresponding to diagnosis/treatment"}
                },
                "required": ["age", "medical_specialty", "recommended_treatment", "icd10_code"]
            }
        }
    ]

    # Create the message for the model
    messages = [
        {"role": "system", "content": "You are a medical coding assistant that extracts structured information from medical transcripts."},
        {"role": "user", "content": f"Medical specialty: {specialty}\nTranscription: {transcription}"}
    ]

    # Call the OpenAI Chat Completions API
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        functions=functions,
        function_call={"name": "extract_patient_data"}  # Force the model to call the function
    )

    # Parse the response into JSON
    try:
        structured_data = json.loads(response.choices[0].message.function_call.arguments)
    except Exception as e:
        print("Error parsing response:", e)
        structured_data = {
            "age": None,
            "medical_specialty": specialty,
            "recommended_treatment": None,
            "icd10_code": None
        }

    return structured_data
# Step 2: Define a function to match treatment to ICD-10 code
def match_icd10_code(treatment_description):
    """
    Matches a treatment or procedure with an ICD-10 code.
    This example uses the model to find the correct code.
    In production, you might load an ICD-10 CSV and do fuzzy matching.
    """
    
    messages = [
        {"role": "system", "content": "You are an expert in ICD-10 coding."},
        {"role": "user", "content": f"Find the most appropriate ICD-10 code for the treatment: {treatment_description}"}
    ]
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    icd10_code = response.choices[0].message.content.strip()
    return icd10_code
# Step 3: Process all transcriptions
structured_results = []

for _, row in df.iterrows():
    transcription = row["transcription"]
    specialty = row["medical_specialty"]

    # Extract age & treatment
    extracted = extract_medical_data(transcription, specialty)

    # Perform ICD-10 code matching explicitly
    if extracted["recommended_treatment"]:
        extracted["icd10_code"] = match_icd10_code(extracted["recommended_treatment"])
    else:
        extracted["icd10_code"] = None

    structured_results.append(extracted)
# Step 4: Convert results to DataFrame
df_structured = pd.DataFrame(structured_results)
df_structured.head()