Using machine learning techniques to identify factors influencing eustachian tube score

1. Cleaning and preparing the data

# Create a vector of German terms and their English equivalents

german_terms <- c(

  "keine Operation" = "no surgery",

  "Nasenseptum + AE" = "nasal septum + middle ear",

  "Zahnimplantat OK" = "upper jaw dental implant",

  "Tonsillektomie" = "tonsillectomy",

  "Resektion Speicheldrüse UK li" = "resection of left submandibular gland",

  "Adenoidektomie" = "adenoidectomy",

  "TE+AE" = "tympanic membrane + middle ear",

  "Beck´sche Bohrung li" = "left Beck's drilling",

  "Sinus max Fensterung li" = "left maxillary sinus fenestration",

  "Ohrmuschelrekonstruktion bei Knorpelhypertrophie" = "auricular reconstruction due to cartilage hypertrophy",

  "Verstärkung rundes Fenster re" = "reinforcement of right round window",

  "Nasenseptum + TE" = "nasal septum + tympanic membrane",

  "Tympanoplastik li" = "left tympanoplasty",

  "Nasenseptum" = "nasal septum", "selten (1-2/y)" = "rarely", "häufig" = "frequently (>2/y)", "nie" = "never"

)

replace_german_terms <- function(text) {

  # Replace German terms with English equivalents

  for (term in names(german_terms)) {

    text <- stringr::str_replace_all(text, term, german_terms[term])

  }

  return(text)

}

# Load necessary library
library(dplyr)

# Read the CSV file into a dataframe
data <- read.csv('druckausgleich.csv', sep = ",", dec =",")

# Remove empty rows
cleaned_data <- data %>% 
  filter(!is.na(X)) |>  
  janitor::clean_names() |> 
  dplyr::select(-x,-geschlecht_r, -laufend, -var00001) |>
  dplyr::mutate_if(is.character, stringr::str_trim) |> 
  mutate_if(is.character, factor) |> 
  dplyr::mutate(operationen = replace_german_terms(operationen))

# Replace terms in all columns
cleaned_data <- cleaned_data %>%
  mutate(across(everything(), ~ stringr::str_replace_all(., c(
    "selten" = "rarely", 
    "häufig" = "frequently", 
    "nie" = "never", 
    "männlich" = "m", 
    "weiblich" = "f", 
    "kein" = "no",
    "nicht beeinträchtigt" = "not impaired",
    "bisher nicht aufgefallen" = "not noticed so far",
    "vorhanden, aber nicht beeinträchtigt" = "present but not impaired",
    "beeinträchtigt" = "impaired",
    "stark beeinträchtigt" = "severely impaired", "Abflug" = "takeoff", "Abtauchen" = "diving", "Landeanflug" = "approach", "Auftauchen" = "surfacing",

"Rechtshänder" = "right-handed", "Linkshänder" = "left-handed",

"noe Allergien" = "no allergies", "allergische Rhinitis" = "allergic rhinitis", "Medikamente" = "medications", "sonstige" = "other", "Lebensmittel" = "food",

"regelrecht" = "normal", "flach" = "flat",

"vernarbt" = "scarred", "myringosklerotische Plaques vorne unten" = "myringosclerotic plaques front bottom", "etwas atroph" = "slightly atrophic", "etwas schuppig belegt" = "slightly scaly", "leicht vernarbt, Rötung, Zn Spülung bei Cerumen" = "slightly scarred, redness, Zn irrigation with cerumen", "regelrecht, etwas Cerumenauflagerung" = "normal, slight cerumen overlay", "minimal vernarbt" = "minimally scarred", "etwas gefäßinjiziert" = "slightly vascularized", "gestielte Exostose" = "pedunculated exostosis", "myringosklerotische Plaques sichelförmig hinten und vorne unten" = "myringosclerotic plaques crescent-shaped back and front bottom", "Rötung, Zn Spülung bei Cerumen" = "redness, Zn irrigation with cerumen", "intaktes, weiß mattes TF, Va Zn TyPla" = "intact, white matte TM, Va Zn TyPla",

"etwas gefäßinjiziert" = "slightly vascularized", "unverändert" = "unchanged", "minimal gefäßinjiziert" = "minimally vascularized", "deutlich gefäßinjiziert" = "clearly vascularized",

"prompt" = "prompt", "prompt mit schneller Rückstellbewegung" = "prompt with quick return movement", "leicht verzögert" = "slightly delayed", "schnelle, kleine Bewegung" = "quick, small movement", "nicht beeinträchtigt" = "not impaired", "bisher nicht aufgefallen" = "not noticed so far", "vorhanden, aber nicht beeinträchtigt" = "present, but not impaired", "beeinträchtigt" = "impaired", "stark beeinträchtigt" = "severely impaired", "schwach" = "weak", "schwach und verzögert" = "weak and delayed", "nicht erkennbar" = "not recognizable", "atemsynchron" = "breath-synchronous",

"vollständig" = "complete", "Druckkammer fehlt" = "pressure chamber missing", "Tymp+TMM fehlt" = "Tymp+TMM missing", "TMM fehlt" = "TMM missing",

"bestanden" = "passed", "Abbruch 1. Tauchgang" = "aborted 1st dive", "Abbruch 2. Tauchgang" = "aborted 2nd dive", "Daten technisch unvollständig" = "data technically incomplete", "Taucher" = "diver", "Nicht-Taucher" = "non-diver", "Nicht" = "non", "immer" = "always", "eingeschränkt" = "restricted", "aber" = "but", "vorhanden" = "present", "stark" = "strongly", "schwach" = "slightly", "und" = "and", "verzögert" = "delayed", "Daten" = "data", "technisch" = "technically"
  )))) 

# Display the cleaned data
cleaned_data |> 
  glimpse()

# Load necessary libraries
library(dplyr)
library(stringr)

# Assuming df is the dataframe containing the columns
# Convert columns with numeric values stored as characters to numeric

# Function to clean and convert character columns to numeric
clean_numeric_column <- function(column) {
  column <- str_replace_all(column, "'", "")  # Remove single quotes
  as.numeric(column)
}

# Identify columns that are characters but contain numeric values, including negative values and NAs
numeric_columns <- cleaned_data %>%
  select_if(~ all(str_detect(., "^-?[0-9.']*$"))) %>%
  names()

# Convert these columns to numeric
cleaned_data <- cleaned_data %>%
  mutate(across(all_of(numeric_columns), clean_numeric_column))

# Select only character columns
character_columns <- cleaned_data %>%
  select(where(is.character)) %>%
  names()

# Combine cleaned numeric columns with original character columns
final_data <- cleaned_data %>%
  select(all_of(character_columns), all_of(numeric_columns))

glimpse(final_data)

# Select and print column names containing either 're' or 'li'
selected_columns <- colnames(final_data)[str_detect(colnames(final_data), "re|li")]
selected_columns

unique(cleaned_data$allergien)

final_data <- final_data |>

  dplyr::rename_with(

    .cols = where(is.numeric),

    .fn = ~ gsub("_(re|li)(.*)", "\\2_\\1", .)

  ) |> 
   rename(ets7_re = ets7re, ets7_li = ets7li) |>

  tidyr::pivot_longer(

    cols = tidyselect::matches(".*_(re|li)$"),

    names_to = c(".value", "side"),

    names_pattern = "(.*)_(re|li)"

  ) |>

  dplyr::mutate(side = dplyr::recode(side, re = "right", li = "left")) |> 
  filter_all(any_vars(!is.na(.))) |> 
  mutate_if(is.character, factor)

glimpse(final_data |> 
	    select_if(is.numeric))

# Summarize the measurements and create new columns
final_data <- final_data %>%
  rowwise() %>%
  mutate(
    ohr_ab_sl_summe = sum(c_across(matches("^ohr_ab_sl_\\d+$")), na.rm = TRUE),
    ohr_auf_sl_summe = sum(c_across(matches("^ohr_sl_auf_\\d+$")), na.rm = TRUE),
    ohr_ab_fa_summe = sum(c_across(matches("^ohr_ab_fa_\\d+$")), na.rm = TRUE),
    ohr_auf_fa_summe = sum(c_across(matches("^ohr_fa_auf_\\d+$")), na.rm = TRUE),
    taster_ab_sl_summe = sum(c_across(matches("^taster_ab_sl_\\d+$")), na.rm = TRUE),
    taster_auf_sl_summe = sum(c_across(matches("^taster_auf_sl_\\d+$")), na.rm = TRUE),
    taster_ab_fa_summe = sum(c_across(matches("^taster_ab_fa_\\d+$")), na.rm = TRUE),
    taster_auf_fa_summe = sum(c_across(matches("^taster_auf_fa_\\d+$")), na.rm = TRUE)
  ) %>%
  ungroup()

# Remove the original measurement columns
druckversuch <- final_data %>%
  select(-matches("^ohr_sl_auf_\\d+$|^ohr_ab_sl_\\d+$|^ohr_ab_fa_\\d+$|^ohr_fa_auf_\\d+$|^taster_ab_sl_\\d+$|^taster_auf_sl_\\d+$|^taster_ab_fa_\\d+$|^taster_auf_fa_\\d+$"))

druckversuch |> 
 write.csv("druckversuch.csv")

unique(druckversuch$druckkammer)

str(druckversuch)

The dataset was cleaned and transformed. Especially left and right side were defined as factors and put together in a long format. Further the words and terms were translated into English. I will keep on with defining the data columns, checking the data for normality and performing descriptive statistics on the data.

‌
‌
‌

Using machine learning techniques to identify factors influencing eustachian tube score

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}1. Cleaning and preparing the data

1. Cleaning and preparing the data