Skip to content
New Workbook
Sign up
Using machine learning techniques to identify factors influencing eustachian tube score

1. Cleaning and preparing the data

# Create a vector of German terms and their English equivalents

german_terms <- c(

  "keine Operation" = "no surgery",

  "Nasenseptum + AE" = "nasal septum + middle ear",

  "Zahnimplantat OK" = "upper jaw dental implant",

  "Tonsillektomie" = "tonsillectomy",

  "Resektion Speicheldrüse UK li" = "resection of left submandibular gland",

  "Adenoidektomie" = "adenoidectomy",

  "TE+AE" = "tympanic membrane + middle ear",

  "Beck´sche Bohrung li" = "left Beck's drilling",

  "Sinus max Fensterung li" = "left maxillary sinus fenestration",

  "Ohrmuschelrekonstruktion bei Knorpelhypertrophie" = "auricular reconstruction due to cartilage hypertrophy",

  "Verstärkung rundes Fenster re" = "reinforcement of right round window",

  "Nasenseptum + TE" = "nasal septum + tympanic membrane",

  "Tympanoplastik li" = "left tympanoplasty",

  "Nasenseptum" = "nasal septum", "selten (1-2/y)" = "rarely", "häufig" = "frequently (>2/y)", "nie" = "never"

)

replace_german_terms <- function(text) {

  # Replace German terms with English equivalents

  for (term in names(german_terms)) {

    text <- stringr::str_replace_all(text, term, german_terms[term])

  }

  return(text)

}
# Load necessary library
library(dplyr)

# Read the CSV file into a dataframe
data <- read.csv('druckausgleich.csv', sep = ",", dec =",")

# Remove empty rows
cleaned_data <- data %>% 
  filter(!is.na(X)) |>  
  janitor::clean_names() |> 
  dplyr::select(-x,-geschlecht_r, -laufend, -var00001) |>
  dplyr::mutate_if(is.character, stringr::str_trim) |> 
  mutate_if(is.character, factor) |> 
  dplyr::mutate(operationen = replace_german_terms(operationen))

# Replace terms in all columns
cleaned_data <- cleaned_data %>%
  mutate(across(everything(), ~ stringr::str_replace_all(., c(
    "selten" = "rarely", 
    "häufig" = "frequently", 
    "nie" = "never", 
    "männlich" = "m", 
    "weiblich" = "f", 
    "kein" = "no",
    "nicht beeinträchtigt" = "not impaired",
    "bisher nicht aufgefallen" = "not noticed so far",
    "vorhanden, aber nicht beeinträchtigt" = "present but not impaired",
    "beeinträchtigt" = "impaired",
    "stark beeinträchtigt" = "severely impaired", "Abflug" = "takeoff", "Abtauchen" = "diving", "Landeanflug" = "approach", "Auftauchen" = "surfacing",

"Rechtshänder" = "right-handed", "Linkshänder" = "left-handed",

"noe Allergien" = "no allergies", "allergische Rhinitis" = "allergic rhinitis", "Medikamente" = "medications", "sonstige" = "other", "Lebensmittel" = "food",

"regelrecht" = "normal", "flach" = "flat",

"vernarbt" = "scarred", "myringosklerotische Plaques vorne unten" = "myringosclerotic plaques front bottom", "etwas atroph" = "slightly atrophic", "etwas schuppig belegt" = "slightly scaly", "leicht vernarbt, Rötung, Zn Spülung bei Cerumen" = "slightly scarred, redness, Zn irrigation with cerumen", "regelrecht, etwas Cerumenauflagerung" = "normal, slight cerumen overlay", "minimal vernarbt" = "minimally scarred", "etwas gefäßinjiziert" = "slightly vascularized", "gestielte Exostose" = "pedunculated exostosis", "myringosklerotische Plaques sichelförmig hinten und vorne unten" = "myringosclerotic plaques crescent-shaped back and front bottom", "Rötung, Zn Spülung bei Cerumen" = "redness, Zn irrigation with cerumen", "intaktes, weiß mattes TF, Va Zn TyPla" = "intact, white matte TM, Va Zn TyPla",

"etwas gefäßinjiziert" = "slightly vascularized", "unverändert" = "unchanged", "minimal gefäßinjiziert" = "minimally vascularized", "deutlich gefäßinjiziert" = "clearly vascularized",

"prompt" = "prompt", "prompt mit schneller Rückstellbewegung" = "prompt with quick return movement", "leicht verzögert" = "slightly delayed", "schnelle, kleine Bewegung" = "quick, small movement", "nicht beeinträchtigt" = "not impaired", "bisher nicht aufgefallen" = "not noticed so far", "vorhanden, aber nicht beeinträchtigt" = "present, but not impaired", "beeinträchtigt" = "impaired", "stark beeinträchtigt" = "severely impaired", "schwach" = "weak", "schwach und verzögert" = "weak and delayed", "nicht erkennbar" = "not recognizable", "atemsynchron" = "breath-synchronous",

"vollständig" = "complete", "Druckkammer fehlt" = "pressure chamber missing", "Tymp+TMM fehlt" = "Tymp+TMM missing", "TMM fehlt" = "TMM missing",

"bestanden" = "passed", "Abbruch 1. Tauchgang" = "aborted 1st dive", "Abbruch 2. Tauchgang" = "aborted 2nd dive", "Daten technisch unvollständig" = "data technically incomplete", "Taucher" = "diver", "Nicht-Taucher" = "non-diver", "Nicht" = "non", "immer" = "always", "eingeschränkt" = "restricted", "aber" = "but", "vorhanden" = "present", "stark" = "strongly", "schwach" = "slightly", "und" = "and", "verzögert" = "delayed", "Daten" = "data", "technisch" = "technically"
  )))) 

# Display the cleaned data
cleaned_data |> 
  glimpse()
# Load necessary libraries
library(dplyr)
library(stringr)

# Assuming df is the dataframe containing the columns
# Convert columns with numeric values stored as characters to numeric

# Function to clean and convert character columns to numeric
clean_numeric_column <- function(column) {
  column <- str_replace_all(column, "'", "")  # Remove single quotes
  as.numeric(column)
}

# Identify columns that are characters but contain numeric values, including negative values and NAs
numeric_columns <- cleaned_data %>%
  select_if(~ all(str_detect(., "^-?[0-9.']*$"))) %>%
  names()

# Convert these columns to numeric
cleaned_data <- cleaned_data %>%
  mutate(across(all_of(numeric_columns), clean_numeric_column))

# Select only character columns
character_columns <- cleaned_data %>%
  select(where(is.character)) %>%
  names()

# Combine cleaned numeric columns with original character columns
final_data <- cleaned_data %>%
  select(all_of(character_columns), all_of(numeric_columns)) 
glimpse(final_data)
# Select and print column names containing either 're' or 'li'
selected_columns <- colnames(final_data)[str_detect(colnames(final_data), "re|li")]
selected_columns
unique(cleaned_data$allergien)
final_data <- final_data |>

  dplyr::rename_with(

    .cols = where(is.numeric),

    .fn = ~ gsub("_(re|li)(.*)", "\\2_\\1", .)

  ) |> 
   rename(ets7_re = ets7re, ets7_li = ets7li) |>

  tidyr::pivot_longer(

    cols = tidyselect::matches(".*_(re|li)$"),

    names_to = c(".value", "side"),

    names_pattern = "(.*)_(re|li)"

  ) |>

  dplyr::mutate(side = dplyr::recode(side, re = "right", li = "left")) |> 
  filter_all(any_vars(!is.na(.))) |> 
  mutate_if(is.character, factor)
glimpse(final_data |> 
	    select_if(is.numeric))
# Summarize the measurements and create new columns
final_data <- final_data %>%
  rowwise() %>%
  mutate(
    ohr_ab_sl_summe = sum(c_across(matches("^ohr_ab_sl_\\d+$")), na.rm = TRUE),
    ohr_auf_sl_summe = sum(c_across(matches("^ohr_sl_auf_\\d+$")), na.rm = TRUE),
    ohr_ab_fa_summe = sum(c_across(matches("^ohr_ab_fa_\\d+$")), na.rm = TRUE),
    ohr_auf_fa_summe = sum(c_across(matches("^ohr_fa_auf_\\d+$")), na.rm = TRUE),
    taster_ab_sl_summe = sum(c_across(matches("^taster_ab_sl_\\d+$")), na.rm = TRUE),
    taster_auf_sl_summe = sum(c_across(matches("^taster_auf_sl_\\d+$")), na.rm = TRUE),
    taster_ab_fa_summe = sum(c_across(matches("^taster_ab_fa_\\d+$")), na.rm = TRUE),
    taster_auf_fa_summe = sum(c_across(matches("^taster_auf_fa_\\d+$")), na.rm = TRUE)
  ) %>%
  ungroup()

# Remove the original measurement columns
druckversuch <- final_data %>%
  select(-matches("^ohr_sl_auf_\\d+$|^ohr_ab_sl_\\d+$|^ohr_ab_fa_\\d+$|^ohr_fa_auf_\\d+$|^taster_ab_sl_\\d+$|^taster_auf_sl_\\d+$|^taster_ab_fa_\\d+$|^taster_auf_fa_\\d+$"))

The dataset was cleaned and transformed. Especially left and right side were defined as factors and put together in a long format. Further the words and terms were translated into English. I will keep on with defining the data columns, checking the data for normality and performing descriptive statistics on the data.

2. Defining the column meanings

  1. proband: Identifier for the participant.

  2. kohorte: Cohort group (e.g., diver, non-diver).

  3. geschlecht: Gender (m = male, f = female).

  4. operationen: History of surgeries.

  5. rhinosinusitis: Frequency of rhinosinusitis (sinus infection).

  6. otitiden: Frequency of otitis (ear infections).

  7. rauchen: Smoking status.

  8. druckexposition: Frequency of pressure exposure.

  9. tbs_subj: Subjective assessment of ear function.

  10. tbs_wann: Time of subjective assessment.

  11. handigkeit: Handedness (right-handed, left-handed).

  12. allergien: Allergies.

  13. indikator: Indicator of completeness.

  14. druckkammer: Result of pressure chamber test.

  15. alter: Age of the participant.

  16. tauchgange: Number of dives.

  17. ets7re: ETS7 measurement for the right ear.

  18. ets7li: ETS7 measurement for the left ear.

  19. etdq7: ETDQ7 score (Eustachian Tube Dysfunction Questionnaire).

  20. taster_ab_sl_1: Measurement 1 for "ab" (down) and "sl" (sound conduction).

  21. taster_ab_sl_2: Measurement 2 for "ab" (down) and "sl" (sound conduction).

  22. taster_ab_sl_3: Measurement 3 for "ab" (down) and "sl" (sound conduction).

  23. taster_auf_sl_1: Measurement 1 for "auf" (up) and "sl" (sound conduction).

  24. taster_auf_sl_2: Measurement 2 for "auf" (up) and "sl" (sound conduction).

  25. taster_auf_sl_3: Measurement 3 for "auf" (up) and "sl" (sound conduction).

  26. taster_ab_fa_1: Measurement 1 for "ab" (down) and "fa" (frequency analysis).

  27. taster_ab_fa_2: Measurement 2 for "ab" (down) and "fa" (frequency analysis).

  28. taster_ab_fa_3: Measurement 3 for "ab" (down) and "fa" (frequency analysis).

  29. taster_auf_fa_1: Measurement 1 for "auf" (up) and "fa" (frequency analysis).

  30. taster_auf_fa_2: Measurement 2 for "auf" (up) and "fa" (frequency analysis).

  31. taster_auf_fa_3: Measurement 3 for "auf" (up) and "fa" (frequency analysis).

  32. taster_ab_sl_summe: Sum of "ab" (down) and "sl" (sound conduction) measurements.

  33. taster_auf_sl_summe: Sum of "auf" (up) and "sl" (sound conduction) measurements.

  34. taster_ab_fa_summe: Sum of "ab" (down) and "fa" (frequency analysis) measurements.

  35. taster_auf_fa_summe: Sum of "auf" (up) and "fa" (frequency analysis) measurements.

  36. side: Side (right or left).

  37. tymp: Tympanometry result.

  38. subj_valsalva: Subjective Valsalva maneuver result.

  39. subj_schlucken: Subjective swallowing result.

  40. t_fpra: Tympanic membrane pre-test result.

  41. t_fpost: Tympanic membrane post-test result.

  42. obj_valsalva: Objective Valsalva maneuver result.

  43. r_tymp: Resistance or reflectance in tympanometry.

  44. v_tymp: Volume of the ear canal in tympanometry.

  45. s_tymp: Stiffness or compliance in tympanometry.

  46. gehorgangvol: Ear canal volume.

  47. resonanzfreq: Resonance frequency.

  48. spitzendruck: Peak pressure.

  49. c226hz: Compliance at 226 Hz.

  50. x_1000hz: Measurement at 1000 Hz.

  51. tmm30: Tympanometric measurement at 30 Hz.

  52. tmm30_t: Tympanometric measurement at 30 Hz, specific parameter (e.g., time).

  53. tmm30_p: Tympanometric measurement at 30 Hz, peak pressure.

  54. tmm40: Tympanometric measurement at 40 Hz.

  55. tm_mr40_t: Tympanometric measurement at 40 Hz, specific method or response, parameter (e.g., time).

  56. tmm40_p: Tympanometric measurement at 40 Hz, peak pressure.

  57. tmm50: Tympanometric measurement at 50 Hz.

  58. tm_mr50_t: Tympanometric measurement at 50 Hz, specific method or response, parameter (e.g., time).

  59. tmm50_p: Tympanometric measurement at 50 Hz, peak pressure.

  60. tmm40_t: Tympanometric measurement at 40 Hz, specific parameter (e.g., time).

  61. tmm50_t: Tympanometric measurement at 50 Hz, specific parameter (e.g., time).

  62. ohr_ab_sl_1: Ear measurement 1 for "ab" (down) and "sl" (sound conduction).

  63. ohr_ab_sl_2: Ear measurement 2 for "ab" (down) and "sl" (sound conduction).

  64. ohr_ab_sl_3: Ear measurement 3 for "ab" (down) and "sl" (sound conduction).

  65. ohr_sl_auf_1: Ear measurement 1 for "auf" (up) and "sl" (sound conduction).

  66. ohr_sl_auf_2: Ear measurement 2 for "auf" (up) and "sl" (sound conduction).

  67. ohr_sl_auf_3: Ear measurement 3 for "auf" (up) and "sl" (sound conduction).

  68. ohr_ab_fa_1: Ear measurement 1 for "ab" (down) and "fa" (frequency analysis).

  69. ohr_ab_fa_2: Ear measurement 2 for "ab" (down) and "fa" (frequency analysis).

  70. ohr_ab_fa_3: Ear measurement 3 for "ab" (down) and "fa" (frequency analysis).

  71. ohr_fa_auf_1: Ear measurement 1 for "auf" (up) and "fa" (frequency analysis).

  72. ohr_fa_auf_2: Ear measurement 2 for "auf" (up) and "fa" (frequency analysis).

  73. ohr_fa_auf_3: Ear measurement 3 for "auf" (up) and "fa" (frequency analysis).

  74. ohr_ab_sl_summe: Sum of ear measurements for "ab" (down) and "sl" (sound conduction).

  75. ohr_auf_sl_summe: Sum of ear measurements for "auf" (up) and "sl" (sound conduction).

  76. ohr_ab_fa_summe: Sum of ear measurements for "ab" (down) and "fa" (frequency analysis).

  77. ohr_auf_fa_summe: Sum of ear measurements for "auf" (up) and "fa" (frequency analysis).

These interpretations are based on common medical abbreviations and might differ.

3. Descriptive statistics