Skip to content

My first project: On the Importance of Handwashing - 14/03/2024

Hungarian physician Dr. Ignaz Semmelweis worked at the Vienna General Hospital with childbed fever patients. Childbed fever is a deadly disease affecting women who have just given birth, and in the early 1840s, as many as 10% of the women giving birth died from it at the Vienna General Hospital. Dr.Semmelweis discovered that it was the contaminated hands of the doctors delivering the babies, and on June 1st, 1847, he decreed that everyone should wash their hands, an unorthodox and controversial request; nobody in Vienna knew about bacteria.

You will reanalyze the data that made Semmelweis discover the importance of handwashing and its impact on the hospital.

The data is stored as two CSV files within the data folder.

yearly_deaths_by_clinic.csv contains the number of women giving birth at the two clinics at the Vienna General Hospital between the years 1841 and 1846.

ColumnDescription
yearYears (1841-1846)
birthsNumber of births
deathsNumber of deaths
clinicClinic 1 or clinic 2

monthly_deaths.csv contains data from 'Clinic 1' of the hospital where most deaths occurred.

ColumnDescription
dateDate (YYYY-MM-DD)
birthsNumber of births
deathsNumber of deaths
# Imported libraries
library(tidyverse)
library(dplyr)

# Start coding here..

# Load the data using read_csv
read_csv("data/yearly_deaths_by_clinic.csv")
read_csv("data/monthly_deaths.csv")
# Inspecting data
# 1. Load the CSV files into yearly and monthly data frames and check the data.

yearly <- read.csv("data/yearly_deaths_by_clinic.csv")
yearly

monthly <- read.csv("data/monthly_deaths.csv")

monthly
# 2. Add a new column with the proportions

# Add a proportion_deaths column to yearly df, calculating the proportion of deaths per number of births for each year 
  yearly <- mutate(yearly, proportion_deaths =  deaths / births)


# Add a proportion_deaths column to monthly df, calculating the proportion of deaths per number of births for each month

 monthly <- mutate(monthly, proportion_deaths =  deaths / births)

# (opt) mutate(monthly, year = as.numeric(substr(date, 1, 4)))
# Create a ggplot line plot for the yearly proportion of deaths with a different colored line for each clinic
library(ggplot2)

ggplot(yearly, aes(x = year , y = proportion_deaths, color = clinic)) + 
  geom_line()

# Create a ggplot line plot for the monthly proportion of deaths

ggplot(monthly, aes(x = date , y = proportion_deaths)) +
  geom_line() 
# Visualize the threshold:
# 1. Save the threshold date to a variable 

threshold_date <- "1847-06-01"

threshold_date

threshold_date < "1848-06-01"

class(threshold_date)

class(monthly["date"])

threshold_date > monthly["date"]


# Corrected R code

# Assuming monthly_proportion and threshold_date are defined elsewhere in the code

# 2. Create a new column, handwashing_started, that uses a conditional to assign TRUE or FALSE values.
# Add a handwashing_started boolean column to monthly_proportion using variable from date column "1847-06-01" as the threshold - TRUE after, FALSE before; 
# TRUE should mean that handwashing has started at the clinic. 

monthly <- monthly %>%
  mutate(handwashing_started = date >= threshold_date)  %>% # (!) The date should be >= threshold date
  mutate(year = as.numeric(substr(date, 1, 4))) %>%
  mutate(month = as.numeric(substr(date, 6, 7))) # Corrected the substr function usage

monthly
	   

# 3. Plot the new df with different colored lines depending on handwashing_started.
# Create a new line plot with different colors for before and after the threshold.
ggplot(monthly, aes(x = month , y = proportion_deaths, color = handwashing_started, group = year)) +
  geom_line() +
  facet_wrap(~year)

# Assuming the plotting code will be added here later
# On the Importance of Handwashing — Semmelweis Analysis
# Diana Nicuțari

library(tidyverse)

# 1) Load and inspect the data -----------------------------------------------

yearly <- read_csv("data/yearly_deaths_by_clinic.csv", show_col_types = FALSE)
monthly <- read_csv("data/monthly_deaths.csv", show_col_types = FALSE)

glimpse(yearly)
glimpse(monthly)

# 2) Add a new column with the proportions -----------------------------------

yearly <- yearly %>%
  mutate(proportion_deaths = deaths / births)

monthly <- monthly %>%
  mutate(proportion_deaths = deaths / births)

# 3) Make a line plot for each data frame ------------------------------------

# Yearly plot: proportion of deaths by year, colored by clinic
p_yearly <- ggplot(yearly, aes(x = year, y = proportion_deaths, color = clinic)) +
  geom_line(linewidth = 1) +
  geom_point() +
  labs(
    title = "Yearly Proportion of Deaths by Clinic (1841–1846)",
    x = "Year",
    y = "Deaths / Births",
    color = "Clinic"
  )

# Monthly plot: proportion of deaths by date (Clinic 1)
p_monthly <- ggplot(monthly, aes(x = date, y = proportion_deaths)) +
  geom_line(linewidth = 1) +
  labs(
    title = "Monthly Proportion of Deaths (Clinic 1)",
    x = "Date",
    y = "Deaths / Births"
  )

p_yearly
p_monthly

# 4) Visualize the threshold (handwashing starts) ----------------------------

threshold_date <- as.Date("1847-06-01")

monthly <- monthly %>%
  mutate(handwashing_started = date >= threshold_date)

p_monthly_threshold <- ggplot(monthly, aes(x = date, y = proportion_deaths, color = handwashing_started)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = threshold_date, linetype = "dashed") +
  labs(
    title = "Monthly Death Proportion Before vs After Handwashing",
    subtitle = "Dashed line indicates start of handwashing (1847-06-01)",
    x = "Date",
    y = "Deaths / Births",
    color = "Handwashing started"
  )

p_monthly_threshold

# 5) Calculate the mean proportion of deaths ---------------------------------

monthly_summary <- monthly %>%
  group_by(handwashing_started) %>%
  summarise(mean_proportion_deaths = mean(proportion_deaths, na.rm = TRUE), .groups = "drop")

monthly_summary
#
class(threshold_date)