Skip to content
Donald
  • AI Chat
  • Code
  • Report
  • # Start coding here...
    install.packages("readxl")
    library(readxl)
    
    # Specify the file path and sheet name
    file1 <- "Donald Radu new.xlsx"
    sheet_name1 <- "Final_new"
    
    
    # Load the specified sheet into a data frame
    df <- read_excel(file1, sheet = sheet_name1)
    df <- as.data.frame(df)
    
    df$gender[df$gender == "F"] <- "1"
    df$gender[df$gender == "M"] <- "0"
    
    # load stringr package
    library(stringr)
    
    # create new column with count of metastasis
    df$num_metastases <- str_count(df$locations, ",") + 1
    
    # Create a new column indicating presence of "2"=liver metastasis in "Locations"
    df$liver_meta <- grepl("2", df$locations)
    
    # Create a new column indicating presence of "1"=signet ring histological type in "Pathology"
    df$signet <- grepl("1", df$patho)
    
    # Create a new column indicating presence of "1"=signet ring histological type in "Pathology"
    df$poorly <- grepl("2", df$patho)
    
    # Create a new column indicating presence of "1"=signet ring histological type in "Pathology"
    df$tubular <- grepl("3", df$patho)
    
    # Create a new column indicating total time on treatment for the patients as the sum of time1, time2, and time3, ignoring NAs
    df$tot_tot <- rowSums(df[, c("time1", "time_maintenance", "time2", "time3")], na.rm = TRUE)
    
    # Create a new column indicating total time on treatment for the patients as the sum of time1, time2, and time3, ignoring NAs
    df$tot_time1 <- rowSums(df[, c("time1", "time_maintenance")], na.rm = TRUE)
    
    # Change TRUE, FALSE to 1,0
    df$liver_meta <- ifelse(df$liver_meta, 1, 0)
    df$signet <- ifelse(df$signet, 1, 0)
    df$poorly <- ifelse(df$poorly, 1, 0)
    df$tubular <- ifelse(df$poorly, 1, 0)
    
    # Display the updated dataframe
    df
    #Total number of patients (patients that received line 1)
    (sum_tot = nrow(df))
    df_og_data <- subset(df, outcome_line1 == 2 | outcome_line2 == 2 | outcome_line3 == 2 | outcome_line3 == 0 )
    df_og_data
    nrow(df_og_data)
    
    df_lost_data <- subset(df, outcome_line1 == 3 | outcome_line2 == 3 | outcome_line3 == 3)
    df_lost_data
    nrow(df_lost_data)
    df_new <- subset(df, outcome_line1 == 1 | outcome_line2 == 1 | outcome_line3 == 1)
    df_new
    # Filter the data to include patients who received at least line 2 in a new dataframe
    line2_equal_or_more_data <- subset(df, line2_yes == 1 & (outcome_line2 == 0 | outcome_line2 == 1))
    nrow(line2_equal_or_more_data)
    mean(line2_equal_or_more_data$age)
    
    # Filter the data to include patients who received at least line 3 in a new dataframe
    line3_equal_or_more_data <- subset(df, line3_yes == 1 & (outcome_line3 == 0 | outcome_line3 == 1))
    nrow(line3_equal_or_more_data)
    mean(line3_equal_or_more_data$age)

    Subseturi

    # Filter the data to include patients who received line 1 only in a new dataframe
    line1_only_data <- subset(df, line1_yes == 1 & outcome_line1 == 1)
    nrow(line1_only_data)
    mean(line1_only_data$age)
    mean(line1_only_data$tot_time1)
    
    # Filter the data to include patients who received line 1 and line 2 only in a new dataframe
    line2_only_data <- subset(df, line2_yes == 1 & outcome_line2 == 1)
    nrow(line2_only_data)
    mean(line2_only_data$age)
    mean(line2_only_data$tot_time1)
    
    # Filter the data to include patients who received line 1, line 2, line 3 only in a new dataframe
    line3_only_data <- subset(df, line3_yes == 1 & outcome_line3 == 1)
    nrow(line3_only_data)
    mean(line3_only_data$age)
    mean(line3_only_data$tot_time1)
    line1_only_data
    (sum_line1_only = sum(line1_only_data$line1_yes))
    (sum_line2_only = sum(line2_only_data$line2_yes))
    (sum_line3_only = sum(line3_only_data$line3_yes))
    (sum_tot = sum_line1_only + sum_line2_only + sum_line3_only)
    #Patients per each line
    (perc_line1_only = sum_line1_only / sum_tot)
    (perc_line2_only = sum_line2_only / sum_tot)
    (perc_line3_only = sum_line3_only / sum_tot)

    Age

    hist(line1_only_data$age)
    hist(line2_only_data$age)
    hist(line3_only_data$age)
    Hidden output
    # Calculate the mean age for patients who received line 1
    mean_age_line1 <- mean(line1_only_data$age)
    
    #Calculate the median age for patients who received line 1
    median_age_line1 <- median(line1_only_data$age)
    
    # Print the results
    cat("Mean age of patients who received up to line 1: ", mean_age_line1, "\n")
    cat("Median age of patients who received up to line 1: ", median_age_line1, "\n")
    
    # Calculate the mean age for patients who received line 2
    mean_age_line2 <- mean(line2_only_data$age)
    
    #Calculate the median age for patients who received line 2
    median_age_line2 <- median(line2_only_data$age)
    
    # Print the result
    cat("Mean age of patients who received up to line 2: ", mean_age_line2, "\n")
    cat("Median age of patients who received up to line 2: ", median_age_line2, "\n")
    
    # Calculate the mean age for patients who received line 3
    mean_age_line3 <- mean(line3_only_data$age)
    
    #Calculate the median age for patients who received line 3
    median_age_line3 <- median(line3_only_data$age)
    
    # Print the result
    cat("Mean age of patients who received line 3: ", mean_age_line3, "\n")
    cat("Median age of patients who received up to line 3: ", median_age_line3, "\n")
    
    # Calculate the standard deviation for the mean age of patients who received line 1, line 2, line 3
    sd_age_line1 <- sd(line1_only_data$age)
    sd_age_line2 <- sd(line2_only_data$age)
    sd_age_line3 <- sd(line3_only_data$age)
    
    # Print the results
    cat("Standard deviation for the mean age of patients who received up to line 1: ", sd_age_line1, "\n")
    cat("Standard deviation for the mean age of patients who received up to line 2: ", sd_age_line2, "\n")
    cat("Standard deviation for the mean age of patients who received up to line 3: ", sd_age_line3, "\n")
    
    # Calculate the first and third quartiles for age in line 1 
    q1_age_line1 <- quantile(line1_only_data$age, 0.25)
    q3_age_line1 <- quantile(line1_only_data$age, 0.75)
    
    # Calculate the first and third quartiles for age in line 2 
    q1_age_line2 <- quantile(line2_only_data$age, 0.25)
    q3_age_line2 <- quantile(line2_only_data$age, 0.75)
    
    # Calculate the first and third quartiles for age in line 3
    q1_age_line3 <- quantile(line3_only_data$age, 0.25)
    q3_age_line3 <- quantile(line3_only_data$age, 0.75)
    
    # Calculate the interquartile range for age in line 1
    iqr_age_line1 <- q3_age_line1 - q1_age_line1
    
    # Calculate the interquartile range for age in line 2
    iqr_age_line2 <- q3_age_line2 - q1_age_line2
    
    # Calculate the interquartile range for age in line 3
    iqr_age_line3 <- q3_age_line3 - q1_age_line3
    
    # Print the results
    cat("Interquartile range for age in line 1: ", q1_age_line1, "-", q3_age_line1, "\n")
    cat("Interquartile range for age in line 2: ", q1_age_line2, "-", q3_age_line2, "\n")
    cat("Interquartile range for age in line 3: ", q1_age_line3, "-", q3_age_line3, "\n")
    
    line1_data_age <- line1_only_data$age
    line2_data_age <- line2_only_data$age
    line3_data_age <- line3_only_data$age
    
    # Perform Mann-Whitney U test for median age of patients on line 2 vs. line 3
    mwu_test <- wilcox.test(line2_data_age, line3_data_age)
    
    # Print the results of the test
    cat("Mann-Whitney U test results line2vsline3:\n")
    cat("U statistic = ", mwu_test$statistic, "\n")
    cat("p-value = ", mwu_test$p.value, "\n")
    
    
    # Perform Mann-Whitney U test for median age of patients on line 1 vs. line 2
    mwu_test <- wilcox.test(line1_data_age, line2_data_age)
    
    # Print the results of the test
    cat("Mann-Whitney U test results line1vsline2:\n")
    cat("U statistic = ", mwu_test$statistic, "\n")
    cat("p-value = ", mwu_test$p.value, "\n")
    
    
    # Install and load the 'car' package for Levene's test
    if (!requireNamespace("car", quietly = TRUE)) {
      install.packages("car")
    }
    library(car)
    
    # Check for normality
    shapiro_line1 <- shapiro.test(line1_data_age)
    shapiro_line2 <- shapiro.test(line2_data_age)
    shapiro_line3 <- shapiro.test(line3_data_age)
    
    cat("Shapiro-Wilk test for line 1: ", shapiro_line1$p.value, "\n")
    cat("Shapiro-Wilk test for line 2: ", shapiro_line2$p.value, "\n")
    cat("Shapiro-Wilk test for line 3: ", shapiro_line3$p.value, "\n")
    
    # Create a new dataframe with 'age' and 'group' columns
    age_group_data <- data.frame(age = c(line1_data_age, line2_data_age, line3_data_age),
                                 group = factor(c(rep("Line1", length(line1_data_age)),
                                                  rep("Line2", length(line2_data_age)),
                                                  rep("Line3", length(line3_data_age)))))
    
    # Check for homogeneity of variances
    levene_test <- leveneTest(age ~ group, data = age_group_data)
    
    cat("Levene's test: ", levene_test[1, "Pr(>F)"], "\n")
    
    # Perform t-test if assumptions are met
    if (shapiro_line1$p.value > 0.05 && shapiro_line2$p.value > 0.05 && shapiro_line3$p.value > 0.05 && levene_test[1, "Pr(>F)"] > 0.05) {
      t_test_line1_line2 <- t.test(line1_data_age, line2_data_age)
      t_test_line1_line3 <- t.test(line1_data_age, line3_data_age)
      t_test_line2_line3 <- t.test(line2_data_age, line3_data_age)
    
      cat("T-test result for line 1 vs line 2: ", t_test_line1_line2$p.value, "\n")
      cat("T-test result for line 1 vs line 3: ", t_test_line1_line3$p.value, "\n")
      cat("T-test result for line 2 vs line 3: ", t_test_line2_line3$p.value, "\n")
    } else {
      cat("Assumptions for the t-test are not met.\n")
    }