Skip to content
# Start coding here...
install.packages("readxl")
library(readxl)
# Specify the file path and sheet name
file1 <- "Donald Radu new.xlsx"
sheet_name1 <- "Final_new"
# Load the specified sheet into a data frame
df <- read_excel(file1, sheet = sheet_name1)
df <- as.data.frame(df)
df$gender[df$gender == "F"] <- "1"
df$gender[df$gender == "M"] <- "0"
# load stringr package
library(stringr)
# create new column with count of metastasis
df$num_metastases <- str_count(df$locations, ",") + 1
# Create a new column indicating presence of "2"=liver metastasis in "Locations"
df$liver_meta <- grepl("2", df$locations)
# Create a new column indicating presence of "1"=signet ring histological type in "Pathology"
df$signet <- grepl("1", df$patho)
# Create a new column indicating presence of "1"=signet ring histological type in "Pathology"
df$poorly <- grepl("2", df$patho)
# Create a new column indicating presence of "1"=signet ring histological type in "Pathology"
df$tubular <- grepl("3", df$patho)
# Create a new column indicating total time on treatment for the patients as the sum of time1, time2, and time3, ignoring NAs
df$tot_tot <- rowSums(df[, c("time1", "time_maintenance", "time2", "time3")], na.rm = TRUE)
# Create a new column indicating total time on treatment for the patients as the sum of time1, time2, and time3, ignoring NAs
df$tot_time1 <- rowSums(df[, c("time1", "time_maintenance")], na.rm = TRUE)
# Change TRUE, FALSE to 1,0
df$liver_meta <- ifelse(df$liver_meta, 1, 0)
df$signet <- ifelse(df$signet, 1, 0)
df$poorly <- ifelse(df$poorly, 1, 0)
df$tubular <- ifelse(df$poorly, 1, 0)
# Display the updated dataframe
df
#Total number of patients (patients that received line 1)
(sum_tot = nrow(df))
df_og_data <- subset(df, outcome_line1 == 2 | outcome_line2 == 2 | outcome_line3 == 2 | outcome_line3 == 0 )
df_og_data
nrow(df_og_data)
df_lost_data <- subset(df, outcome_line1 == 3 | outcome_line2 == 3 | outcome_line3 == 3)
df_lost_data
nrow(df_lost_data)
df_new <- subset(df, outcome_line1 == 1 | outcome_line2 == 1 | outcome_line3 == 1)
df_new
# Filter the data to include patients who received at least line 2 in a new dataframe
line2_equal_or_more_data <- subset(df, line2_yes == 1 & (outcome_line2 == 0 | outcome_line2 == 1))
nrow(line2_equal_or_more_data)
mean(line2_equal_or_more_data$age)
# Filter the data to include patients who received at least line 3 in a new dataframe
line3_equal_or_more_data <- subset(df, line3_yes == 1 & (outcome_line3 == 0 | outcome_line3 == 1))
nrow(line3_equal_or_more_data)
mean(line3_equal_or_more_data$age)
Subseturi
# Filter the data to include patients who received line 1 only in a new dataframe
line1_only_data <- subset(df, line1_yes == 1 & outcome_line1 == 1)
nrow(line1_only_data)
mean(line1_only_data$age)
mean(line1_only_data$tot_time1)
# Filter the data to include patients who received line 1 and line 2 only in a new dataframe
line2_only_data <- subset(df, line2_yes == 1 & outcome_line2 == 1)
nrow(line2_only_data)
mean(line2_only_data$age)
mean(line2_only_data$tot_time1)
# Filter the data to include patients who received line 1, line 2, line 3 only in a new dataframe
line3_only_data <- subset(df, line3_yes == 1 & outcome_line3 == 1)
nrow(line3_only_data)
mean(line3_only_data$age)
mean(line3_only_data$tot_time1)
line1_only_data
(sum_line1_only = sum(line1_only_data$line1_yes))
(sum_line2_only = sum(line2_only_data$line2_yes))
(sum_line3_only = sum(line3_only_data$line3_yes))
(sum_tot = sum_line1_only + sum_line2_only + sum_line3_only)
#Patients per each line
(perc_line1_only = sum_line1_only / sum_tot)
(perc_line2_only = sum_line2_only / sum_tot)
(perc_line3_only = sum_line3_only / sum_tot)
Age
hist(line1_only_data$age)
hist(line2_only_data$age)
hist(line3_only_data$age)
Hidden output
# Calculate the mean age for patients who received line 1
mean_age_line1 <- mean(line1_only_data$age)
#Calculate the median age for patients who received line 1
median_age_line1 <- median(line1_only_data$age)
# Print the results
cat("Mean age of patients who received up to line 1: ", mean_age_line1, "\n")
cat("Median age of patients who received up to line 1: ", median_age_line1, "\n")
# Calculate the mean age for patients who received line 2
mean_age_line2 <- mean(line2_only_data$age)
#Calculate the median age for patients who received line 2
median_age_line2 <- median(line2_only_data$age)
# Print the result
cat("Mean age of patients who received up to line 2: ", mean_age_line2, "\n")
cat("Median age of patients who received up to line 2: ", median_age_line2, "\n")
# Calculate the mean age for patients who received line 3
mean_age_line3 <- mean(line3_only_data$age)
#Calculate the median age for patients who received line 3
median_age_line3 <- median(line3_only_data$age)
# Print the result
cat("Mean age of patients who received line 3: ", mean_age_line3, "\n")
cat("Median age of patients who received up to line 3: ", median_age_line3, "\n")
# Calculate the standard deviation for the mean age of patients who received line 1, line 2, line 3
sd_age_line1 <- sd(line1_only_data$age)
sd_age_line2 <- sd(line2_only_data$age)
sd_age_line3 <- sd(line3_only_data$age)
# Print the results
cat("Standard deviation for the mean age of patients who received up to line 1: ", sd_age_line1, "\n")
cat("Standard deviation for the mean age of patients who received up to line 2: ", sd_age_line2, "\n")
cat("Standard deviation for the mean age of patients who received up to line 3: ", sd_age_line3, "\n")
# Calculate the first and third quartiles for age in line 1
q1_age_line1 <- quantile(line1_only_data$age, 0.25)
q3_age_line1 <- quantile(line1_only_data$age, 0.75)
# Calculate the first and third quartiles for age in line 2
q1_age_line2 <- quantile(line2_only_data$age, 0.25)
q3_age_line2 <- quantile(line2_only_data$age, 0.75)
# Calculate the first and third quartiles for age in line 3
q1_age_line3 <- quantile(line3_only_data$age, 0.25)
q3_age_line3 <- quantile(line3_only_data$age, 0.75)
# Calculate the interquartile range for age in line 1
iqr_age_line1 <- q3_age_line1 - q1_age_line1
# Calculate the interquartile range for age in line 2
iqr_age_line2 <- q3_age_line2 - q1_age_line2
# Calculate the interquartile range for age in line 3
iqr_age_line3 <- q3_age_line3 - q1_age_line3
# Print the results
cat("Interquartile range for age in line 1: ", q1_age_line1, "-", q3_age_line1, "\n")
cat("Interquartile range for age in line 2: ", q1_age_line2, "-", q3_age_line2, "\n")
cat("Interquartile range for age in line 3: ", q1_age_line3, "-", q3_age_line3, "\n")
line1_data_age <- line1_only_data$age
line2_data_age <- line2_only_data$age
line3_data_age <- line3_only_data$age
# Perform Mann-Whitney U test for median age of patients on line 2 vs. line 3
mwu_test <- wilcox.test(line2_data_age, line3_data_age)
# Print the results of the test
cat("Mann-Whitney U test results line2vsline3:\n")
cat("U statistic = ", mwu_test$statistic, "\n")
cat("p-value = ", mwu_test$p.value, "\n")
# Perform Mann-Whitney U test for median age of patients on line 1 vs. line 2
mwu_test <- wilcox.test(line1_data_age, line2_data_age)
# Print the results of the test
cat("Mann-Whitney U test results line1vsline2:\n")
cat("U statistic = ", mwu_test$statistic, "\n")
cat("p-value = ", mwu_test$p.value, "\n")
# Install and load the 'car' package for Levene's test
if (!requireNamespace("car", quietly = TRUE)) {
install.packages("car")
}
library(car)
# Check for normality
shapiro_line1 <- shapiro.test(line1_data_age)
shapiro_line2 <- shapiro.test(line2_data_age)
shapiro_line3 <- shapiro.test(line3_data_age)
cat("Shapiro-Wilk test for line 1: ", shapiro_line1$p.value, "\n")
cat("Shapiro-Wilk test for line 2: ", shapiro_line2$p.value, "\n")
cat("Shapiro-Wilk test for line 3: ", shapiro_line3$p.value, "\n")
# Create a new dataframe with 'age' and 'group' columns
age_group_data <- data.frame(age = c(line1_data_age, line2_data_age, line3_data_age),
group = factor(c(rep("Line1", length(line1_data_age)),
rep("Line2", length(line2_data_age)),
rep("Line3", length(line3_data_age)))))
# Check for homogeneity of variances
levene_test <- leveneTest(age ~ group, data = age_group_data)
cat("Levene's test: ", levene_test[1, "Pr(>F)"], "\n")
# Perform t-test if assumptions are met
if (shapiro_line1$p.value > 0.05 && shapiro_line2$p.value > 0.05 && shapiro_line3$p.value > 0.05 && levene_test[1, "Pr(>F)"] > 0.05) {
t_test_line1_line2 <- t.test(line1_data_age, line2_data_age)
t_test_line1_line3 <- t.test(line1_data_age, line3_data_age)
t_test_line2_line3 <- t.test(line2_data_age, line3_data_age)
cat("T-test result for line 1 vs line 2: ", t_test_line1_line2$p.value, "\n")
cat("T-test result for line 1 vs line 3: ", t_test_line1_line3$p.value, "\n")
cat("T-test result for line 2 vs line 3: ", t_test_line2_line3$p.value, "\n")
} else {
cat("Assumptions for the t-test are not met.\n")
}