--- title: Slides execute: cache: true freeze: auto include: true echo: false number-sections: true --- ```{r} library(tidyverse) library(dplyr) library(ggplot2) library(survival) library(emmeans) library(foreign) library(gtsummary) library(gt) library(ggsurvfit) ``` ```{r} dat <- read.csv("./unos.txt", sep = "\t") names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", "year", "sex", "tx.type", "follow.up") dat <- dat |> mutate( sex = factor(sex, levels = c(0,1), labels = c("Female","Male")), tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")), hla.match = factor(hla.match), year = factor(year) ) ``` # Introduction: research question ## Survival after transplantation (M) ## Identify Predictors (M) ## Why Survival Analysis (L) Motivation: study the distribution of time to event $T$. Example: time of death after kidney transplant. ```{r} ex <- data.frame( id = c(1, 2, 3, 4, 5), transplant = c(2000, 2000, 2001, 2003, 2004), death = c(2005, 2009, 2005, 2004, 2016) ) ex_table_1 <- ex |> gt() |> cols_label( id = "ID", transplant = "Transplant", death = "Death" ) %>% cols_align(align = "center", columns = everything()) ex_plot_real_time <- ggplot(ex) + geom_segment(aes(x = transplant, xend = death, y = factor(id), yend = factor(id)), color = "grey50", linewidth = 1) + geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) + geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) + scale_y_discrete(limits = rev) + labs(x = "Year", y = "Subject ID", color = "Event") + theme_minimal() ``` ```{r} ex_table_1 ex_plot_real_time ``` --- We then calculate time to death for each respondents. With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X` ```{r} ex_t <- ex |> mutate( t = death - transplant ) ex_table_2 <- ex_t |> gt() |> cols_label( id = "ID", transplant = "Transplant", death = "Death", t = "Time to death" ) |> cols_align(align = "center", columns = everything()) ex_plot_uniform_time <- ggplot(ex_t) + geom_segment(aes(x = 0, xend = t, y = factor(id), yend = factor(id)), color = "grey50", linewidth = 1) + geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) + geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) + scale_y_discrete(limits = rev) + labs(x = "Year", y = "Subject ID", color = "Event") + theme_minimal() ``` ```{r} ex_table_2 ex_plot_uniform_time ``` --- What if we do not know when exactly some respondents die? Scenario 1: the study ends at the year 2008? ```{r} ex_t_c <- ex_t |> mutate( death_censored = if_else(death <= 2008, death, 2008), death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"), status = if_else(death <= 2008, 1, 0), t_1 = death_censored - transplant, t_1_txt = if_else(death <= 2008, as.character(t_1), paste(">", as.character(t_1))) ) ``` ```{r} ex_table_3 <- ex_t_c |> select(id, transplant, death_censored_txt, t_1_txt) |> gt() |> cols_label( id = "ID", transplant = "Transplant", death_censored_txt = "Death", t_1_txt = "Time to death" ) |> cols_align(align = "center", columns = everything()) ex_plot_real_time_1 <- ggplot(ex_t_c) + geom_segment(aes(x = transplant, xend = death_censored, y = factor(id), yend = factor(id)), color = "grey50", linewidth = 1) + geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) + geom_point(aes(x = death_censored, y = factor(id), color = if_else(status == 1, "death", "censored")), size = 3) + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", "censored" = "orange")) + scale_y_discrete(limits = rev) + labs(x = "Year", y = "Subject ID", color = "Event") + xlim(2000, 2016) + geom_vline(xintercept = 2008, linetype = "dashed", color = "orange", linewidth = 0.8) + theme_minimal() ex_plot_uniform_time_1 <- ggplot(ex_t_c) + geom_segment(aes(x = 0, xend = t_1, y = factor(id), yend = factor(id)), color = "grey50", linewidth = 1) + geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) + geom_point(aes(x = t_1, y = factor(id), color = if_else(status == 1, "death", "censored")), size = 3) + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", "censored" = "orange")) + scale_y_discrete(limits = rev) + labs(x = "Year", y = "Subject ID", color = "Event") + theme_minimal() ``` ```{r} ex_table_3 ex_plot_real_time_1 ex_plot_uniform_time_1 ``` **Right Censoring**: only observe the event (death) if it occurs before a certain time (2008). --- Scenario 2: respondent 3 move away; loss follow up ```{r} ex_alt <- ex_t_c ex_alt$death_censored_txt[3] <- "> 2005" ex_alt$status[3] <- 0 ex_alt$t_1_txt[3] <- "> 4" ex_table_4 <- ex_alt |> select(id, transplant, death_censored_txt, t_1_txt) |> gt() |> cols_label( id = "ID", transplant = "Transplant", death_censored_txt = "Death", t_1_txt = "Time to death" ) |> cols_align(align = "center", columns = everything()) ex_plot_real_time_2 <- ggplot(ex_alt) + geom_segment(aes(x = transplant, xend = death_censored, y = factor(id), yend = factor(id)), color = "grey50", linewidth = 1) + geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) + geom_point(aes(x = death_censored, y = factor(id), color = if_else(status == 1, "death", "censored")), size = 3) + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", "censored" = "orange")) + scale_y_discrete(limits = rev) + labs(x = "Year", y = "Subject ID", color = "Event") + xlim(2000, 2016) + geom_vline(xintercept = 2008, linetype = "dashed", color = "orange", linewidth = 0.8) + theme_minimal() ex_plot_uniform_time_2 <- ggplot(ex_alt) + geom_segment(aes(x = 0, xend = t_1, y = factor(id), yend = factor(id)), color = "grey50", linewidth = 1) + geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) + geom_point(aes(x = t_1, y = factor(id), color = if_else(status == 1, "death", "censored")), size = 3) + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", "censored" = "orange")) + scale_y_discrete(limits = rev) + labs(x = "Year", y = "Subject ID", color = "Event") + theme_minimal() ``` ```{r} ex_table_4 ex_plot_real_time_2 ex_plot_uniform_time_2 ``` --- How many patients are right censored? ```{r} dat |> mutate(Overall = "Overall") |> pivot_longer( cols = c(Overall, sex, tx.type), names_to = "Attribute", values_to = "Category" ) |> count(Attribute, Category, death) |> ggplot(aes(x = Category, y = n, fill = factor(death))) + geom_col(position = "dodge") + facet_wrap(~ Attribute, scales = "free_x") + labs(x = "Group", y = "Count", fill = "Death Status") + theme_minimal() ``` ## Challenges in Survival Analysis (L) - **Right Censoring**: only observe the event if it occurs before a certain time. - **Left Censoring**: event has occurred prior to the start of a research - Follow up every 3 years. - Event has occurred -> event happened sometime before follow up. - **Left Truncation**: delayed entry; respondents are included only if they survived long enough. - Start follow up with patients 100 days after the transplant - Patients dies within 100 day wouldn't be included in the dataset - **Right Truncation**: respondents are included only if they have already experienced the event. - Retrospective analysis from deceased patients between 1990 and 2000. - Patients not dead before 2000 are not included in the dataset. # Method & Result ## Explain Dataset (M) ::: {.callout-note} # Things to highlight in figures - sex and recipient age are similar across `tx.type` - Living donors have less `hla.match` then cadaveric donors - There is age difference between different `tx.type` ::: ### `sex` ~ `tx.tpye` ::: {.callout-note} `sex` are similar across `tx.type` ::: ```{r, fig.width=7, fig.height=5} dat |> count(tx.type, sex) |> group_by(tx.type) |> mutate( total = sum(n), percent = n / total ) |> ungroup() |> ggplot() + geom_col(aes(x = tx.type, y = percent, fill = sex), position = "dodge") + labs( title = "Percentage of Sex in Transplant Type", y = "Percentage", x = "Transplant Type", fill = NULL ) + scale_x_discrete( labels = c("Cadaveric" = "Deceased"), ) + theme_minimal() + theme( legend.position = "bottom", plot.title = element_text(size = 17, face = "bold"), axis.title = element_text(size = 15), axis.text = element_text(size = 10), legend.text = element_text(size = 12), plot.margin = margin(20, 30, 20, 30) ) ``` ## Table 1 (L) ### HLA match `hla.match` ```{r} dat$hla.match |> table(useNA = "always") ``` ```{r} ggplot(dat, aes(x = hla.match)) + geom_bar() + labs(x = "HLA match", y = "Count") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = hla.match, fill = tx.type)) + geom_bar(position = "dodge") + labs(x = "HLA match", y = "Count") + theme_minimal() ``` ::: {.callout-warning} TODO: change to percentage within each `tx.type` ::: ::: {.callout-note} Living donors have less `hla.match` then cadaveric donors ::: ### Donor age `age.donor` ```{r} #| echo: true mean(dat$age.donor, na.rm = TRUE) median(dat$age.donor, na.rm = TRUE) ``` ```{r} ggplot(dat, aes(x = age.donor)) + geom_bar() + labs(x = "Donor Age", y = "Count") + theme_minimal() ``` ```{r, fig.width = 8, fig.height = 12} ggplot(dat, aes(x = age.donor)) + geom_bar() + labs(x = "Donor Age", y = "Count") + theme_minimal() + facet_grid(hla.match ~ .) ``` ```{r, fig.width= 8, fig.height= 12} ggplot(dat, aes(x = age.donor, color = tx.type)) + geom_density() + labs(x = "Donor Age", y = "Density") + theme_minimal() + facet_grid(hla.match ~ .) ``` ```{r} ggplot(dat, aes(x = age.donor, color = hla.match)) + geom_density() + labs(x = "Donor Age", y = "Count") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = as.factor(tx.type), y = age.donor)) + geom_boxplot() + theme_minimal() ``` ::: {.callout-note} There is age difference between different `tx.type` ::: ### Recipient Age `age.rec` ```{r} #| echo: true mean(dat$age.rec, na.rm = TRUE) median(dat$age.rec, na.rm = TRUE) ``` ```{r} ggplot(dat, aes(x = age.rec)) + geom_bar() + labs(x = "Recipient Age", y = "Count") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) + geom_boxplot() + theme_minimal() ``` ```{r} ggplot(dat, aes(x = age.rec, fill = tx.type)) + geom_bar(position = "dodge") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = age.rec, color = hla.match)) + geom_density() + theme_minimal() ``` ```{r} ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) + geom_boxplot() + theme_minimal() ``` ### Cold Ischemia Time `cold.isc` ```{r} #| echo: true mean(dat$cold.isc, na.rm = TRUE) median(dat$cold.isc, na.rm = TRUE) ``` ```{r} ggplot(dat, aes(x = cold.isc)) + geom_density() + labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) + geom_density() + labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = cold.isc)) + geom_density() + theme_minimal() + facet_grid(tx.type ~ .) ``` ### Transplant Type `tx.type` ```{r} dat$tx.type |> table(useNA = "always") ``` ```{r} #| echo: true dat$tx.type |> table() ``` ```{r} ggplot(dat, aes(x = tx.type)) + geom_bar() + labs(x = "Transplant Type", y = "Count") + theme_minimal() ``` ### Year `year` ```{r} dat$year |> table() ``` ```{r} ggplot(dat, aes(x = year)) + geom_bar() + labs(x = "Year", y = "Count") + theme_minimal() ``` ```{r} ggplot(dat, aes(x = year, y = follow.up)) + geom_boxplot() ``` ```{r} ggplot(dat, aes(x = year, fill = tx.type)) + geom_bar(position = "dodge") ``` ```{r} ggplot(dat, aes(x = year, y = age.rec)) + geom_boxplot() ``` ```{r} ggplot(dat, aes(x = year, y = age.donor)) + geom_boxplot() ``` ## Kaplan-Meier ### Overall (L) ```{r} km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat) summary(km_all, times = c(4, 8, 12)) ``` ```{r} km_all |> ggsurvfit(type = "survival") + add_confidence_interval() + scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) + labs( x = "Years of Follow-up", y = "Overall Survival Probability" ) + theme_minimal() ``` ### `tx.type` (M) ```{r, fig.width=7, fig.height=5} km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat) km.tx |> ggsurvfit(type = "survival") + add_confidence_interval() + scale_x_continuous( breaks = seq(0, 20, by = 4) ) + scale_y_continuous( limits = c(0, 1), breaks = seq(0, 1, by = 0.2), labels = scales::label_number(accuracy = 0.1) ) + labs( x = "Years of Follow-up", y = "Overall Survival Probability" ) + theme_minimal() + theme( legend.position = "bottom", plot.title = element_text(size = 17, face = "bold"), axis.title = element_text(size = 15), axis.text = element_text(size = 10), legend.text = element_text(size = 12), plot.margin = margin(20, 30, 20, 30) ) ``` ## Cox Model ```{r} get.life.table <- function(dat, time.intervals) { n.pop <- nrow(dat) dat |> recode.dat(time.intervals) |> group_by(fu.interval) |> summarize( n.censored = sum(.data$death == 0), n.event = sum(.data$death), ) |> ungroup() |> calculate.hazard(n.pop) } get.life.table.by.groups <- function(dat, time.intervals, grps) { grps |> lapply(function(grp) { dat |> get.life.table.by.group(time.intervals, grp) |> mutate( grp.name = grp, grp.value = pick(1)[[1]] ) |> select(-1) }) |> bind_rows() } get.life.table.by.group <- function(dat, time.intervals, grp) { dat |> recode.dat(time.intervals) |> group_by(fu.interval, .data[[grp]]) |> summarize( n.censored = sum(.data$death == 0), n.event = sum(.data$death), .groups = "keep" ) |> ungroup(fu.interval) |> group_modify(function(df.sub, grp) { grp.name <- names(grp) grp.value <- grp[[1]] n.pop <- (dat[[grp.name]] == grp.value) |> sum() calculate.hazard(df.sub, n.pop) }) |> ungroup() } calculate.hazard <- function(life.table, n.pop) { n.removed <- life.table$n.event + life.table$n.censored n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)]) life.table |> mutate( n.at.risk = n.pop - n.removed.accum, # TODO: how to account for censored? How do we adjust for uneven interval? hazard.rate = n.event / n.at.risk ) } recode.dat <- function(dat, time.intervals) { df <- dat[dat$follow.up <= sum(time.intervals), ] time.points <- cumsum(time.intervals) df$fu.interval <- sapply(df$follow.up, function(time) { time.points[time <= time.points][1] }) df } ``` ### `death` Distribution (?) ```{r} # dat |> # mutate( # accum.death = cumsum(death), # accum.censored = if_else(death == 1, 0, 1) |> cumsum() # ) |> # ggplot() + # geom_pont(aes(x = follow.up, y = accum.death, color = death)) ``` ```{r} ``` ```{r} plot.death = dat[dat$death == 1,] |> ggplot(aes(x = follow.up)) + geom_histogram(bins = 50) plot.death ``` ### Overall (L) ::: {.callout-note} skip overall, jump directly to `tx.type` ::: ```{r} time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) get.life.table(dat, time.intervals) ``` ### `tx.type` (M) ### `age` (continuous) (L) ### `age` (categorical) (M) ### Full model (L) ```{r} m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat) summary(m1) m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat) summary(m2) anova(m1, m2, test = "LRT") ``` ### Discussion: include `year` or not? ## Assumption Testing # Conclusion (M + L)