--- title: "foo" execute: cache: true freeze: auto number-sections: true --- ```{r} library(tidyverse) library(dplyr) library(ggplot2) library(survival) library(emmeans) library(foreign) library(gtsummary) library(ggsurvfit) ``` ```{r} dat <- read.csv("./unos.txt", sep = "\t") head(dat) names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", "year", "sex", "tx.type", "follow.up") ``` Complete case ```{r} nrow(dat) ``` ```{r} sapply(names(dat), function(col) { is.na(dat[, col]) |> sum() }) ``` # Exercise ## Exercise 1 > Illustrate in a table the characteristics of the population (age, sex, race, donor, . . . ). ```{r} hist(dat$age.donor) hist(dat$age.rec) hist(dat$cold.isc) hist(dat$year) hist(dat$follow.up) unique(dat$hla.match) unique(dat$year) ``` Change the order!! ```{r} dat.table1 <- dat |> mutate( sex = factor(sex, levels = c(0,1), labels = c("Female","Male")), tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")), hla.match = factor(hla.match), year = factor(year) ) ``` ```{r} table1 <- dat.table1 |> select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> tbl_summary( statistic = list( all_continuous() ~ "{median} ({p25}, {p75})" ), label = list( hla.match ~ "HLA matches, n(%)", age.donor ~ "Donor age, median (IQR)", age.rec ~ "Recipient age, median (IQR)", cold.isc ~ "Cold ischemic time (hours), median (IQR), ", year ~ "Year of transplant", sex ~ "Sex, n(%)", tx.type ~ "Transplant type, n(%)" ), missing = "ifany" ) |> modify_footnote(all_stat_cols() ~ NA) table1 ``` ```{r} table.split <- dat.table1 |> select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> tbl_summary( by = tx.type, statistic = list( all_continuous() ~ "{median} ({p25}, {p75})" ), label = list( hla.match ~ "HLA matches, n(%)", age.donor ~ "Donor age, median (IQR)", age.rec ~ "Recipient age, median (IQR)", cold.isc ~ "Cold ischemic time (hours), median (IQR), ", year ~ "Year of transplant", sex ~ "Sex, n(%)" ), missing = "ifany" ) |> add_overall() |> modify_footnote(all_stat_cols() ~ NA) table.split ``` ```{r} table.split <- dat.table1 |> select(sex, age.rec, age.donor, hla.match, cold.isc, tx.type) |> tbl_summary( by = tx.type, statistic = list( all_continuous() ~ "{median} ({p25}, {p75})" ), label = list( hla.match ~ "HLA matches, n(%)", age.donor ~ "Donor age, median (IQR)", age.rec ~ "Recipient age, median (IQR)", cold.isc ~ "Cold ischemic time (hours), median (IQR), ", sex ~ "Sex, n(%)" ), missing = "ifany" ) |> add_overall() |> modify_footnote(all_stat_cols() ~ NA) table.split ``` Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI ```{r} rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat) summary(rev_km)$table median_followup <- summary(rev_km)$table["median"] confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"]) median_followup confidence_interval ``` ```{r} g <- ggplot(dat) ``` ```{r} g + geom_point(aes(x = follow.up, y = death)) g + geom_density(aes(x = follow.up)) g + geom_density(aes(x = age.donor)) g + geom_bar(aes(x = age.rec)) g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor)) # dat$age.1 |> table() ``` ```{r} g + geom_boxplot(aes(x = year, y = follow.up, group = year)) ``` ## Exercise 2 Plot the Kaplan-Meier overall survival curve for pediatric kid- ney transplant recipients for the first 12 years after transplantation. !!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!! ```{r} km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ]) plot(km) ``` ## Exercise 3 We are going to compare mortality rates (hazard functions) between children whose transplanted kidney was provided by a living donor (in general a family member) and those whose source was recently deceased (variable donor type: `txtype`). Use the life table method to calculate the death rates for the first 5 years for each group (take in the first year intervals of 4 months and then look at each year) and show the results in a table. Estimate the hazard ratio in each time interval as the ratio between the mortality rates in the two groups. What do you notice? ```{r} dat.5 <- dat[dat$follow.up <= 5, ] head(dat.5) classify_time_interval = function(fu) { if (fu <= 1/3) { return(1/3) } else if (fu <= 2/3) { return(2/3) } else if (fu <= 1) { return(1) } ceiling(fu) } dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval) table(dat.5$fu.interval) head(dat.5) ``` ```{r} dat.5.life <- dat.5 |> group_by(fu.interval) |> summarize( n.censored = sum(death == 0), n.event = sum(death), n.at.risk = nrow(dat), ) for (i in 2:nrow(dat.5.life)) { j <- i - 1 n.censored.pre <- dat.5.life$n.censored[j] n.event.pre <- dat.5.life$n.event[j] n.at.risk.pre <- dat.5.life$n.at.risk[j] n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre dat.5.life$n.at.risk[i] <- n.at.risk } print(nrow(dat)) dat.5.life ``` n.removed.acc ```{r} dat.5.life <- dat.5.life |> mutate( hazard.rate = n.event / n.at.risk ) ``` --- ```{r} get_life_table = function(dat.sub, dat) { dat.sub <- dat.sub |> group_by(fu.interval) |> summarize( n.censored = sum(death == 0), n.event = sum(death), n.at.risk = nrow(dat), ) for (i in 2:nrow(dat.sub)) { j <- i - 1 n.censored.pre <- dat.sub$n.censored[j] n.event.pre <- dat.sub$n.event[j] n.at.risk.pre <- dat.sub$n.at.risk[j] n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre dat.sub$n.at.risk[i] <- n.at.risk } dat.sub <- dat.sub |> mutate( hazard.rate = n.event / n.at.risk ) return(dat.sub) } ``` ```{r} dat.5.tx0 = dat.5[dat.5$tx.type == 0, ] dat.5.tx1 = dat.5[dat.5$tx.type == 1, ] ``` ```{r} tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ]) tx0.life ``` ```{r} tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ]) tx1.life ``` ```{r} tx1.life$hazard.rate / tx0.life$hazard.rate ``` ```{r} hazard.df <- data.frame( fu.interval = tx1.life$fu.interval, hazard.rate.0 = tx0.life$hazard.rate, hazard.rate.1 = tx1.life$hazard.rate, hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate ) ggplot(hazard.df, aes(x = fu.interval)) + geom_line(aes(y = hazard.rate.0), color = "blue") + geom_line(aes(y = hazard.rate.1), color = "orange") ggplot(hazard.df, aes(x = fu.interval)) + geom_line(aes(y = hazard.ratio)) ``` --- General Solution: ```{r} get.life.table <- function(dat, time.intervals) { n.pop <- nrow(dat) dat |> recode.dat(time.intervals) |> group_by(fu.interval) |> summarize( n.censored = sum(.data$death == 0), n.event = sum(.data$death), ) |> ungroup() |> calculate.hazard(n.pop) } get.life.table.by.groups <- function(dat, time.intervals, grps) { grps |> lapply(function(grp) { dat |> get.life.table.by.group(time.intervals, grp) |> mutate( grp.name = grp, grp.value = pick(1)[[1]] ) |> select(-1) }) |> bind_rows() } get.life.table.by.group <- function(dat, time.intervals, grp) { dat |> recode.dat(time.intervals) |> group_by(fu.interval, .data[[grp]]) |> summarize( n.censored = sum(.data$death == 0), n.event = sum(.data$death), .groups = "keep" ) |> ungroup(fu.interval) |> group_modify(function(df.sub, grp) { grp.name <- names(grp) grp.value <- grp[[1]] n.pop <- (dat[[grp.name]] == grp.value) |> sum() calculate.hazard(df.sub, n.pop) }) |> ungroup() } calculate.hazard <- function(life.table, n.pop) { n.removed <- life.table$n.event + life.table$n.censored n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)]) life.table |> mutate( n.at.risk = n.pop - n.removed.accum, # TODO: how to account for censored? How do we adjust for uneven interval? hazard.rate = n.event / n.at.risk ) } recode.dat <- function(dat, time.intervals) { df <- dat[dat$follow.up <= sum(time.intervals), ] time.points <- cumsum(time.intervals) df$fu.interval <- sapply(df$follow.up, function(time) { time.points[time <= time.points][1] }) df } ``` ```{r} time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) grps <- c("tx.type", "sex", "hla.match", "cold.isc") get.life.table.by.groups(dat, time.intervals, grps) ``` ```{r} grps <- c("tx.type", "sex", "hla.match", "cold.isc") grps <- c("tx.type") plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) { dat |> get.life.table.by.groups(time.intervals, grps) |> ggplot(aes(x = fu.interval, y = hazard.rate, color = paste(grp.name, grp.value, sep = " = "))) + geom_point() + geom_line() + labs( color = "Group" ) + theme( legend.position = "bottom" ) } ``` ```{r} time.intervals <- c(rep(1/3, 3), rep(1, 4)) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` ```{r} time.intervals <- rep(1, 5) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` ```{r} time.intervals <- rep(1/3, 15) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` ```{r} time.intervals <- c(rep(1/3, 3*3), rep(1, 2)) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` ```{r} time.intervals <- c(rep(1/4, 4), rep(1, 4)) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` ```{r} time.intervals <- c(rep(1/4, 4*2), rep(1, 3)) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` ```{r} time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6)) plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) ``` --- Compare ```{r} time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) get.life.table.by.groups(dat, time.intervals, grps) ``` ```{r} hazard.df ``` ## Exercise 4 Show a plot with Kaplan-Meier survival curves for the two donor types. ```{r} km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up <= 12, ]) plot(km.tx, col = c("blue", "orange")) legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange")) ``` ## Exercise 5 Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained. ```{r} cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat) summary(cox.tx) ``` ```{r} log.hazard.ratio.hat <- cox.tx$coefficient hazard.ratio.hat <- log.hazard.ratio.hat |> exp() ``` ```{r} log.hazard.ratio.se <- summary(cox.tx)$coefficients[3] log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp() ``` ::: {.callout-note} Patients receiving kidney transplants from living donors (`tx.type = 1`) has 1.91 higher hazard rate than those receiving from cadaveric donors (`tx.type = 0`). Since 1 is not in the 95% confidence interval, the result is statistically significant. ::: # Exercise 6 Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories. ## Continuous ```{r} cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat) cox.age.rec |> summary() ``` ## Categorical (Case 1) ```{r} recode.age <- function(dat, ages) { df$age.group <- sapply(df$follow.up, function(time) { time.points[time <= time.points][1] }) df } ``` ```{r} dat.age.rec <- dat[!is.na(dat$age.rec), ] class.age <- function(age) { if (age < 2) { return("0-1") } else if (age < 6) { return("2-5") } else if (age < 11) { return("6-10") } else if (age < 19) { return("11-18") } } dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |> factor(levels = c( "0-1", "2-5", "6-10", "11-18" )) cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec) ``` ```{r} emmeans(cox.age.rec, pairwise ~ age.group, type = "response") ``` # Exercise 7 Fit a multivariate Cox model by using other predictors and describe your results. ```{r} sapply(names(dat), function(col) { is.na(dat[, col]) |> sum() }) ``` ```{r} names(dat) ``` ```{r} Waldtest <- function(cox.fit, i.betas){ q <- length(i.betas) coefs <- cox.fit$coefficients # print(cox.fit$var) var <- cox.fit$var[i.betas, i.betas] Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas] p.value <- 1-pchisq(as.numeric(Wald), df = q) return(cbind(Wald, p.value)) } ``` ```{r} fit.cox.model <- function(surv, i.betas) { fit.tx <- coxph(surv, data = dat) Waldtest(fit.tx, i.betas) } ``` Start with `tx.type` ```{r} surv <- Surv(follow.up, death) ~ tx.type + age.rec fit.cox.model(surv, 1:2) # NOTE: Why is this 1??? ``` Iteration 1: `tx.type + age + ?` ```{r} names(dat) ``` ```{r} surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match fit.cox.model(surv, 3) surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor fit.cox.model(surv, 3) surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc fit.cox.model(surv, 3) surv <- Surv(follow.up, death) ~ tx.type + age.rec + year fit.cox.model(surv, 3) surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex fit.cox.model(surv, 3) ``` Add `hla.match` Iteration 2: `tx.type + age.rec + hla.match + ?` ```{r} surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor fit.cox.model(surv, 4) surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc fit.cox.model(surv, 4) surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year fit.cox.model(surv, 4) surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex fit.cox.model(surv, 4) ``` Add `age.donor` Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?` ```{r} surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + cold.isc fit.cox.model(surv, 5) surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + year fit.cox.model(surv, 5) surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + sex fit.cox.model(surv, 5) ``` Add `year` ```{r} surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + year + cold.isc fit.cox.model(surv, 6) surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + year + sex fit.cox.model(surv, 6) ``` ```{r} ``` Next, we test for each remaining variables # Exercise 8 Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors. Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications. H0: PH-assumption holds H1: PH-assumption doesn't hold ```{r} # different cox model!! cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat) cox.zph(cox.final) cox.final ``` Exercise 10 — Plot the Schoenfeld residuals and comment. A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate's impact changes, while a horizontal line around zero supports the assumption. ```{r} plot(cox.zph(cox.final)) ``` ```{r, fig.width=10, fig.height=10} plot(cox.zph(cox.final)) ``` NOTES: left censoring: event happened before time zero -> not applicable left truncation: not applicable?? right censoring: we saw this in the follow up times, administrative censoring high ! No left truncation, since our target population is patients who received a kidney transplant. So we don't care about patients who died before receiving a transplant. ! ADD: we used a Cox model but we already saw that the PH assumption doesn't hold ! ? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.