diff --git a/report.Rmd b/report.Rmd index f030138..89e97dc 100644 --- a/report.Rmd +++ b/report.Rmd @@ -1,593 +1,709 @@ ---- -title: foo -execute: - cache: true - freeze: auto -number-sections: true ---- - -```{r} -library(tidyverse) -library(dplyr) -library(ggplot2) -library(survival) -library(emmeans) -library(foreign) -# library(gtsummary) -``` - -```{r} -dat <- read.csv("./unos.txt", sep = "\t") -head(dat) -names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", - "year", "sex", "tx.type", "follow.up") -``` - -Complete case - -```{r} -nrow(dat) -``` - - -```{r} -sapply(names(dat), function(col) { - is.na(dat[, col]) |> sum() -}) -``` - - - -# Exercise - -## Exercise 1 - -> Illustrate in a table the characteristics of the population (age, sex, race, -donor, . . . ). - -```{r} -g <- ggplot(dat) -``` - -```{r} -g + geom_point(aes(x = follow.up, y = death)) -g + geom_density(aes(x = follow.up)) - -g + geom_density(aes(x = age.donor)) -g + geom_bar(aes(x = age.rec)) - -g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor)) - -# dat$age.1 |> table() -``` - -```{r} -g + geom_boxplot(aes(x = year, y = follow.up, group = year)) -``` - -## Exercise 2 - -Plot the Kaplan-Meier overall survival curve for pediatric kid- -ney transplant recipients for the first 12 years after transplantation. - -```{r} -km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ]) -plot(km) - - - -``` - -## Exercise 3 - -We are going to compare mortality rates (hazard functions) -between children whose transplanted kidney was provided by a living donor -(in general a family member) and those whose source was recently deceased -(variable donor type: `txtype`). Use the life table method to calculate the death -rates for the first 5 years for each group (take in the first year intervals of 4 -months and then look at each year) and show the results in a table. Estimate -the hazard ratio in each time interval as the ratio between the mortality rates -in the two groups. What do you notice? - -```{r} -dat.5 <- dat[dat$follow.up <= 5, ] -head(dat.5) -classify_time_interval = function(fu) { - if (fu <= 1/3) { - return(1/3) - } else if (fu <= 2/3) { - return(2/3) - } else if (fu <= 1) { - return(1) - } - ceiling(fu) -} - -dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval) -table(dat.5$fu.interval) -head(dat.5) -``` - -```{r} -dat.5.life <- dat.5 |> - group_by(fu.interval) |> - summarize( - n.censored = sum(death == 0), - n.event = sum(death), - n.at.risk = nrow(dat), - ) - -for (i in 2:nrow(dat.5.life)) { - j <- i - 1 - - n.censored.pre <- dat.5.life$n.censored[j] - n.event.pre <- dat.5.life$n.event[j] - n.at.risk.pre <- dat.5.life$n.at.risk[j] - - n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre - - dat.5.life$n.at.risk[i] <- n.at.risk -} - -print(nrow(dat)) -dat.5.life - -``` -n.removed.acc -```{r} -dat.5.life <- dat.5.life |> - mutate( - hazard.rate = n.event / n.at.risk - ) - -``` - ---- - - -```{r} -get_life_table = function(dat.sub, dat) { - dat.sub <- dat.sub |> - group_by(fu.interval) |> - summarize( - n.censored = sum(death == 0), - n.event = sum(death), - n.at.risk = nrow(dat), - ) - - for (i in 2:nrow(dat.sub)) { - j <- i - 1 - - n.censored.pre <- dat.sub$n.censored[j] - n.event.pre <- dat.sub$n.event[j] - n.at.risk.pre <- dat.sub$n.at.risk[j] - - n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre - - dat.sub$n.at.risk[i] <- n.at.risk - } - - dat.sub <- dat.sub |> - mutate( - hazard.rate = n.event / n.at.risk - ) - - return(dat.sub) -} -``` - -```{r} -dat.5.tx0 = dat.5[dat.5$tx.type == 0, ] -dat.5.tx1 = dat.5[dat.5$tx.type == 1, ] -``` - -```{r} -tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ]) -tx0.life -``` - -```{r} -tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ]) -tx1.life -``` - - -```{r} -tx1.life$hazard.rate / tx0.life$hazard.rate -``` - -```{r} -hazard.df <- data.frame( - fu.interval = tx1.life$fu.interval, - hazard.rate.0 = tx0.life$hazard.rate, - hazard.rate.1 = tx1.life$hazard.rate, - hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate -) - -ggplot(hazard.df, aes(x = fu.interval)) + - geom_line(aes(y = hazard.rate.0), color = "blue") + - geom_line(aes(y = hazard.rate.1), color = "orange") - -ggplot(hazard.df, aes(x = fu.interval)) + - geom_line(aes(y = hazard.ratio)) - -``` - ---- - -General Solution: - -```{r} -get.life.table <- function(dat, time.intervals) { - n.pop <- nrow(dat) - - dat |> - recode.dat(time.intervals) |> - group_by(fu.interval) |> - summarize( - n.censored = sum(.data$death == 0), - n.event = sum(.data$death), - ) |> - ungroup() |> - calculate.hazard(n.pop) -} - -get.life.table.by.groups <- function(dat, time.intervals, grps) { - grps |> - lapply(function(grp) { - dat |> - get.life.table.by.group(time.intervals, grp) |> - mutate( - grp.name = grp, - grp.value = pick(1)[[1]] - ) |> - select(-1) - }) |> - bind_rows() -} - -get.life.table.by.group <- function(dat, time.intervals, grp) { - dat |> - recode.dat(time.intervals) |> - group_by(fu.interval, .data[[grp]]) |> - summarize( - n.censored = sum(.data$death == 0), - n.event = sum(.data$death), - .groups = "keep" - ) |> - ungroup(fu.interval) |> - group_modify(function(df.sub, grp) { - grp.name <- names(grp) - grp.value <- grp[[1]] - n.pop <- (dat[[grp.name]] == grp.value) |> sum() - calculate.hazard(df.sub, n.pop) - }) |> - ungroup() -} - -calculate.hazard <- function(life.table, n.pop) { - n.removed <- life.table$n.event + life.table$n.censored - n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)]) - life.table |> - mutate( - n.at.risk = n.pop - n.removed.accum, - # TODO: how to account for censored? How do we adjust for uneven interval? - hazard.rate = n.event / n.at.risk - ) -} - -recode.dat <- function(dat, time.intervals) { - df <- dat[dat$follow.up <= sum(time.intervals), ] - time.points <- cumsum(time.intervals) - df$fu.interval <- sapply(df$follow.up, function(time) { - time.points[time <= time.points][1] - }) - - df -} - - - -``` - -```{r} -time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) -grps <- c("tx.type", "sex", "hla.match", "cold.isc") - -get.life.table.by.groups(dat, time.intervals, grps) -``` - -```{r} -grps <- c("tx.type", "sex", "hla.match", "cold.isc") -grps <- c("tx.type") - - -plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) { - dat |> - get.life.table.by.groups(time.intervals, grps) |> - ggplot(aes(x = fu.interval, y = hazard.rate, - color = paste(grp.name, grp.value, sep = " = "))) + - geom_point() + - geom_line() + - labs( - color = "Group" - ) + - theme( - legend.position = "bottom" - ) -} - -``` -```{r} -time.intervals <- c(rep(1/3, 3), rep(1, 4)) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - -```{r} -time.intervals <- rep(1, 5) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - -```{r} -time.intervals <- rep(1/3, 15) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - -```{r} -time.intervals <- c(rep(1/3, 3*3), rep(1, 2)) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - -```{r} -time.intervals <- c(rep(1/4, 4), rep(1, 4)) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - -```{r} -time.intervals <- c(rep(1/4, 4*2), rep(1, 3)) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - -```{r} -time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6)) -plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) -``` - ---- - -Compare - -```{r} -time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) -get.life.table.by.groups(dat, time.intervals, grps) -``` - -```{r} -hazard.df -``` - - - -## Exercise 4 - -Show a plot with Kaplan-Meier survival curves for the two donor types. - -```{r} -km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up <= 12, ]) - -plot(km.tx, col = c("blue", "orange")) -legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange")) -``` - -## Exercise 5 - -Fit a univariate Cox model with predictor donor type. Report -the hazard ratio and 95% confidence interval and interpret the result obtained. - -```{r} -cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat) -summary(cox.tx) -``` - -```{r} -log.hazard.ratio.hat <- cox.tx$coefficient -hazard.ratio.hat <- log.hazard.ratio.hat |> exp() - -``` - -```{r} -log.hazard.ratio.se <- summary(cox.tx)$coefficients[3] -log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se -log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se - -c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp() -``` - -::: {.callout-note} -Patients receiving kidney transplants from living donors (`tx.type = 1`) has -1.91 higher hazard rate than those receiving from cadaveric donors -(`tx.type = 0`). - -Since 1 is not in the 95% confidence interval, the result is statistically -significant. -::: - - -# Exercise 6 - -Research shows that an important determinant of mortality -after kidney transplant is the age of the recipient. Fit a Cox model with age -as predictor and estimate the hazard ratio and its confidence interval. Consider -age first as continuous variable and then divide into categories. - -## Continuous - -```{r} -cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat) -cox.age.rec |> summary() -``` - -## Categorical (Case 1) - -```{r} -recode.age <- function(dat, ages) { - df$age.group <- sapply(df$follow.up, function(time) { - time.points[time <= time.points][1] - }) - - df -} -``` - -```{r} -dat.age.rec <- dat[!is.na(dat$age.rec), ] -class.age <- function(age) { - if (age < 2) { - return("0,1") - } else if (age < 6) { - return("2,3,4,5") - } else if (age < 11) { - return("6,7,8,9,10") - } else if (age < 19) { - return("11-18") - } -} -``` - -```{r} -dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |> - factor(levels = c( - "0,1", - "2,3,4,5", - "6,7,8,9,10", - "11-18" - )) - - -cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec) -cox.age.rec |> summary() -``` - -```{r} -emmeans(cox.age.rec, pairwise ~ age.group, type = "response") -``` - - -# Exercise 7 - -Fit a multivariate Cox model by using other predictors and -describe your results. - - -```{r} -sapply(names(dat), function(col) { - is.na(dat[, col]) |> sum() -}) -``` - -```{r} -names(dat) -``` - -```{r} -Waldtest <- function(cox.fit, i.betas){ - q <- length(i.betas) - coefs <- cox.fit$coefficients - # print(cox.fit$var) - var <- cox.fit$var[i.betas, i.betas] - Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas] - p.value <- 1-pchisq(as.numeric(Wald), df = q) - return(cbind(Wald, p.value)) -} -``` - - -```{r} -fit.cox.model <- function(surv, i.betas) { - fit.tx <- coxph(surv, data = dat) - Waldtest(fit.tx, i.betas) -} -``` - -Start with `tx.type` - -```{r} -surv <- Surv(follow.up, death) ~ tx.type + age.rec -fit.cox.model(surv, 1:2) # NOTE: Why is this 1??? -``` - -Iteration 1: `tx.type + age + ?` - -```{r} -names(dat) -``` - -```{r} -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match -fit.cox.model(surv, 3) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor -fit.cox.model(surv, 3) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc -fit.cox.model(surv, 3) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + year -fit.cox.model(surv, 3) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex -fit.cox.model(surv, 3) -``` - -Add `hla.match` - -Iteration 2: `tx.type + age.rec + hla.match + ?` - -```{r} -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor -fit.cox.model(surv, 4) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc -fit.cox.model(surv, 4) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year -fit.cox.model(surv, 4) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex -fit.cox.model(surv, 4) -``` - -Add `age.donor` - -Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?` - -```{r} -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + - cold.isc -fit.cox.model(surv, 5) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + - year -fit.cox.model(surv, 5) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + - sex -fit.cox.model(surv, 5) -``` - -Add `year` - -```{r} -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + - year + cold.isc -fit.cox.model(surv, 6) -surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + - year + sex -fit.cox.model(surv, 6) -``` - - - -Next, we test for each remaining variables - -# Exercise 8 - -Estimate the survival function for specific covariate patterns. -Based on the previous results choose the best predictors. - -Exercise 9 — Check the proportional hazards assumption. You may use the -function cox.zph. Discuss the result and possible implications. - -Exercise 10 — Plot the Schoenfeld residuals and comment. - +--- +title: foo +execute: + cache: true + freeze: auto +number-sections: true +--- + +```{r} +library(tidyverse) +library(dplyr) +library(ggplot2) +library(survival) +library(emmeans) +library(foreign) +library(gtsummary) +``` + +```{r} +dat <- read.csv("./unos.txt", sep = "\t") +head(dat) +names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", + "year", "sex", "tx.type", "follow.up") +``` + +Complete case + +```{r} +nrow(dat) +``` + + +```{r} +sapply(names(dat), function(col) { + is.na(dat[, col]) |> sum() +}) +``` + + + +# Exercise + +## Exercise 1 + +> Illustrate in a table the characteristics of the population (age, sex, race, +donor, . . . ). + +```{r} +hist(dat$age.donor) +hist(dat$age.rec) +hist(dat$cold.isc) +hist(dat$year) +hist(dat$follow.up) + +unique(dat$hla.match) +unique(dat$year) +``` +Change the order!! + +```{r} +dat.table1 <- dat |> + mutate( + sex = factor(sex, levels = c(0,1), labels = c("Female","Male")), + tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")), + hla.match = factor(hla.match), + year = factor(year) + ) +``` + +```{r} +table1 <- dat.table1 |> + select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> + tbl_summary( + statistic = list( + all_continuous() ~ "{median} ({p25}, {p75})" + ), + label = list( + hla.match ~ "HLA matches, n(%)", + age.donor ~ "Donor age, median (IQR)", + age.rec ~ "Recipient age, median (IQR)", + cold.isc ~ "Cold ischemic time (hours), median (IQR), ", + year ~ "Year of transplant", + sex ~ "Sex, n(%)", + tx.type ~ "Transplant type, n(%)" + ), + missing = "ifany" + ) |> + modify_footnote(all_stat_cols() ~ NA) + +table1 +``` + +```{r} +table.split <- dat.table1 |> + select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> + tbl_summary( + by = tx.type, + statistic = list( + all_continuous() ~ "{median} ({p25}, {p75})" + ), + label = list( + hla.match ~ "HLA matches, n(%)", + age.donor ~ "Donor age, median (IQR)", + age.rec ~ "Recipient age, median (IQR)", + cold.isc ~ "Cold ischemic time (hours), median (IQR), ", + year ~ "Year of transplant", + sex ~ "Sex, n(%)" + ), + missing = "ifany" + ) |> + add_overall() |> + modify_footnote(all_stat_cols() ~ NA) + +table.split +``` + + +Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI +```{r} +rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat) + +summary(rev_km)$table + +median_followup <- summary(rev_km)$table["median"] +confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"]) +median_followup +confidence_interval +``` + + +```{r} +g <- ggplot(dat) +``` + +```{r} +g + geom_point(aes(x = follow.up, y = death)) +g + geom_density(aes(x = follow.up)) + +g + geom_density(aes(x = age.donor)) +g + geom_bar(aes(x = age.rec)) + +g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor)) + +# dat$age.1 |> table() +``` + +```{r} +g + geom_boxplot(aes(x = year, y = follow.up, group = year)) +``` + +## Exercise 2 + +Plot the Kaplan-Meier overall survival curve for pediatric kid- +ney transplant recipients for the first 12 years after transplantation. + +!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!! + +```{r} +km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ]) +plot(km) +``` + +## Exercise 3 + +We are going to compare mortality rates (hazard functions) +between children whose transplanted kidney was provided by a living donor +(in general a family member) and those whose source was recently deceased +(variable donor type: `txtype`). Use the life table method to calculate the death +rates for the first 5 years for each group (take in the first year intervals of 4 +months and then look at each year) and show the results in a table. Estimate +the hazard ratio in each time interval as the ratio between the mortality rates +in the two groups. What do you notice? + +```{r} +dat.5 <- dat[dat$follow.up <= 5, ] +head(dat.5) +classify_time_interval = function(fu) { + if (fu <= 1/3) { + return(1/3) + } else if (fu <= 2/3) { + return(2/3) + } else if (fu <= 1) { + return(1) + } + ceiling(fu) +} + +dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval) +table(dat.5$fu.interval) +head(dat.5) +``` + +```{r} +dat.5.life <- dat.5 |> + group_by(fu.interval) |> + summarize( + n.censored = sum(death == 0), + n.event = sum(death), + n.at.risk = nrow(dat), + ) + +for (i in 2:nrow(dat.5.life)) { + j <- i - 1 + + n.censored.pre <- dat.5.life$n.censored[j] + n.event.pre <- dat.5.life$n.event[j] + n.at.risk.pre <- dat.5.life$n.at.risk[j] + + n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre + + dat.5.life$n.at.risk[i] <- n.at.risk +} + +print(nrow(dat)) +dat.5.life + +``` +n.removed.acc +```{r} +dat.5.life <- dat.5.life |> + mutate( + hazard.rate = n.event / n.at.risk + ) + +``` + +--- + + +```{r} +get_life_table = function(dat.sub, dat) { + dat.sub <- dat.sub |> + group_by(fu.interval) |> + summarize( + n.censored = sum(death == 0), + n.event = sum(death), + n.at.risk = nrow(dat), + ) + + for (i in 2:nrow(dat.sub)) { + j <- i - 1 + + n.censored.pre <- dat.sub$n.censored[j] + n.event.pre <- dat.sub$n.event[j] + n.at.risk.pre <- dat.sub$n.at.risk[j] + + n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre + + dat.sub$n.at.risk[i] <- n.at.risk + } + + dat.sub <- dat.sub |> + mutate( + hazard.rate = n.event / n.at.risk + ) + + return(dat.sub) +} +``` + +```{r} +dat.5.tx0 = dat.5[dat.5$tx.type == 0, ] +dat.5.tx1 = dat.5[dat.5$tx.type == 1, ] +``` + +```{r} +tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ]) +tx0.life +``` + +```{r} +tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ]) +tx1.life +``` + + +```{r} +tx1.life$hazard.rate / tx0.life$hazard.rate +``` + +```{r} +hazard.df <- data.frame( + fu.interval = tx1.life$fu.interval, + hazard.rate.0 = tx0.life$hazard.rate, + hazard.rate.1 = tx1.life$hazard.rate, + hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate +) + +ggplot(hazard.df, aes(x = fu.interval)) + + geom_line(aes(y = hazard.rate.0), color = "blue") + + geom_line(aes(y = hazard.rate.1), color = "orange") + +ggplot(hazard.df, aes(x = fu.interval)) + + geom_line(aes(y = hazard.ratio)) + +``` + +--- + +General Solution: + +```{r} +get.life.table <- function(dat, time.intervals) { + n.pop <- nrow(dat) + + dat |> + recode.dat(time.intervals) |> + group_by(fu.interval) |> + summarize( + n.censored = sum(.data$death == 0), + n.event = sum(.data$death), + ) |> + ungroup() |> + calculate.hazard(n.pop) +} + +get.life.table.by.groups <- function(dat, time.intervals, grps) { + grps |> + lapply(function(grp) { + dat |> + get.life.table.by.group(time.intervals, grp) |> + mutate( + grp.name = grp, + grp.value = pick(1)[[1]] + ) |> + select(-1) + }) |> + bind_rows() +} + +get.life.table.by.group <- function(dat, time.intervals, grp) { + dat |> + recode.dat(time.intervals) |> + group_by(fu.interval, .data[[grp]]) |> + summarize( + n.censored = sum(.data$death == 0), + n.event = sum(.data$death), + .groups = "keep" + ) |> + ungroup(fu.interval) |> + group_modify(function(df.sub, grp) { + grp.name <- names(grp) + grp.value <- grp[[1]] + n.pop <- (dat[[grp.name]] == grp.value) |> sum() + calculate.hazard(df.sub, n.pop) + }) |> + ungroup() +} + +calculate.hazard <- function(life.table, n.pop) { + n.removed <- life.table$n.event + life.table$n.censored + n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)]) + life.table |> + mutate( + n.at.risk = n.pop - n.removed.accum, + # TODO: how to account for censored? How do we adjust for uneven interval? + hazard.rate = n.event / n.at.risk + ) +} + +recode.dat <- function(dat, time.intervals) { + df <- dat[dat$follow.up <= sum(time.intervals), ] + time.points <- cumsum(time.intervals) + df$fu.interval <- sapply(df$follow.up, function(time) { + time.points[time <= time.points][1] + }) + + df +} + + + +``` + +```{r} +time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) +grps <- c("tx.type", "sex", "hla.match", "cold.isc") + +get.life.table.by.groups(dat, time.intervals, grps) +``` + +```{r} +grps <- c("tx.type", "sex", "hla.match", "cold.isc") +grps <- c("tx.type") + + +plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) { + dat |> + get.life.table.by.groups(time.intervals, grps) |> + ggplot(aes(x = fu.interval, y = hazard.rate, + color = paste(grp.name, grp.value, sep = " = "))) + + geom_point() + + geom_line() + + labs( + color = "Group" + ) + + theme( + legend.position = "bottom" + ) +} + +``` +```{r} +time.intervals <- c(rep(1/3, 3), rep(1, 4)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- rep(1, 5) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- rep(1/3, 15) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/3, 3*3), rep(1, 2)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/4, 4), rep(1, 4)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/4, 4*2), rep(1, 3)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +--- + +Compare + +```{r} +time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) +get.life.table.by.groups(dat, time.intervals, grps) +``` + +```{r} +hazard.df +``` + + + +## Exercise 4 + +Show a plot with Kaplan-Meier survival curves for the two donor types. + +```{r} +km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up <= 12, ]) + +plot(km.tx, col = c("blue", "orange")) +legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange")) +``` + +## Exercise 5 + +Fit a univariate Cox model with predictor donor type. Report +the hazard ratio and 95% confidence interval and interpret the result obtained. + +```{r} +cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat) +summary(cox.tx) +``` + +```{r} +log.hazard.ratio.hat <- cox.tx$coefficient +hazard.ratio.hat <- log.hazard.ratio.hat |> exp() + +``` + +```{r} +log.hazard.ratio.se <- summary(cox.tx)$coefficients[3] +log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se +log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se + +c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp() +``` + +::: {.callout-note} +Patients receiving kidney transplants from living donors (`tx.type = 1`) has +1.91 higher hazard rate than those receiving from cadaveric donors +(`tx.type = 0`). + +Since 1 is not in the 95% confidence interval, the result is statistically +significant. +::: + + +# Exercise 6 + +Research shows that an important determinant of mortality +after kidney transplant is the age of the recipient. Fit a Cox model with age +as predictor and estimate the hazard ratio and its confidence interval. Consider +age first as continuous variable and then divide into categories. + +## Continuous + +```{r} +cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat) +cox.age.rec |> summary() +``` + +## Categorical (Case 1) + +```{r} +recode.age <- function(dat, ages) { + df$age.group <- sapply(df$follow.up, function(time) { + time.points[time <= time.points][1] + }) + + df +} +``` + +```{r} +dat.age.rec <- dat[!is.na(dat$age.rec), ] +class.age <- function(age) { + if (age < 2) { + return("0,1") + } else if (age < 6) { + return("2,3,4,5") + } else if (age < 11) { + return("6,7,8,9,10") + } else if (age < 19) { + return("11-18") + } +} +``` + +```{r} +dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |> + factor(levels = c( + "0,1", + "2,3,4,5", + "6,7,8,9,10", + "11-18" + )) + + +cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec) +cox.age.rec |> summary() +``` + +```{r} +emmeans(cox.age.rec, pairwise ~ age.group, type = "response") +``` + + +# Exercise 7 + +Fit a multivariate Cox model by using other predictors and +describe your results. + + +```{r} +sapply(names(dat), function(col) { + is.na(dat[, col]) |> sum() +}) +``` + +```{r} +names(dat) +``` + +```{r} +Waldtest <- function(cox.fit, i.betas){ + q <- length(i.betas) + coefs <- cox.fit$coefficients + # print(cox.fit$var) + var <- cox.fit$var[i.betas, i.betas] + Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas] + p.value <- 1-pchisq(as.numeric(Wald), df = q) + return(cbind(Wald, p.value)) +} +``` + + +```{r} +fit.cox.model <- function(surv, i.betas) { + fit.tx <- coxph(surv, data = dat) + Waldtest(fit.tx, i.betas) +} +``` + +Start with `tx.type` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec +fit.cox.model(surv, 1:2) # NOTE: Why is this 1??? +``` + +Iteration 1: `tx.type + age + ?` + +```{r} +names(dat) +``` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + year +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex +fit.cox.model(surv, 3) +``` + +Add `hla.match` + +Iteration 2: `tx.type + age.rec + hla.match + ?` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +fit.cox.model(surv, 4) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc +fit.cox.model(surv, 4) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year +fit.cox.model(surv, 4) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex +fit.cox.model(surv, 4) +``` + +Add `age.donor` + +Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + cold.isc +fit.cox.model(surv, 5) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + year +fit.cox.model(surv, 5) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + sex +fit.cox.model(surv, 5) +``` + +Add `year` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + year + cold.isc +fit.cox.model(surv, 6) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + year + sex +fit.cox.model(surv, 6) +``` + +```{r} + +``` + + +Next, we test for each remaining variables + +# Exercise 8 + +Estimate the survival function for specific covariate patterns. +Based on the previous results choose the best predictors. + +Exercise 9 — Check the proportional hazards assumption. You may use the +function cox.zph. Discuss the result and possible implications. + +H0: PH-assumption holds +H1: PH-assumption doesn't hold + +```{r} +# different cox model!! +cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat) +cox.zph(cox.final) +``` + + +Exercise 10 — Plot the Schoenfeld residuals and comment. +A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate's impact changes, while a horizontal line around zero supports the assumption. + +```{r} +plot(cox.zph(cox.final)) +``` + + + +NOTES: +left censoring: event happened before time zero -> not applicable +left truncation: not applicable?? +right censoring: we saw this in the follow up times, administrative censoring high + +! No left truncation, since our target population is patients who received a +kidney transplant. So we don't care about patients who died before receiving +a transplant. + +! ADD: we used a Cox model but we already saw that the PH assumption doesn't hold ! + +? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk. + + diff --git a/report.html b/report.html index 4c9fd4b..8afe5ee 100644 --- a/report.html +++ b/report.html @@ -99,24 +99,17 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
library(tidyverse)
-
-
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-✔ dplyr     1.1.4     ✔ readr     2.1.5
-✔ forcats   1.0.0     ✔ stringr   1.5.1
-✔ lubridate 1.9.4     ✔ tibble    3.3.0
-✔ purrr     1.1.0     ✔ tidyr     1.3.1
-── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-✖ dplyr::filter() masks stats::filter()
-✖ dplyr::lag()    masks stats::lag()
-ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
-
library(survival)
-# library(gtsummary)
+
library(tidyverse)
+library(dplyr)
+library(ggplot2)
+library(survival)
+library(emmeans)
+library(foreign)
+library(gtsummary)
-
dat <- read.csv("./unos.txt", sep = "\t")
-head(dat)
+
dat <- read.csv("./unos.txt", sep = "\t")
+head(dat)
  hlamat age age.1 cold_isc death year sex txtype fu
 1      2  25    15        6     0 2002   0      1  0
@@ -126,10 +119,26 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 5      2  17    12       28     1 1997   1      1  0
 6      3  44    13        1     0 2002   1      0  0
+
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
+                "year", "sex", "tx.type", "follow.up")
+
+

Complete case

+
+
nrow(dat)
+
+
[1] 9775
+
-
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
-                "year", "sex", "tx.type", "follow.up")
+
sapply(names(dat), function(col) {
+  is.na(dat[, col]) |> sum()
+})
+
+
hla.match age.donor   age.rec  cold.isc     death      year       sex   tx.type 
+      234       113         9      2250         0         0         0         0 
+follow.up 
+        0 
+

1 Exercise

@@ -139,10 +148,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

Illustrate in a table the characteristics of the population (age, sex, race, donor, . . . ).

-
g <- ggplot(dat)
-
-
-
g + geom_point(aes(x = follow.up, y = death))
+
hist(dat$age.donor)
@@ -150,7 +156,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g +  geom_density(aes(x = follow.up))
+
hist(dat$age.rec)
@@ -158,11 +164,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g +  geom_density(aes(x = age.donor))
-
-
Warning: Removed 113 rows containing non-finite outside the scale range
-(`stat_density()`).
-
+
hist(dat$cold.isc)
@@ -170,11 +172,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g +  geom_bar(aes(x = age.rec))
-
-
Warning: Removed 9 rows containing non-finite outside the scale range
-(`stat_count()`).
-
+
hist(dat$year)
@@ -182,7 +180,1435 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
+
hist(dat$follow.up)
+
+
+
+

+
+
+
+
unique(dat$hla.match)
+
+
[1]  2  1  0  3  4 NA  5  6
+
+
unique(dat$year)
+
+
 [1] 2002 1999 1997 2001 1994 1995 1996 1993 1990 1991 1992 2000 1998
+
+
+

Change the order!!

+
+
dat.table1 <- dat |> 
+  mutate(
+    sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
+    tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
+    hla.match = factor(hla.match),
+    year = factor(year)
+  )
+
+
+
table1 <- dat.table1 |> 
+  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
+  tbl_summary(
+    statistic = list(
+      all_continuous() ~ "{median} ({p25}, {p75})"
+    ),
+    label = list(
+      hla.match ~ "HLA matches, n(%)",
+      age.donor ~ "Donor age, median (IQR)",
+      age.rec ~ "Recipient age, median (IQR)",
+      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
+      year ~ "Year of transplant",
+      sex ~ "Sex, n(%)",
+      tx.type ~ "Transplant type, n(%)"
+    ),
+    missing = "ifany"
+  ) |> 
+  modify_footnote(all_stat_cols() ~ NA)
+
+table1
+
+
+ + + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CharacteristicN = 9,775
HLA matches, n(%)
+
    0784 (8.2%)
    11,504 (16%)
    21,553 (16%)
    33,778 (40%)
    41,278 (13%)
    5365 (3.8%)
    6279 (2.9%)
    Unknown234
Donor age, median (IQR)33 (21, 41)
    Unknown113
Recipient age, median (IQR)13 (8, 16)
    Unknown9
Cold ischemic time (hours), median (IQR),7 (1, 19)
    Unknown2,250
Year of transplant
+
    1990728 (7.4%)
    1991734 (7.5%)
    1992666 (6.8%)
    1993736 (7.5%)
    1994787 (8.1%)
    1995813 (8.3%)
    1996789 (8.1%)
    1997745 (7.6%)
    1998740 (7.6%)
    1999842 (8.6%)
    2000706 (7.2%)
    2001834 (8.5%)
    2002655 (6.7%)
Sex, n(%)
+
    Female4,014 (41%)
    Male5,761 (59%)
Transplant type, n(%)
+
    Cadaveric5,148 (53%)
    Living4,627 (47%)
+ +
+
+
+
+
table.split <- dat.table1 |> 
+  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
+  tbl_summary(
+    by = tx.type,
+    statistic = list(
+      all_continuous() ~ "{median} ({p25}, {p75})"
+    ),
+    label = list(
+      hla.match ~ "HLA matches, n(%)",
+      age.donor ~ "Donor age, median (IQR)",
+      age.rec ~ "Recipient age, median (IQR)",
+      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
+      year ~ "Year of transplant",
+      sex ~ "Sex, n(%)"
+    ),
+    missing = "ifany"
+  ) |> 
+  add_overall() |> 
+  modify_footnote(all_stat_cols() ~ NA)
+
+table.split
+
+
+ + + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CharacteristicOverall
+N = 9,775
Cadaveric
+N = 5,148
Living
+N = 4,627
HLA matches, n(%)
+

+

+
    0784 (8.2%)125 (2.5%)659 (14%)
    11,504 (16%)160 (3.2%)1,344 (29%)
    21,553 (16%)228 (4.6%)1,325 (29%)
    33,778 (40%)3,039 (61%)739 (16%)
    41,278 (13%)1,007 (20%)271 (5.9%)
    5365 (3.8%)241 (4.9%)124 (2.7%)
    6279 (2.9%)153 (3.1%)126 (2.7%)
    Unknown23419539
Donor age, median (IQR)33 (21, 41)37 (31, 42)22 (15, 37)
    Unknown1131121
Recipient age, median (IQR)13 (8, 16)13 (7, 16)14 (9, 16)
    Unknown972
Cold ischemic time (hours), median (IQR),7 (1, 19)1 (0, 1)19 (13, 25)
    Unknown2,2501,490760
Year of transplant
+

+

+
    1990728 (7.4%)318 (6.2%)410 (8.9%)
    1991734 (7.5%)387 (7.5%)347 (7.5%)
    1992666 (6.8%)345 (6.7%)321 (6.9%)
    1993736 (7.5%)403 (7.8%)333 (7.2%)
    1994787 (8.1%)355 (6.9%)432 (9.3%)
    1995813 (8.3%)427 (8.3%)386 (8.3%)
    1996789 (8.1%)416 (8.1%)373 (8.1%)
    1997745 (7.6%)408 (7.9%)337 (7.3%)
    1998740 (7.6%)412 (8.0%)328 (7.1%)
    1999842 (8.6%)436 (8.5%)406 (8.8%)
    2000706 (7.2%)392 (7.6%)314 (6.8%)
    2001834 (8.5%)483 (9.4%)351 (7.6%)
    2002655 (6.7%)366 (7.1%)289 (6.2%)
Sex, n(%)
+

+

+
    Female4,014 (41%)2,105 (41%)1,909 (41%)
    Male5,761 (59%)3,043 (59%)2,718 (59%)
+ +
+
+
+

Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI

+
+
rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)
+
+summary(rev_km)$table
+
+
     records        n.max      n.start       events        rmean    se(rmean) 
+9.775000e+03 9.775000e+03 9.775000e+03 9.310000e+03 4.055027e+00 3.202711e-02 
+      median      0.95LCL      0.95UCL 
+3.380822e+00 3.227397e+00 3.567123e+00 
+
+
median_followup <- summary(rev_km)$table["median"]
+confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
+median_followup
+
+
  median 
+3.380822 
+
+
confidence_interval
+
+
 0.95LCL  0.95UCL 
+3.227397 3.567123 
+
+
+
+
g <- ggplot(dat)
+
+
+
g + geom_point(aes(x = follow.up, y = death))
+
+
+
+

+
+
+
+
g +  geom_density(aes(x = follow.up))
+
+
+
+

+
+
+
+
g +  geom_density(aes(x = age.donor))
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+

+
+
+
+
g +  geom_bar(aes(x = age.rec))
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+

+
+
+
+
g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
Warning: Removed 113 rows containing missing values or values outside the scale range
 (`stat_boxplot()`).
@@ -194,23 +1620,34 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-

+

+
+
+
+
# dat$age.1 |> table()
+
+
+
g + geom_boxplot(aes(x = year, y = follow.up, group = year))
+
+
+
+

-
# dat$age.1 |> table()

1.2 Exercise 2

Plot the Kaplan-Meier overall survival curve for pediatric kid- ney transplant recipients for the first 12 years after transplantation.

+

!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!

-
km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up  <= 12, ])
-plot(km)
+
km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up  <= 12, ])
+plot(km)
-

+

@@ -220,8 +1657,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

1.3 Exercise 3

We are going to compare mortality rates (hazard functions) between children whose transplanted kidney was provided by a living donor (in general a family member) and those whose source was recently deceased (variable donor type: txtype). Use the life table method to calculate the death rates for the first 5 years for each group (take in the first year intervals of 4 months and then look at each year) and show the results in a table. Estimate the hazard ratio in each time interval as the ratio between the mortality rates in the two groups. What do you notice?

-
dat.5 <- dat[dat$follow.up <= 5, ]
-head(dat.5)
+
dat.5 <- dat[dat$follow.up <= 5, ]
+head(dat.5)
  hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
 1         2        25      15        6     0 2002   0       1         0
@@ -231,19 +1668,19 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 5         2        17      12       28     1 1997   1       1         0
 6         3        44      13        1     0 2002   1       0         0
-
classify_time_interval = function(fu) {
-  if (fu <= 1/3) {
-    return(1/3)
-  } else if (fu <= 2/3) {
-    return(2/3)
-  } else if (fu <= 1) {
-    return(1)
-  }
-  ceiling(fu)
-}
-
-dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
-table(dat.5$fu.interval)
+
classify_time_interval = function(fu) {
+  if (fu <= 1/3) {
+    return(1/3)
+  } else if (fu <= 2/3) {
+    return(2/3)
+  } else if (fu <= 1) {
+    return(1)
+  }
+  ceiling(fu)
+}
+
+dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
+table(dat.5$fu.interval)

 0.333333333333333 0.666666666666667                 1                 2 
@@ -251,7 +1688,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
                 3                 4                 5 
              1184              1093               950 
-
head(dat.5)
+
head(dat.5)
  hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
 1         2        25      15        6     0 2002   0       1         0
@@ -270,31 +1707,31 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 
-
dat.5.life <- dat.5 |>
-  group_by(fu.interval) |>
-  summarize(
-    n.censored = sum(death == 0),
-    n.event = sum(death),
-    n.at.risk = nrow(dat),
-  )
-
-for (i in 2:nrow(dat.5.life)) {
-  j <- i - 1
-
-  n.censored.pre <- dat.5.life$n.censored[j]
-  n.event.pre <- dat.5.life$n.event[j]
-  n.at.risk.pre <- dat.5.life$n.at.risk[j]
-
-  n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
-
-  dat.5.life$n.at.risk[i] <- n.at.risk
-}
-
-print(nrow(dat))
+
dat.5.life <- dat.5 |>
+  group_by(fu.interval) |>
+  summarize(
+    n.censored = sum(death == 0),
+    n.event = sum(death),
+    n.at.risk = nrow(dat),
+  )
+
+for (i in 2:nrow(dat.5.life)) {
+  j <- i - 1
+
+  n.censored.pre <- dat.5.life$n.censored[j]
+  n.event.pre <- dat.5.life$n.event[j]
+  n.at.risk.pre <- dat.5.life$n.at.risk[j]
+
+  n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
+
+  dat.5.life$n.at.risk[i] <- n.at.risk
+}
+
+print(nrow(dat))
[1] 9775
-
dat.5.life
+
dat.5.life
# A tibble: 7 × 4
   fu.interval n.censored n.event n.at.risk
@@ -308,126 +1745,355 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 7       5            919      31      4122
+

n.removed.acc

-
dat.5.life <- dat.5.life |>
-  mutate(
-     hazard.rate = n.event / n.at.risk
-  )
+
dat.5.life <- dat.5.life |>
+  mutate(
+     hazard.rate = n.event / n.at.risk
+  )

-
get_life_table = function(dat) {
-  dat <- dat |>
-    group_by(fu.interval) |>
-    summarize(
-      n.censored = sum(death == 0),
-      n.event = sum(death),
-      n.at.risk = nrow(dat),
-    )
-
-  for (i in 2:nrow(dat)) {
-    j <- i - 1
-
-    n.censored.pre <- dat$n.censored[j]
-    n.event.pre <- dat$n.event[j]
-    n.at.risk.pre <- dat$n.at.risk[j]
-
-    n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
-
-    dat$n.at.risk[i] <- n.at.risk
-  }
-
-  dat <- dat |>
-    mutate(
-       hazard.rate = n.event / n.at.risk
-    )
-
-  return(dat)
-}
+
get_life_table = function(dat.sub, dat) {
+  dat.sub <- dat.sub |>
+    group_by(fu.interval) |>
+    summarize(
+      n.censored = sum(death == 0),
+      n.event = sum(death),
+      n.at.risk = nrow(dat),
+    )
+
+  for (i in 2:nrow(dat.sub)) {
+    j <- i - 1
+
+    n.censored.pre <- dat.sub$n.censored[j]
+    n.event.pre <- dat.sub$n.event[j]
+    n.at.risk.pre <- dat.sub$n.at.risk[j]
+
+    n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
+
+    dat.sub$n.at.risk[i] <- n.at.risk
+  }
+
+  dat.sub <- dat.sub |>
+    mutate(
+       hazard.rate = n.event / n.at.risk
+    )
+
+  return(dat.sub)
+}
-
dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
-dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
+
dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
+dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
-
tx0.life <- get_life_table(dat.5.tx0)
-tx0.life
+
tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
+tx0.life
# A tibble: 7 × 5
   fu.interval n.censored n.event n.at.risk hazard.rate
         <dbl>      <int>   <int>     <int>       <dbl>
-1       0.333        463      49      3378     0.0145 
-2       0.667        253      13      2866     0.00454
-3       1            340      11      2600     0.00423
-4       2            591      24      2249     0.0107 
-5       3            568      17      1634     0.0104 
-6       4            521      16      1049     0.0153 
-7       5            501      11       512     0.0215 
+1 0.333 463 49 5148 0.00952 +2 0.667 253 13 4636 0.00280 +3 1 340 11 4370 0.00252 +4 2 591 24 4019 0.00597 +5 3 568 17 3404 0.00499 +6 4 521 16 2819 0.00568 +7 5 501 11 2282 0.00482
-
tx1.life <- get_life_table(dat.5.tx1)
-tx1.life
+
tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
+tx1.life
# A tibble: 7 × 5
   fu.interval n.censored n.event n.at.risk hazard.rate
         <dbl>      <int>   <int>     <int>       <dbl>
-1       0.333        367     103      3225     0.0319 
-2       0.667        238      24      2755     0.00871
-3       1            273      16      2493     0.00642
-4       2            573      38      2204     0.0172 
-5       3            575      24      1593     0.0151 
-6       4            532      24       994     0.0241 
-7       5            418      20       438     0.0457 
+1 0.333 367 103 4627 0.0223 +2 0.667 238 24 4157 0.00577 +3 1 273 16 3895 0.00411 +4 2 573 38 3606 0.0105 +5 3 575 24 2995 0.00801 +6 4 532 24 2396 0.0100 +7 5 418 20 1840 0.0109
-
tx1.life$hazard.rate / tx0.life$hazard.rate
+
tx1.life$hazard.rate / tx0.life$hazard.rate
-
[1] 2.201766 1.920536 1.516975 1.615661 1.448100 1.582998 2.125363
+
[1] 2.338731 2.058881 1.631929 1.764675 1.604557 1.764816 2.254941
-
hazard.df <- data.frame(
-  fu.interval = tx1.life$fu.interval,
-  hazard.rate.0 = tx0.life$hazard.rate,
-  hazard.rate.1 = tx1.life$hazard.rate,
-  hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
-) 
-
-ggplot(hazard.df, aes(x = fu.interval)) +
-  geom_line(aes(y = hazard.rate.0), color = "blue") + 
-  geom_line(aes(y = hazard.rate.1), color = "orange")
+
hazard.df <- data.frame(
+  fu.interval = tx1.life$fu.interval,
+  hazard.rate.0 = tx0.life$hazard.rate,
+  hazard.rate.1 = tx1.life$hazard.rate,
+  hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
+) 
+
+ggplot(hazard.df, aes(x = fu.interval)) +
+  geom_line(aes(y = hazard.rate.0), color = "blue") + 
+  geom_line(aes(y = hazard.rate.1), color = "orange")
-

+

-
ggplot(hazard.df, aes(x = fu.interval)) +
-  geom_line(aes(y = hazard.ratio))
+
ggplot(hazard.df, aes(x = fu.interval)) +
+  geom_line(aes(y = hazard.ratio))
-

+

+
+

General Solution:

+
+
get.life.table <- function(dat, time.intervals) {
+  n.pop <- nrow(dat)
+
+  dat |>
+    recode.dat(time.intervals) |>
+    group_by(fu.interval) |>
+    summarize(
+      n.censored = sum(.data$death == 0),
+      n.event = sum(.data$death),
+    ) |>
+    ungroup() |>
+    calculate.hazard(n.pop)
+}
+
+get.life.table.by.groups <- function(dat, time.intervals, grps) {
+  grps |>
+    lapply(function(grp) {
+      dat |>
+        get.life.table.by.group(time.intervals, grp) |>
+        mutate(
+          grp.name = grp,
+          grp.value = pick(1)[[1]]
+        ) |>
+        select(-1)
+    }) |>
+    bind_rows()
+}
+
+get.life.table.by.group <- function(dat, time.intervals, grp) {
+  dat |>
+    recode.dat(time.intervals) |>
+    group_by(fu.interval, .data[[grp]]) |>
+    summarize(
+      n.censored = sum(.data$death == 0),
+      n.event = sum(.data$death),
+      .groups = "keep"
+    ) |>
+    ungroup(fu.interval) |>
+    group_modify(function(df.sub, grp) {
+      grp.name <- names(grp)
+      grp.value <- grp[[1]]
+      n.pop <- (dat[[grp.name]] == grp.value) |> sum()
+      calculate.hazard(df.sub, n.pop)
+    }) |>
+    ungroup()
+}
+
+calculate.hazard <- function(life.table, n.pop) {
+  n.removed <- life.table$n.event + life.table$n.censored
+  n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
+  life.table |>
+    mutate(
+      n.at.risk = n.pop - n.removed.accum,
+      # TODO: how to account for censored? How do we adjust for uneven interval?
+      hazard.rate = n.event / n.at.risk
+    )
+}
+
+recode.dat <- function(dat, time.intervals) {
+  df <- dat[dat$follow.up <= sum(time.intervals), ]
+  time.points <- cumsum(time.intervals)
+  df$fu.interval <- sapply(df$follow.up, function(time) {
+      time.points[time <= time.points][1]
+  })
+
+  df
+}
+
+
+
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+grps <- c("tx.type", "sex", "hla.match", "cold.isc")
+
+get.life.table.by.groups(dat, time.intervals, grps)
+
+
# A tibble: 593 × 7
+   fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
+         <dbl>      <int>   <int>     <dbl>       <dbl> <chr>        <dbl>
+ 1       0.333        463      49      5148     0.00952 tx.type          0
+ 2       0.667        253      13      4636     0.00280 tx.type          0
+ 3       1            340      11      4370     0.00252 tx.type          0
+ 4       2            591      24      4019     0.00597 tx.type          0
+ 5       3            568      17      3404     0.00499 tx.type          0
+ 6       4            521      16      2819     0.00568 tx.type          0
+ 7       5            501      11      2282     0.00482 tx.type          0
+ 8       0.333        367     103      4627     0.0223  tx.type          1
+ 9       0.667        238      24      4157     0.00577 tx.type          1
+10       1            273      16      3895     0.00411 tx.type          1
+# ℹ 583 more rows
+
+
+
+
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
+grps <- c("tx.type")
+
+
+plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
+  dat |>
+    get.life.table.by.groups(time.intervals, grps) |>
+    ggplot(aes(x = fu.interval, y = hazard.rate,
+               color = paste(grp.name, grp.value, sep = " = "))) +
+    geom_point() +
+    geom_line() +
+    labs(
+      color = "Group"
+    ) +
+    theme(
+      legend.position = "bottom"
+    )
+}
+
+
+
time.intervals <- c(rep(1/3, 3), rep(1, 4))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+
time.intervals <- rep(1, 5) 
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+
time.intervals <- rep(1/3, 15)
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+
time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+
time.intervals <- c(rep(1/4, 4), rep(1, 4))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+
time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+
time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+
+
+
+

+
+
+
+
+
+

Compare

+
+
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+get.life.table.by.groups(dat, time.intervals, grps)
+
+
# A tibble: 14 × 7
+   fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
+         <dbl>      <int>   <int>     <dbl>       <dbl> <chr>        <int>
+ 1       0.333        463      49      5148     0.00952 tx.type          0
+ 2       0.667        253      13      4636     0.00280 tx.type          0
+ 3       1            340      11      4370     0.00252 tx.type          0
+ 4       2            591      24      4019     0.00597 tx.type          0
+ 5       3            568      17      3404     0.00499 tx.type          0
+ 6       4            521      16      2819     0.00568 tx.type          0
+ 7       5            501      11      2282     0.00482 tx.type          0
+ 8       0.333        367     103      4627     0.0223  tx.type          1
+ 9       0.667        238      24      4157     0.00577 tx.type          1
+10       1            273      16      3895     0.00411 tx.type          1
+11       2            573      38      3606     0.0105  tx.type          1
+12       3            575      24      2995     0.00801 tx.type          1
+13       4            532      24      2396     0.0100  tx.type          1
+14       5            418      20      1840     0.0109  tx.type          1
+
+
+
+
hazard.df
+
+
  fu.interval hazard.rate.0 hazard.rate.1 hazard.ratio
+1   0.3333333   0.009518260   0.022260644     2.338731
+2   0.6666667   0.002804142   0.005773394     2.058881
+3   1.0000000   0.002517162   0.004107831     1.631929
+4   2.0000000   0.005971635   0.010537992     1.764675
+5   3.0000000   0.004994125   0.008013356     1.604557
+6   4.0000000   0.005675772   0.010016694     1.764816
+7   5.0000000   0.004820333   0.010869565     2.254941
+
+

1.4 Exercise 4

Show a plot with Kaplan-Meier survival curves for the two donor types.

-
km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up  <= 12, ])
-
-plot(km.tx, col = c("blue", "orange"))
-legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))
+
km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up  <= 12, ])
+
+plot(km.tx, col = c("blue", "orange"))
+legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))
-

+

@@ -437,8 +2103,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

1.5 Exercise 5

Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained.

-
cox <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
-summary(cox)
+
cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
+summary(cox.tx)
Call:
 coxph(formula = Surv(follow.up, death) ~ tx.type, data = dat)
@@ -460,27 +2126,369 @@ Score (logrank) test = 47.09  on 1 df,   p=7e-12
-
1.90539
+
log.hazard.ratio.hat <- cox.tx$coefficient
+hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
+
+
+
log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
+log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
+log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se
+
+c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
-
[1] 1.90539
+
 tx.type  tx.type 
+1.579899 2.297932 
+
+
+
+
+
+ +
+
+Note +
+
+
+

Patients receiving kidney transplants from living donors (tx.type = 1) has 1.91 higher hazard rate than those receiving from cadaveric donors (tx.type = 0).

+

Since 1 is not in the 95% confidence interval, the result is statistically significant.

+
+
+
+ +
+

2 Exercise 6

+

Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.

+
+

2.1 Continuous

+
+
cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
+cox.age.rec |> summary()
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ age.rec, data = dat)
+
+  n= 9766, number of events= 464 
+   (9 observations deleted due to missingness)
+
+             coef exp(coef)  se(coef)      z Pr(>|z|)    
+age.rec -0.042550  0.958342  0.008434 -5.045 4.53e-07 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+        exp(coef) exp(-coef) lower .95 upper .95
+age.rec    0.9583      1.043    0.9426    0.9743
+
+Concordance= 0.586  (se = 0.016 )
+Likelihood ratio test= 24.81  on 1 df,   p=6e-07
+Wald test            = 25.45  on 1 df,   p=5e-07
+Score (logrank) test = 25.75  on 1 df,   p=4e-07
+
+
+
+
+

2.2 Categorical (Case 1)

+
+
recode.age <- function(dat, ages) {
+  df$age.group <- sapply(df$follow.up, function(time) {
+      time.points[time <= time.points][1]
+  })
+
+  df
+}
+
+
+
dat.age.rec <- dat[!is.na(dat$age.rec), ]
+class.age <- function(age) {
+  if (age < 2) { 
+    return("0,1")
+  } else if (age < 6) {
+    return("2,3,4,5")
+  } else if (age < 11) {
+    return("6,7,8,9,10")
+  } else if (age < 19) {
+    return("11-18")
+  }
+}
+
+
+
dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
+  factor(levels = c(
+    "0,1",
+    "2,3,4,5",
+    "6,7,8,9,10",
+    "11-18"
+  ))
+
+
+cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
+cox.age.rec |> summary()
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ age.group, data = dat.age.rec)
+
+  n= 9766, number of events= 464 
+
+                       coef exp(coef) se(coef)      z Pr(>|z|)    
+age.group2,3,4,5    -0.5947    0.5517   0.1767 -3.366 0.000764 ***
+age.group6,7,8,9,10 -1.0578    0.3472   0.1776 -5.956 2.59e-09 ***
+age.group11-18      -1.0192    0.3609   0.1513 -6.736 1.63e-11 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+                    exp(coef) exp(-coef) lower .95 upper .95
+age.group2,3,4,5       0.5517      1.813    0.3902    0.7801
+age.group6,7,8,9,10    0.3472      2.880    0.2452    0.4918
+age.group11-18         0.3609      2.771    0.2683    0.4855
+
+Concordance= 0.584  (se = 0.015 )
+Likelihood ratio test= 44.93  on 3 df,   p=1e-09
+Wald test            = 53.97  on 3 df,   p=1e-11
+Score (logrank) test = 57.87  on 3 df,   p=2e-12
-
1.90539 + 1.96 * exp(0.09558)
+
emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
-
[1] 4.061972
-
-
(2.298 - 1.58) / 2 |> exp() / 2
-
-
[1] 0.04858537
+
$emmeans
+ age.group  response     SE  df asymp.LCL asymp.UCL
+ 0,1           1.000 0.0000 Inf     1.000     1.000
+ 2,3,4,5       0.552 0.0975 Inf     0.390     0.780
+ 6,7,8,9,10    0.347 0.0617 Inf     0.245     0.492
+ 11-18         0.361 0.0546 Inf     0.268     0.485
+
+Confidence level used: 0.95 
+Intervals are back-transformed from the log scale 
+
+$contrasts
+ contrast             ratio    SE  df null z.ratio p.value
+ 0,1 / 2,3,4,5        1.813 0.320 Inf    1   3.366  0.0043
+ 0,1 / 6,7,8,9,10     2.880 0.511 Inf    1   5.956  <.0001
+ 0,1 / (11-18)        2.771 0.419 Inf    1   6.736  <.0001
+ 2,3,4,5 / 6,7,8,9,10 1.589 0.251 Inf    1   2.928  0.0179
+ 2,3,4,5 / (11-18)    1.529 0.196 Inf    1   3.314  0.0051
+ 6,7,8,9,10 / (11-18) 0.962 0.124 Inf    1  -0.298  0.9908
+
+P value adjustment: tukey method for comparing a family of 4 estimates 
+Tests are performed on the log scale 
-

Exercise 6 — Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.

-

Exercise 7 — Fit a multivariate Cox model by using other predictors and describe your results.

-

Exercise 8 — Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.

-

Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications. Exercise 10 — Plot the Schoenfeld residuals and comment.

+
+

3 Exercise 7

+

Fit a multivariate Cox model by using other predictors and describe your results.

+
+
sapply(names(dat), function(col) {
+  is.na(dat[, col]) |> sum()
+})
+
+
hla.match age.donor   age.rec  cold.isc     death      year       sex   tx.type 
+      234       113         9      2250         0         0         0         0 
+follow.up 
+        0 
+
+
+
+
names(dat)
+
+
[1] "hla.match" "age.donor" "age.rec"   "cold.isc"  "death"     "year"     
+[7] "sex"       "tx.type"   "follow.up"
+
+
+
+
Waldtest <- function(cox.fit, i.betas){
+  q <- length(i.betas)
+  coefs <- cox.fit$coefficients
+  # print(cox.fit$var)
+  var <- cox.fit$var[i.betas, i.betas]
+  Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
+  p.value <- 1-pchisq(as.numeric(Wald), df = q)
+  return(cbind(Wald, p.value))
+}
+
+
+
fit.cox.model <- function(surv, i.betas) {
+  fit.tx <- coxph(surv, data = dat)
+  Waldtest(fit.tx, i.betas)
+}
+
+

Start with tx.type

+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec
+fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
+
+
             p.value
+[1,] 77.3778       0
+
+
+

Iteration 1: tx.type + age + ?

+
+
names(dat)
+
+
[1] "hla.match" "age.donor" "age.rec"   "cold.isc"  "death"     "year"     
+[7] "sex"       "tx.type"   "follow.up"
+
+
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
+fit.cox.model(surv, 3)
+
+
                  p.value
+[1,] 8.146717 0.004313919
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
+fit.cox.model(surv, 3)
+
+
                 p.value
+[1,] 6.509223 0.01073164
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
+fit.cox.model(surv, 3)
+
+
                 p.value
+[1,] 0.7057259 0.4008664
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
+fit.cox.model(surv, 3)
+
+
                 p.value
+[1,] 4.680593 0.03050521
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
+fit.cox.model(surv, 3)
+
+
                p.value
+[1,] 1.921176 0.1657271
+
+
+

Add hla.match

+

Iteration 2: tx.type + age.rec + hla.match + ?

+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
+fit.cox.model(surv, 4)
+
+
                p.value
+[1,] 5.211567 0.0224371
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
+fit.cox.model(surv, 4)
+
+
                 p.value
+[1,] 0.9157965 0.3385811
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
+fit.cox.model(surv, 4)
+
+
                p.value
+[1,] 4.94216 0.02620927
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
+fit.cox.model(surv, 4)
+
+
                p.value
+[1,] 0.981911 0.3217275
+
+
+

Add age.donor

+

Iteration 2: tx.type + age.rec + hla.match + age.donor + ?

+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+  cold.isc
+fit.cox.model(surv, 5)
+
+
                 p.value
+[1,] 0.4450261 0.5047065
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+  year
+fit.cox.model(surv, 5)
+
+
                 p.value
+[1,] 4.367751 0.03662531
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+  sex
+fit.cox.model(surv, 5)
+
+
                 p.value
+[1,] 0.9343691 0.3337302
+
+
+

Add year

+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
+  year + cold.isc
+fit.cox.model(surv, 6)
+
+
                 p.value
+[1,] 0.1614349 0.6878388
+
+
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
+  year + sex
+fit.cox.model(surv, 6)
+
+
                 p.value
+[1,] 0.9641705 0.3261383
+
+
+

Next, we test for each remaining variables

+
+
+

4 Exercise 8

+

Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.

+

Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications.

+

H0: PH-assumption holds H1: PH-assumption doesn’t hold

+
+
# different cox model!!
+cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
+cox.zph(cox.final)
+
+
          chisq df       p
+tx.type    2.56  1 0.10948
+age.rec   35.98  1 2.0e-09
+hla.match  8.22  1 0.00414
+age.donor 11.05  1 0.00089
+GLOBAL    47.37  4 1.3e-09
+
+
+

Exercise 10 — Plot the Schoenfeld residuals and comment. A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate’s impact changes, while a horizontal line around zero supports the assumption.

+
+
plot(cox.zph(cox.final))
+
+
+
+

+
+
+
+
+
+
+

+
+
+
+
+
+
+

+
+
+
+
+
+
+

+
+
+
+
+

NOTES: left censoring: event happened before time zero -> not applicable left truncation: not applicable?? right censoring: we saw this in the follow up times, administrative censoring high

+

! No left truncation, since our target population is patients who received a kidney transplant. So we don’t care about patients who died before receiving a transplant.

+

! ADD: we used a Cox model but we already saw that the PH assumption doesn’t hold !

+

? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.

+
diff --git a/slides.html b/slides.html new file mode 100644 index 0000000..80b495b --- /dev/null +++ b/slides.html @@ -0,0 +1,3075 @@ + + + + + + + + + +Slides + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

Slides

+
+ + + +
+ + + + +
+ + + +
+ + +
+

1 Introduction: research question

+
+

1.1 Survival after transplantation (M)

+
+
+

1.2 Identify Predictors (M)

+
+
+

1.3 Why Survival Analysis (L)

+

Motivation: study the distribution of time to event \(T\).

+

Example: time of death after kidney transplant.

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDTransplantDeath
120002005
220002009
320012005
420032004
520042016
+ +
+
+
+
+
+

+
+
+
+
+
+

We then calculate time to death for each respondents.

+

With this, all we need is to fit a linear model t ~ X or log(t) ~ X

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDTransplantDeathTime to death
1200020055
2200020099
3200120054
4200320041
52004201612
+ +
+
+
+
+
+

+
+
+
+
+
+

What if we do not know when exactly some respondents die?

+

Scenario 1: the study ends at the year 2008?

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDTransplantDeathTime to death
1200020055
22000> 2008> 8
3200120054
4200320041
52004> 2008> 4
+ +
+
+
+
+
+

+
+
+
+
+
+
+

+
+
+
+
+

Right Censoring: only observe the event (death) if it occurs before a certain time (2008).

+
+

Scenario 2: respondent 3 move away; loss follow up

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDTransplantDeathTime to death
1200020055
22000> 2008> 8
32001> 2005> 4
4200320041
52004> 2008> 4
+ +
+
+
+
+
+

+
+
+
+
+
+
+

+
+
+
+
+
+

How many patients are right censored?

+
+
+
+
+

+
+
+
+
+
+
+

1.4 Challenges in Survival Analysis (L)

+
    +
  • Right Censoring: only observe the event if it occurs before a certain time.
  • +
  • Left Censoring: event has occurred prior to the start of a research +
      +
    • Follow up every 3 years.
    • +
    • Event has occurred -> event happened sometime before follow up.
    • +
  • +
  • Left Truncation: delayed entry; respondents are included only if they survived long enough. +
      +
    • Start follow up with patients 100 days after the transplant
    • +
    • Patients dies within 100 day wouldn’t be included in the dataset
    • +
  • +
  • Right Truncation: respondents are included only if they have already experienced the event. +
      +
    • Retrospective analysis from deceased patients between 1990 and 2000.
    • +
    • Patients not dead before 2000 are not included in the dataset.
    • +
  • +
+
+
+
+

2 Method & Result

+
+

2.1 Explain Dataset (M)

+
+
+

2.2 Table 1 (L)

+
+

2.2.1 HLA match hla.match

+
+
+

+   0    1    2    3    4    5    6 <NA> 
+ 784 1504 1553 3778 1278  365  279  234 
+
+
+
+
+
+
+

+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+

2.2.2 Donor age age.donor

+
+
mean(dat$age.donor, na.rm = TRUE)
+
+
[1] 31.29952
+
+
median(dat$age.donor, na.rm = TRUE)
+
+
[1] 33
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+

+
+
+
+
+
+
+

2.2.3 Recipient Age age.rec

+
+
mean(dat$age.rec, na.rm = TRUE)
+
+
[1] 11.64653
+
+
median(dat$age.rec, na.rm = TRUE)
+
+
[1] 13
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 2250 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+

+
+
+
+
+
+
+

2.2.4 Cold Ischemia Time cold.isc

+
+
mean(dat$cold.isc, na.rm = TRUE)
+
+
[1] 10.85967
+
+
median(dat$cold.isc, na.rm = TRUE)
+
+
[1] 7
+
+
+
+
+
Warning: Removed 2250 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 2250 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+

+
+
+
+
+
+
+

2.2.5 Transplant Type tx.type

+
+
dat$tx.type |> table()
+
+

+Cadaveric    Living 
+     5148      4627 
+
+
+
+
+
+
+

+
+
+
+
+
+
+

2.2.6 Year year

+
+
+

+1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 
+ 728  734  666  736  787  813  789  745  740  842  706  834  655 
+
+
+
+
+
+
+

+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+

+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+

+
+
+
+
+
+
+
+

2.3 Overall Kaplan-Meier

+
+
+
+
+

+
+
+
+
+
+
+

2.4 Hazard Functions

+
+

2.4.1 death Distribution (?)

+
+
+
+
+

+
+
+
+
+
+
+
+
+

+
+
+
+
+
+
+

2.4.2 Overall (?)

+
+
+
# A tibble: 7 × 5
+  fu.interval n.censored n.event n.at.risk hazard.rate
+        <dbl>      <int>   <int>     <dbl>       <dbl>
+1       0.333        830     152      9775     0.0155 
+2       0.667        491      37      8793     0.00421
+3       1            613      27      8265     0.00327
+4       2           1164      62      7625     0.00813
+5       3           1143      41      6399     0.00641
+6       4           1053      40      5215     0.00767
+7       5            919      31      4122     0.00752
+
+
+
+
+

2.4.3 tx.type (M)

+
+
+
+

2.5 Cox Model

+
+

2.5.1 age (continuous) (L)

+
+
+

2.5.2 age (categorical) (M)

+
+
+

2.5.3 Full model (L)

+
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ hla.match + tx.type, 
+    data = dat)
+
+  n= 9541, number of events= 451 
+   (234 observations deleted due to missingness)
+
+                  coef exp(coef) se(coef)      z Pr(>|z|)   
+hla.match1    -0.08845   0.91535  0.17308 -0.511  0.60933   
+hla.match2    -0.36340   0.69531  0.17919 -2.028  0.04256 * 
+hla.match3    -0.50810   0.60164  0.18298 -2.777  0.00549 **
+hla.match4    -0.65410   0.51991  0.22221 -2.944  0.00324 **
+hla.match5    -0.39445   0.67405  0.29410 -1.341  0.17985   
+hla.match6    -0.51491   0.59755  0.31489 -1.635  0.10201   
+tx.typeLiving  0.39049   1.47771  0.12688  3.078  0.00209 **
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+              exp(coef) exp(-coef) lower .95 upper .95
+hla.match1       0.9153     1.0925    0.6520    1.2850
+hla.match2       0.6953     1.4382    0.4894    0.9879
+hla.match3       0.6016     1.6621    0.4203    0.8612
+hla.match4       0.5199     1.9234    0.3363    0.8037
+hla.match5       0.6740     1.4836    0.3787    1.1996
+hla.match6       0.5976     1.6735    0.3224    1.1077
+tx.typeLiving    1.4777     0.6767    1.1524    1.8949
+
+Concordance= 0.613  (se = 0.014 )
+Likelihood ratio test= 56.79  on 7 df,   p=7e-10
+Wald test            = 57.08  on 7 df,   p=6e-10
+Score (logrank) test = 59.51  on 7 df,   p=2e-10
+
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ hla.match * tx.type, 
+    data = dat)
+
+  n= 9541, number of events= 451 
+   (234 observations deleted due to missingness)
+
+                            coef exp(coef) se(coef)      z Pr(>|z|)  
+hla.match1                0.8094    2.2467   0.8165  0.991   0.3215  
+hla.match2                1.1371    3.1176   0.7530  1.510   0.1311  
+hla.match3                0.4217    1.5245   0.7140  0.591   0.5548  
+hla.match4                0.1094    1.1156   0.7341  0.149   0.8816  
+hla.match5                0.4754    1.6087   0.7820  0.608   0.5432  
+hla.match6                0.5836    1.7925   0.8021  0.728   0.4669  
+tx.typeLiving             1.3801    3.9752   0.7218  1.912   0.0559 .
+hla.match1:tx.typeLiving -0.9614    0.3823   0.8355 -1.151   0.2498  
+hla.match2:tx.typeLiving -1.6578    0.1905   0.7762 -2.136   0.0327 *
+hla.match3:tx.typeLiving -1.0096    0.3644   0.7461 -1.353   0.1760  
+hla.match4:tx.typeLiving -0.5175    0.5960   0.7863 -0.658   0.5105  
+hla.match5:tx.typeLiving -0.8601    0.4231   0.8805 -0.977   0.3286  
+hla.match6:tx.typeLiving -1.3342    0.2634   0.9113 -1.464   0.1432  
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+                         exp(coef) exp(-coef) lower .95 upper .95
+hla.match1                  2.2467     0.4451   0.45341   11.1322
+hla.match2                  3.1176     0.3208   0.71257   13.6400
+hla.match3                  1.5245     0.6559   0.37617    6.1787
+hla.match4                  1.1156     0.8964   0.26460    4.7032
+hla.match5                  1.6087     0.6216   0.34737    7.4498
+hla.match6                  1.7925     0.5579   0.37212    8.6342
+tx.typeLiving               3.9752     0.2516   0.96600   16.3580
+hla.match1:tx.typeLiving    0.3823     2.6155   0.07435    1.9662
+hla.match2:tx.typeLiving    0.1905     5.2480   0.04162    0.8723
+hla.match3:tx.typeLiving    0.3644     2.7444   0.08443    1.5726
+hla.match4:tx.typeLiving    0.5960     1.6778   0.12763    2.7836
+hla.match5:tx.typeLiving    0.4231     2.3635   0.07533    2.3763
+hla.match6:tx.typeLiving    0.2634     3.7971   0.04414    1.5712
+
+Concordance= 0.615  (se = 0.014 )
+Likelihood ratio test= 66.78  on 13 df,   p=3e-09
+Wald test            = 65.9  on 13 df,   p=5e-09
+Score (logrank) test = 69.92  on 13 df,   p=8e-10
+
+
+
Analysis of Deviance Table
+ Cox model: response is  Surv(follow.up, death)
+ Model 1: ~ hla.match + tx.type
+ Model 2: ~ hla.match * tx.type
+   loglik  Chisq Df Pr(>|Chi|)
+1 -3848.5                     
+2 -3843.5 9.9891  6     0.1251
+
+
+
+
+

2.5.4 Discussion: include year or not?

+
+
+
+

2.6 Assumption Testing

+
+
+
+

3 Conclusion (M + L)

+
+ +
+ + +
+ + + + + \ No newline at end of file diff --git a/slides.qmd b/slides.qmd new file mode 100644 index 0000000..81b33bb --- /dev/null +++ b/slides.qmd @@ -0,0 +1,609 @@ +--- +title: Slides +execute: + cache: true + freeze: auto + include: true + echo: false +number-sections: true +--- + +```{r} +library(tidyverse) +library(dplyr) +library(ggplot2) +library(survival) +library(emmeans) +library(foreign) +library(gtsummary) +library(gt) +library(ggsurvfit) +``` + +```{r} +dat <- read.csv("./unos.txt", sep = "\t") +names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", + "year", "sex", "tx.type", "follow.up") +dat <- dat |> + mutate( + sex = factor(sex, levels = c(0,1), labels = c("Female","Male")), + tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")), + hla.match = factor(hla.match), + year = factor(year) + ) +``` + +# Introduction: research question + +## Survival after transplantation (M) + +## Identify Predictors (M) + +## Why Survival Analysis (L) + +Motivation: study the distribution of time to event $T$. + +Example: time of death after kidney transplant. + + +```{r} +ex <- data.frame( + id = c(1, 2, 3, 4, 5), + transplant = c(2000, 2000, 2001, 2003, 2004), + death = c(2005, 2009, 2005, 2004, 2016) +) + +ex_table_1 <- ex |> + gt() |> + cols_label( + id = "ID", + transplant = "Transplant", + death = "Death" + ) %>% + cols_align(align = "center", columns = everything()) + +ex_plot_real_time <- ggplot(ex) + + geom_segment(aes(x = transplant, xend = death, y = factor(id), + yend = factor(id)), color = "grey50", linewidth = 1) + + geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) + + geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) + + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) + + scale_y_discrete(limits = rev) + + labs(x = "Year", y = "Subject ID", color = "Event") + + theme_minimal() +``` + +```{r} +ex_table_1 +ex_plot_real_time +``` + +--- + +We then calculate time to death for each respondents. + +With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X` + +```{r} +ex_t <- ex |> + mutate( + t = death - transplant + ) + +ex_table_2 <- ex_t |> + gt() |> + cols_label( + id = "ID", + transplant = "Transplant", + death = "Death", + t = "Time to death" + ) |> + cols_align(align = "center", columns = everything()) + +ex_plot_uniform_time <- ggplot(ex_t) + + geom_segment(aes(x = 0, xend = t, y = factor(id), + yend = factor(id)), color = "grey50", linewidth = 1) + + geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) + + geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) + + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) + + scale_y_discrete(limits = rev) + + labs(x = "Year", y = "Subject ID", color = "Event") + + theme_minimal() + +``` + + +```{r} +ex_table_2 +ex_plot_uniform_time +``` + +--- + +What if we do not know when exactly some respondents die? + +Scenario 1: the study ends at the year 2008? + +```{r} +ex_t_c <- ex_t |> + mutate( + death_censored = if_else(death <= 2008, death, 2008), + death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"), + status = if_else(death <= 2008, 1, 0), + t_1 = death_censored - transplant, + t_1_txt = if_else(death <= 2008, as.character(t_1), + paste(">", as.character(t_1))) + ) +``` + +```{r} +ex_table_3 <- ex_t_c |> + select(id, transplant, death_censored_txt, t_1_txt) |> + gt() |> + cols_label( + id = "ID", + transplant = "Transplant", + death_censored_txt = "Death", + t_1_txt = "Time to death" + ) |> + cols_align(align = "center", columns = everything()) + +ex_plot_real_time_1 <- ggplot(ex_t_c) + + geom_segment(aes(x = transplant, xend = death_censored, y = factor(id), + yend = factor(id)), color = "grey50", linewidth = 1) + + geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) + + geom_point(aes(x = death_censored, y = factor(id), color = + if_else(status == 1, "death", "censored")), size = 3) + + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", + "censored" = "orange")) + + scale_y_discrete(limits = rev) + + labs(x = "Year", y = "Subject ID", color = "Event") + + xlim(2000, 2016) + + geom_vline(xintercept = 2008, linetype = "dashed", color = "orange", + linewidth = 0.8) + + theme_minimal() + + +ex_plot_uniform_time_1 <- ggplot(ex_t_c) + + geom_segment(aes(x = 0, xend = t_1, y = factor(id), + yend = factor(id)), color = "grey50", linewidth = 1) + + geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) + + geom_point(aes(x = t_1, y = factor(id), color = + if_else(status == 1, "death", "censored")), size = 3) + + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", + "censored" = "orange")) + + scale_y_discrete(limits = rev) + + labs(x = "Year", y = "Subject ID", color = "Event") + + theme_minimal() +``` + + +```{r} +ex_table_3 +ex_plot_real_time_1 +ex_plot_uniform_time_1 +``` + +**Right Censoring**: only observe the event (death) if it occurs before a +certain time (2008). + +--- + +Scenario 2: respondent 3 move away; loss follow up + +```{r} +ex_alt <- ex_t_c +ex_alt$death_censored_txt[3] <- "> 2005" +ex_alt$status[3] <- 0 +ex_alt$t_1_txt[3] <- "> 4" + + +ex_table_4 <- ex_alt |> + select(id, transplant, death_censored_txt, t_1_txt) |> + gt() |> + cols_label( + id = "ID", + transplant = "Transplant", + death_censored_txt = "Death", + t_1_txt = "Time to death" + ) |> + cols_align(align = "center", columns = everything()) + +ex_plot_real_time_2 <- ggplot(ex_alt) + + geom_segment(aes(x = transplant, xend = death_censored, y = factor(id), + yend = factor(id)), color = "grey50", linewidth = 1) + + geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) + + geom_point(aes(x = death_censored, y = factor(id), color = + if_else(status == 1, "death", "censored")), size = 3) + + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", + "censored" = "orange")) + + scale_y_discrete(limits = rev) + + labs(x = "Year", y = "Subject ID", color = "Event") + + xlim(2000, 2016) + + geom_vline(xintercept = 2008, linetype = "dashed", color = "orange", + linewidth = 0.8) + + theme_minimal() + + +ex_plot_uniform_time_2 <- ggplot(ex_alt) + + geom_segment(aes(x = 0, xend = t_1, y = factor(id), + yend = factor(id)), color = "grey50", linewidth = 1) + + geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) + + geom_point(aes(x = t_1, y = factor(id), color = + if_else(status == 1, "death", "censored")), size = 3) + + scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D", + "censored" = "orange")) + + scale_y_discrete(limits = rev) + + labs(x = "Year", y = "Subject ID", color = "Event") + + theme_minimal() +``` + +```{r} +ex_table_4 +ex_plot_real_time_2 +ex_plot_uniform_time_2 +``` + +--- + +How many patients are right censored? + +```{r} +dat |> + mutate(Overall = "Overall") |> + pivot_longer( + cols = c(Overall, sex, tx.type), + names_to = "Attribute", + values_to = "Category" + ) |> + count(Attribute, Category, death) |> + ggplot(aes(x = Category, y = n, fill = factor(death))) + + geom_col(position = "dodge") + + facet_wrap(~ Attribute, scales = "free_x") + + labs(x = "Group", y = "Count", fill = "Death Status") + + theme_minimal() +``` + +## Challenges in Survival Analysis (L) + +- **Right Censoring**: only observe the event if it occurs before a certain + time. +- **Left Censoring**: event has occurred prior to the start of a research + - Follow up every 3 years. + - Event has occurred -> event happened sometime before follow up. +- **Left Truncation**: delayed entry; respondents are included only if they + survived long enough. + - Start follow up with patients 100 days after the transplant + - Patients dies within 100 day wouldn't be included in the dataset +- **Right Truncation**: respondents are included only if they have already + experienced the event. + - Retrospective analysis from deceased patients between 1990 and 2000. + - Patients not dead before 2000 are not included in the dataset. + +# Method & Result + +## Explain Dataset (M) + +## Table 1 (L) + + +### HLA match `hla.match` + +```{r} +dat$hla.match |> table(useNA = "always") +``` + +```{r} +ggplot(dat, aes(x = hla.match)) + + geom_bar() + + labs(x = "HLA match", y = "Count") + + theme_minimal() + +``` + +```{r} +ggplot(dat, aes(x = hla.match, fill = tx.type)) + + geom_bar(position = "dodge") + + labs(x = "HLA match", y = "Count") + + theme_minimal() +``` + +### Donor age `age.donor` + +```{r} +#| echo: true +mean(dat$age.donor, na.rm = TRUE) +median(dat$age.donor, na.rm = TRUE) +``` + +```{r} +ggplot(dat, aes(x = age.donor)) + + geom_bar() + + labs(x = "Donor Age", y = "Count") + + theme_minimal() +``` + +```{r} +#| fig-width: 8 +#| fig-height: 12 +ggplot(dat, aes(x = age.donor)) + + geom_bar() + + labs(x = "Donor Age", y = "Count") + + theme_minimal() + + facet_grid(hla.match ~ .) +``` + +```{r} +ggplot(dat, aes(x = age.donor, color = hla.match)) + + geom_density() + + labs(x = "Donor Age", y = "Count") + + theme_minimal() +``` + + +### Recipient Age `age.rec` + +```{r} +#| echo: true +mean(dat$age.rec, na.rm = TRUE) +median(dat$age.rec, na.rm = TRUE) +``` + + +```{r} +ggplot(dat, aes(x = age.rec)) + + geom_bar() + + labs(x = "Recipient Age", y = "Count") + + theme_minimal() + +``` + +```{r} +ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) + + geom_boxplot() + + theme_minimal() + +``` + +```{r} +ggplot(dat, aes(x = age.rec, fill = tx.type)) + + geom_bar(position = "dodge") + + theme_minimal() + +``` + +```{r} +ggplot(dat, aes(x = age.rec, color = hla.match)) + + geom_density() + + theme_minimal() +``` + +```{r} +ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) + + geom_boxplot() + + theme_minimal() +``` + + +### Cold Ischemia Time `cold.isc` + +```{r} +#| echo: true +mean(dat$cold.isc, na.rm = TRUE) +median(dat$cold.isc, na.rm = TRUE) +``` + +```{r} +ggplot(dat, aes(x = cold.isc)) + + geom_density() + + labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") + + theme_minimal() +``` + +```{r} +ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) + + geom_density() + + labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") + + theme_minimal() +``` + +### Transplant Type `tx.type` + +```{r} +#| echo: true +dat$tx.type |> table() +``` + +```{r} +ggplot(dat, aes(x = tx.type)) + + geom_bar() + + labs(x = "Transplant Type", y = "Count") + + theme_minimal() + +``` + +### Year `year` + +```{r} +dat$year |> table() +``` + +```{r} +ggplot(dat, aes(x = year)) + + geom_bar() + + labs(x = "Year", y = "Count") + + theme_minimal() +``` + +```{r} +ggplot(dat, aes(x = year, y = follow.up)) + + geom_boxplot() +``` + +```{r} +ggplot(dat, aes(x = year, fill = tx.type)) + + geom_bar(position = "dodge") +``` + +```{r} +ggplot(dat, aes(x = year, y = age.rec)) + + geom_boxplot() +``` + +```{r} +ggplot(dat, aes(x = year, y = age.donor)) + + geom_boxplot() +``` + +## Overall Kaplan-Meier + +```{r} +km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat) +``` + +```{r} +km_all |> + ggsurvfit(type = "survival") + + add_confidence_interval() + + scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) + + labs( + x = "Years of Follow-up", + y = "Overall Survival Probability" + ) + + theme_minimal() +``` + +## Hazard Functions + +```{r} +get.life.table <- function(dat, time.intervals) { + n.pop <- nrow(dat) + + dat |> + recode.dat(time.intervals) |> + group_by(fu.interval) |> + summarize( + n.censored = sum(.data$death == 0), + n.event = sum(.data$death), + ) |> + ungroup() |> + calculate.hazard(n.pop) +} + +get.life.table.by.groups <- function(dat, time.intervals, grps) { + grps |> + lapply(function(grp) { + dat |> + get.life.table.by.group(time.intervals, grp) |> + mutate( + grp.name = grp, + grp.value = pick(1)[[1]] + ) |> + select(-1) + }) |> + bind_rows() +} + +get.life.table.by.group <- function(dat, time.intervals, grp) { + dat |> + recode.dat(time.intervals) |> + group_by(fu.interval, .data[[grp]]) |> + summarize( + n.censored = sum(.data$death == 0), + n.event = sum(.data$death), + .groups = "keep" + ) |> + ungroup(fu.interval) |> + group_modify(function(df.sub, grp) { + grp.name <- names(grp) + grp.value <- grp[[1]] + n.pop <- (dat[[grp.name]] == grp.value) |> sum() + calculate.hazard(df.sub, n.pop) + }) |> + ungroup() +} + +calculate.hazard <- function(life.table, n.pop) { + n.removed <- life.table$n.event + life.table$n.censored + n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)]) + life.table |> + mutate( + n.at.risk = n.pop - n.removed.accum, + # TODO: how to account for censored? How do we adjust for uneven interval? + hazard.rate = n.event / n.at.risk + ) +} + +recode.dat <- function(dat, time.intervals) { + df <- dat[dat$follow.up <= sum(time.intervals), ] + time.points <- cumsum(time.intervals) + df$fu.interval <- sapply(df$follow.up, function(time) { + time.points[time <= time.points][1] + }) + + df +} +``` + +### `death` Distribution (?) + +```{r} +dat |> + mutate( + accum.death = cumsum(death), + accum.censored = if_else(death == 1, 0, 1) |> cumsum() + ) |> + ggplot() + + geom_step(aes(x = follow.up, y = accum.death)) +``` + +```{r} + +``` + +```{r} +plot.death = dat[dat$death == 1,] |> + ggplot(aes(x = follow.up)) + + geom_histogram(bins = 50) +plot.death +``` + + +### Overall (?) + +```{r} +time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) +get.life.table(dat, time.intervals) +``` + +### `tx.type` (M) + + +```{r} + +``` + + +## Cox Model + +### `age` (continuous) (L) + +### `age` (categorical) (M) + +### Full model (L) + +```{r} +m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat) +summary(m1) + +m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat) +summary(m2) + +anova(m1, m2, test = "LRT") +``` + +### Discussion: include `year` or not? + +## Assumption Testing + +# Conclusion (M + L)