diff --git a/report.Rmd b/report.Rmd
index f030138..89e97dc 100644
--- a/report.Rmd
+++ b/report.Rmd
@@ -1,593 +1,709 @@
----
-title: foo
-execute:
- cache: true
- freeze: auto
-number-sections: true
----
-
-```{r}
-library(tidyverse)
-library(dplyr)
-library(ggplot2)
-library(survival)
-library(emmeans)
-library(foreign)
-# library(gtsummary)
-```
-
-```{r}
-dat <- read.csv("./unos.txt", sep = "\t")
-head(dat)
-names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
- "year", "sex", "tx.type", "follow.up")
-```
-
-Complete case
-
-```{r}
-nrow(dat)
-```
-
-
-```{r}
-sapply(names(dat), function(col) {
- is.na(dat[, col]) |> sum()
-})
-```
-
-
-
-# Exercise
-
-## Exercise 1
-
-> Illustrate in a table the characteristics of the population (age, sex, race,
-donor, . . . ).
-
-```{r}
-g <- ggplot(dat)
-```
-
-```{r}
-g + geom_point(aes(x = follow.up, y = death))
-g + geom_density(aes(x = follow.up))
-
-g + geom_density(aes(x = age.donor))
-g + geom_bar(aes(x = age.rec))
-
-g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
-
-# dat$age.1 |> table()
-```
-
-```{r}
-g + geom_boxplot(aes(x = year, y = follow.up, group = year))
-```
-
-## Exercise 2
-
-Plot the Kaplan-Meier overall survival curve for pediatric kid-
-ney transplant recipients for the first 12 years after transplantation.
-
-```{r}
-km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ])
-plot(km)
-
-
-
-```
-
-## Exercise 3
-
-We are going to compare mortality rates (hazard functions)
-between children whose transplanted kidney was provided by a living donor
-(in general a family member) and those whose source was recently deceased
-(variable donor type: `txtype`). Use the life table method to calculate the death
-rates for the first 5 years for each group (take in the first year intervals of 4
-months and then look at each year) and show the results in a table. Estimate
-the hazard ratio in each time interval as the ratio between the mortality rates
-in the two groups. What do you notice?
-
-```{r}
-dat.5 <- dat[dat$follow.up <= 5, ]
-head(dat.5)
-classify_time_interval = function(fu) {
- if (fu <= 1/3) {
- return(1/3)
- } else if (fu <= 2/3) {
- return(2/3)
- } else if (fu <= 1) {
- return(1)
- }
- ceiling(fu)
-}
-
-dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
-table(dat.5$fu.interval)
-head(dat.5)
-```
-
-```{r}
-dat.5.life <- dat.5 |>
- group_by(fu.interval) |>
- summarize(
- n.censored = sum(death == 0),
- n.event = sum(death),
- n.at.risk = nrow(dat),
- )
-
-for (i in 2:nrow(dat.5.life)) {
- j <- i - 1
-
- n.censored.pre <- dat.5.life$n.censored[j]
- n.event.pre <- dat.5.life$n.event[j]
- n.at.risk.pre <- dat.5.life$n.at.risk[j]
-
- n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
-
- dat.5.life$n.at.risk[i] <- n.at.risk
-}
-
-print(nrow(dat))
-dat.5.life
-
-```
-n.removed.acc
-```{r}
-dat.5.life <- dat.5.life |>
- mutate(
- hazard.rate = n.event / n.at.risk
- )
-
-```
-
----
-
-
-```{r}
-get_life_table = function(dat.sub, dat) {
- dat.sub <- dat.sub |>
- group_by(fu.interval) |>
- summarize(
- n.censored = sum(death == 0),
- n.event = sum(death),
- n.at.risk = nrow(dat),
- )
-
- for (i in 2:nrow(dat.sub)) {
- j <- i - 1
-
- n.censored.pre <- dat.sub$n.censored[j]
- n.event.pre <- dat.sub$n.event[j]
- n.at.risk.pre <- dat.sub$n.at.risk[j]
-
- n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
-
- dat.sub$n.at.risk[i] <- n.at.risk
- }
-
- dat.sub <- dat.sub |>
- mutate(
- hazard.rate = n.event / n.at.risk
- )
-
- return(dat.sub)
-}
-```
-
-```{r}
-dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
-dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
-```
-
-```{r}
-tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
-tx0.life
-```
-
-```{r}
-tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
-tx1.life
-```
-
-
-```{r}
-tx1.life$hazard.rate / tx0.life$hazard.rate
-```
-
-```{r}
-hazard.df <- data.frame(
- fu.interval = tx1.life$fu.interval,
- hazard.rate.0 = tx0.life$hazard.rate,
- hazard.rate.1 = tx1.life$hazard.rate,
- hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
-)
-
-ggplot(hazard.df, aes(x = fu.interval)) +
- geom_line(aes(y = hazard.rate.0), color = "blue") +
- geom_line(aes(y = hazard.rate.1), color = "orange")
-
-ggplot(hazard.df, aes(x = fu.interval)) +
- geom_line(aes(y = hazard.ratio))
-
-```
-
----
-
-General Solution:
-
-```{r}
-get.life.table <- function(dat, time.intervals) {
- n.pop <- nrow(dat)
-
- dat |>
- recode.dat(time.intervals) |>
- group_by(fu.interval) |>
- summarize(
- n.censored = sum(.data$death == 0),
- n.event = sum(.data$death),
- ) |>
- ungroup() |>
- calculate.hazard(n.pop)
-}
-
-get.life.table.by.groups <- function(dat, time.intervals, grps) {
- grps |>
- lapply(function(grp) {
- dat |>
- get.life.table.by.group(time.intervals, grp) |>
- mutate(
- grp.name = grp,
- grp.value = pick(1)[[1]]
- ) |>
- select(-1)
- }) |>
- bind_rows()
-}
-
-get.life.table.by.group <- function(dat, time.intervals, grp) {
- dat |>
- recode.dat(time.intervals) |>
- group_by(fu.interval, .data[[grp]]) |>
- summarize(
- n.censored = sum(.data$death == 0),
- n.event = sum(.data$death),
- .groups = "keep"
- ) |>
- ungroup(fu.interval) |>
- group_modify(function(df.sub, grp) {
- grp.name <- names(grp)
- grp.value <- grp[[1]]
- n.pop <- (dat[[grp.name]] == grp.value) |> sum()
- calculate.hazard(df.sub, n.pop)
- }) |>
- ungroup()
-}
-
-calculate.hazard <- function(life.table, n.pop) {
- n.removed <- life.table$n.event + life.table$n.censored
- n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
- life.table |>
- mutate(
- n.at.risk = n.pop - n.removed.accum,
- # TODO: how to account for censored? How do we adjust for uneven interval?
- hazard.rate = n.event / n.at.risk
- )
-}
-
-recode.dat <- function(dat, time.intervals) {
- df <- dat[dat$follow.up <= sum(time.intervals), ]
- time.points <- cumsum(time.intervals)
- df$fu.interval <- sapply(df$follow.up, function(time) {
- time.points[time <= time.points][1]
- })
-
- df
-}
-
-
-
-```
-
-```{r}
-time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
-grps <- c("tx.type", "sex", "hla.match", "cold.isc")
-
-get.life.table.by.groups(dat, time.intervals, grps)
-```
-
-```{r}
-grps <- c("tx.type", "sex", "hla.match", "cold.isc")
-grps <- c("tx.type")
-
-
-plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
- dat |>
- get.life.table.by.groups(time.intervals, grps) |>
- ggplot(aes(x = fu.interval, y = hazard.rate,
- color = paste(grp.name, grp.value, sep = " = "))) +
- geom_point() +
- geom_line() +
- labs(
- color = "Group"
- ) +
- theme(
- legend.position = "bottom"
- )
-}
-
-```
-```{r}
-time.intervals <- c(rep(1/3, 3), rep(1, 4))
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
-```{r}
-time.intervals <- rep(1, 5)
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
-```{r}
-time.intervals <- rep(1/3, 15)
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
-```{r}
-time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
-```{r}
-time.intervals <- c(rep(1/4, 4), rep(1, 4))
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
-```{r}
-time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
-```{r}
-time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
-plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
-```
-
----
-
-Compare
-
-```{r}
-time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
-get.life.table.by.groups(dat, time.intervals, grps)
-```
-
-```{r}
-hazard.df
-```
-
-
-
-## Exercise 4
-
-Show a plot with Kaplan-Meier survival curves for the two donor types.
-
-```{r}
-km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up <= 12, ])
-
-plot(km.tx, col = c("blue", "orange"))
-legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))
-```
-
-## Exercise 5
-
-Fit a univariate Cox model with predictor donor type. Report
-the hazard ratio and 95% confidence interval and interpret the result obtained.
-
-```{r}
-cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
-summary(cox.tx)
-```
-
-```{r}
-log.hazard.ratio.hat <- cox.tx$coefficient
-hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
-
-```
-
-```{r}
-log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
-log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
-log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se
-
-c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
-```
-
-::: {.callout-note}
-Patients receiving kidney transplants from living donors (`tx.type = 1`) has
-1.91 higher hazard rate than those receiving from cadaveric donors
-(`tx.type = 0`).
-
-Since 1 is not in the 95% confidence interval, the result is statistically
-significant.
-:::
-
-
-# Exercise 6
-
-Research shows that an important determinant of mortality
-after kidney transplant is the age of the recipient. Fit a Cox model with age
-as predictor and estimate the hazard ratio and its confidence interval. Consider
-age first as continuous variable and then divide into categories.
-
-## Continuous
-
-```{r}
-cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
-cox.age.rec |> summary()
-```
-
-## Categorical (Case 1)
-
-```{r}
-recode.age <- function(dat, ages) {
- df$age.group <- sapply(df$follow.up, function(time) {
- time.points[time <= time.points][1]
- })
-
- df
-}
-```
-
-```{r}
-dat.age.rec <- dat[!is.na(dat$age.rec), ]
-class.age <- function(age) {
- if (age < 2) {
- return("0,1")
- } else if (age < 6) {
- return("2,3,4,5")
- } else if (age < 11) {
- return("6,7,8,9,10")
- } else if (age < 19) {
- return("11-18")
- }
-}
-```
-
-```{r}
-dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
- factor(levels = c(
- "0,1",
- "2,3,4,5",
- "6,7,8,9,10",
- "11-18"
- ))
-
-
-cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
-cox.age.rec |> summary()
-```
-
-```{r}
-emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
-```
-
-
-# Exercise 7
-
-Fit a multivariate Cox model by using other predictors and
-describe your results.
-
-
-```{r}
-sapply(names(dat), function(col) {
- is.na(dat[, col]) |> sum()
-})
-```
-
-```{r}
-names(dat)
-```
-
-```{r}
-Waldtest <- function(cox.fit, i.betas){
- q <- length(i.betas)
- coefs <- cox.fit$coefficients
- # print(cox.fit$var)
- var <- cox.fit$var[i.betas, i.betas]
- Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
- p.value <- 1-pchisq(as.numeric(Wald), df = q)
- return(cbind(Wald, p.value))
-}
-```
-
-
-```{r}
-fit.cox.model <- function(surv, i.betas) {
- fit.tx <- coxph(surv, data = dat)
- Waldtest(fit.tx, i.betas)
-}
-```
-
-Start with `tx.type`
-
-```{r}
-surv <- Surv(follow.up, death) ~ tx.type + age.rec
-fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
-```
-
-Iteration 1: `tx.type + age + ?`
-
-```{r}
-names(dat)
-```
-
-```{r}
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
-fit.cox.model(surv, 3)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
-fit.cox.model(surv, 3)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
-fit.cox.model(surv, 3)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
-fit.cox.model(surv, 3)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
-fit.cox.model(surv, 3)
-```
-
-Add `hla.match`
-
-Iteration 2: `tx.type + age.rec + hla.match + ?`
-
-```{r}
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
-fit.cox.model(surv, 4)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
-fit.cox.model(surv, 4)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
-fit.cox.model(surv, 4)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
-fit.cox.model(surv, 4)
-```
-
-Add `age.donor`
-
-Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?`
-
-```{r}
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
- cold.isc
-fit.cox.model(surv, 5)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
- year
-fit.cox.model(surv, 5)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
- sex
-fit.cox.model(surv, 5)
-```
-
-Add `year`
-
-```{r}
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
- year + cold.isc
-fit.cox.model(surv, 6)
-surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
- year + sex
-fit.cox.model(surv, 6)
-```
-
-
-
-Next, we test for each remaining variables
-
-# Exercise 8
-
-Estimate the survival function for specific covariate patterns.
-Based on the previous results choose the best predictors.
-
-Exercise 9 — Check the proportional hazards assumption. You may use the
-function cox.zph. Discuss the result and possible implications.
-
-Exercise 10 — Plot the Schoenfeld residuals and comment.
-
+---
+title: foo
+execute:
+ cache: true
+ freeze: auto
+number-sections: true
+---
+
+```{r}
+library(tidyverse)
+library(dplyr)
+library(ggplot2)
+library(survival)
+library(emmeans)
+library(foreign)
+library(gtsummary)
+```
+
+```{r}
+dat <- read.csv("./unos.txt", sep = "\t")
+head(dat)
+names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
+ "year", "sex", "tx.type", "follow.up")
+```
+
+Complete case
+
+```{r}
+nrow(dat)
+```
+
+
+```{r}
+sapply(names(dat), function(col) {
+ is.na(dat[, col]) |> sum()
+})
+```
+
+
+
+# Exercise
+
+## Exercise 1
+
+> Illustrate in a table the characteristics of the population (age, sex, race,
+donor, . . . ).
+
+```{r}
+hist(dat$age.donor)
+hist(dat$age.rec)
+hist(dat$cold.isc)
+hist(dat$year)
+hist(dat$follow.up)
+
+unique(dat$hla.match)
+unique(dat$year)
+```
+Change the order!!
+
+```{r}
+dat.table1 <- dat |>
+ mutate(
+ sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
+ tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
+ hla.match = factor(hla.match),
+ year = factor(year)
+ )
+```
+
+```{r}
+table1 <- dat.table1 |>
+ select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
+ tbl_summary(
+ statistic = list(
+ all_continuous() ~ "{median} ({p25}, {p75})"
+ ),
+ label = list(
+ hla.match ~ "HLA matches, n(%)",
+ age.donor ~ "Donor age, median (IQR)",
+ age.rec ~ "Recipient age, median (IQR)",
+ cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
+ year ~ "Year of transplant",
+ sex ~ "Sex, n(%)",
+ tx.type ~ "Transplant type, n(%)"
+ ),
+ missing = "ifany"
+ ) |>
+ modify_footnote(all_stat_cols() ~ NA)
+
+table1
+```
+
+```{r}
+table.split <- dat.table1 |>
+ select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
+ tbl_summary(
+ by = tx.type,
+ statistic = list(
+ all_continuous() ~ "{median} ({p25}, {p75})"
+ ),
+ label = list(
+ hla.match ~ "HLA matches, n(%)",
+ age.donor ~ "Donor age, median (IQR)",
+ age.rec ~ "Recipient age, median (IQR)",
+ cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
+ year ~ "Year of transplant",
+ sex ~ "Sex, n(%)"
+ ),
+ missing = "ifany"
+ ) |>
+ add_overall() |>
+ modify_footnote(all_stat_cols() ~ NA)
+
+table.split
+```
+
+
+Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI
+```{r}
+rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)
+
+summary(rev_km)$table
+
+median_followup <- summary(rev_km)$table["median"]
+confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
+median_followup
+confidence_interval
+```
+
+
+```{r}
+g <- ggplot(dat)
+```
+
+```{r}
+g + geom_point(aes(x = follow.up, y = death))
+g + geom_density(aes(x = follow.up))
+
+g + geom_density(aes(x = age.donor))
+g + geom_bar(aes(x = age.rec))
+
+g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
+
+# dat$age.1 |> table()
+```
+
+```{r}
+g + geom_boxplot(aes(x = year, y = follow.up, group = year))
+```
+
+## Exercise 2
+
+Plot the Kaplan-Meier overall survival curve for pediatric kid-
+ney transplant recipients for the first 12 years after transplantation.
+
+!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!
+
+```{r}
+km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ])
+plot(km)
+```
+
+## Exercise 3
+
+We are going to compare mortality rates (hazard functions)
+between children whose transplanted kidney was provided by a living donor
+(in general a family member) and those whose source was recently deceased
+(variable donor type: `txtype`). Use the life table method to calculate the death
+rates for the first 5 years for each group (take in the first year intervals of 4
+months and then look at each year) and show the results in a table. Estimate
+the hazard ratio in each time interval as the ratio between the mortality rates
+in the two groups. What do you notice?
+
+```{r}
+dat.5 <- dat[dat$follow.up <= 5, ]
+head(dat.5)
+classify_time_interval = function(fu) {
+ if (fu <= 1/3) {
+ return(1/3)
+ } else if (fu <= 2/3) {
+ return(2/3)
+ } else if (fu <= 1) {
+ return(1)
+ }
+ ceiling(fu)
+}
+
+dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
+table(dat.5$fu.interval)
+head(dat.5)
+```
+
+```{r}
+dat.5.life <- dat.5 |>
+ group_by(fu.interval) |>
+ summarize(
+ n.censored = sum(death == 0),
+ n.event = sum(death),
+ n.at.risk = nrow(dat),
+ )
+
+for (i in 2:nrow(dat.5.life)) {
+ j <- i - 1
+
+ n.censored.pre <- dat.5.life$n.censored[j]
+ n.event.pre <- dat.5.life$n.event[j]
+ n.at.risk.pre <- dat.5.life$n.at.risk[j]
+
+ n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
+
+ dat.5.life$n.at.risk[i] <- n.at.risk
+}
+
+print(nrow(dat))
+dat.5.life
+
+```
+n.removed.acc
+```{r}
+dat.5.life <- dat.5.life |>
+ mutate(
+ hazard.rate = n.event / n.at.risk
+ )
+
+```
+
+---
+
+
+```{r}
+get_life_table = function(dat.sub, dat) {
+ dat.sub <- dat.sub |>
+ group_by(fu.interval) |>
+ summarize(
+ n.censored = sum(death == 0),
+ n.event = sum(death),
+ n.at.risk = nrow(dat),
+ )
+
+ for (i in 2:nrow(dat.sub)) {
+ j <- i - 1
+
+ n.censored.pre <- dat.sub$n.censored[j]
+ n.event.pre <- dat.sub$n.event[j]
+ n.at.risk.pre <- dat.sub$n.at.risk[j]
+
+ n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
+
+ dat.sub$n.at.risk[i] <- n.at.risk
+ }
+
+ dat.sub <- dat.sub |>
+ mutate(
+ hazard.rate = n.event / n.at.risk
+ )
+
+ return(dat.sub)
+}
+```
+
+```{r}
+dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
+dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
+```
+
+```{r}
+tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
+tx0.life
+```
+
+```{r}
+tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
+tx1.life
+```
+
+
+```{r}
+tx1.life$hazard.rate / tx0.life$hazard.rate
+```
+
+```{r}
+hazard.df <- data.frame(
+ fu.interval = tx1.life$fu.interval,
+ hazard.rate.0 = tx0.life$hazard.rate,
+ hazard.rate.1 = tx1.life$hazard.rate,
+ hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
+)
+
+ggplot(hazard.df, aes(x = fu.interval)) +
+ geom_line(aes(y = hazard.rate.0), color = "blue") +
+ geom_line(aes(y = hazard.rate.1), color = "orange")
+
+ggplot(hazard.df, aes(x = fu.interval)) +
+ geom_line(aes(y = hazard.ratio))
+
+```
+
+---
+
+General Solution:
+
+```{r}
+get.life.table <- function(dat, time.intervals) {
+ n.pop <- nrow(dat)
+
+ dat |>
+ recode.dat(time.intervals) |>
+ group_by(fu.interval) |>
+ summarize(
+ n.censored = sum(.data$death == 0),
+ n.event = sum(.data$death),
+ ) |>
+ ungroup() |>
+ calculate.hazard(n.pop)
+}
+
+get.life.table.by.groups <- function(dat, time.intervals, grps) {
+ grps |>
+ lapply(function(grp) {
+ dat |>
+ get.life.table.by.group(time.intervals, grp) |>
+ mutate(
+ grp.name = grp,
+ grp.value = pick(1)[[1]]
+ ) |>
+ select(-1)
+ }) |>
+ bind_rows()
+}
+
+get.life.table.by.group <- function(dat, time.intervals, grp) {
+ dat |>
+ recode.dat(time.intervals) |>
+ group_by(fu.interval, .data[[grp]]) |>
+ summarize(
+ n.censored = sum(.data$death == 0),
+ n.event = sum(.data$death),
+ .groups = "keep"
+ ) |>
+ ungroup(fu.interval) |>
+ group_modify(function(df.sub, grp) {
+ grp.name <- names(grp)
+ grp.value <- grp[[1]]
+ n.pop <- (dat[[grp.name]] == grp.value) |> sum()
+ calculate.hazard(df.sub, n.pop)
+ }) |>
+ ungroup()
+}
+
+calculate.hazard <- function(life.table, n.pop) {
+ n.removed <- life.table$n.event + life.table$n.censored
+ n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
+ life.table |>
+ mutate(
+ n.at.risk = n.pop - n.removed.accum,
+ # TODO: how to account for censored? How do we adjust for uneven interval?
+ hazard.rate = n.event / n.at.risk
+ )
+}
+
+recode.dat <- function(dat, time.intervals) {
+ df <- dat[dat$follow.up <= sum(time.intervals), ]
+ time.points <- cumsum(time.intervals)
+ df$fu.interval <- sapply(df$follow.up, function(time) {
+ time.points[time <= time.points][1]
+ })
+
+ df
+}
+
+
+
+```
+
+```{r}
+time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+grps <- c("tx.type", "sex", "hla.match", "cold.isc")
+
+get.life.table.by.groups(dat, time.intervals, grps)
+```
+
+```{r}
+grps <- c("tx.type", "sex", "hla.match", "cold.isc")
+grps <- c("tx.type")
+
+
+plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
+ dat |>
+ get.life.table.by.groups(time.intervals, grps) |>
+ ggplot(aes(x = fu.interval, y = hazard.rate,
+ color = paste(grp.name, grp.value, sep = " = "))) +
+ geom_point() +
+ geom_line() +
+ labs(
+ color = "Group"
+ ) +
+ theme(
+ legend.position = "bottom"
+ )
+}
+
+```
+```{r}
+time.intervals <- c(rep(1/3, 3), rep(1, 4))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- rep(1, 5)
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- rep(1/3, 15)
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/4, 4), rep(1, 4))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+---
+
+Compare
+
+```{r}
+time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+get.life.table.by.groups(dat, time.intervals, grps)
+```
+
+```{r}
+hazard.df
+```
+
+
+
+## Exercise 4
+
+Show a plot with Kaplan-Meier survival curves for the two donor types.
+
+```{r}
+km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up <= 12, ])
+
+plot(km.tx, col = c("blue", "orange"))
+legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))
+```
+
+## Exercise 5
+
+Fit a univariate Cox model with predictor donor type. Report
+the hazard ratio and 95% confidence interval and interpret the result obtained.
+
+```{r}
+cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
+summary(cox.tx)
+```
+
+```{r}
+log.hazard.ratio.hat <- cox.tx$coefficient
+hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
+
+```
+
+```{r}
+log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
+log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
+log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se
+
+c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
+```
+
+::: {.callout-note}
+Patients receiving kidney transplants from living donors (`tx.type = 1`) has
+1.91 higher hazard rate than those receiving from cadaveric donors
+(`tx.type = 0`).
+
+Since 1 is not in the 95% confidence interval, the result is statistically
+significant.
+:::
+
+
+# Exercise 6
+
+Research shows that an important determinant of mortality
+after kidney transplant is the age of the recipient. Fit a Cox model with age
+as predictor and estimate the hazard ratio and its confidence interval. Consider
+age first as continuous variable and then divide into categories.
+
+## Continuous
+
+```{r}
+cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
+cox.age.rec |> summary()
+```
+
+## Categorical (Case 1)
+
+```{r}
+recode.age <- function(dat, ages) {
+ df$age.group <- sapply(df$follow.up, function(time) {
+ time.points[time <= time.points][1]
+ })
+
+ df
+}
+```
+
+```{r}
+dat.age.rec <- dat[!is.na(dat$age.rec), ]
+class.age <- function(age) {
+ if (age < 2) {
+ return("0,1")
+ } else if (age < 6) {
+ return("2,3,4,5")
+ } else if (age < 11) {
+ return("6,7,8,9,10")
+ } else if (age < 19) {
+ return("11-18")
+ }
+}
+```
+
+```{r}
+dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
+ factor(levels = c(
+ "0,1",
+ "2,3,4,5",
+ "6,7,8,9,10",
+ "11-18"
+ ))
+
+
+cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
+cox.age.rec |> summary()
+```
+
+```{r}
+emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
+```
+
+
+# Exercise 7
+
+Fit a multivariate Cox model by using other predictors and
+describe your results.
+
+
+```{r}
+sapply(names(dat), function(col) {
+ is.na(dat[, col]) |> sum()
+})
+```
+
+```{r}
+names(dat)
+```
+
+```{r}
+Waldtest <- function(cox.fit, i.betas){
+ q <- length(i.betas)
+ coefs <- cox.fit$coefficients
+ # print(cox.fit$var)
+ var <- cox.fit$var[i.betas, i.betas]
+ Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
+ p.value <- 1-pchisq(as.numeric(Wald), df = q)
+ return(cbind(Wald, p.value))
+}
+```
+
+
+```{r}
+fit.cox.model <- function(surv, i.betas) {
+ fit.tx <- coxph(surv, data = dat)
+ Waldtest(fit.tx, i.betas)
+}
+```
+
+Start with `tx.type`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec
+fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
+```
+
+Iteration 1: `tx.type + age + ?`
+
+```{r}
+names(dat)
+```
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
+fit.cox.model(surv, 3)
+```
+
+Add `hla.match`
+
+Iteration 2: `tx.type + age.rec + hla.match + ?`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
+fit.cox.model(surv, 4)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
+fit.cox.model(surv, 4)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
+fit.cox.model(surv, 4)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
+fit.cox.model(surv, 4)
+```
+
+Add `age.donor`
+
+Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ cold.isc
+fit.cox.model(surv, 5)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ year
+fit.cox.model(surv, 5)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ sex
+fit.cox.model(surv, 5)
+```
+
+Add `year`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ year + cold.isc
+fit.cox.model(surv, 6)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ year + sex
+fit.cox.model(surv, 6)
+```
+
+```{r}
+
+```
+
+
+Next, we test for each remaining variables
+
+# Exercise 8
+
+Estimate the survival function for specific covariate patterns.
+Based on the previous results choose the best predictors.
+
+Exercise 9 — Check the proportional hazards assumption. You may use the
+function cox.zph. Discuss the result and possible implications.
+
+H0: PH-assumption holds
+H1: PH-assumption doesn't hold
+
+```{r}
+# different cox model!!
+cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
+cox.zph(cox.final)
+```
+
+
+Exercise 10 — Plot the Schoenfeld residuals and comment.
+A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate's impact changes, while a horizontal line around zero supports the assumption.
+
+```{r}
+plot(cox.zph(cox.final))
+```
+
+
+
+NOTES:
+left censoring: event happened before time zero -> not applicable
+left truncation: not applicable??
+right censoring: we saw this in the follow up times, administrative censoring high
+
+! No left truncation, since our target population is patients who received a
+kidney transplant. So we don't care about patients who died before receiving
+a transplant.
+
+! ADD: we used a Cox model but we already saw that the PH assumption doesn't hold !
+
+? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.
+
+
diff --git a/report.html b/report.html
index 4c9fd4b..8afe5ee 100644
--- a/report.html
+++ b/report.html
@@ -99,24 +99,17 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
-
-
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-✔ dplyr 1.1.4 ✔ readr 2.1.5
-✔ forcats 1.0.0 ✔ stringr 1.5.1
-✔ lubridate 1.9.4 ✔ tibble 3.3.0
-✔ purrr 1.1.0 ✔ tidyr 1.3.1
-── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-✖ dplyr::filter() masks stats::filter()
-✖ dplyr::lag() masks stats::lag()
-ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
-
library (survival)
-# library(gtsummary)
+
library (tidyverse)
+library (dplyr)
+library (ggplot2)
+library (survival)
+library (emmeans)
+library (foreign)
+library (gtsummary)
-
dat <- read.csv ("./unos.txt" , sep = " \t " )
-head (dat)
+
dat <- read.csv ("./unos.txt" , sep = " \t " )
+head (dat)
hlamat age age.1 cold_isc death year sex txtype fu
1 2 25 15 6 0 2002 0 1 0
@@ -126,10 +119,26 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
5 2 17 12 28 1 1997 1 1 0
6 3 44 13 1 0 2002 1 0 0
+
names (dat) <- c ("hla.match" , "age.donor" , "age.rec" , "cold.isc" , "death" ,
+ "year" , "sex" , "tx.type" , "follow.up" )
+
+Complete case
+
-
names (dat) <- c ("hla.match" , "age.donor" , "age.rec" , "cold.isc" , "death" ,
- "year" , "sex" , "tx.type" , "follow.up" )
+
sapply (names (dat), function (col) {
+ is.na (dat[, col]) |> sum ()
+ })
+
+
hla.match age.donor age.rec cold.isc death year sex tx.type
+ 234 113 9 2250 0 0 0 0
+follow.up
+ 0
+
Exercise
@@ -139,10 +148,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
Illustrate in a table the characteristics of the population (age, sex, race, donor, . . . ).
-
-
g + geom_point (aes (x = follow.up, y = death))
+
@@ -150,7 +156,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g + geom_density (aes (x = follow.up))
+
@@ -158,11 +164,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g + geom_density (aes (x = age.donor))
-
-
Warning: Removed 113 rows containing non-finite outside the scale range
-(`stat_density()`).
-
+
@@ -170,11 +172,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g + geom_bar (aes (x = age.rec))
-
-
Warning: Removed 9 rows containing non-finite outside the scale range
-(`stat_count()`).
-
+
@@ -182,7 +180,1435 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
g + geom_boxplot (aes (x = age.rec, y = age.donor, group = age.donor))
+
+
+
+
+
+
+
+
+
+
+
+
+
[1] 2002 1999 1997 2001 1994 1995 1996 1993 1990 1991 1992 2000 1998
+
+
+Change the order!!
+
+
dat.table1 <- dat |>
+ mutate (
+ sex = factor (sex, levels = c (0 ,1 ), labels = c ("Female" ,"Male" )),
+ tx.type = factor (tx.type, levels = c (0 ,1 ), labels = c ("Cadaveric" ,"Living" )),
+ hla.match = factor (hla.match),
+ year = factor (year)
+ )
+
+
+
table1 <- dat.table1 |>
+ select (hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
+ tbl_summary (
+ statistic = list (
+ all_continuous () ~ "{median} ({p25}, {p75})"
+ ),
+ label = list (
+ hla.match ~ "HLA matches, n(%)" ,
+ age.donor ~ "Donor age, median (IQR)" ,
+ age.rec ~ "Recipient age, median (IQR)" ,
+ cold.isc ~ "Cold ischemic time (hours), median (IQR), " ,
+ year ~ "Year of transplant" ,
+ sex ~ "Sex, n(%)" ,
+ tx.type ~ "Transplant type, n(%)"
+ ),
+ missing = "ifany"
+ ) |>
+ modify_footnote (all_stat_cols () ~ NA )
+
+ table1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+HLA matches, n(%)
+
+
+
+
+ 0
+784 (8.2%)
+
+
+ 1
+1,504 (16%)
+
+
+ 2
+1,553 (16%)
+
+
+ 3
+3,778 (40%)
+
+
+ 4
+1,278 (13%)
+
+
+ 5
+365 (3.8%)
+
+
+ 6
+279 (2.9%)
+
+
+ Unknown
+234
+
+
+Donor age, median (IQR)
+33 (21, 41)
+
+
+ Unknown
+113
+
+
+Recipient age, median (IQR)
+13 (8, 16)
+
+
+ Unknown
+9
+
+
+Cold ischemic time (hours), median (IQR),
+7 (1, 19)
+
+
+ Unknown
+2,250
+
+
+Year of transplant
+
+
+
+
+ 1990
+728 (7.4%)
+
+
+ 1991
+734 (7.5%)
+
+
+ 1992
+666 (6.8%)
+
+
+ 1993
+736 (7.5%)
+
+
+ 1994
+787 (8.1%)
+
+
+ 1995
+813 (8.3%)
+
+
+ 1996
+789 (8.1%)
+
+
+ 1997
+745 (7.6%)
+
+
+ 1998
+740 (7.6%)
+
+
+ 1999
+842 (8.6%)
+
+
+ 2000
+706 (7.2%)
+
+
+ 2001
+834 (8.5%)
+
+
+ 2002
+655 (6.7%)
+
+
+Sex, n(%)
+
+
+
+
+ Female
+4,014 (41%)
+
+
+ Male
+5,761 (59%)
+
+
+Transplant type, n(%)
+
+
+
+
+ Cadaveric
+5,148 (53%)
+
+
+ Living
+4,627 (47%)
+
+
+
+
+
+
+
+
+
table.split <- dat.table1 |>
+ select (hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
+ tbl_summary (
+ by = tx.type,
+ statistic = list (
+ all_continuous () ~ "{median} ({p25}, {p75})"
+ ),
+ label = list (
+ hla.match ~ "HLA matches, n(%)" ,
+ age.donor ~ "Donor age, median (IQR)" ,
+ age.rec ~ "Recipient age, median (IQR)" ,
+ cold.isc ~ "Cold ischemic time (hours), median (IQR), " ,
+ year ~ "Year of transplant" ,
+ sex ~ "Sex, n(%)"
+ ),
+ missing = "ifany"
+ ) |>
+ add_overall () |>
+ modify_footnote (all_stat_cols () ~ NA )
+
+ table.split
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+HLA matches, n(%)
+
+
+
+
+
+
+
+
+ 0
+784 (8.2%)
+125 (2.5%)
+659 (14%)
+
+
+ 1
+1,504 (16%)
+160 (3.2%)
+1,344 (29%)
+
+
+ 2
+1,553 (16%)
+228 (4.6%)
+1,325 (29%)
+
+
+ 3
+3,778 (40%)
+3,039 (61%)
+739 (16%)
+
+
+ 4
+1,278 (13%)
+1,007 (20%)
+271 (5.9%)
+
+
+ 5
+365 (3.8%)
+241 (4.9%)
+124 (2.7%)
+
+
+ 6
+279 (2.9%)
+153 (3.1%)
+126 (2.7%)
+
+
+ Unknown
+234
+195
+39
+
+
+Donor age, median (IQR)
+33 (21, 41)
+37 (31, 42)
+22 (15, 37)
+
+
+ Unknown
+113
+112
+1
+
+
+Recipient age, median (IQR)
+13 (8, 16)
+13 (7, 16)
+14 (9, 16)
+
+
+ Unknown
+9
+7
+2
+
+
+Cold ischemic time (hours), median (IQR),
+7 (1, 19)
+1 (0, 1)
+19 (13, 25)
+
+
+ Unknown
+2,250
+1,490
+760
+
+
+Year of transplant
+
+
+
+
+
+
+
+
+ 1990
+728 (7.4%)
+318 (6.2%)
+410 (8.9%)
+
+
+ 1991
+734 (7.5%)
+387 (7.5%)
+347 (7.5%)
+
+
+ 1992
+666 (6.8%)
+345 (6.7%)
+321 (6.9%)
+
+
+ 1993
+736 (7.5%)
+403 (7.8%)
+333 (7.2%)
+
+
+ 1994
+787 (8.1%)
+355 (6.9%)
+432 (9.3%)
+
+
+ 1995
+813 (8.3%)
+427 (8.3%)
+386 (8.3%)
+
+
+ 1996
+789 (8.1%)
+416 (8.1%)
+373 (8.1%)
+
+
+ 1997
+745 (7.6%)
+408 (7.9%)
+337 (7.3%)
+
+
+ 1998
+740 (7.6%)
+412 (8.0%)
+328 (7.1%)
+
+
+ 1999
+842 (8.6%)
+436 (8.5%)
+406 (8.8%)
+
+
+ 2000
+706 (7.2%)
+392 (7.6%)
+314 (6.8%)
+
+
+ 2001
+834 (8.5%)
+483 (9.4%)
+351 (7.6%)
+
+
+ 2002
+655 (6.7%)
+366 (7.1%)
+289 (6.2%)
+
+
+Sex, n(%)
+
+
+
+
+
+
+
+
+ Female
+4,014 (41%)
+2,105 (41%)
+1,909 (41%)
+
+
+ Male
+5,761 (59%)
+3,043 (59%)
+2,718 (59%)
+
+
+
+
+
+
+
+Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI
+
+
rev_km <- survfit (Surv (follow.up, death == 0 ) ~ 1 , data = dat)
+
+summary (rev_km)$ table
+
+
records n.max n.start events rmean se(rmean)
+9.775000e+03 9.775000e+03 9.775000e+03 9.310000e+03 4.055027e+00 3.202711e-02
+ median 0.95LCL 0.95UCL
+3.380822e+00 3.227397e+00 3.567123e+00
+
+
median_followup <- summary (rev_km)$ table["median" ]
+ confidence_interval <- c (summary (rev_km)$ table["0.95LCL" ], summary (rev_km)$ table["0.95UCL" ])
+ median_followup
+
+
+
+
0.95LCL 0.95UCL
+3.227397 3.567123
+
+
+
+
+
g + geom_point (aes (x = follow.up, y = death))
+
+
+
+
+
+
+
+
g + geom_density (aes (x = follow.up))
+
+
+
+
+
+
+
+
g + geom_density (aes (x = age.donor))
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+
+
+
+
+
g + geom_bar (aes (x = age.rec))
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+
+
+
+
+
g + geom_boxplot (aes (x = age.rec, y = age.donor, group = age.donor))
Warning: Removed 113 rows containing missing values or values outside the scale range
(`stat_boxplot()`).
@@ -194,23 +1620,34 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
+
+
+
+
+
+
+
+
g + geom_boxplot (aes (x = year, y = follow.up, group = year))
+
+
+
+
-
Exercise 2
Plot the Kaplan-Meier overall survival curve for pediatric kid- ney transplant recipients for the first 12 years after transplantation.
+!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!
-
km <- survfit (Surv (follow.up, death) ~ 1 , data = dat[dat$ follow.up <= 12 , ])
-plot (km)
+
km <- survfit (Surv (follow.up, death) ~ 1 , data = dat[dat$ follow.up <= 12 , ])
+plot (km)
-
+
@@ -220,8 +1657,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
Exercise 3
We are going to compare mortality rates (hazard functions) between children whose transplanted kidney was provided by a living donor (in general a family member) and those whose source was recently deceased (variable donor type: txtype). Use the life table method to calculate the death rates for the first 5 years for each group (take in the first year intervals of 4 months and then look at each year) and show the results in a table. Estimate the hazard ratio in each time interval as the ratio between the mortality rates in the two groups. What do you notice?
-
dat.5 <- dat[dat$ follow.up <= 5 , ]
-head (dat.5 )
+
dat.5 <- dat[dat$ follow.up <= 5 , ]
+head (dat.5 )
hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
1 2 25 15 6 0 2002 0 1 0
@@ -231,19 +1668,19 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
5 2 17 12 28 1 1997 1 1 0
6 3 44 13 1 0 2002 1 0 0
-
classify_time_interval = function (fu) {
- if (fu <= 1 / 3 ) {
- return (1 / 3 )
- } else if (fu <= 2 / 3 ) {
- return (2 / 3 )
- } else if (fu <= 1 ) {
- return (1 )
- }
- ceiling (fu)
- }
-
- dat.5 $ fu.interval <- sapply (dat.5 $ follow.up, classify_time_interval)
-table (dat.5 $ fu.interval)
+
classify_time_interval = function (fu) {
+ if (fu <= 1 / 3 ) {
+ return (1 / 3 )
+ } else if (fu <= 2 / 3 ) {
+ return (2 / 3 )
+ } else if (fu <= 1 ) {
+ return (1 )
+ }
+ ceiling (fu)
+ }
+
+ dat.5 $ fu.interval <- sapply (dat.5 $ follow.up, classify_time_interval)
+table (dat.5 $ fu.interval)
0.333333333333333 0.666666666666667 1 2
@@ -251,7 +1688,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
3 4 5
1184 1093 950
-
+
hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
1 2 25 15 6 0 2002 0 1 0
@@ -270,31 +1707,31 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
-
dat.5. life <- dat.5 |>
- group_by (fu.interval) |>
- summarize (
- n.censored = sum (death == 0 ),
- n.event = sum (death),
- n.at.risk = nrow (dat),
- )
-
-for (i in 2 : nrow (dat.5. life)) {
- j <- i - 1
-
- n.censored.pre <- dat.5. life$ n.censored[j]
- n.event.pre <- dat.5. life$ n.event[j]
- n.at.risk.pre <- dat.5. life$ n.at.risk[j]
-
- n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
-
- dat.5. life$ n.at.risk[i] <- n.at.risk
- }
-
-print (nrow (dat))
+
dat.5. life <- dat.5 |>
+ group_by (fu.interval) |>
+ summarize (
+ n.censored = sum (death == 0 ),
+ n.event = sum (death),
+ n.at.risk = nrow (dat),
+ )
+
+for (i in 2 : nrow (dat.5. life)) {
+ j <- i - 1
+
+ n.censored.pre <- dat.5. life$ n.censored[j]
+ n.event.pre <- dat.5. life$ n.event[j]
+ n.at.risk.pre <- dat.5. life$ n.at.risk[j]
+
+ n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
+
+ dat.5. life$ n.at.risk[i] <- n.at.risk
+ }
+
+print (nrow (dat))
-
+
# A tibble: 7 × 4
fu.interval n.censored n.event n.at.risk
@@ -308,126 +1745,355 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
7 5 919 31 4122
+
n.removed.acc
-
dat.5. life <- dat.5. life |>
- mutate (
- hazard.rate = n.event / n.at.risk
- )
+
dat.5. life <- dat.5. life |>
+ mutate (
+ hazard.rate = n.event / n.at.risk
+ )
-
get_life_table = function (dat) {
- dat <- dat |>
- group_by (fu.interval) |>
- summarize (
- n.censored = sum (death == 0 ),
- n.event = sum (death),
- n.at.risk = nrow (dat),
- )
-
- for (i in 2 : nrow (dat)) {
- j <- i - 1
-
- n.censored.pre <- dat$ n.censored[j]
- n.event.pre <- dat$ n.event[j]
- n.at.risk.pre <- dat$ n.at.risk[j]
-
- n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
-
- dat$ n.at.risk[i] <- n.at.risk
- }
-
- dat <- dat |>
- mutate (
- hazard.rate = n.event / n.at.risk
- )
-
- return (dat)
- }
+
get_life_table = function (dat.sub, dat) {
+ dat.sub <- dat.sub |>
+ group_by (fu.interval) |>
+ summarize (
+ n.censored = sum (death == 0 ),
+ n.event = sum (death),
+ n.at.risk = nrow (dat),
+ )
+
+ for (i in 2 : nrow (dat.sub)) {
+ j <- i - 1
+
+ n.censored.pre <- dat.sub$ n.censored[j]
+ n.event.pre <- dat.sub$ n.event[j]
+ n.at.risk.pre <- dat.sub$ n.at.risk[j]
+
+ n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
+
+ dat.sub$ n.at.risk[i] <- n.at.risk
+ }
+
+ dat.sub <- dat.sub |>
+ mutate (
+ hazard.rate = n.event / n.at.risk
+ )
+
+ return (dat.sub)
+ }
-
dat.5. tx0 = dat.5 [dat.5 $ tx.type == 0 , ]
- dat.5. tx1 = dat.5 [dat.5 $ tx.type == 1 , ]
+
dat.5. tx0 = dat.5 [dat.5 $ tx.type == 0 , ]
+ dat.5. tx1 = dat.5 [dat.5 $ tx.type == 1 , ]
-
tx0.life <- get_life_table (dat.5. tx0)
- tx0.life
+
tx0.life <- get_life_table (dat.5. tx0, dat[dat$ tx.type == 0 , ])
+ tx0.life
# A tibble: 7 × 5
fu.interval n.censored n.event n.at.risk hazard.rate
<dbl> <int> <int> <int> <dbl>
-1 0.333 463 49 3378 0.0145
-2 0.667 253 13 2866 0.00454
-3 1 340 11 2600 0.00423
-4 2 591 24 2249 0.0107
-5 3 568 17 1634 0.0104
-6 4 521 16 1049 0.0153
-7 5 501 11 512 0.0215
+1 0.333 463 49 5148 0.00952
+2 0.667 253 13 4636 0.00280
+3 1 340 11 4370 0.00252
+4 2 591 24 4019 0.00597
+5 3 568 17 3404 0.00499
+6 4 521 16 2819 0.00568
+7 5 501 11 2282 0.00482
-
tx1.life <- get_life_table (dat.5. tx1)
- tx1.life
+
tx1.life <- get_life_table (dat.5. tx1, dat[dat$ tx.type == 1 , ])
+ tx1.life
# A tibble: 7 × 5
fu.interval n.censored n.event n.at.risk hazard.rate
<dbl> <int> <int> <int> <dbl>
-1 0.333 367 103 3225 0.0319
-2 0.667 238 24 2755 0.00871
-3 1 273 16 2493 0.00642
-4 2 573 38 2204 0.0172
-5 3 575 24 1593 0.0151
-6 4 532 24 994 0.0241
-7 5 418 20 438 0.0457
+1 0.333 367 103 4627 0.0223
+2 0.667 238 24 4157 0.00577
+3 1 273 16 3895 0.00411
+4 2 573 38 3606 0.0105
+5 3 575 24 2995 0.00801
+6 4 532 24 2396 0.0100
+7 5 418 20 1840 0.0109
-
tx1.life$ hazard.rate / tx0.life$ hazard.rate
+
tx1.life$ hazard.rate / tx0.life$ hazard.rate
-
[1] 2.201766 1.920536 1.516975 1.615661 1.448100 1.582998 2.125363
+
[1] 2.338731 2.058881 1.631929 1.764675 1.604557 1.764816 2.254941
-
hazard.df <- data.frame (
- fu.interval = tx1.life$ fu.interval,
- hazard.rate.0 = tx0.life$ hazard.rate,
- hazard.rate.1 = tx1.life$ hazard.rate,
- hazard.ratio = tx1.life$ hazard.rate / tx0.life$ hazard.rate
- )
-
-ggplot (hazard.df, aes (x = fu.interval)) +
- geom_line (aes (y = hazard.rate.0 ), color = "blue" ) +
- geom_line (aes (y = hazard.rate.1 ), color = "orange" )
+
hazard.df <- data.frame (
+ fu.interval = tx1.life$ fu.interval,
+ hazard.rate.0 = tx0.life$ hazard.rate,
+ hazard.rate.1 = tx1.life$ hazard.rate,
+ hazard.ratio = tx1.life$ hazard.rate / tx0.life$ hazard.rate
+ )
+
+ggplot (hazard.df, aes (x = fu.interval)) +
+ geom_line (aes (y = hazard.rate.0 ), color = "blue" ) +
+ geom_line (aes (y = hazard.rate.1 ), color = "orange" )
-
+
-
ggplot (hazard.df, aes (x = fu.interval)) +
- geom_line (aes (y = hazard.ratio))
+
ggplot (hazard.df, aes (x = fu.interval)) +
+ geom_line (aes (y = hazard.ratio))
-
+
+
+
General Solution:
+
+
get.life.table <- function (dat, time.intervals) {
+ n.pop <- nrow (dat)
+
+ dat |>
+ recode.dat (time.intervals) |>
+ group_by (fu.interval) |>
+ summarize (
+ n.censored = sum (.data$ death == 0 ),
+ n.event = sum (.data$ death),
+ ) |>
+ ungroup () |>
+ calculate.hazard (n.pop)
+ }
+
+ get.life.table.by.groups <- function (dat, time.intervals, grps) {
+ grps |>
+ lapply (function (grp) {
+ dat |>
+ get.life.table.by.group (time.intervals, grp) |>
+ mutate (
+ grp.name = grp,
+ grp.value = pick (1 )[[1 ]]
+ ) |>
+ select (- 1 )
+ }) |>
+ bind_rows ()
+ }
+
+ get.life.table.by.group <- function (dat, time.intervals, grp) {
+ dat |>
+ recode.dat (time.intervals) |>
+ group_by (fu.interval, .data[[grp]]) |>
+ summarize (
+ n.censored = sum (.data$ death == 0 ),
+ n.event = sum (.data$ death),
+ .groups = "keep"
+ ) |>
+ ungroup (fu.interval) |>
+ group_modify (function (df.sub, grp) {
+ grp.name <- names (grp)
+ grp.value <- grp[[1 ]]
+ n.pop <- (dat[[grp.name]] == grp.value) |> sum ()
+ calculate.hazard (df.sub, n.pop)
+ }) |>
+ ungroup ()
+ }
+
+ calculate.hazard <- function (life.table, n.pop) {
+ n.removed <- life.table$ n.event + life.table$ n.censored
+ n.removed.accum <- c (0 , cumsum (n.removed)[- length (n.removed)])
+ life.table |>
+ mutate (
+ n.at.risk = n.pop - n.removed.accum,
+ # TODO : how to account for censored? How do we adjust for uneven interval?
+ hazard.rate = n.event / n.at.risk
+ )
+ }
+
+ recode.dat <- function (dat, time.intervals) {
+ df <- dat[dat$ follow.up <= sum (time.intervals), ]
+ time.points <- cumsum (time.intervals)
+ df$ fu.interval <- sapply (df$ follow.up, function (time) {
+ time.points[time <= time.points][1 ]
+ })
+
+ df
+ }
+
+
+
time.intervals <- c (1 / 3 , 1 / 3 , 1 / 3 , 1 , 1 , 1 , 1 )
+ grps <- c ("tx.type" , "sex" , "hla.match" , "cold.isc" )
+
+get.life.table.by.groups (dat, time.intervals, grps)
+
+
# A tibble: 593 × 7
+ fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
+ <dbl> <int> <int> <dbl> <dbl> <chr> <dbl>
+ 1 0.333 463 49 5148 0.00952 tx.type 0
+ 2 0.667 253 13 4636 0.00280 tx.type 0
+ 3 1 340 11 4370 0.00252 tx.type 0
+ 4 2 591 24 4019 0.00597 tx.type 0
+ 5 3 568 17 3404 0.00499 tx.type 0
+ 6 4 521 16 2819 0.00568 tx.type 0
+ 7 5 501 11 2282 0.00482 tx.type 0
+ 8 0.333 367 103 4627 0.0223 tx.type 1
+ 9 0.667 238 24 4157 0.00577 tx.type 1
+10 1 273 16 3895 0.00411 tx.type 1
+# ℹ 583 more rows
+
+
+
+
grps <- c ("tx.type" , "sex" , "hla.match" , "cold.isc" )
+ grps <- c ("tx.type" )
+
+
+ plot.hazard.rate.by.groups <- function (dat, time.intervals, grps) {
+ dat |>
+ get.life.table.by.groups (time.intervals, grps) |>
+ ggplot (aes (x = fu.interval, y = hazard.rate,
+ color = paste (grp.name, grp.value, sep = " = " ))) +
+ geom_point () +
+ geom_line () +
+ labs (
+ color = "Group"
+ ) +
+ theme (
+ legend.position = "bottom"
+ )
+ }
+
+
+
time.intervals <- c (rep (1 / 3 , 3 ), rep (1 , 4 ))
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
time.intervals <- rep (1 , 5 )
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
time.intervals <- rep (1 / 3 , 15 )
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
time.intervals <- c (rep (1 / 3 , 3 * 3 ), rep (1 , 2 ))
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
time.intervals <- c (rep (1 / 4 , 4 ), rep (1 , 4 ))
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
time.intervals <- c (rep (1 / 4 , 4 * 2 ), rep (1 , 3 ))
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
time.intervals <- c (rep (1 / 4 , 4 * 2 ), rep (1 / 3 , 3 * 3 ), rep (1 , 6 ))
+plot.hazard.rate.by.groups (dat, time.intervals, c ("tx.type" ))
+
+
+
+
+
+
+
+
+
+
Compare
+
+
time.intervals <- c (1 / 3 , 1 / 3 , 1 / 3 , 1 , 1 , 1 , 1 )
+get.life.table.by.groups (dat, time.intervals, grps)
+
+
# A tibble: 14 × 7
+ fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
+ <dbl> <int> <int> <dbl> <dbl> <chr> <int>
+ 1 0.333 463 49 5148 0.00952 tx.type 0
+ 2 0.667 253 13 4636 0.00280 tx.type 0
+ 3 1 340 11 4370 0.00252 tx.type 0
+ 4 2 591 24 4019 0.00597 tx.type 0
+ 5 3 568 17 3404 0.00499 tx.type 0
+ 6 4 521 16 2819 0.00568 tx.type 0
+ 7 5 501 11 2282 0.00482 tx.type 0
+ 8 0.333 367 103 4627 0.0223 tx.type 1
+ 9 0.667 238 24 4157 0.00577 tx.type 1
+10 1 273 16 3895 0.00411 tx.type 1
+11 2 573 38 3606 0.0105 tx.type 1
+12 3 575 24 2995 0.00801 tx.type 1
+13 4 532 24 2396 0.0100 tx.type 1
+14 5 418 20 1840 0.0109 tx.type 1
+
+
+
+
+
+
fu.interval hazard.rate.0 hazard.rate.1 hazard.ratio
+1 0.3333333 0.009518260 0.022260644 2.338731
+2 0.6666667 0.002804142 0.005773394 2.058881
+3 1.0000000 0.002517162 0.004107831 1.631929
+4 2.0000000 0.005971635 0.010537992 1.764675
+5 3.0000000 0.004994125 0.008013356 1.604557
+6 4.0000000 0.005675772 0.010016694 1.764816
+7 5.0000000 0.004820333 0.010869565 2.254941
+
+
Exercise 4
Show a plot with Kaplan-Meier survival curves for the two donor types.
-
km.tx <- survfit (Surv (follow.up, death) ~ tx.type, data = dat[dat$ follow.up <= 12 , ])
-
-plot (km.tx, col = c ("blue" , "orange" ))
-legend (legend = c ("cadaveric" , "living" ), "bottomleft" , lwd = 2 , col = c ("blue" , "orange" ))
+
km.tx <- survfit (Surv (follow.up, death) ~ tx.type, data = dat[dat$ follow.up <= 12 , ])
+
+plot (km.tx, col = c ("blue" , "orange" ))
+legend (legend = c ("cadaveric" , "living" ), "bottomleft" , lwd = 2 , col = c ("blue" , "orange" ))
-
+
@@ -437,8 +2103,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
Exercise 5
Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained.
-
cox <- coxph (Surv (follow.up, death) ~ tx.type, data = dat)
-summary (cox)
+
cox.tx <- coxph (Surv (follow.up, death) ~ tx.type, data = dat)
+summary (cox.tx)
Call:
coxph(formula = Surv(follow.up, death) ~ tx.type, data = dat)
@@ -460,27 +2126,369 @@ Score (logrank) test = 47.09 on 1 df, p=7e-12
-
+
log.hazard.ratio.hat <- cox.tx$ coefficient
+ hazard.ratio.hat <- log.hazard.ratio.hat |> exp ()
+
+
+
log.hazard.ratio.se <- summary (cox.tx)$ coefficients[3 ]
+ log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm (0.975 ) * log.hazard.ratio.se
+ log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm (0.975 ) * log.hazard.ratio.se
+
+c (log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp ()
-
[1] 1.90539
+
tx.type tx.type
+1.579899 2.297932
+
+
+
+
+
+
Patients receiving kidney transplants from living donors (tx.type = 1) has 1.91 higher hazard rate than those receiving from cadaveric donors (tx.type = 0).
+
Since 1 is not in the 95% confidence interval, the result is statistically significant.
+
+
+
+
+
+ Exercise 6
+Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.
+
+ Continuous
+
+
cox.age.rec <- coxph (Surv (follow.up, death) ~ age.rec, data = dat)
+ cox.age.rec |> summary ()
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ age.rec, data = dat)
+
+ n= 9766, number of events= 464
+ (9 observations deleted due to missingness)
+
+ coef exp(coef) se(coef) z Pr(>|z|)
+age.rec -0.042550 0.958342 0.008434 -5.045 4.53e-07 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+ exp(coef) exp(-coef) lower .95 upper .95
+age.rec 0.9583 1.043 0.9426 0.9743
+
+Concordance= 0.586 (se = 0.016 )
+Likelihood ratio test= 24.81 on 1 df, p=6e-07
+Wald test = 25.45 on 1 df, p=5e-07
+Score (logrank) test = 25.75 on 1 df, p=4e-07
+
+
+
+
+ Categorical (Case 1)
+
+
recode.age <- function (dat, ages) {
+ df$ age.group <- sapply (df$ follow.up, function (time) {
+ time.points[time <= time.points][1 ]
+ })
+
+ df
+ }
+
+
+
dat.age.rec <- dat[! is.na (dat$ age.rec), ]
+ class.age <- function (age) {
+ if (age < 2 ) {
+ return ("0,1" )
+ } else if (age < 6 ) {
+ return ("2,3,4,5" )
+ } else if (age < 11 ) {
+ return ("6,7,8,9,10" )
+ } else if (age < 19 ) {
+ return ("11-18" )
+ }
+ }
+
+
+
dat.age.rec$ age.group <- sapply (dat.age.rec$ age.rec, class.age) |>
+ factor (levels = c (
+ "0,1" ,
+ "2,3,4,5" ,
+ "6,7,8,9,10" ,
+ "11-18"
+ ))
+
+
+ cox.age.rec <- coxph (Surv (follow.up, death) ~ age.group, data = dat.age.rec)
+ cox.age.rec |> summary ()
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ age.group, data = dat.age.rec)
+
+ n= 9766, number of events= 464
+
+ coef exp(coef) se(coef) z Pr(>|z|)
+age.group2,3,4,5 -0.5947 0.5517 0.1767 -3.366 0.000764 ***
+age.group6,7,8,9,10 -1.0578 0.3472 0.1776 -5.956 2.59e-09 ***
+age.group11-18 -1.0192 0.3609 0.1513 -6.736 1.63e-11 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+ exp(coef) exp(-coef) lower .95 upper .95
+age.group2,3,4,5 0.5517 1.813 0.3902 0.7801
+age.group6,7,8,9,10 0.3472 2.880 0.2452 0.4918
+age.group11-18 0.3609 2.771 0.2683 0.4855
+
+Concordance= 0.584 (se = 0.015 )
+Likelihood ratio test= 44.93 on 3 df, p=1e-09
+Wald test = 53.97 on 3 df, p=1e-11
+Score (logrank) test = 57.87 on 3 df, p=2e-12
-
1.90539 + 1.96 * exp (0.09558 )
+
emmeans (cox.age.rec, pairwise ~ age.group, type = "response" )
-
(2.298 - 1.58 ) / 2 |> exp () / 2
-
-
[1] 0.04858537
+
$emmeans
+ age.group response SE df asymp.LCL asymp.UCL
+ 0,1 1.000 0.0000 Inf 1.000 1.000
+ 2,3,4,5 0.552 0.0975 Inf 0.390 0.780
+ 6,7,8,9,10 0.347 0.0617 Inf 0.245 0.492
+ 11-18 0.361 0.0546 Inf 0.268 0.485
+
+Confidence level used: 0.95
+Intervals are back-transformed from the log scale
+
+$contrasts
+ contrast ratio SE df null z.ratio p.value
+ 0,1 / 2,3,4,5 1.813 0.320 Inf 1 3.366 0.0043
+ 0,1 / 6,7,8,9,10 2.880 0.511 Inf 1 5.956 <.0001
+ 0,1 / (11-18) 2.771 0.419 Inf 1 6.736 <.0001
+ 2,3,4,5 / 6,7,8,9,10 1.589 0.251 Inf 1 2.928 0.0179
+ 2,3,4,5 / (11-18) 1.529 0.196 Inf 1 3.314 0.0051
+ 6,7,8,9,10 / (11-18) 0.962 0.124 Inf 1 -0.298 0.9908
+
+P value adjustment: tukey method for comparing a family of 4 estimates
+Tests are performed on the log scale
-Exercise 6 — Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.
-Exercise 7 — Fit a multivariate Cox model by using other predictors and describe your results.
-Exercise 8 — Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.
-Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications. Exercise 10 — Plot the Schoenfeld residuals and comment.
+
+ Exercise 7
+Fit a multivariate Cox model by using other predictors and describe your results.
+
+
sapply (names (dat), function (col) {
+ is.na (dat[, col]) |> sum ()
+ })
+
+
hla.match age.donor age.rec cold.isc death year sex tx.type
+ 234 113 9 2250 0 0 0 0
+follow.up
+ 0
+
+
+
+
+
+
[1] "hla.match" "age.donor" "age.rec" "cold.isc" "death" "year"
+[7] "sex" "tx.type" "follow.up"
+
+
+
+
Waldtest <- function (cox.fit, i.betas){
+ q <- length (i.betas)
+ coefs <- cox.fit$ coefficients
+ # print(cox.fit$var)
+ var <- cox.fit$ var[i.betas, i.betas]
+ Wald <- coefs[i.betas]%*% solve (var)%*% coefs[i.betas]
+ p.value <- 1 - pchisq (as.numeric (Wald), df = q)
+ return (cbind (Wald, p.value))
+ }
+
+
+
fit.cox.model <- function (surv, i.betas) {
+ fit.tx <- coxph (surv, data = dat)
+ Waldtest (fit.tx, i.betas)
+ }
+
+Start with tx.type
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec
+fit.cox.model (surv, 1 : 2 ) # NOTE : Why is this 1???
+
+
p.value
+[1,] 77.3778 0
+
+
+Iteration 1: tx.type + age + ?
+
+
+
+
[1] "hla.match" "age.donor" "age.rec" "cold.isc" "death" "year"
+[7] "sex" "tx.type" "follow.up"
+
+
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match
+fit.cox.model (surv, 3 )
+
+
p.value
+[1,] 8.146717 0.004313919
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + age.donor
+fit.cox.model (surv, 3 )
+
+
p.value
+[1,] 6.509223 0.01073164
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + cold.isc
+fit.cox.model (surv, 3 )
+
+
p.value
+[1,] 0.7057259 0.4008664
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + year
+fit.cox.model (surv, 3 )
+
+
p.value
+[1,] 4.680593 0.03050521
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + sex
+fit.cox.model (surv, 3 )
+
+
p.value
+[1,] 1.921176 0.1657271
+
+
+Add hla.match
+Iteration 2: tx.type + age.rec + hla.match + ?
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
+fit.cox.model (surv, 4 )
+
+
p.value
+[1,] 5.211567 0.0224371
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
+fit.cox.model (surv, 4 )
+
+
p.value
+[1,] 0.9157965 0.3385811
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + year
+fit.cox.model (surv, 4 )
+
+
p.value
+[1,] 4.94216 0.02620927
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + sex
+fit.cox.model (surv, 4 )
+
+
p.value
+[1,] 0.981911 0.3217275
+
+
+Add age.donor
+Iteration 2: tx.type + age.rec + hla.match + age.donor + ?
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ cold.isc
+fit.cox.model (surv, 5 )
+
+
p.value
+[1,] 0.4450261 0.5047065
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ year
+fit.cox.model (surv, 5 )
+
+
p.value
+[1,] 4.367751 0.03662531
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ sex
+fit.cox.model (surv, 5 )
+
+
p.value
+[1,] 0.9343691 0.3337302
+
+
+Add year
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ year + cold.isc
+fit.cox.model (surv, 6 )
+
+
p.value
+[1,] 0.1614349 0.6878388
+
+
surv <- Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+ year + sex
+fit.cox.model (surv, 6 )
+
+
p.value
+[1,] 0.9641705 0.3261383
+
+
+Next, we test for each remaining variables
+
+
+ Exercise 8
+Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.
+Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications.
+H0: PH-assumption holds H1: PH-assumption doesn’t hold
+
+
# different cox model!!
+ cox.final <- coxph (Surv (follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
+cox.zph (cox.final)
+
+
chisq df p
+tx.type 2.56 1 0.10948
+age.rec 35.98 1 2.0e-09
+hla.match 8.22 1 0.00414
+age.donor 11.05 1 0.00089
+GLOBAL 47.37 4 1.3e-09
+
+
+Exercise 10 — Plot the Schoenfeld residuals and comment. A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate’s impact changes, while a horizontal line around zero supports the assumption.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+NOTES: left censoring: event happened before time zero -> not applicable left truncation: not applicable?? right censoring: we saw this in the follow up times, administrative censoring high
+! No left truncation, since our target population is patients who received a kidney transplant. So we don’t care about patients who died before receiving a transplant.
+! ADD: we used a Cox model but we already saw that the PH assumption doesn’t hold !
+? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.
+
diff --git a/slides.html b/slides.html
new file mode 100644
index 0000000..80b495b
--- /dev/null
+++ b/slides.html
@@ -0,0 +1,3075 @@
+
+
+
+
+
+
+
+
+
+
Slides
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Introduction: research question
+
+ Survival after transplantation (M)
+
+
+ Identify Predictors (M)
+
+
+ Why Survival Analysis (L)
+Motivation: study the distribution of time to event \(T\) .
+Example: time of death after kidney transplant.
+
+
+
+
+
+
+
+
+
+
+
+1
+2000
+2005
+
+
+2
+2000
+2009
+
+
+3
+2001
+2005
+
+
+4
+2003
+2004
+
+
+5
+2004
+2016
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+We then calculate time to death for each respondents.
+With this, all we need is to fit a linear model t ~ X or log(t) ~ X
+
+
+
+
+
+
+
+
+
+
+
+1
+2000
+2005
+5
+
+
+2
+2000
+2009
+9
+
+
+3
+2001
+2005
+4
+
+
+4
+2003
+2004
+1
+
+
+5
+2004
+2016
+12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+What if we do not know when exactly some respondents die?
+Scenario 1: the study ends at the year 2008?
+
+
+
+
+
+
+
+
+
+
+
+1
+2000
+2005
+5
+
+
+2
+2000
+> 2008
+> 8
+
+
+3
+2001
+2005
+4
+
+
+4
+2003
+2004
+1
+
+
+5
+2004
+> 2008
+> 4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Right Censoring : only observe the event (death) if it occurs before a certain time (2008).
+
+Scenario 2: respondent 3 move away; loss follow up
+
+
+
+
+
+
+
+
+
+
+
+1
+2000
+2005
+5
+
+
+2
+2000
+> 2008
+> 8
+
+
+3
+2001
+> 2005
+> 4
+
+
+4
+2003
+2004
+1
+
+
+5
+2004
+> 2008
+> 4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+How many patients are right censored?
+
+
+
+
+
+
+
+
+
+
+
+ Challenges in Survival Analysis (L)
+
+Right Censoring : only observe the event if it occurs before a certain time.
+Left Censoring : event has occurred prior to the start of a research
+
+Follow up every 3 years.
+Event has occurred -> event happened sometime before follow up.
+
+Left Truncation : delayed entry; respondents are included only if they survived long enough.
+
+Start follow up with patients 100 days after the transplant
+Patients dies within 100 day wouldn’t be included in the dataset
+
+Right Truncation : respondents are included only if they have already experienced the event.
+
+Retrospective analysis from deceased patients between 1990 and 2000.
+Patients not dead before 2000 are not included in the dataset.
+
+
+
+
+
+ Method & Result
+
+
+ Table 1 (L)
+
+ HLA match hla.match
+
+
+
+ 0 1 2 3 4 5 6 <NA>
+ 784 1504 1553 3778 1278 365 279 234
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Donor age age.donor
+
+
mean (dat$ age.donor, na.rm = TRUE )
+
+
median (dat$ age.donor, na.rm = TRUE )
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+
+
+
+
+
+
+
+ Recipient Age age.rec
+
+
mean (dat$ age.rec, na.rm = TRUE )
+
+
median (dat$ age.rec, na.rm = TRUE )
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_count()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 2250 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+
+
+
+
+
+
+
+ Cold Ischemia Time cold.isc
+
+
mean (dat$ cold.isc, na.rm = TRUE )
+
+
median (dat$ cold.isc, na.rm = TRUE )
+
+
+
+
+
Warning: Removed 2250 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 2250 rows containing non-finite outside the scale range
+(`stat_density()`).
+
+
+
+
+
+
+
+
+
+
+
+ Transplant Type tx.type
+
+
+
+
+Cadaveric Living
+ 5148 4627
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Year year
+
+
+
+1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
+ 728 734 666 736 787 813 789 745 740 842 706 834 655
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 9 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+
+
+
+
+
+
+
+
Warning: Removed 113 rows containing non-finite outside the scale range
+(`stat_boxplot()`).
+
+
+
+
+
+
+
+
+
+
+
+
+ Overall Kaplan-Meier
+
+
+
+
+
+
+
+
+
+
+
+ Hazard Functions
+
+ death Distribution (?)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Overall (?)
+
+
+
# A tibble: 7 × 5
+ fu.interval n.censored n.event n.at.risk hazard.rate
+ <dbl> <int> <int> <dbl> <dbl>
+1 0.333 830 152 9775 0.0155
+2 0.667 491 37 8793 0.00421
+3 1 613 27 8265 0.00327
+4 2 1164 62 7625 0.00813
+5 3 1143 41 6399 0.00641
+6 4 1053 40 5215 0.00767
+7 5 919 31 4122 0.00752
+
+
+
+
+
+
+ Cox Model
+
+
+ age (categorical) (M)
+
+
+ Full model (L)
+
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ hla.match + tx.type,
+ data = dat)
+
+ n= 9541, number of events= 451
+ (234 observations deleted due to missingness)
+
+ coef exp(coef) se(coef) z Pr(>|z|)
+hla.match1 -0.08845 0.91535 0.17308 -0.511 0.60933
+hla.match2 -0.36340 0.69531 0.17919 -2.028 0.04256 *
+hla.match3 -0.50810 0.60164 0.18298 -2.777 0.00549 **
+hla.match4 -0.65410 0.51991 0.22221 -2.944 0.00324 **
+hla.match5 -0.39445 0.67405 0.29410 -1.341 0.17985
+hla.match6 -0.51491 0.59755 0.31489 -1.635 0.10201
+tx.typeLiving 0.39049 1.47771 0.12688 3.078 0.00209 **
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+ exp(coef) exp(-coef) lower .95 upper .95
+hla.match1 0.9153 1.0925 0.6520 1.2850
+hla.match2 0.6953 1.4382 0.4894 0.9879
+hla.match3 0.6016 1.6621 0.4203 0.8612
+hla.match4 0.5199 1.9234 0.3363 0.8037
+hla.match5 0.6740 1.4836 0.3787 1.1996
+hla.match6 0.5976 1.6735 0.3224 1.1077
+tx.typeLiving 1.4777 0.6767 1.1524 1.8949
+
+Concordance= 0.613 (se = 0.014 )
+Likelihood ratio test= 56.79 on 7 df, p=7e-10
+Wald test = 57.08 on 7 df, p=6e-10
+Score (logrank) test = 59.51 on 7 df, p=2e-10
+
+
+
Call:
+coxph(formula = Surv(follow.up, death) ~ hla.match * tx.type,
+ data = dat)
+
+ n= 9541, number of events= 451
+ (234 observations deleted due to missingness)
+
+ coef exp(coef) se(coef) z Pr(>|z|)
+hla.match1 0.8094 2.2467 0.8165 0.991 0.3215
+hla.match2 1.1371 3.1176 0.7530 1.510 0.1311
+hla.match3 0.4217 1.5245 0.7140 0.591 0.5548
+hla.match4 0.1094 1.1156 0.7341 0.149 0.8816
+hla.match5 0.4754 1.6087 0.7820 0.608 0.5432
+hla.match6 0.5836 1.7925 0.8021 0.728 0.4669
+tx.typeLiving 1.3801 3.9752 0.7218 1.912 0.0559 .
+hla.match1:tx.typeLiving -0.9614 0.3823 0.8355 -1.151 0.2498
+hla.match2:tx.typeLiving -1.6578 0.1905 0.7762 -2.136 0.0327 *
+hla.match3:tx.typeLiving -1.0096 0.3644 0.7461 -1.353 0.1760
+hla.match4:tx.typeLiving -0.5175 0.5960 0.7863 -0.658 0.5105
+hla.match5:tx.typeLiving -0.8601 0.4231 0.8805 -0.977 0.3286
+hla.match6:tx.typeLiving -1.3342 0.2634 0.9113 -1.464 0.1432
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+ exp(coef) exp(-coef) lower .95 upper .95
+hla.match1 2.2467 0.4451 0.45341 11.1322
+hla.match2 3.1176 0.3208 0.71257 13.6400
+hla.match3 1.5245 0.6559 0.37617 6.1787
+hla.match4 1.1156 0.8964 0.26460 4.7032
+hla.match5 1.6087 0.6216 0.34737 7.4498
+hla.match6 1.7925 0.5579 0.37212 8.6342
+tx.typeLiving 3.9752 0.2516 0.96600 16.3580
+hla.match1:tx.typeLiving 0.3823 2.6155 0.07435 1.9662
+hla.match2:tx.typeLiving 0.1905 5.2480 0.04162 0.8723
+hla.match3:tx.typeLiving 0.3644 2.7444 0.08443 1.5726
+hla.match4:tx.typeLiving 0.5960 1.6778 0.12763 2.7836
+hla.match5:tx.typeLiving 0.4231 2.3635 0.07533 2.3763
+hla.match6:tx.typeLiving 0.2634 3.7971 0.04414 1.5712
+
+Concordance= 0.615 (se = 0.014 )
+Likelihood ratio test= 66.78 on 13 df, p=3e-09
+Wald test = 65.9 on 13 df, p=5e-09
+Score (logrank) test = 69.92 on 13 df, p=8e-10
+
+
+
Analysis of Deviance Table
+ Cox model: response is Surv(follow.up, death)
+ Model 1: ~ hla.match + tx.type
+ Model 2: ~ hla.match * tx.type
+ loglik Chisq Df Pr(>|Chi|)
+1 -3848.5
+2 -3843.5 9.9891 6 0.1251
+
+
+
+
+ Discussion: include year or not?
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/slides.qmd b/slides.qmd
new file mode 100644
index 0000000..81b33bb
--- /dev/null
+++ b/slides.qmd
@@ -0,0 +1,609 @@
+---
+title: Slides
+execute:
+ cache: true
+ freeze: auto
+ include: true
+ echo: false
+number-sections: true
+---
+
+```{r}
+library(tidyverse)
+library(dplyr)
+library(ggplot2)
+library(survival)
+library(emmeans)
+library(foreign)
+library(gtsummary)
+library(gt)
+library(ggsurvfit)
+```
+
+```{r}
+dat <- read.csv("./unos.txt", sep = "\t")
+names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
+ "year", "sex", "tx.type", "follow.up")
+dat <- dat |>
+ mutate(
+ sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
+ tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
+ hla.match = factor(hla.match),
+ year = factor(year)
+ )
+```
+
+# Introduction: research question
+
+## Survival after transplantation (M)
+
+## Identify Predictors (M)
+
+## Why Survival Analysis (L)
+
+Motivation: study the distribution of time to event $T$.
+
+Example: time of death after kidney transplant.
+
+
+```{r}
+ex <- data.frame(
+ id = c(1, 2, 3, 4, 5),
+ transplant = c(2000, 2000, 2001, 2003, 2004),
+ death = c(2005, 2009, 2005, 2004, 2016)
+)
+
+ex_table_1 <- ex |>
+ gt() |>
+ cols_label(
+ id = "ID",
+ transplant = "Transplant",
+ death = "Death"
+ ) %>%
+ cols_align(align = "center", columns = everything())
+
+ex_plot_real_time <- ggplot(ex) +
+ geom_segment(aes(x = transplant, xend = death, y = factor(id),
+ yend = factor(id)), color = "grey50", linewidth = 1) +
+ geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
+ geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) +
+ scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
+ scale_y_discrete(limits = rev) +
+ labs(x = "Year", y = "Subject ID", color = "Event") +
+ theme_minimal()
+```
+
+```{r}
+ex_table_1
+ex_plot_real_time
+```
+
+---
+
+We then calculate time to death for each respondents.
+
+With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X`
+
+```{r}
+ex_t <- ex |>
+ mutate(
+ t = death - transplant
+ )
+
+ex_table_2 <- ex_t |>
+ gt() |>
+ cols_label(
+ id = "ID",
+ transplant = "Transplant",
+ death = "Death",
+ t = "Time to death"
+ ) |>
+ cols_align(align = "center", columns = everything())
+
+ex_plot_uniform_time <- ggplot(ex_t) +
+ geom_segment(aes(x = 0, xend = t, y = factor(id),
+ yend = factor(id)), color = "grey50", linewidth = 1) +
+ geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
+ geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) +
+ scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
+ scale_y_discrete(limits = rev) +
+ labs(x = "Year", y = "Subject ID", color = "Event") +
+ theme_minimal()
+
+```
+
+
+```{r}
+ex_table_2
+ex_plot_uniform_time
+```
+
+---
+
+What if we do not know when exactly some respondents die?
+
+Scenario 1: the study ends at the year 2008?
+
+```{r}
+ex_t_c <- ex_t |>
+ mutate(
+ death_censored = if_else(death <= 2008, death, 2008),
+ death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"),
+ status = if_else(death <= 2008, 1, 0),
+ t_1 = death_censored - transplant,
+ t_1_txt = if_else(death <= 2008, as.character(t_1),
+ paste(">", as.character(t_1)))
+ )
+```
+
+```{r}
+ex_table_3 <- ex_t_c |>
+ select(id, transplant, death_censored_txt, t_1_txt) |>
+ gt() |>
+ cols_label(
+ id = "ID",
+ transplant = "Transplant",
+ death_censored_txt = "Death",
+ t_1_txt = "Time to death"
+ ) |>
+ cols_align(align = "center", columns = everything())
+
+ex_plot_real_time_1 <- ggplot(ex_t_c) +
+ geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
+ yend = factor(id)), color = "grey50", linewidth = 1) +
+ geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
+ geom_point(aes(x = death_censored, y = factor(id), color =
+ if_else(status == 1, "death", "censored")), size = 3) +
+ scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+ "censored" = "orange")) +
+ scale_y_discrete(limits = rev) +
+ labs(x = "Year", y = "Subject ID", color = "Event") +
+ xlim(2000, 2016) +
+ geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
+ linewidth = 0.8) +
+ theme_minimal()
+
+
+ex_plot_uniform_time_1 <- ggplot(ex_t_c) +
+ geom_segment(aes(x = 0, xend = t_1, y = factor(id),
+ yend = factor(id)), color = "grey50", linewidth = 1) +
+ geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
+ geom_point(aes(x = t_1, y = factor(id), color =
+ if_else(status == 1, "death", "censored")), size = 3) +
+ scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+ "censored" = "orange")) +
+ scale_y_discrete(limits = rev) +
+ labs(x = "Year", y = "Subject ID", color = "Event") +
+ theme_minimal()
+```
+
+
+```{r}
+ex_table_3
+ex_plot_real_time_1
+ex_plot_uniform_time_1
+```
+
+**Right Censoring**: only observe the event (death) if it occurs before a
+certain time (2008).
+
+---
+
+Scenario 2: respondent 3 move away; loss follow up
+
+```{r}
+ex_alt <- ex_t_c
+ex_alt$death_censored_txt[3] <- "> 2005"
+ex_alt$status[3] <- 0
+ex_alt$t_1_txt[3] <- "> 4"
+
+
+ex_table_4 <- ex_alt |>
+ select(id, transplant, death_censored_txt, t_1_txt) |>
+ gt() |>
+ cols_label(
+ id = "ID",
+ transplant = "Transplant",
+ death_censored_txt = "Death",
+ t_1_txt = "Time to death"
+ ) |>
+ cols_align(align = "center", columns = everything())
+
+ex_plot_real_time_2 <- ggplot(ex_alt) +
+ geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
+ yend = factor(id)), color = "grey50", linewidth = 1) +
+ geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
+ geom_point(aes(x = death_censored, y = factor(id), color =
+ if_else(status == 1, "death", "censored")), size = 3) +
+ scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+ "censored" = "orange")) +
+ scale_y_discrete(limits = rev) +
+ labs(x = "Year", y = "Subject ID", color = "Event") +
+ xlim(2000, 2016) +
+ geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
+ linewidth = 0.8) +
+ theme_minimal()
+
+
+ex_plot_uniform_time_2 <- ggplot(ex_alt) +
+ geom_segment(aes(x = 0, xend = t_1, y = factor(id),
+ yend = factor(id)), color = "grey50", linewidth = 1) +
+ geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
+ geom_point(aes(x = t_1, y = factor(id), color =
+ if_else(status == 1, "death", "censored")), size = 3) +
+ scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+ "censored" = "orange")) +
+ scale_y_discrete(limits = rev) +
+ labs(x = "Year", y = "Subject ID", color = "Event") +
+ theme_minimal()
+```
+
+```{r}
+ex_table_4
+ex_plot_real_time_2
+ex_plot_uniform_time_2
+```
+
+---
+
+How many patients are right censored?
+
+```{r}
+dat |>
+ mutate(Overall = "Overall") |>
+ pivot_longer(
+ cols = c(Overall, sex, tx.type),
+ names_to = "Attribute",
+ values_to = "Category"
+ ) |>
+ count(Attribute, Category, death) |>
+ ggplot(aes(x = Category, y = n, fill = factor(death))) +
+ geom_col(position = "dodge") +
+ facet_wrap(~ Attribute, scales = "free_x") +
+ labs(x = "Group", y = "Count", fill = "Death Status") +
+ theme_minimal()
+```
+
+## Challenges in Survival Analysis (L)
+
+- **Right Censoring**: only observe the event if it occurs before a certain
+ time.
+- **Left Censoring**: event has occurred prior to the start of a research
+ - Follow up every 3 years.
+ - Event has occurred -> event happened sometime before follow up.
+- **Left Truncation**: delayed entry; respondents are included only if they
+ survived long enough.
+ - Start follow up with patients 100 days after the transplant
+ - Patients dies within 100 day wouldn't be included in the dataset
+- **Right Truncation**: respondents are included only if they have already
+ experienced the event.
+ - Retrospective analysis from deceased patients between 1990 and 2000.
+ - Patients not dead before 2000 are not included in the dataset.
+
+# Method & Result
+
+## Explain Dataset (M)
+
+## Table 1 (L)
+
+
+### HLA match `hla.match`
+
+```{r}
+dat$hla.match |> table(useNA = "always")
+```
+
+```{r}
+ggplot(dat, aes(x = hla.match)) +
+ geom_bar() +
+ labs(x = "HLA match", y = "Count") +
+ theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = hla.match, fill = tx.type)) +
+ geom_bar(position = "dodge") +
+ labs(x = "HLA match", y = "Count") +
+ theme_minimal()
+```
+
+### Donor age `age.donor`
+
+```{r}
+#| echo: true
+mean(dat$age.donor, na.rm = TRUE)
+median(dat$age.donor, na.rm = TRUE)
+```
+
+```{r}
+ggplot(dat, aes(x = age.donor)) +
+ geom_bar() +
+ labs(x = "Donor Age", y = "Count") +
+ theme_minimal()
+```
+
+```{r}
+#| fig-width: 8
+#| fig-height: 12
+ggplot(dat, aes(x = age.donor)) +
+ geom_bar() +
+ labs(x = "Donor Age", y = "Count") +
+ theme_minimal() +
+ facet_grid(hla.match ~ .)
+```
+
+```{r}
+ggplot(dat, aes(x = age.donor, color = hla.match)) +
+ geom_density() +
+ labs(x = "Donor Age", y = "Count") +
+ theme_minimal()
+```
+
+
+### Recipient Age `age.rec`
+
+```{r}
+#| echo: true
+mean(dat$age.rec, na.rm = TRUE)
+median(dat$age.rec, na.rm = TRUE)
+```
+
+
+```{r}
+ggplot(dat, aes(x = age.rec)) +
+ geom_bar() +
+ labs(x = "Recipient Age", y = "Count") +
+ theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) +
+ geom_boxplot() +
+ theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = age.rec, fill = tx.type)) +
+ geom_bar(position = "dodge") +
+ theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = age.rec, color = hla.match)) +
+ geom_density() +
+ theme_minimal()
+```
+
+```{r}
+ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) +
+ geom_boxplot() +
+ theme_minimal()
+```
+
+
+### Cold Ischemia Time `cold.isc`
+
+```{r}
+#| echo: true
+mean(dat$cold.isc, na.rm = TRUE)
+median(dat$cold.isc, na.rm = TRUE)
+```
+
+```{r}
+ggplot(dat, aes(x = cold.isc)) +
+ geom_density() +
+ labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
+ theme_minimal()
+```
+
+```{r}
+ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) +
+ geom_density() +
+ labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
+ theme_minimal()
+```
+
+### Transplant Type `tx.type`
+
+```{r}
+#| echo: true
+dat$tx.type |> table()
+```
+
+```{r}
+ggplot(dat, aes(x = tx.type)) +
+ geom_bar() +
+ labs(x = "Transplant Type", y = "Count") +
+ theme_minimal()
+
+```
+
+### Year `year`
+
+```{r}
+dat$year |> table()
+```
+
+```{r}
+ggplot(dat, aes(x = year)) +
+ geom_bar() +
+ labs(x = "Year", y = "Count") +
+ theme_minimal()
+```
+
+```{r}
+ggplot(dat, aes(x = year, y = follow.up)) +
+ geom_boxplot()
+```
+
+```{r}
+ggplot(dat, aes(x = year, fill = tx.type)) +
+ geom_bar(position = "dodge")
+```
+
+```{r}
+ggplot(dat, aes(x = year, y = age.rec)) +
+ geom_boxplot()
+```
+
+```{r}
+ggplot(dat, aes(x = year, y = age.donor)) +
+ geom_boxplot()
+```
+
+## Overall Kaplan-Meier
+
+```{r}
+km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat)
+```
+
+```{r}
+km_all |>
+ ggsurvfit(type = "survival") +
+ add_confidence_interval() +
+ scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) +
+ labs(
+ x = "Years of Follow-up",
+ y = "Overall Survival Probability"
+ ) +
+ theme_minimal()
+```
+
+## Hazard Functions
+
+```{r}
+get.life.table <- function(dat, time.intervals) {
+ n.pop <- nrow(dat)
+
+ dat |>
+ recode.dat(time.intervals) |>
+ group_by(fu.interval) |>
+ summarize(
+ n.censored = sum(.data$death == 0),
+ n.event = sum(.data$death),
+ ) |>
+ ungroup() |>
+ calculate.hazard(n.pop)
+}
+
+get.life.table.by.groups <- function(dat, time.intervals, grps) {
+ grps |>
+ lapply(function(grp) {
+ dat |>
+ get.life.table.by.group(time.intervals, grp) |>
+ mutate(
+ grp.name = grp,
+ grp.value = pick(1)[[1]]
+ ) |>
+ select(-1)
+ }) |>
+ bind_rows()
+}
+
+get.life.table.by.group <- function(dat, time.intervals, grp) {
+ dat |>
+ recode.dat(time.intervals) |>
+ group_by(fu.interval, .data[[grp]]) |>
+ summarize(
+ n.censored = sum(.data$death == 0),
+ n.event = sum(.data$death),
+ .groups = "keep"
+ ) |>
+ ungroup(fu.interval) |>
+ group_modify(function(df.sub, grp) {
+ grp.name <- names(grp)
+ grp.value <- grp[[1]]
+ n.pop <- (dat[[grp.name]] == grp.value) |> sum()
+ calculate.hazard(df.sub, n.pop)
+ }) |>
+ ungroup()
+}
+
+calculate.hazard <- function(life.table, n.pop) {
+ n.removed <- life.table$n.event + life.table$n.censored
+ n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
+ life.table |>
+ mutate(
+ n.at.risk = n.pop - n.removed.accum,
+ # TODO: how to account for censored? How do we adjust for uneven interval?
+ hazard.rate = n.event / n.at.risk
+ )
+}
+
+recode.dat <- function(dat, time.intervals) {
+ df <- dat[dat$follow.up <= sum(time.intervals), ]
+ time.points <- cumsum(time.intervals)
+ df$fu.interval <- sapply(df$follow.up, function(time) {
+ time.points[time <= time.points][1]
+ })
+
+ df
+}
+```
+
+### `death` Distribution (?)
+
+```{r}
+dat |>
+ mutate(
+ accum.death = cumsum(death),
+ accum.censored = if_else(death == 1, 0, 1) |> cumsum()
+ ) |>
+ ggplot() +
+ geom_step(aes(x = follow.up, y = accum.death))
+```
+
+```{r}
+
+```
+
+```{r}
+plot.death = dat[dat$death == 1,] |>
+ ggplot(aes(x = follow.up)) +
+ geom_histogram(bins = 50)
+plot.death
+```
+
+
+### Overall (?)
+
+```{r}
+time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+get.life.table(dat, time.intervals)
+```
+
+### `tx.type` (M)
+
+
+```{r}
+
+```
+
+
+## Cox Model
+
+### `age` (continuous) (L)
+
+### `age` (categorical) (M)
+
+### Full model (L)
+
+```{r}
+m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat)
+summary(m1)
+
+m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat)
+summary(m2)
+
+anova(m1, m2, test = "LRT")
+```
+
+### Discussion: include `year` or not?
+
+## Assumption Testing
+
+# Conclusion (M + L)