discussion 2

2026-05-14 09:15:59 +02:00 · 2026-05-14 09:15:59 +02:00 · bce631d9e0
commit bce631d9e0
parent 7b94e4bf64
2 changed files with 377 additions and 23 deletions
--- a/report.Rmd
+++ b/report.Rmd
@ -8,20 +8,34 @@ number-sections: true

 ```{r}
 library(tidyverse)
+library(dplyr)
+library(ggplot2)
 library(survival)
+library(emmeans)
+library(foreign)
 # library(gtsummary)
 ```

 ```{r}
 dat <- read.csv("./unos.txt", sep = "\t")
 head(dat)
-```
-
-```{r}
 names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
                "year", "sex", "tx.type", "follow.up")
 ```

+Complete case
+
+```{r}
+nrow(dat)
+```
+
+
+```{r}
+sapply(names(dat), function(col) {
+  is.na(dat[, col]) |> sum()
+})
+```
+


 # Exercise
@ -47,6 +61,10 @@ g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
 # dat$age.1 |> table()
 ```

+```{r}
+g + geom_boxplot(aes(x = year, y = follow.up, group = year))
+```
+
 ## Exercise 2

 Plot the Kaplan-Meier overall survival curve for pediatric kid-
@ -115,7 +133,7 @@ print(nrow(dat))
 dat.5.life

 ```
-
+n.removed.acc
 ```{r}
 dat.5.life <- dat.5.life |>
  mutate(
@ -128,8 +146,8 @@ dat.5.life <- dat.5.life |>


 ```{r}
-get_life_table = function(dat) {
-  dat <- dat |>
+get_life_table = function(dat.sub, dat) {
+  dat.sub <- dat.sub |>
    group_by(fu.interval) |>
    summarize(
      n.censored = sum(death == 0),
@ -137,24 +155,24 @@ get_life_table = function(dat) {
      n.at.risk = nrow(dat),
    )

-  for (i in 2:nrow(dat)) {
+  for (i in 2:nrow(dat.sub)) {
    j <- i - 1

-    n.censored.pre <- dat$n.censored[j]
-    n.event.pre <- dat$n.event[j]
-    n.at.risk.pre <- dat$n.at.risk[j]
+    n.censored.pre <- dat.sub$n.censored[j]
+    n.event.pre <- dat.sub$n.event[j]
+    n.at.risk.pre <- dat.sub$n.at.risk[j]

    n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

-    dat$n.at.risk[i] <- n.at.risk
+    dat.sub$n.at.risk[i] <- n.at.risk
  }

-  dat <- dat |>
+  dat.sub <- dat.sub |>
    mutate(
       hazard.rate = n.event / n.at.risk
    )

-  return(dat)
+  return(dat.sub)
 }
 ```

@ -164,12 +182,12 @@ dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
 ```

 ```{r}
-tx0.life <- get_life_table(dat.5.tx0)
+tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
 tx0.life
 ```

 ```{r}
-tx1.life <- get_life_table(dat.5.tx1)
+tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
 tx1.life
 ```

@ -195,6 +213,159 @@ ggplot(hazard.df, aes(x = fu.interval)) +

 ```

+---
+
+General Solution:
+
+```{r}
+get.life.table <- function(dat, time.intervals) {
+  n.pop <- nrow(dat)
+
+  dat |>
+    recode.dat(time.intervals) |>
+    group_by(fu.interval) |>
+    summarize(
+      n.censored = sum(.data$death == 0),
+      n.event = sum(.data$death),
+    ) |>
+    ungroup() |>
+    calculate.hazard(n.pop)
+}
+
+get.life.table.by.groups <- function(dat, time.intervals, grps) {
+  grps |>
+    lapply(function(grp) {
+      dat |>
+        get.life.table.by.group(time.intervals, grp) |>
+        mutate(
+          grp.name = grp,
+          grp.value = pick(1)[[1]]
+        ) |>
+        select(-1)
+    }) |>
+    bind_rows()
+}
+
+get.life.table.by.group <- function(dat, time.intervals, grp) {
+  dat |>
+    recode.dat(time.intervals) |>
+    group_by(fu.interval, .data[[grp]]) |>
+    summarize(
+      n.censored = sum(.data$death == 0),
+      n.event = sum(.data$death),
+      .groups = "keep"
+    ) |>
+    ungroup(fu.interval) |>
+    group_modify(function(df.sub, grp) {
+      grp.name <- names(grp)
+      grp.value <- grp[[1]]
+      n.pop <- (dat[[grp.name]] == grp.value) |> sum()
+      calculate.hazard(df.sub, n.pop)
+    }) |>
+    ungroup()
+}
+
+calculate.hazard <- function(life.table, n.pop) {
+  n.removed <- life.table$n.event + life.table$n.censored
+  n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
+  life.table |>
+    mutate(
+      n.at.risk = n.pop - n.removed.accum,
+      # TODO: how to account for censored? How do we adjust for uneven interval?
+      hazard.rate = n.event / n.at.risk
+    )
+}
+
+recode.dat <- function(dat, time.intervals) {
+  df <- dat[dat$follow.up <= sum(time.intervals), ]
+  time.points <- cumsum(time.intervals)
+  df$fu.interval <- sapply(df$follow.up, function(time) {
+      time.points[time <= time.points][1]
+  })
+
+  df
+}
+
+
+
+```
+
+```{r}
+time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+grps <- c("tx.type", "sex", "hla.match", "cold.isc")
+
+get.life.table.by.groups(dat, time.intervals, grps)
+```
+
+```{r}
+grps <- c("tx.type", "sex", "hla.match", "cold.isc")
+grps <- c("tx.type")
+
+
+plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
+  dat |>
+    get.life.table.by.groups(time.intervals, grps) |>
+    ggplot(aes(x = fu.interval, y = hazard.rate,
+               color = paste(grp.name, grp.value, sep = " = "))) +
+    geom_point() +
+    geom_line() +
+    labs(
+      color = "Group"
+    ) +
+    theme(
+      legend.position = "bottom"
+    )
+}
+
+```
+```{r}
+time.intervals <- c(rep(1/3, 3), rep(1, 4))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- rep(1, 5) 
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- rep(1/3, 15)
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/4, 4), rep(1, 4))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+```{r}
+time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
+plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
+```
+
+---
+
+Compare
+
+```{r}
+time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+get.life.table.by.groups(dat, time.intervals, grps)
+```
+
+```{r}
+hazard.df
+```
+


 ## Exercise 4
@ -214,31 +385,209 @@ Fit a univariate Cox model with predictor donor type. Report
 the hazard ratio and 95% confidence interval and interpret the result obtained.

 ```{r}
-cox <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
-summary(cox)
+cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
+summary(cox.tx)
 ```

 ```{r}
-1.90539
+log.hazard.ratio.hat <- cox.tx$coefficient
+hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
+
 ```

 ```{r}
-1.90539 + 1.96 * exp(0.09558)
-(2.298 - 1.58) / 2 |> exp() / 2
+log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
+log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
+log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se
+
+c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
 ```

-Exercise 6 — Research shows that an important determinant of mortality
+::: {.callout-note}
+Patients receiving kidney transplants from living donors (`tx.type = 1`) has
+1.91 higher hazard rate than those receiving from cadaveric donors 
+(`tx.type = 0`).
+
+Since 1 is not in the 95% confidence interval, the result is statistically 
+significant.
+:::
+
+
+# Exercise 6
+
+Research shows that an important determinant of mortality
 after kidney transplant is the age of the recipient. Fit a Cox model with age
 as predictor and estimate the hazard ratio and its confidence interval. Consider
 age first as continuous variable and then divide into categories.

-Exercise 7 — Fit a multivariate Cox model by using other predictors and
+## Continuous
+
+```{r}
+cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
+cox.age.rec |> summary()
+```
+
+## Categorical (Case 1)
+
+```{r}
+recode.age <- function(dat, ages) {
+  df$age.group <- sapply(df$follow.up, function(time) {
+      time.points[time <= time.points][1]
+  })
+
+  df
+}
+```
+
+```{r}
+dat.age.rec <- dat[!is.na(dat$age.rec), ]
+class.age <- function(age) {
+  if (age < 2) { 
+    return("0,1")
+  } else if (age < 6) {
+    return("2,3,4,5")
+  } else if (age < 11) {
+    return("6,7,8,9,10")
+  } else if (age < 19) {
+    return("11-18")
+  }
+}
+```
+
+```{r}
+dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
+  factor(levels = c(
+    "0,1",
+    "2,3,4,5",
+    "6,7,8,9,10",
+    "11-18"
+  ))
+
+
+cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
+cox.age.rec |> summary()
+```
+
+```{r}
+emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
+```
+
+
+# Exercise 7
+
+Fit a multivariate Cox model by using other predictors and
 describe your results.

-Exercise 8 — Estimate the survival function for specific covariate patterns.
+
+```{r}
+sapply(names(dat), function(col) {
+  is.na(dat[, col]) |> sum()
+})
+```
+
+```{r}
+names(dat)
+```
+
+```{r}
+Waldtest <- function(cox.fit, i.betas){
+  q <- length(i.betas)
+  coefs <- cox.fit$coefficients
+  # print(cox.fit$var)
+  var <- cox.fit$var[i.betas, i.betas]
+  Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
+  p.value <- 1-pchisq(as.numeric(Wald), df = q)
+  return(cbind(Wald, p.value))
+}
+```
+
+
+```{r}
+fit.cox.model <- function(surv, i.betas) {
+  fit.tx <- coxph(surv, data = dat)
+  Waldtest(fit.tx, i.betas)
+}
+```
+
+Start with `tx.type`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec
+fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
+```
+
+Iteration 1: `tx.type + age + ?`
+
+```{r}
+names(dat)
+```
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
+fit.cox.model(surv, 3)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
+fit.cox.model(surv, 3)
+```
+
+Add `hla.match`
+
+Iteration 2: `tx.type + age.rec + hla.match + ?`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
+fit.cox.model(surv, 4)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
+fit.cox.model(surv, 4)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
+fit.cox.model(surv, 4)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
+fit.cox.model(surv, 4)
+```
+
+Add `age.donor`
+
+Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+  cold.isc
+fit.cox.model(surv, 5)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+  year
+fit.cox.model(surv, 5)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
+  sex
+fit.cox.model(surv, 5)
+```
+
+Add `year`
+
+```{r}
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
+  year + cold.isc
+fit.cox.model(surv, 6)
+surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
+  year + sex
+fit.cox.model(surv, 6)
+```
+
+
+
+Next, we test for each remaining variables
+
+# Exercise 8
+
+Estimate the survival function for specific covariate patterns.
 Based on the previous results choose the best predictors.

 Exercise 9 — Check the proportional hazards assumption. You may use the
 function cox.zph. Discuss the result and possible implications.
+
 Exercise 10 — Plot the Schoenfeld residuals and comment.