foo

library(tidyverse)
library(dplyr)
library(ggplot2)
library(survival)
library(emmeans)
library(foreign)
library(gtsummary)
dat <- read.csv("./unos.txt", sep = "\t")
head(dat)
  hlamat age age.1 cold_isc death year sex txtype fu
1      2  25    15        6     0 2002   0      1  0
2      1  42    10       11     1 1999   0      1  0
3      1   9    14        8     0 2002   1      1  0
4      0  35    12       26     0 2002   1      1  0
5      2  17    12       28     1 1997   1      1  0
6      3  44    13        1     0 2002   1      0  0
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
                "year", "sex", "tx.type", "follow.up")

Complete case

nrow(dat)
[1] 9775
sapply(names(dat), function(col) {
  is.na(dat[, col]) |> sum()
})
hla.match age.donor   age.rec  cold.isc     death      year       sex   tx.type 
      234       113         9      2250         0         0         0         0 
follow.up 
        0 

1 Exercise

1.1 Exercise 1

Illustrate in a table the characteristics of the population (age, sex, race, donor, . . . ).

hist(dat$age.donor)

hist(dat$age.rec)

hist(dat$cold.isc)

hist(dat$year)

hist(dat$follow.up)

unique(dat$hla.match)
[1]  2  1  0  3  4 NA  5  6
unique(dat$year)
 [1] 2002 1999 1997 2001 1994 1995 1996 1993 1990 1991 1992 2000 1998

Change the order!!

dat.table1 <- dat |> 
  mutate(
    sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
    tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
    hla.match = factor(hla.match),
    year = factor(year)
  )
table1 <- dat.table1 |> 
  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{median} ({p25}, {p75})"
    ),
    label = list(
      hla.match ~ "HLA matches, n(%)",
      age.donor ~ "Donor age, median (IQR)",
      age.rec ~ "Recipient age, median (IQR)",
      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
      year ~ "Year of transplant",
      sex ~ "Sex, n(%)",
      tx.type ~ "Transplant type, n(%)"
    ),
    missing = "ifany"
  ) |> 
  modify_footnote(all_stat_cols() ~ NA)

table1
Characteristic N = 9,775
HLA matches, n(%)
    0 784 (8.2%)
    1 1,504 (16%)
    2 1,553 (16%)
    3 3,778 (40%)
    4 1,278 (13%)
    5 365 (3.8%)
    6 279 (2.9%)
    Unknown 234
Donor age, median (IQR) 33 (21, 41)
    Unknown 113
Recipient age, median (IQR) 13 (8, 16)
    Unknown 9
Cold ischemic time (hours), median (IQR), 7 (1, 19)
    Unknown 2,250
Year of transplant
    1990 728 (7.4%)
    1991 734 (7.5%)
    1992 666 (6.8%)
    1993 736 (7.5%)
    1994 787 (8.1%)
    1995 813 (8.3%)
    1996 789 (8.1%)
    1997 745 (7.6%)
    1998 740 (7.6%)
    1999 842 (8.6%)
    2000 706 (7.2%)
    2001 834 (8.5%)
    2002 655 (6.7%)
Sex, n(%)
    Female 4,014 (41%)
    Male 5,761 (59%)
Transplant type, n(%)
    Cadaveric 5,148 (53%)
    Living 4,627 (47%)
table.split <- dat.table1 |> 
  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
  tbl_summary(
    by = tx.type,
    statistic = list(
      all_continuous() ~ "{median} ({p25}, {p75})"
    ),
    label = list(
      hla.match ~ "HLA matches, n(%)",
      age.donor ~ "Donor age, median (IQR)",
      age.rec ~ "Recipient age, median (IQR)",
      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
      year ~ "Year of transplant",
      sex ~ "Sex, n(%)"
    ),
    missing = "ifany"
  ) |> 
  add_overall() |> 
  modify_footnote(all_stat_cols() ~ NA)

table.split
Characteristic Overall
N = 9,775
Cadaveric
N = 5,148
Living
N = 4,627
HLA matches, n(%)


    0 784 (8.2%) 125 (2.5%) 659 (14%)
    1 1,504 (16%) 160 (3.2%) 1,344 (29%)
    2 1,553 (16%) 228 (4.6%) 1,325 (29%)
    3 3,778 (40%) 3,039 (61%) 739 (16%)
    4 1,278 (13%) 1,007 (20%) 271 (5.9%)
    5 365 (3.8%) 241 (4.9%) 124 (2.7%)
    6 279 (2.9%) 153 (3.1%) 126 (2.7%)
    Unknown 234 195 39
Donor age, median (IQR) 33 (21, 41) 37 (31, 42) 22 (15, 37)
    Unknown 113 112 1
Recipient age, median (IQR) 13 (8, 16) 13 (7, 16) 14 (9, 16)
    Unknown 9 7 2
Cold ischemic time (hours), median (IQR), 7 (1, 19) 1 (0, 1) 19 (13, 25)
    Unknown 2,250 1,490 760
Year of transplant


    1990 728 (7.4%) 318 (6.2%) 410 (8.9%)
    1991 734 (7.5%) 387 (7.5%) 347 (7.5%)
    1992 666 (6.8%) 345 (6.7%) 321 (6.9%)
    1993 736 (7.5%) 403 (7.8%) 333 (7.2%)
    1994 787 (8.1%) 355 (6.9%) 432 (9.3%)
    1995 813 (8.3%) 427 (8.3%) 386 (8.3%)
    1996 789 (8.1%) 416 (8.1%) 373 (8.1%)
    1997 745 (7.6%) 408 (7.9%) 337 (7.3%)
    1998 740 (7.6%) 412 (8.0%) 328 (7.1%)
    1999 842 (8.6%) 436 (8.5%) 406 (8.8%)
    2000 706 (7.2%) 392 (7.6%) 314 (6.8%)
    2001 834 (8.5%) 483 (9.4%) 351 (7.6%)
    2002 655 (6.7%) 366 (7.1%) 289 (6.2%)
Sex, n(%)


    Female 4,014 (41%) 2,105 (41%) 1,909 (41%)
    Male 5,761 (59%) 3,043 (59%) 2,718 (59%)

Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI

rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)

summary(rev_km)$table
     records        n.max      n.start       events        rmean    se(rmean) 
9.775000e+03 9.775000e+03 9.775000e+03 9.310000e+03 4.055027e+00 3.202711e-02 
      median      0.95LCL      0.95UCL 
3.380822e+00 3.227397e+00 3.567123e+00 
median_followup <- summary(rev_km)$table["median"]
confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
median_followup
  median 
3.380822 
confidence_interval
 0.95LCL  0.95UCL 
3.227397 3.567123 
g <- ggplot(dat)
g + geom_point(aes(x = follow.up, y = death))

g +  geom_density(aes(x = follow.up))

g +  geom_density(aes(x = age.donor))
Warning: Removed 113 rows containing non-finite outside the scale range
(`stat_density()`).

g +  geom_bar(aes(x = age.rec))
Warning: Removed 9 rows containing non-finite outside the scale range
(`stat_count()`).

g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
Warning: Removed 113 rows containing missing values or values outside the scale range
(`stat_boxplot()`).
Warning: Removed 7 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# dat$age.1 |> table()
g + geom_boxplot(aes(x = year, y = follow.up, group = year))

1.2 Exercise 2

Plot the Kaplan-Meier overall survival curve for pediatric kid- ney transplant recipients for the first 12 years after transplantation.

!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!

km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up  <= 12, ])
plot(km)

1.3 Exercise 3

We are going to compare mortality rates (hazard functions) between children whose transplanted kidney was provided by a living donor (in general a family member) and those whose source was recently deceased (variable donor type: txtype). Use the life table method to calculate the death rates for the first 5 years for each group (take in the first year intervals of 4 months and then look at each year) and show the results in a table. Estimate the hazard ratio in each time interval as the ratio between the mortality rates in the two groups. What do you notice?

dat.5 <- dat[dat$follow.up <= 5, ]
head(dat.5)
  hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
1         2        25      15        6     0 2002   0       1         0
2         1        42      10       11     1 1999   0       1         0
3         1         9      14        8     0 2002   1       1         0
4         0        35      12       26     0 2002   1       1         0
5         2        17      12       28     1 1997   1       1         0
6         3        44      13        1     0 2002   1       0         0
classify_time_interval = function(fu) {
  if (fu <= 1/3) {
    return(1/3)
  } else if (fu <= 2/3) {
    return(2/3)
  } else if (fu <= 1) {
    return(1)
  }
  ceiling(fu)
}

dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
table(dat.5$fu.interval)

0.333333333333333 0.666666666666667                 1                 2 
              982               528               640              1226 
                3                 4                 5 
             1184              1093               950 
head(dat.5)
  hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
1         2        25      15        6     0 2002   0       1         0
2         1        42      10       11     1 1999   0       1         0
3         1         9      14        8     0 2002   1       1         0
4         0        35      12       26     0 2002   1       1         0
5         2        17      12       28     1 1997   1       1         0
6         3        44      13        1     0 2002   1       0         0
  fu.interval
1   0.3333333
2   0.3333333
3   0.3333333
4   0.3333333
5   0.3333333
6   0.3333333
dat.5.life <- dat.5 |>
  group_by(fu.interval) |>
  summarize(
    n.censored = sum(death == 0),
    n.event = sum(death),
    n.at.risk = nrow(dat),
  )

for (i in 2:nrow(dat.5.life)) {
  j <- i - 1

  n.censored.pre <- dat.5.life$n.censored[j]
  n.event.pre <- dat.5.life$n.event[j]
  n.at.risk.pre <- dat.5.life$n.at.risk[j]

  n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

  dat.5.life$n.at.risk[i] <- n.at.risk
}

print(nrow(dat))
[1] 9775
dat.5.life
# A tibble: 7 × 4
  fu.interval n.censored n.event n.at.risk
        <dbl>      <int>   <int>     <int>
1       0.333        830     152      9775
2       0.667        491      37      8793
3       1            613      27      8265
4       2           1164      62      7625
5       3           1143      41      6399
6       4           1053      40      5215
7       5            919      31      4122

n.removed.acc

dat.5.life <- dat.5.life |>
  mutate(
     hazard.rate = n.event / n.at.risk
  )

get_life_table = function(dat.sub, dat) {
  dat.sub <- dat.sub |>
    group_by(fu.interval) |>
    summarize(
      n.censored = sum(death == 0),
      n.event = sum(death),
      n.at.risk = nrow(dat),
    )

  for (i in 2:nrow(dat.sub)) {
    j <- i - 1

    n.censored.pre <- dat.sub$n.censored[j]
    n.event.pre <- dat.sub$n.event[j]
    n.at.risk.pre <- dat.sub$n.at.risk[j]

    n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

    dat.sub$n.at.risk[i] <- n.at.risk
  }

  dat.sub <- dat.sub |>
    mutate(
       hazard.rate = n.event / n.at.risk
    )

  return(dat.sub)
}
dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
tx0.life
# A tibble: 7 × 5
  fu.interval n.censored n.event n.at.risk hazard.rate
        <dbl>      <int>   <int>     <int>       <dbl>
1       0.333        463      49      5148     0.00952
2       0.667        253      13      4636     0.00280
3       1            340      11      4370     0.00252
4       2            591      24      4019     0.00597
5       3            568      17      3404     0.00499
6       4            521      16      2819     0.00568
7       5            501      11      2282     0.00482
tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
tx1.life
# A tibble: 7 × 5
  fu.interval n.censored n.event n.at.risk hazard.rate
        <dbl>      <int>   <int>     <int>       <dbl>
1       0.333        367     103      4627     0.0223 
2       0.667        238      24      4157     0.00577
3       1            273      16      3895     0.00411
4       2            573      38      3606     0.0105 
5       3            575      24      2995     0.00801
6       4            532      24      2396     0.0100 
7       5            418      20      1840     0.0109 
tx1.life$hazard.rate / tx0.life$hazard.rate
[1] 2.338731 2.058881 1.631929 1.764675 1.604557 1.764816 2.254941
hazard.df <- data.frame(
  fu.interval = tx1.life$fu.interval,
  hazard.rate.0 = tx0.life$hazard.rate,
  hazard.rate.1 = tx1.life$hazard.rate,
  hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
) 

ggplot(hazard.df, aes(x = fu.interval)) +
  geom_line(aes(y = hazard.rate.0), color = "blue") + 
  geom_line(aes(y = hazard.rate.1), color = "orange")

ggplot(hazard.df, aes(x = fu.interval)) +
  geom_line(aes(y = hazard.ratio))


General Solution:

get.life.table <- function(dat, time.intervals) {
  n.pop <- nrow(dat)

  dat |>
    recode.dat(time.intervals) |>
    group_by(fu.interval) |>
    summarize(
      n.censored = sum(.data$death == 0),
      n.event = sum(.data$death),
    ) |>
    ungroup() |>
    calculate.hazard(n.pop)
}

get.life.table.by.groups <- function(dat, time.intervals, grps) {
  grps |>
    lapply(function(grp) {
      dat |>
        get.life.table.by.group(time.intervals, grp) |>
        mutate(
          grp.name = grp,
          grp.value = pick(1)[[1]]
        ) |>
        select(-1)
    }) |>
    bind_rows()
}

get.life.table.by.group <- function(dat, time.intervals, grp) {
  dat |>
    recode.dat(time.intervals) |>
    group_by(fu.interval, .data[[grp]]) |>
    summarize(
      n.censored = sum(.data$death == 0),
      n.event = sum(.data$death),
      .groups = "keep"
    ) |>
    ungroup(fu.interval) |>
    group_modify(function(df.sub, grp) {
      grp.name <- names(grp)
      grp.value <- grp[[1]]
      n.pop <- (dat[[grp.name]] == grp.value) |> sum()
      calculate.hazard(df.sub, n.pop)
    }) |>
    ungroup()
}

calculate.hazard <- function(life.table, n.pop) {
  n.removed <- life.table$n.event + life.table$n.censored
  n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
  life.table |>
    mutate(
      n.at.risk = n.pop - n.removed.accum,
      # TODO: how to account for censored? How do we adjust for uneven interval?
      hazard.rate = n.event / n.at.risk
    )
}

recode.dat <- function(dat, time.intervals) {
  df <- dat[dat$follow.up <= sum(time.intervals), ]
  time.points <- cumsum(time.intervals)
  df$fu.interval <- sapply(df$follow.up, function(time) {
      time.points[time <= time.points][1]
  })

  df
}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
grps <- c("tx.type", "sex", "hla.match", "cold.isc")

get.life.table.by.groups(dat, time.intervals, grps)
# A tibble: 593 × 7
   fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
         <dbl>      <int>   <int>     <dbl>       <dbl> <chr>        <dbl>
 1       0.333        463      49      5148     0.00952 tx.type          0
 2       0.667        253      13      4636     0.00280 tx.type          0
 3       1            340      11      4370     0.00252 tx.type          0
 4       2            591      24      4019     0.00597 tx.type          0
 5       3            568      17      3404     0.00499 tx.type          0
 6       4            521      16      2819     0.00568 tx.type          0
 7       5            501      11      2282     0.00482 tx.type          0
 8       0.333        367     103      4627     0.0223  tx.type          1
 9       0.667        238      24      4157     0.00577 tx.type          1
10       1            273      16      3895     0.00411 tx.type          1
# ℹ 583 more rows
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
grps <- c("tx.type")


plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
  dat |>
    get.life.table.by.groups(time.intervals, grps) |>
    ggplot(aes(x = fu.interval, y = hazard.rate,
               color = paste(grp.name, grp.value, sep = " = "))) +
    geom_point() +
    geom_line() +
    labs(
      color = "Group"
    ) +
    theme(
      legend.position = "bottom"
    )
}
time.intervals <- c(rep(1/3, 3), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))

time.intervals <- rep(1, 5) 
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))

time.intervals <- rep(1/3, 15)
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))

time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))

time.intervals <- c(rep(1/4, 4), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))

time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))

time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))


Compare

time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table.by.groups(dat, time.intervals, grps)
# A tibble: 14 × 7
   fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
         <dbl>      <int>   <int>     <dbl>       <dbl> <chr>        <int>
 1       0.333        463      49      5148     0.00952 tx.type          0
 2       0.667        253      13      4636     0.00280 tx.type          0
 3       1            340      11      4370     0.00252 tx.type          0
 4       2            591      24      4019     0.00597 tx.type          0
 5       3            568      17      3404     0.00499 tx.type          0
 6       4            521      16      2819     0.00568 tx.type          0
 7       5            501      11      2282     0.00482 tx.type          0
 8       0.333        367     103      4627     0.0223  tx.type          1
 9       0.667        238      24      4157     0.00577 tx.type          1
10       1            273      16      3895     0.00411 tx.type          1
11       2            573      38      3606     0.0105  tx.type          1
12       3            575      24      2995     0.00801 tx.type          1
13       4            532      24      2396     0.0100  tx.type          1
14       5            418      20      1840     0.0109  tx.type          1
hazard.df
  fu.interval hazard.rate.0 hazard.rate.1 hazard.ratio
1   0.3333333   0.009518260   0.022260644     2.338731
2   0.6666667   0.002804142   0.005773394     2.058881
3   1.0000000   0.002517162   0.004107831     1.631929
4   2.0000000   0.005971635   0.010537992     1.764675
5   3.0000000   0.004994125   0.008013356     1.604557
6   4.0000000   0.005675772   0.010016694     1.764816
7   5.0000000   0.004820333   0.010869565     2.254941

1.4 Exercise 4

Show a plot with Kaplan-Meier survival curves for the two donor types.

km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up  <= 12, ])

plot(km.tx, col = c("blue", "orange"))
legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))

1.5 Exercise 5

Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained.

cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
summary(cox.tx)
Call:
coxph(formula = Surv(follow.up, death) ~ tx.type, data = dat)

  n= 9775, number of events= 465 

           coef exp(coef) se(coef)     z Pr(>|z|)    
tx.type 0.64469   1.90539  0.09558 6.745 1.53e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

        exp(coef) exp(-coef) lower .95 upper .95
tx.type     1.905     0.5248      1.58     2.298

Concordance= 0.586  (se = 0.012 )
Likelihood ratio test= 47.1  on 1 df,   p=7e-12
Wald test            = 45.5  on 1 df,   p=2e-11
Score (logrank) test = 47.09  on 1 df,   p=7e-12
log.hazard.ratio.hat <- cox.tx$coefficient
hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se

c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
 tx.type  tx.type 
1.579899 2.297932 
Note

Patients receiving kidney transplants from living donors (tx.type = 1) has 1.91 higher hazard rate than those receiving from cadaveric donors (tx.type = 0).

Since 1 is not in the 95% confidence interval, the result is statistically significant.

2 Exercise 6

Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.

2.1 Continuous

cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
cox.age.rec |> summary()
Call:
coxph(formula = Surv(follow.up, death) ~ age.rec, data = dat)

  n= 9766, number of events= 464 
   (9 observations deleted due to missingness)

             coef exp(coef)  se(coef)      z Pr(>|z|)    
age.rec -0.042550  0.958342  0.008434 -5.045 4.53e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

        exp(coef) exp(-coef) lower .95 upper .95
age.rec    0.9583      1.043    0.9426    0.9743

Concordance= 0.586  (se = 0.016 )
Likelihood ratio test= 24.81  on 1 df,   p=6e-07
Wald test            = 25.45  on 1 df,   p=5e-07
Score (logrank) test = 25.75  on 1 df,   p=4e-07

2.2 Categorical (Case 1)

recode.age <- function(dat, ages) {
  df$age.group <- sapply(df$follow.up, function(time) {
      time.points[time <= time.points][1]
  })

  df
}
dat.age.rec <- dat[!is.na(dat$age.rec), ]
class.age <- function(age) {
  if (age < 2) { 
    return("0,1")
  } else if (age < 6) {
    return("2,3,4,5")
  } else if (age < 11) {
    return("6,7,8,9,10")
  } else if (age < 19) {
    return("11-18")
  }
}
dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
  factor(levels = c(
    "0,1",
    "2,3,4,5",
    "6,7,8,9,10",
    "11-18"
  ))


cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
cox.age.rec |> summary()
Call:
coxph(formula = Surv(follow.up, death) ~ age.group, data = dat.age.rec)

  n= 9766, number of events= 464 

                       coef exp(coef) se(coef)      z Pr(>|z|)    
age.group2,3,4,5    -0.5947    0.5517   0.1767 -3.366 0.000764 ***
age.group6,7,8,9,10 -1.0578    0.3472   0.1776 -5.956 2.59e-09 ***
age.group11-18      -1.0192    0.3609   0.1513 -6.736 1.63e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                    exp(coef) exp(-coef) lower .95 upper .95
age.group2,3,4,5       0.5517      1.813    0.3902    0.7801
age.group6,7,8,9,10    0.3472      2.880    0.2452    0.4918
age.group11-18         0.3609      2.771    0.2683    0.4855

Concordance= 0.584  (se = 0.015 )
Likelihood ratio test= 44.93  on 3 df,   p=1e-09
Wald test            = 53.97  on 3 df,   p=1e-11
Score (logrank) test = 57.87  on 3 df,   p=2e-12
emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
$emmeans
 age.group  response     SE  df asymp.LCL asymp.UCL
 0,1           1.000 0.0000 Inf     1.000     1.000
 2,3,4,5       0.552 0.0975 Inf     0.390     0.780
 6,7,8,9,10    0.347 0.0617 Inf     0.245     0.492
 11-18         0.361 0.0546 Inf     0.268     0.485

Confidence level used: 0.95 
Intervals are back-transformed from the log scale 

$contrasts
 contrast             ratio    SE  df null z.ratio p.value
 0,1 / 2,3,4,5        1.813 0.320 Inf    1   3.366  0.0043
 0,1 / 6,7,8,9,10     2.880 0.511 Inf    1   5.956  <.0001
 0,1 / (11-18)        2.771 0.419 Inf    1   6.736  <.0001
 2,3,4,5 / 6,7,8,9,10 1.589 0.251 Inf    1   2.928  0.0179
 2,3,4,5 / (11-18)    1.529 0.196 Inf    1   3.314  0.0051
 6,7,8,9,10 / (11-18) 0.962 0.124 Inf    1  -0.298  0.9908

P value adjustment: tukey method for comparing a family of 4 estimates 
Tests are performed on the log scale 

3 Exercise 7

Fit a multivariate Cox model by using other predictors and describe your results.

sapply(names(dat), function(col) {
  is.na(dat[, col]) |> sum()
})
hla.match age.donor   age.rec  cold.isc     death      year       sex   tx.type 
      234       113         9      2250         0         0         0         0 
follow.up 
        0 
names(dat)
[1] "hla.match" "age.donor" "age.rec"   "cold.isc"  "death"     "year"     
[7] "sex"       "tx.type"   "follow.up"
Waldtest <- function(cox.fit, i.betas){
  q <- length(i.betas)
  coefs <- cox.fit$coefficients
  # print(cox.fit$var)
  var <- cox.fit$var[i.betas, i.betas]
  Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
  p.value <- 1-pchisq(as.numeric(Wald), df = q)
  return(cbind(Wald, p.value))
}
fit.cox.model <- function(surv, i.betas) {
  fit.tx <- coxph(surv, data = dat)
  Waldtest(fit.tx, i.betas)
}

Start with tx.type

surv <- Surv(follow.up, death) ~ tx.type + age.rec
fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
             p.value
[1,] 77.3778       0

Iteration 1: tx.type + age + ?

names(dat)
[1] "hla.match" "age.donor" "age.rec"   "cold.isc"  "death"     "year"     
[7] "sex"       "tx.type"   "follow.up"
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
fit.cox.model(surv, 3)
                  p.value
[1,] 8.146717 0.004313919
surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
fit.cox.model(surv, 3)
                 p.value
[1,] 6.509223 0.01073164
surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
fit.cox.model(surv, 3)
                 p.value
[1,] 0.7057259 0.4008664
surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
fit.cox.model(surv, 3)
                 p.value
[1,] 4.680593 0.03050521
surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
fit.cox.model(surv, 3)
                p.value
[1,] 1.921176 0.1657271

Add hla.match

Iteration 2: tx.type + age.rec + hla.match + ?

surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
fit.cox.model(surv, 4)
                p.value
[1,] 5.211567 0.0224371
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
fit.cox.model(surv, 4)
                 p.value
[1,] 0.9157965 0.3385811
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
fit.cox.model(surv, 4)
                p.value
[1,] 4.94216 0.02620927
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
fit.cox.model(surv, 4)
                p.value
[1,] 0.981911 0.3217275

Add age.donor

Iteration 2: tx.type + age.rec + hla.match + age.donor + ?

surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
  cold.isc
fit.cox.model(surv, 5)
                 p.value
[1,] 0.4450261 0.5047065
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
  year
fit.cox.model(surv, 5)
                 p.value
[1,] 4.367751 0.03662531
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
  sex
fit.cox.model(surv, 5)
                 p.value
[1,] 0.9343691 0.3337302

Add year

surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
  year + cold.isc
fit.cox.model(surv, 6)
                 p.value
[1,] 0.1614349 0.6878388
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
  year + sex
fit.cox.model(surv, 6)
                 p.value
[1,] 0.9641705 0.3261383

Next, we test for each remaining variables

4 Exercise 8

Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.

Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications.

H0: PH-assumption holds H1: PH-assumption doesn’t hold

# different cox model!!
cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
cox.zph(cox.final)
          chisq df       p
tx.type    2.56  1 0.10948
age.rec   35.98  1 2.0e-09
hla.match  8.22  1 0.00414
age.donor 11.05  1 0.00089
GLOBAL    47.37  4 1.3e-09

Exercise 10 — Plot the Schoenfeld residuals and comment. A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate’s impact changes, while a horizontal line around zero supports the assumption.

plot(cox.zph(cox.final))

NOTES: left censoring: event happened before time zero -> not applicable left truncation: not applicable?? right censoring: we saw this in the follow up times, administrative censoring high

! No left truncation, since our target population is patients who received a kidney transplant. So we don’t care about patients who died before receiving a transplant.

! ADD: we used a Cox model but we already saw that the PH assumption doesn’t hold !

? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.