foo

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survival)
# library(gtsummary)
dat <- read.csv("./unos.txt", sep = "\t")
head(dat)
  hlamat age age.1 cold_isc death year sex txtype fu
1      2  25    15        6     0 2002   0      1  0
2      1  42    10       11     1 1999   0      1  0
3      1   9    14        8     0 2002   1      1  0
4      0  35    12       26     0 2002   1      1  0
5      2  17    12       28     1 1997   1      1  0
6      3  44    13        1     0 2002   1      0  0
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
                "year", "sex", "tx.type", "follow.up")

1 Exercise

1.1 Exercise 1

Illustrate in a table the characteristics of the population (age, sex, race, donor, . . . ).

g <- ggplot(dat)
g + geom_point(aes(x = follow.up, y = death))

g +  geom_density(aes(x = follow.up))

g +  geom_density(aes(x = age.donor))
Warning: Removed 113 rows containing non-finite outside the scale range
(`stat_density()`).

g +  geom_bar(aes(x = age.rec))
Warning: Removed 9 rows containing non-finite outside the scale range
(`stat_count()`).

g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
Warning: Removed 113 rows containing missing values or values outside the scale range
(`stat_boxplot()`).
Warning: Removed 7 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# dat$age.1 |> table()

1.2 Exercise 2

Plot the Kaplan-Meier overall survival curve for pediatric kid- ney transplant recipients for the first 12 years after transplantation.

km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up  <= 12, ])
plot(km)

1.3 Exercise 3

We are going to compare mortality rates (hazard functions) between children whose transplanted kidney was provided by a living donor (in general a family member) and those whose source was recently deceased (variable donor type: txtype). Use the life table method to calculate the death rates for the first 5 years for each group (take in the first year intervals of 4 months and then look at each year) and show the results in a table. Estimate the hazard ratio in each time interval as the ratio between the mortality rates in the two groups. What do you notice?

dat.5 <- dat[dat$follow.up <= 5, ]
head(dat.5)
  hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
1         2        25      15        6     0 2002   0       1         0
2         1        42      10       11     1 1999   0       1         0
3         1         9      14        8     0 2002   1       1         0
4         0        35      12       26     0 2002   1       1         0
5         2        17      12       28     1 1997   1       1         0
6         3        44      13        1     0 2002   1       0         0
classify_time_interval = function(fu) {
  if (fu <= 1/3) {
    return(1/3)
  } else if (fu <= 2/3) {
    return(2/3)
  } else if (fu <= 1) {
    return(1)
  }
  ceiling(fu)
}

dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
table(dat.5$fu.interval)

0.333333333333333 0.666666666666667                 1                 2 
              982               528               640              1226 
                3                 4                 5 
             1184              1093               950 
head(dat.5)
  hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
1         2        25      15        6     0 2002   0       1         0
2         1        42      10       11     1 1999   0       1         0
3         1         9      14        8     0 2002   1       1         0
4         0        35      12       26     0 2002   1       1         0
5         2        17      12       28     1 1997   1       1         0
6         3        44      13        1     0 2002   1       0         0
  fu.interval
1   0.3333333
2   0.3333333
3   0.3333333
4   0.3333333
5   0.3333333
6   0.3333333
dat.5.life <- dat.5 |>
  group_by(fu.interval) |>
  summarize(
    n.censored = sum(death == 0),
    n.event = sum(death),
    n.at.risk = nrow(dat),
  )

for (i in 2:nrow(dat.5.life)) {
  j <- i - 1

  n.censored.pre <- dat.5.life$n.censored[j]
  n.event.pre <- dat.5.life$n.event[j]
  n.at.risk.pre <- dat.5.life$n.at.risk[j]

  n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

  dat.5.life$n.at.risk[i] <- n.at.risk
}

print(nrow(dat))
[1] 9775
dat.5.life
# A tibble: 7 × 4
  fu.interval n.censored n.event n.at.risk
        <dbl>      <int>   <int>     <int>
1       0.333        830     152      9775
2       0.667        491      37      8793
3       1            613      27      8265
4       2           1164      62      7625
5       3           1143      41      6399
6       4           1053      40      5215
7       5            919      31      4122
dat.5.life <- dat.5.life |>
  mutate(
     hazard.rate = n.event / n.at.risk
  )

get_life_table = function(dat) {
  dat <- dat |>
    group_by(fu.interval) |>
    summarize(
      n.censored = sum(death == 0),
      n.event = sum(death),
      n.at.risk = nrow(dat),
    )

  for (i in 2:nrow(dat)) {
    j <- i - 1

    n.censored.pre <- dat$n.censored[j]
    n.event.pre <- dat$n.event[j]
    n.at.risk.pre <- dat$n.at.risk[j]

    n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

    dat$n.at.risk[i] <- n.at.risk
  }

  dat <- dat |>
    mutate(
       hazard.rate = n.event / n.at.risk
    )

  return(dat)
}
dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
tx0.life <- get_life_table(dat.5.tx0)
tx0.life
# A tibble: 7 × 5
  fu.interval n.censored n.event n.at.risk hazard.rate
        <dbl>      <int>   <int>     <int>       <dbl>
1       0.333        463      49      3378     0.0145 
2       0.667        253      13      2866     0.00454
3       1            340      11      2600     0.00423
4       2            591      24      2249     0.0107 
5       3            568      17      1634     0.0104 
6       4            521      16      1049     0.0153 
7       5            501      11       512     0.0215 
tx1.life <- get_life_table(dat.5.tx1)
tx1.life
# A tibble: 7 × 5
  fu.interval n.censored n.event n.at.risk hazard.rate
        <dbl>      <int>   <int>     <int>       <dbl>
1       0.333        367     103      3225     0.0319 
2       0.667        238      24      2755     0.00871
3       1            273      16      2493     0.00642
4       2            573      38      2204     0.0172 
5       3            575      24      1593     0.0151 
6       4            532      24       994     0.0241 
7       5            418      20       438     0.0457 
tx1.life$hazard.rate / tx0.life$hazard.rate
[1] 2.201766 1.920536 1.516975 1.615661 1.448100 1.582998 2.125363
hazard.df <- data.frame(
  fu.interval = tx1.life$fu.interval,
  hazard.rate.0 = tx0.life$hazard.rate,
  hazard.rate.1 = tx1.life$hazard.rate,
  hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
) 

ggplot(hazard.df, aes(x = fu.interval)) +
  geom_line(aes(y = hazard.rate.0), color = "blue") + 
  geom_line(aes(y = hazard.rate.1), color = "orange")

ggplot(hazard.df, aes(x = fu.interval)) +
  geom_line(aes(y = hazard.ratio))

1.4 Exercise 4

Show a plot with Kaplan-Meier survival curves for the two donor types.

km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up  <= 12, ])

plot(km.tx, col = c("blue", "orange"))
legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))

1.5 Exercise 5

Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained.

cox <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
summary(cox)
Call:
coxph(formula = Surv(follow.up, death) ~ tx.type, data = dat)

  n= 9775, number of events= 465 

           coef exp(coef) se(coef)     z Pr(>|z|)    
tx.type 0.64469   1.90539  0.09558 6.745 1.53e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

        exp(coef) exp(-coef) lower .95 upper .95
tx.type     1.905     0.5248      1.58     2.298

Concordance= 0.586  (se = 0.012 )
Likelihood ratio test= 47.1  on 1 df,   p=7e-12
Wald test            = 45.5  on 1 df,   p=2e-11
Score (logrank) test = 47.09  on 1 df,   p=7e-12
1.90539
[1] 1.90539
1.90539 + 1.96 * exp(0.09558)
[1] 4.061972
(2.298 - 1.58) / 2 |> exp() / 2
[1] 0.04858537

Exercise 6 — Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.

Exercise 7 — Fit a multivariate Cox model by using other predictors and describe your results.

Exercise 8 — Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.

Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications. Exercise 10 — Plot the Schoenfeld residuals and comment.