library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(survival)
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(foreign)
library(gtsummary)
library(ggsurvfit)
dat <- read.csv("./unos.txt", sep = "\t")
head(dat)
## hlamat age age.1 cold_isc death year sex txtype fu
## 1 2 25 15 6 0 2002 0 1 0
## 2 1 42 10 11 1 1999 0 1 0
## 3 1 9 14 8 0 2002 1 1 0
## 4 0 35 12 26 0 2002 1 1 0
## 5 2 17 12 28 1 1997 1 1 0
## 6 3 44 13 1 0 2002 1 0 0
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
"year", "sex", "tx.type", "follow.up")
Complete case
nrow(dat)
## [1] 9775
sapply(names(dat), function(col) {
is.na(dat[, col]) |> sum()
})
## hla.match age.donor age.rec cold.isc death year sex tx.type
## 234 113 9 2250 0 0 0 0
## follow.up
## 0
Illustrate in a table the characteristics of the population (age, sex, race, donor, . . . ).
hist(dat$age.donor)
hist(dat$age.rec)
hist(dat$cold.isc)
hist(dat$year)
hist(dat$follow.up)
unique(dat$hla.match)
## [1] 2 1 0 3 4 NA 5 6
unique(dat$year)
## [1] 2002 1999 1997 2001 1994 1995 1996 1993 1990 1991 1992 2000 1998
Change the order!!
dat.table1 <- dat |>
mutate(
sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
hla.match = factor(hla.match),
year = factor(year)
)
table1 <- dat.table1 |>
select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
tbl_summary(
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})"
),
label = list(
hla.match ~ "HLA matches, n(%)",
age.donor ~ "Donor age, median (IQR)",
age.rec ~ "Recipient age, median (IQR)",
cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
year ~ "Year of transplant",
sex ~ "Sex, n(%)",
tx.type ~ "Transplant type, n(%)"
),
missing = "ifany"
) |>
modify_footnote(all_stat_cols() ~ NA)
table1
| Characteristic | N = 9,775 |
|---|---|
| HLA matches, n(%) | |
| 0 | 784 (8.2%) |
| 1 | 1,504 (16%) |
| 2 | 1,553 (16%) |
| 3 | 3,778 (40%) |
| 4 | 1,278 (13%) |
| 5 | 365 (3.8%) |
| 6 | 279 (2.9%) |
| Unknown | 234 |
| Donor age, median (IQR) | 33 (21, 41) |
| Unknown | 113 |
| Recipient age, median (IQR) | 13 (8, 16) |
| Unknown | 9 |
| Cold ischemic time (hours), median (IQR), | 7 (1, 19) |
| Unknown | 2,250 |
| Year of transplant | |
| 1990 | 728 (7.4%) |
| 1991 | 734 (7.5%) |
| 1992 | 666 (6.8%) |
| 1993 | 736 (7.5%) |
| 1994 | 787 (8.1%) |
| 1995 | 813 (8.3%) |
| 1996 | 789 (8.1%) |
| 1997 | 745 (7.6%) |
| 1998 | 740 (7.6%) |
| 1999 | 842 (8.6%) |
| 2000 | 706 (7.2%) |
| 2001 | 834 (8.5%) |
| 2002 | 655 (6.7%) |
| Sex, n(%) | |
| Female | 4,014 (41%) |
| Male | 5,761 (59%) |
| Transplant type, n(%) | |
| Cadaveric | 5,148 (53%) |
| Living | 4,627 (47%) |
table.split <- dat.table1 |>
select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
tbl_summary(
by = tx.type,
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})"
),
label = list(
hla.match ~ "HLA matches, n(%)",
age.donor ~ "Donor age, median (IQR)",
age.rec ~ "Recipient age, median (IQR)",
cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
year ~ "Year of transplant",
sex ~ "Sex, n(%)"
),
missing = "ifany"
) |>
add_overall() |>
modify_footnote(all_stat_cols() ~ NA)
table.split
| Characteristic | Overall N = 9,775 |
Cadaveric N = 5,148 |
Living N = 4,627 |
|---|---|---|---|
| HLA matches, n(%) | |||
| 0 | 784 (8.2%) | 125 (2.5%) | 659 (14%) |
| 1 | 1,504 (16%) | 160 (3.2%) | 1,344 (29%) |
| 2 | 1,553 (16%) | 228 (4.6%) | 1,325 (29%) |
| 3 | 3,778 (40%) | 3,039 (61%) | 739 (16%) |
| 4 | 1,278 (13%) | 1,007 (20%) | 271 (5.9%) |
| 5 | 365 (3.8%) | 241 (4.9%) | 124 (2.7%) |
| 6 | 279 (2.9%) | 153 (3.1%) | 126 (2.7%) |
| Unknown | 234 | 195 | 39 |
| Donor age, median (IQR) | 33 (21, 41) | 37 (31, 42) | 22 (15, 37) |
| Unknown | 113 | 112 | 1 |
| Recipient age, median (IQR) | 13 (8, 16) | 13 (7, 16) | 14 (9, 16) |
| Unknown | 9 | 7 | 2 |
| Cold ischemic time (hours), median (IQR), | 7 (1, 19) | 1 (0, 1) | 19 (13, 25) |
| Unknown | 2,250 | 1,490 | 760 |
| Year of transplant | |||
| 1990 | 728 (7.4%) | 318 (6.2%) | 410 (8.9%) |
| 1991 | 734 (7.5%) | 387 (7.5%) | 347 (7.5%) |
| 1992 | 666 (6.8%) | 345 (6.7%) | 321 (6.9%) |
| 1993 | 736 (7.5%) | 403 (7.8%) | 333 (7.2%) |
| 1994 | 787 (8.1%) | 355 (6.9%) | 432 (9.3%) |
| 1995 | 813 (8.3%) | 427 (8.3%) | 386 (8.3%) |
| 1996 | 789 (8.1%) | 416 (8.1%) | 373 (8.1%) |
| 1997 | 745 (7.6%) | 408 (7.9%) | 337 (7.3%) |
| 1998 | 740 (7.6%) | 412 (8.0%) | 328 (7.1%) |
| 1999 | 842 (8.6%) | 436 (8.5%) | 406 (8.8%) |
| 2000 | 706 (7.2%) | 392 (7.6%) | 314 (6.8%) |
| 2001 | 834 (8.5%) | 483 (9.4%) | 351 (7.6%) |
| 2002 | 655 (6.7%) | 366 (7.1%) | 289 (6.2%) |
| Sex, n(%) | |||
| Female | 4,014 (41%) | 2,105 (41%) | 1,909 (41%) |
| Male | 5,761 (59%) | 3,043 (59%) | 2,718 (59%) |
table.split <- dat.table1 |>
select(sex, age.rec, age.donor, hla.match, cold.isc, tx.type) |>
tbl_summary(
by = tx.type,
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})"
),
label = list(
hla.match ~ "HLA matches, n(%)",
age.donor ~ "Donor age, median (IQR)",
age.rec ~ "Recipient age, median (IQR)",
cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
sex ~ "Sex, n(%)"
),
missing = "ifany"
) |>
add_overall() |>
modify_footnote(all_stat_cols() ~ NA)
table.split
| Characteristic | Overall N = 9,775 |
Cadaveric N = 5,148 |
Living N = 4,627 |
|---|---|---|---|
| Sex, n(%) | |||
| Female | 4,014 (41%) | 2,105 (41%) | 1,909 (41%) |
| Male | 5,761 (59%) | 3,043 (59%) | 2,718 (59%) |
| Recipient age, median (IQR) | 13 (8, 16) | 13 (7, 16) | 14 (9, 16) |
| Unknown | 9 | 7 | 2 |
| Donor age, median (IQR) | 33 (21, 41) | 37 (31, 42) | 22 (15, 37) |
| Unknown | 113 | 112 | 1 |
| HLA matches, n(%) | |||
| 0 | 784 (8.2%) | 125 (2.5%) | 659 (14%) |
| 1 | 1,504 (16%) | 160 (3.2%) | 1,344 (29%) |
| 2 | 1,553 (16%) | 228 (4.6%) | 1,325 (29%) |
| 3 | 3,778 (40%) | 3,039 (61%) | 739 (16%) |
| 4 | 1,278 (13%) | 1,007 (20%) | 271 (5.9%) |
| 5 | 365 (3.8%) | 241 (4.9%) | 124 (2.7%) |
| 6 | 279 (2.9%) | 153 (3.1%) | 126 (2.7%) |
| Unknown | 234 | 195 | 39 |
| Cold ischemic time (hours), median (IQR), | 7 (1, 19) | 1 (0, 1) | 19 (13, 25) |
| Unknown | 2,250 | 1,490 | 760 |
Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI
rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)
summary(rev_km)$table
## records n.max n.start events rmean se(rmean)
## 9.775000e+03 9.775000e+03 9.775000e+03 9.310000e+03 4.055027e+00 3.202711e-02
## median 0.95LCL 0.95UCL
## 3.380822e+00 3.227397e+00 3.567123e+00
median_followup <- summary(rev_km)$table["median"]
confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
median_followup
## median
## 3.380822
confidence_interval
## 0.95LCL 0.95UCL
## 3.227397 3.567123
g <- ggplot(dat)
g + geom_point(aes(x = follow.up, y = death))
g + geom_density(aes(x = follow.up))
g + geom_density(aes(x = age.donor))
## Warning: Removed 113 rows containing non-finite outside the scale range
## (`stat_density()`).
g + geom_bar(aes(x = age.rec))
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_count()`).
g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
## Warning: Removed 113 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# dat$age.1 |> table()
g + geom_boxplot(aes(x = year, y = follow.up, group = year))
Plot the Kaplan-Meier overall survival curve for pediatric kid- ney transplant recipients for the first 12 years after transplantation.
!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!
km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ])
plot(km)
We are going to compare mortality rates (hazard functions) between
children whose transplanted kidney was provided by a living donor (in
general a family member) and those whose source was recently deceased
(variable donor type: txtype). Use the life table method to
calculate the death rates for the first 5 years for each group (take in
the first year intervals of 4 months and then look at each year) and
show the results in a table. Estimate the hazard ratio in each time
interval as the ratio between the mortality rates in the two groups.
What do you notice?
dat.5 <- dat[dat$follow.up <= 5, ]
head(dat.5)
## hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
## 1 2 25 15 6 0 2002 0 1 0
## 2 1 42 10 11 1 1999 0 1 0
## 3 1 9 14 8 0 2002 1 1 0
## 4 0 35 12 26 0 2002 1 1 0
## 5 2 17 12 28 1 1997 1 1 0
## 6 3 44 13 1 0 2002 1 0 0
classify_time_interval = function(fu) {
if (fu <= 1/3) {
return(1/3)
} else if (fu <= 2/3) {
return(2/3)
} else if (fu <= 1) {
return(1)
}
ceiling(fu)
}
dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
table(dat.5$fu.interval)
##
## 0.333333333333333 0.666666666666667 1 2
## 982 528 640 1226
## 3 4 5
## 1184 1093 950
head(dat.5)
## hla.match age.donor age.rec cold.isc death year sex tx.type follow.up
## 1 2 25 15 6 0 2002 0 1 0
## 2 1 42 10 11 1 1999 0 1 0
## 3 1 9 14 8 0 2002 1 1 0
## 4 0 35 12 26 0 2002 1 1 0
## 5 2 17 12 28 1 1997 1 1 0
## 6 3 44 13 1 0 2002 1 0 0
## fu.interval
## 1 0.3333333
## 2 0.3333333
## 3 0.3333333
## 4 0.3333333
## 5 0.3333333
## 6 0.3333333
dat.5.life <- dat.5 |>
group_by(fu.interval) |>
summarize(
n.censored = sum(death == 0),
n.event = sum(death),
n.at.risk = nrow(dat),
)
for (i in 2:nrow(dat.5.life)) {
j <- i - 1
n.censored.pre <- dat.5.life$n.censored[j]
n.event.pre <- dat.5.life$n.event[j]
n.at.risk.pre <- dat.5.life$n.at.risk[j]
n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
dat.5.life$n.at.risk[i] <- n.at.risk
}
print(nrow(dat))
## [1] 9775
dat.5.life
## # A tibble: 7 × 4
## fu.interval n.censored n.event n.at.risk
## <dbl> <int> <int> <int>
## 1 0.333 830 152 9775
## 2 0.667 491 37 8793
## 3 1 613 27 8265
## 4 2 1164 62 7625
## 5 3 1143 41 6399
## 6 4 1053 40 5215
## 7 5 919 31 4122
n.removed.acc
dat.5.life <- dat.5.life |>
mutate(
hazard.rate = n.event / n.at.risk
)
get_life_table = function(dat.sub, dat) {
dat.sub <- dat.sub |>
group_by(fu.interval) |>
summarize(
n.censored = sum(death == 0),
n.event = sum(death),
n.at.risk = nrow(dat),
)
for (i in 2:nrow(dat.sub)) {
j <- i - 1
n.censored.pre <- dat.sub$n.censored[j]
n.event.pre <- dat.sub$n.event[j]
n.at.risk.pre <- dat.sub$n.at.risk[j]
n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
dat.sub$n.at.risk[i] <- n.at.risk
}
dat.sub <- dat.sub |>
mutate(
hazard.rate = n.event / n.at.risk
)
return(dat.sub)
}
dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
tx0.life
## # A tibble: 7 × 5
## fu.interval n.censored n.event n.at.risk hazard.rate
## <dbl> <int> <int> <int> <dbl>
## 1 0.333 463 49 5148 0.00952
## 2 0.667 253 13 4636 0.00280
## 3 1 340 11 4370 0.00252
## 4 2 591 24 4019 0.00597
## 5 3 568 17 3404 0.00499
## 6 4 521 16 2819 0.00568
## 7 5 501 11 2282 0.00482
tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
tx1.life
## # A tibble: 7 × 5
## fu.interval n.censored n.event n.at.risk hazard.rate
## <dbl> <int> <int> <int> <dbl>
## 1 0.333 367 103 4627 0.0223
## 2 0.667 238 24 4157 0.00577
## 3 1 273 16 3895 0.00411
## 4 2 573 38 3606 0.0105
## 5 3 575 24 2995 0.00801
## 6 4 532 24 2396 0.0100
## 7 5 418 20 1840 0.0109
tx1.life$hazard.rate / tx0.life$hazard.rate
## [1] 2.338731 2.058881 1.631929 1.764675 1.604557 1.764816 2.254941
hazard.df <- data.frame(
fu.interval = tx1.life$fu.interval,
hazard.rate.0 = tx0.life$hazard.rate,
hazard.rate.1 = tx1.life$hazard.rate,
hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
)
ggplot(hazard.df, aes(x = fu.interval)) +
geom_line(aes(y = hazard.rate.0), color = "blue") +
geom_line(aes(y = hazard.rate.1), color = "orange")
ggplot(hazard.df, aes(x = fu.interval)) +
geom_line(aes(y = hazard.ratio))
General Solution:
get.life.table <- function(dat, time.intervals) {
n.pop <- nrow(dat)
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
) |>
ungroup() |>
calculate.hazard(n.pop)
}
get.life.table.by.groups <- function(dat, time.intervals, grps) {
grps |>
lapply(function(grp) {
dat |>
get.life.table.by.group(time.intervals, grp) |>
mutate(
grp.name = grp,
grp.value = pick(1)[[1]]
) |>
select(-1)
}) |>
bind_rows()
}
get.life.table.by.group <- function(dat, time.intervals, grp) {
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval, .data[[grp]]) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
.groups = "keep"
) |>
ungroup(fu.interval) |>
group_modify(function(df.sub, grp) {
grp.name <- names(grp)
grp.value <- grp[[1]]
n.pop <- (dat[[grp.name]] == grp.value) |> sum()
calculate.hazard(df.sub, n.pop)
}) |>
ungroup()
}
calculate.hazard <- function(life.table, n.pop) {
n.removed <- life.table$n.event + life.table$n.censored
n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
life.table |>
mutate(
n.at.risk = n.pop - n.removed.accum,
# TODO: how to account for censored? How do we adjust for uneven interval?
hazard.rate = n.event / n.at.risk
)
}
recode.dat <- function(dat, time.intervals) {
df <- dat[dat$follow.up <= sum(time.intervals), ]
time.points <- cumsum(time.intervals)
df$fu.interval <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
get.life.table.by.groups(dat, time.intervals, grps)
## # A tibble: 593 × 7
## fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
## <dbl> <int> <int> <dbl> <dbl> <chr> <dbl>
## 1 0.333 463 49 5148 0.00952 tx.type 0
## 2 0.667 253 13 4636 0.00280 tx.type 0
## 3 1 340 11 4370 0.00252 tx.type 0
## 4 2 591 24 4019 0.00597 tx.type 0
## 5 3 568 17 3404 0.00499 tx.type 0
## 6 4 521 16 2819 0.00568 tx.type 0
## 7 5 501 11 2282 0.00482 tx.type 0
## 8 0.333 367 103 4627 0.0223 tx.type 1
## 9 0.667 238 24 4157 0.00577 tx.type 1
## 10 1 273 16 3895 0.00411 tx.type 1
## # ℹ 583 more rows
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
grps <- c("tx.type")
plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
dat |>
get.life.table.by.groups(time.intervals, grps) |>
ggplot(aes(x = fu.interval, y = hazard.rate,
color = paste(grp.name, grp.value, sep = " = "))) +
geom_point() +
geom_line() +
labs(
color = "Group"
) +
theme(
legend.position = "bottom"
)
}
time.intervals <- c(rep(1/3, 3), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
time.intervals <- rep(1, 5)
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
time.intervals <- rep(1/3, 15)
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
time.intervals <- c(rep(1/4, 4), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
Compare
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table.by.groups(dat, time.intervals, grps)
## # A tibble: 14 × 7
## fu.interval n.censored n.event n.at.risk hazard.rate grp.name grp.value
## <dbl> <int> <int> <dbl> <dbl> <chr> <int>
## 1 0.333 463 49 5148 0.00952 tx.type 0
## 2 0.667 253 13 4636 0.00280 tx.type 0
## 3 1 340 11 4370 0.00252 tx.type 0
## 4 2 591 24 4019 0.00597 tx.type 0
## 5 3 568 17 3404 0.00499 tx.type 0
## 6 4 521 16 2819 0.00568 tx.type 0
## 7 5 501 11 2282 0.00482 tx.type 0
## 8 0.333 367 103 4627 0.0223 tx.type 1
## 9 0.667 238 24 4157 0.00577 tx.type 1
## 10 1 273 16 3895 0.00411 tx.type 1
## 11 2 573 38 3606 0.0105 tx.type 1
## 12 3 575 24 2995 0.00801 tx.type 1
## 13 4 532 24 2396 0.0100 tx.type 1
## 14 5 418 20 1840 0.0109 tx.type 1
hazard.df
## fu.interval hazard.rate.0 hazard.rate.1 hazard.ratio
## 1 0.3333333 0.009518260 0.022260644 2.338731
## 2 0.6666667 0.002804142 0.005773394 2.058881
## 3 1.0000000 0.002517162 0.004107831 1.631929
## 4 2.0000000 0.005971635 0.010537992 1.764675
## 5 3.0000000 0.004994125 0.008013356 1.604557
## 6 4.0000000 0.005675772 0.010016694 1.764816
## 7 5.0000000 0.004820333 0.010869565 2.254941
Show a plot with Kaplan-Meier survival curves for the two donor types.
km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up <= 12, ])
plot(km.tx, col = c("blue", "orange"))
legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))
Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained.
cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
summary(cox.tx)
## Call:
## coxph(formula = Surv(follow.up, death) ~ tx.type, data = dat)
##
## n= 9775, number of events= 465
##
## coef exp(coef) se(coef) z Pr(>|z|)
## tx.type 0.64469 1.90539 0.09558 6.745 1.53e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## tx.type 1.905 0.5248 1.58 2.298
##
## Concordance= 0.586 (se = 0.012 )
## Likelihood ratio test= 47.1 on 1 df, p=7e-12
## Wald test = 45.5 on 1 df, p=2e-11
## Score (logrank) test = 47.09 on 1 df, p=7e-12
log.hazard.ratio.hat <- cox.tx$coefficient
hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se
c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
## tx.type tx.type
## 1.579899 2.297932
Patients receiving kidney transplants from living donors
(tx.type = 1) has 1.91 higher hazard rate than those
receiving from cadaveric donors (tx.type = 0).
Since 1 is not in the 95% confidence interval, the result is statistically significant.
Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories.
cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
cox.age.rec |> summary()
## Call:
## coxph(formula = Surv(follow.up, death) ~ age.rec, data = dat)
##
## n= 9766, number of events= 464
## (9 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age.rec -0.042550 0.958342 0.008434 -5.045 4.53e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age.rec 0.9583 1.043 0.9426 0.9743
##
## Concordance= 0.586 (se = 0.016 )
## Likelihood ratio test= 24.81 on 1 df, p=6e-07
## Wald test = 25.45 on 1 df, p=5e-07
## Score (logrank) test = 25.75 on 1 df, p=4e-07
recode.age <- function(dat, ages) {
df$age.group <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
dat.age.rec <- dat[!is.na(dat$age.rec), ]
class.age <- function(age) {
if (age < 2) {
return("0-1")
} else if (age < 6) {
return("2-5")
} else if (age < 11) {
return("6-10")
} else if (age < 19) {
return("11-18")
}
}
dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
factor(levels = c(
"0-1",
"2-5",
"6-10",
"11-18"
))
cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
## $emmeans
## age.group response SE df asymp.LCL asymp.UCL
## 0-1 1.000 0.0000 Inf 1.000 1.000
## 2-5 0.552 0.0975 Inf 0.390 0.780
## 6-10 0.347 0.0617 Inf 0.245 0.492
## 11-18 0.361 0.0546 Inf 0.268 0.485
##
## Confidence level used: 0.95
## Intervals are back-transformed from the log scale
##
## $contrasts
## contrast ratio SE df null z.ratio p.value
## (0-1) / (2-5) 1.813 0.320 Inf 1 3.366 0.0043
## (0-1) / (6-10) 2.880 0.511 Inf 1 5.956 <.0001
## (0-1) / (11-18) 2.771 0.419 Inf 1 6.736 <.0001
## (2-5) / (6-10) 1.589 0.251 Inf 1 2.928 0.0179
## (2-5) / (11-18) 1.529 0.196 Inf 1 3.314 0.0051
## (6-10) / (11-18) 0.962 0.124 Inf 1 -0.298 0.9908
##
## P value adjustment: tukey method for comparing a family of 4 estimates
## Tests are performed on the log scale
Fit a multivariate Cox model by using other predictors and describe your results.
sapply(names(dat), function(col) {
is.na(dat[, col]) |> sum()
})
## hla.match age.donor age.rec cold.isc death year sex tx.type
## 234 113 9 2250 0 0 0 0
## follow.up
## 0
names(dat)
## [1] "hla.match" "age.donor" "age.rec" "cold.isc" "death" "year"
## [7] "sex" "tx.type" "follow.up"
Waldtest <- function(cox.fit, i.betas){
q <- length(i.betas)
coefs <- cox.fit$coefficients
# print(cox.fit$var)
var <- cox.fit$var[i.betas, i.betas]
Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
p.value <- 1-pchisq(as.numeric(Wald), df = q)
return(cbind(Wald, p.value))
}
fit.cox.model <- function(surv, i.betas) {
fit.tx <- coxph(surv, data = dat)
Waldtest(fit.tx, i.betas)
}
Start with tx.type
surv <- Surv(follow.up, death) ~ tx.type + age.rec
fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
## p.value
## [1,] 77.3778 0
Iteration 1: tx.type + age + ?
names(dat)
## [1] "hla.match" "age.donor" "age.rec" "cold.isc" "death" "year"
## [7] "sex" "tx.type" "follow.up"
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
fit.cox.model(surv, 3)
## p.value
## [1,] 8.146717 0.004313919
surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
fit.cox.model(surv, 3)
## p.value
## [1,] 6.509223 0.01073164
surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
fit.cox.model(surv, 3)
## p.value
## [1,] 0.7057259 0.4008664
surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
fit.cox.model(surv, 3)
## p.value
## [1,] 4.680593 0.03050521
surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
fit.cox.model(surv, 3)
## p.value
## [1,] 1.921176 0.1657271
Add hla.match
Iteration 2: tx.type + age.rec + hla.match + ?
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
fit.cox.model(surv, 4)
## p.value
## [1,] 5.211567 0.0224371
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
fit.cox.model(surv, 4)
## p.value
## [1,] 0.9157965 0.3385811
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
fit.cox.model(surv, 4)
## p.value
## [1,] 4.94216 0.02620927
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
fit.cox.model(surv, 4)
## p.value
## [1,] 0.981911 0.3217275
Add age.donor
Iteration 2:
tx.type + age.rec + hla.match + age.donor + ?
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
cold.isc
fit.cox.model(surv, 5)
## p.value
## [1,] 0.4450261 0.5047065
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
year
fit.cox.model(surv, 5)
## p.value
## [1,] 4.367751 0.03662531
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
sex
fit.cox.model(surv, 5)
## p.value
## [1,] 0.9343691 0.3337302
Add year
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
year + cold.isc
fit.cox.model(surv, 6)
## p.value
## [1,] 0.1614349 0.6878388
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
year + sex
fit.cox.model(surv, 6)
## p.value
## [1,] 0.9641705 0.3261383
Next, we test for each remaining variables
Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors.
Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications.
H0: PH-assumption holds H1: PH-assumption doesn’t hold
# different cox model!!
cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
cox.zph(cox.final)
## chisq df p
## tx.type 2.56 1 0.10948
## age.rec 35.98 1 2.0e-09
## hla.match 8.22 1 0.00414
## age.donor 11.05 1 0.00089
## GLOBAL 47.37 4 1.3e-09
cox.final
## Call:
## coxph(formula = Surv(follow.up, death) ~ tx.type + age.rec +
## hla.match + age.donor, data = dat)
##
## coef exp(coef) se(coef) z p
## tx.type 0.408215 1.504130 0.121728 3.354 0.000798
## age.rec -0.043724 0.957218 0.008875 -4.927 8.36e-07
## hla.match -0.107323 0.898236 0.040127 -2.675 0.007483
## age.donor -0.008810 0.991228 0.003859 -2.283 0.022437
##
## Likelihood ratio test=85.48 on 4 df, p=< 2.2e-16
## n= 9433, number of events= 449
## (342 observations deleted due to missingness)
Exercise 10 — Plot the Schoenfeld residuals and comment. A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate’s impact changes, while a horizontal line around zero supports the assumption.
plot(cox.zph(cox.final))
plot(cox.zph(cox.final))
NOTES: left censoring: event happened before time zero -> not applicable left truncation: not applicable?? right censoring: we saw this in the follow up times, administrative censoring high
! No left truncation, since our target population is patients who received a kidney transplant. So we don’t care about patients who died before receiving a transplant.
! ADD: we used a Cox model but we already saw that the PH assumption doesn’t hold !
? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.