discussion 2

This commit is contained in:
Louis Chih-Ming Lee 2026-05-14 09:15:59 +02:00
parent 7b94e4bf64
commit bce631d9e0
2 changed files with 377 additions and 23 deletions

5
.lintr Normal file
View file

@ -0,0 +1,5 @@
linters: lintr::linters_with_defaults(
object_name_linter = lintr::object_name_linter(
styles = c("snake_case", "dotted.case")
)
)

View file

@ -8,20 +8,34 @@ number-sections: true
```{r} ```{r}
library(tidyverse) library(tidyverse)
library(dplyr)
library(ggplot2)
library(survival) library(survival)
library(emmeans)
library(foreign)
# library(gtsummary) # library(gtsummary)
``` ```
```{r} ```{r}
dat <- read.csv("./unos.txt", sep = "\t") dat <- read.csv("./unos.txt", sep = "\t")
head(dat) head(dat)
```
```{r}
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
"year", "sex", "tx.type", "follow.up") "year", "sex", "tx.type", "follow.up")
``` ```
Complete case
```{r}
nrow(dat)
```
```{r}
sapply(names(dat), function(col) {
is.na(dat[, col]) |> sum()
})
```
# Exercise # Exercise
@ -47,6 +61,10 @@ g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))
# dat$age.1 |> table() # dat$age.1 |> table()
``` ```
```{r}
g + geom_boxplot(aes(x = year, y = follow.up, group = year))
```
## Exercise 2 ## Exercise 2
Plot the Kaplan-Meier overall survival curve for pediatric kid- Plot the Kaplan-Meier overall survival curve for pediatric kid-
@ -115,7 +133,7 @@ print(nrow(dat))
dat.5.life dat.5.life
``` ```
n.removed.acc
```{r} ```{r}
dat.5.life <- dat.5.life |> dat.5.life <- dat.5.life |>
mutate( mutate(
@ -128,8 +146,8 @@ dat.5.life <- dat.5.life |>
```{r} ```{r}
get_life_table = function(dat) { get_life_table = function(dat.sub, dat) {
dat <- dat |> dat.sub <- dat.sub |>
group_by(fu.interval) |> group_by(fu.interval) |>
summarize( summarize(
n.censored = sum(death == 0), n.censored = sum(death == 0),
@ -137,24 +155,24 @@ get_life_table = function(dat) {
n.at.risk = nrow(dat), n.at.risk = nrow(dat),
) )
for (i in 2:nrow(dat)) { for (i in 2:nrow(dat.sub)) {
j <- i - 1 j <- i - 1
n.censored.pre <- dat$n.censored[j] n.censored.pre <- dat.sub$n.censored[j]
n.event.pre <- dat$n.event[j] n.event.pre <- dat.sub$n.event[j]
n.at.risk.pre <- dat$n.at.risk[j] n.at.risk.pre <- dat.sub$n.at.risk[j]
n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre
dat$n.at.risk[i] <- n.at.risk dat.sub$n.at.risk[i] <- n.at.risk
} }
dat <- dat |> dat.sub <- dat.sub |>
mutate( mutate(
hazard.rate = n.event / n.at.risk hazard.rate = n.event / n.at.risk
) )
return(dat) return(dat.sub)
} }
``` ```
@ -164,12 +182,12 @@ dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
``` ```
```{r} ```{r}
tx0.life <- get_life_table(dat.5.tx0) tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
tx0.life tx0.life
``` ```
```{r} ```{r}
tx1.life <- get_life_table(dat.5.tx1) tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
tx1.life tx1.life
``` ```
@ -195,6 +213,159 @@ ggplot(hazard.df, aes(x = fu.interval)) +
``` ```
---
General Solution:
```{r}
get.life.table <- function(dat, time.intervals) {
n.pop <- nrow(dat)
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
) |>
ungroup() |>
calculate.hazard(n.pop)
}
get.life.table.by.groups <- function(dat, time.intervals, grps) {
grps |>
lapply(function(grp) {
dat |>
get.life.table.by.group(time.intervals, grp) |>
mutate(
grp.name = grp,
grp.value = pick(1)[[1]]
) |>
select(-1)
}) |>
bind_rows()
}
get.life.table.by.group <- function(dat, time.intervals, grp) {
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval, .data[[grp]]) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
.groups = "keep"
) |>
ungroup(fu.interval) |>
group_modify(function(df.sub, grp) {
grp.name <- names(grp)
grp.value <- grp[[1]]
n.pop <- (dat[[grp.name]] == grp.value) |> sum()
calculate.hazard(df.sub, n.pop)
}) |>
ungroup()
}
calculate.hazard <- function(life.table, n.pop) {
n.removed <- life.table$n.event + life.table$n.censored
n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
life.table |>
mutate(
n.at.risk = n.pop - n.removed.accum,
# TODO: how to account for censored? How do we adjust for uneven interval?
hazard.rate = n.event / n.at.risk
)
}
recode.dat <- function(dat, time.intervals) {
df <- dat[dat$follow.up <= sum(time.intervals), ]
time.points <- cumsum(time.intervals)
df$fu.interval <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
```
```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
get.life.table.by.groups(dat, time.intervals, grps)
```
```{r}
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
grps <- c("tx.type")
plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
dat |>
get.life.table.by.groups(time.intervals, grps) |>
ggplot(aes(x = fu.interval, y = hazard.rate,
color = paste(grp.name, grp.value, sep = " = "))) +
geom_point() +
geom_line() +
labs(
color = "Group"
) +
theme(
legend.position = "bottom"
)
}
```
```{r}
time.intervals <- c(rep(1/3, 3), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
```{r}
time.intervals <- rep(1, 5)
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
```{r}
time.intervals <- rep(1/3, 15)
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
```{r}
time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
```{r}
time.intervals <- c(rep(1/4, 4), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
```{r}
time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
```{r}
time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```
---
Compare
```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table.by.groups(dat, time.intervals, grps)
```
```{r}
hazard.df
```
## Exercise 4 ## Exercise 4
@ -214,31 +385,209 @@ Fit a univariate Cox model with predictor donor type. Report
the hazard ratio and 95% confidence interval and interpret the result obtained. the hazard ratio and 95% confidence interval and interpret the result obtained.
```{r} ```{r}
cox <- coxph(Surv(follow.up, death) ~ tx.type, data = dat) cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
summary(cox) summary(cox.tx)
``` ```
```{r} ```{r}
1.90539 log.hazard.ratio.hat <- cox.tx$coefficient
hazard.ratio.hat <- log.hazard.ratio.hat |> exp()
``` ```
```{r} ```{r}
1.90539 + 1.96 * exp(0.09558) log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
(2.298 - 1.58) / 2 |> exp() / 2 log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se
c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
``` ```
Exercise 6 — Research shows that an important determinant of mortality ::: {.callout-note}
Patients receiving kidney transplants from living donors (`tx.type = 1`) has
1.91 higher hazard rate than those receiving from cadaveric donors
(`tx.type = 0`).
Since 1 is not in the 95% confidence interval, the result is statistically
significant.
:::
# Exercise 6
Research shows that an important determinant of mortality
after kidney transplant is the age of the recipient. Fit a Cox model with age after kidney transplant is the age of the recipient. Fit a Cox model with age
as predictor and estimate the hazard ratio and its confidence interval. Consider as predictor and estimate the hazard ratio and its confidence interval. Consider
age first as continuous variable and then divide into categories. age first as continuous variable and then divide into categories.
Exercise 7 — Fit a multivariate Cox model by using other predictors and ## Continuous
```{r}
cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
cox.age.rec |> summary()
```
## Categorical (Case 1)
```{r}
recode.age <- function(dat, ages) {
df$age.group <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
```
```{r}
dat.age.rec <- dat[!is.na(dat$age.rec), ]
class.age <- function(age) {
if (age < 2) {
return("0,1")
} else if (age < 6) {
return("2,3,4,5")
} else if (age < 11) {
return("6,7,8,9,10")
} else if (age < 19) {
return("11-18")
}
}
```
```{r}
dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
factor(levels = c(
"0,1",
"2,3,4,5",
"6,7,8,9,10",
"11-18"
))
cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
cox.age.rec |> summary()
```
```{r}
emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
```
# Exercise 7
Fit a multivariate Cox model by using other predictors and
describe your results. describe your results.
Exercise 8 — Estimate the survival function for specific covariate patterns.
```{r}
sapply(names(dat), function(col) {
is.na(dat[, col]) |> sum()
})
```
```{r}
names(dat)
```
```{r}
Waldtest <- function(cox.fit, i.betas){
q <- length(i.betas)
coefs <- cox.fit$coefficients
# print(cox.fit$var)
var <- cox.fit$var[i.betas, i.betas]
Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
p.value <- 1-pchisq(as.numeric(Wald), df = q)
return(cbind(Wald, p.value))
}
```
```{r}
fit.cox.model <- function(surv, i.betas) {
fit.tx <- coxph(surv, data = dat)
Waldtest(fit.tx, i.betas)
}
```
Start with `tx.type`
```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec
fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
```
Iteration 1: `tx.type + age + ?`
```{r}
names(dat)
```
```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
fit.cox.model(surv, 3)
```
Add `hla.match`
Iteration 2: `tx.type + age.rec + hla.match + ?`
```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
fit.cox.model(surv, 4)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
fit.cox.model(surv, 4)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
fit.cox.model(surv, 4)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
fit.cox.model(surv, 4)
```
Add `age.donor`
Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?`
```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
cold.isc
fit.cox.model(surv, 5)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
year
fit.cox.model(surv, 5)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
sex
fit.cox.model(surv, 5)
```
Add `year`
```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
year + cold.isc
fit.cox.model(surv, 6)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
year + sex
fit.cox.model(surv, 6)
```
Next, we test for each remaining variables
# Exercise 8
Estimate the survival function for specific covariate patterns.
Based on the previous results choose the best predictors. Based on the previous results choose the best predictors.
Exercise 9 — Check the proportional hazards assumption. You may use the Exercise 9 — Check the proportional hazards assumption. You may use the
function cox.zph. Discuss the result and possible implications. function cox.zph. Discuss the result and possible implications.
Exercise 10 — Plot the Schoenfeld residuals and comment. Exercise 10 — Plot the Schoenfeld residuals and comment.