some changes

This commit is contained in:
Louis Chih-Ming Lee 2026-05-19 09:37:05 +02:00
parent bce631d9e0
commit a23e66a9ad
4 changed files with 6565 additions and 757 deletions

View file

@ -13,7 +13,7 @@ library(ggplot2)
library(survival)
library(emmeans)
library(foreign)
# library(gtsummary)
library(gtsummary)
```
```{r}
@ -45,6 +45,89 @@ sapply(names(dat), function(col) {
> Illustrate in a table the characteristics of the population (age, sex, race,
donor, . . . ).
```{r}
hist(dat$age.donor)
hist(dat$age.rec)
hist(dat$cold.isc)
hist(dat$year)
hist(dat$follow.up)
unique(dat$hla.match)
unique(dat$year)
```
Change the order!!
```{r}
dat.table1 <- dat |>
mutate(
sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
hla.match = factor(hla.match),
year = factor(year)
)
```
```{r}
table1 <- dat.table1 |>
select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
tbl_summary(
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})"
),
label = list(
hla.match ~ "HLA matches, n(%)",
age.donor ~ "Donor age, median (IQR)",
age.rec ~ "Recipient age, median (IQR)",
cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
year ~ "Year of transplant",
sex ~ "Sex, n(%)",
tx.type ~ "Transplant type, n(%)"
),
missing = "ifany"
) |>
modify_footnote(all_stat_cols() ~ NA)
table1
```
```{r}
table.split <- dat.table1 |>
select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |>
tbl_summary(
by = tx.type,
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})"
),
label = list(
hla.match ~ "HLA matches, n(%)",
age.donor ~ "Donor age, median (IQR)",
age.rec ~ "Recipient age, median (IQR)",
cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
year ~ "Year of transplant",
sex ~ "Sex, n(%)"
),
missing = "ifany"
) |>
add_overall() |>
modify_footnote(all_stat_cols() ~ NA)
table.split
```
Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI
```{r}
rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)
summary(rev_km)$table
median_followup <- summary(rev_km)$table["median"]
confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
median_followup
confidence_interval
```
```{r}
g <- ggplot(dat)
```
@ -70,12 +153,11 @@ g + geom_boxplot(aes(x = year, y = follow.up, group = year))
Plot the Kaplan-Meier overall survival curve for pediatric kid-
ney transplant recipients for the first 12 years after transplantation.
!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!
```{r}
km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up <= 12, ])
plot(km)
```
## Exercise 3
@ -577,6 +659,9 @@ surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
fit.cox.model(surv, 6)
```
```{r}
```
Next, we test for each remaining variables
@ -589,5 +674,36 @@ Based on the previous results choose the best predictors.
Exercise 9 — Check the proportional hazards assumption. You may use the
function cox.zph. Discuss the result and possible implications.
Exercise 10 — Plot the Schoenfeld residuals and comment.
H0: PH-assumption holds
H1: PH-assumption doesn't hold
```{r}
# different cox model!!
cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
cox.zph(cox.final)
```
Exercise 10 — Plot the Schoenfeld residuals and comment.
A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate's impact changes, while a horizontal line around zero supports the assumption.
```{r}
plot(cox.zph(cox.final))
```
NOTES:
left censoring: event happened before time zero -> not applicable
left truncation: not applicable??
right censoring: we saw this in the follow up times, administrative censoring high
! No left truncation, since our target population is patients who received a
kidney transplant. So we don't care about patients who died before receiving
a transplant.
! ADD: we used a Cox model but we already saw that the PH assumption doesn't hold !
? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.

File diff suppressed because it is too large Load diff

3075
slides.html Normal file

File diff suppressed because it is too large Load diff

609
slides.qmd Normal file
View file

@ -0,0 +1,609 @@
---
title: Slides
execute:
cache: true
freeze: auto
include: true
echo: false
number-sections: true
---
```{r}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(survival)
library(emmeans)
library(foreign)
library(gtsummary)
library(gt)
library(ggsurvfit)
```
```{r}
dat <- read.csv("./unos.txt", sep = "\t")
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
"year", "sex", "tx.type", "follow.up")
dat <- dat |>
mutate(
sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
hla.match = factor(hla.match),
year = factor(year)
)
```
# Introduction: research question
## Survival after transplantation (M)
## Identify Predictors (M)
## Why Survival Analysis (L)
Motivation: study the distribution of time to event $T$.
Example: time of death after kidney transplant.
```{r}
ex <- data.frame(
id = c(1, 2, 3, 4, 5),
transplant = c(2000, 2000, 2001, 2003, 2004),
death = c(2005, 2009, 2005, 2004, 2016)
)
ex_table_1 <- ex |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death = "Death"
) %>%
cols_align(align = "center", columns = everything())
ex_plot_real_time <- ggplot(ex) +
geom_segment(aes(x = transplant, xend = death, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_1
ex_plot_real_time
```
---
We then calculate time to death for each respondents.
With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X`
```{r}
ex_t <- ex |>
mutate(
t = death - transplant
)
ex_table_2 <- ex_t |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death = "Death",
t = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_uniform_time <- ggplot(ex_t) +
geom_segment(aes(x = 0, xend = t, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_2
ex_plot_uniform_time
```
---
What if we do not know when exactly some respondents die?
Scenario 1: the study ends at the year 2008?
```{r}
ex_t_c <- ex_t |>
mutate(
death_censored = if_else(death <= 2008, death, 2008),
death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"),
status = if_else(death <= 2008, 1, 0),
t_1 = death_censored - transplant,
t_1_txt = if_else(death <= 2008, as.character(t_1),
paste(">", as.character(t_1)))
)
```
```{r}
ex_table_3 <- ex_t_c |>
select(id, transplant, death_censored_txt, t_1_txt) |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death_censored_txt = "Death",
t_1_txt = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_real_time_1 <- ggplot(ex_t_c) +
geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death_censored, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
xlim(2000, 2016) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
linewidth = 0.8) +
theme_minimal()
ex_plot_uniform_time_1 <- ggplot(ex_t_c) +
geom_segment(aes(x = 0, xend = t_1, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t_1, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_3
ex_plot_real_time_1
ex_plot_uniform_time_1
```
**Right Censoring**: only observe the event (death) if it occurs before a
certain time (2008).
---
Scenario 2: respondent 3 move away; loss follow up
```{r}
ex_alt <- ex_t_c
ex_alt$death_censored_txt[3] <- "> 2005"
ex_alt$status[3] <- 0
ex_alt$t_1_txt[3] <- "> 4"
ex_table_4 <- ex_alt |>
select(id, transplant, death_censored_txt, t_1_txt) |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death_censored_txt = "Death",
t_1_txt = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_real_time_2 <- ggplot(ex_alt) +
geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death_censored, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
xlim(2000, 2016) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
linewidth = 0.8) +
theme_minimal()
ex_plot_uniform_time_2 <- ggplot(ex_alt) +
geom_segment(aes(x = 0, xend = t_1, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t_1, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_4
ex_plot_real_time_2
ex_plot_uniform_time_2
```
---
How many patients are right censored?
```{r}
dat |>
mutate(Overall = "Overall") |>
pivot_longer(
cols = c(Overall, sex, tx.type),
names_to = "Attribute",
values_to = "Category"
) |>
count(Attribute, Category, death) |>
ggplot(aes(x = Category, y = n, fill = factor(death))) +
geom_col(position = "dodge") +
facet_wrap(~ Attribute, scales = "free_x") +
labs(x = "Group", y = "Count", fill = "Death Status") +
theme_minimal()
```
## Challenges in Survival Analysis (L)
- **Right Censoring**: only observe the event if it occurs before a certain
time.
- **Left Censoring**: event has occurred prior to the start of a research
- Follow up every 3 years.
- Event has occurred -> event happened sometime before follow up.
- **Left Truncation**: delayed entry; respondents are included only if they
survived long enough.
- Start follow up with patients 100 days after the transplant
- Patients dies within 100 day wouldn't be included in the dataset
- **Right Truncation**: respondents are included only if they have already
experienced the event.
- Retrospective analysis from deceased patients between 1990 and 2000.
- Patients not dead before 2000 are not included in the dataset.
# Method & Result
## Explain Dataset (M)
## Table 1 (L)
### HLA match `hla.match`
```{r}
dat$hla.match |> table(useNA = "always")
```
```{r}
ggplot(dat, aes(x = hla.match)) +
geom_bar() +
labs(x = "HLA match", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = hla.match, fill = tx.type)) +
geom_bar(position = "dodge") +
labs(x = "HLA match", y = "Count") +
theme_minimal()
```
### Donor age `age.donor`
```{r}
#| echo: true
mean(dat$age.donor, na.rm = TRUE)
median(dat$age.donor, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = age.donor)) +
geom_bar() +
labs(x = "Donor Age", y = "Count") +
theme_minimal()
```
```{r}
#| fig-width: 8
#| fig-height: 12
ggplot(dat, aes(x = age.donor)) +
geom_bar() +
labs(x = "Donor Age", y = "Count") +
theme_minimal() +
facet_grid(hla.match ~ .)
```
```{r}
ggplot(dat, aes(x = age.donor, color = hla.match)) +
geom_density() +
labs(x = "Donor Age", y = "Count") +
theme_minimal()
```
### Recipient Age `age.rec`
```{r}
#| echo: true
mean(dat$age.rec, na.rm = TRUE)
median(dat$age.rec, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = age.rec)) +
geom_bar() +
labs(x = "Recipient Age", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) +
geom_boxplot() +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = age.rec, fill = tx.type)) +
geom_bar(position = "dodge") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = age.rec, color = hla.match)) +
geom_density() +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) +
geom_boxplot() +
theme_minimal()
```
### Cold Ischemia Time `cold.isc`
```{r}
#| echo: true
mean(dat$cold.isc, na.rm = TRUE)
median(dat$cold.isc, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = cold.isc)) +
geom_density() +
labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) +
geom_density() +
labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
theme_minimal()
```
### Transplant Type `tx.type`
```{r}
#| echo: true
dat$tx.type |> table()
```
```{r}
ggplot(dat, aes(x = tx.type)) +
geom_bar() +
labs(x = "Transplant Type", y = "Count") +
theme_minimal()
```
### Year `year`
```{r}
dat$year |> table()
```
```{r}
ggplot(dat, aes(x = year)) +
geom_bar() +
labs(x = "Year", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = year, y = follow.up)) +
geom_boxplot()
```
```{r}
ggplot(dat, aes(x = year, fill = tx.type)) +
geom_bar(position = "dodge")
```
```{r}
ggplot(dat, aes(x = year, y = age.rec)) +
geom_boxplot()
```
```{r}
ggplot(dat, aes(x = year, y = age.donor)) +
geom_boxplot()
```
## Overall Kaplan-Meier
```{r}
km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat)
```
```{r}
km_all |>
ggsurvfit(type = "survival") +
add_confidence_interval() +
scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) +
labs(
x = "Years of Follow-up",
y = "Overall Survival Probability"
) +
theme_minimal()
```
## Hazard Functions
```{r}
get.life.table <- function(dat, time.intervals) {
n.pop <- nrow(dat)
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
) |>
ungroup() |>
calculate.hazard(n.pop)
}
get.life.table.by.groups <- function(dat, time.intervals, grps) {
grps |>
lapply(function(grp) {
dat |>
get.life.table.by.group(time.intervals, grp) |>
mutate(
grp.name = grp,
grp.value = pick(1)[[1]]
) |>
select(-1)
}) |>
bind_rows()
}
get.life.table.by.group <- function(dat, time.intervals, grp) {
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval, .data[[grp]]) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
.groups = "keep"
) |>
ungroup(fu.interval) |>
group_modify(function(df.sub, grp) {
grp.name <- names(grp)
grp.value <- grp[[1]]
n.pop <- (dat[[grp.name]] == grp.value) |> sum()
calculate.hazard(df.sub, n.pop)
}) |>
ungroup()
}
calculate.hazard <- function(life.table, n.pop) {
n.removed <- life.table$n.event + life.table$n.censored
n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
life.table |>
mutate(
n.at.risk = n.pop - n.removed.accum,
# TODO: how to account for censored? How do we adjust for uneven interval?
hazard.rate = n.event / n.at.risk
)
}
recode.dat <- function(dat, time.intervals) {
df <- dat[dat$follow.up <= sum(time.intervals), ]
time.points <- cumsum(time.intervals)
df$fu.interval <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
```
### `death` Distribution (?)
```{r}
dat |>
mutate(
accum.death = cumsum(death),
accum.censored = if_else(death == 1, 0, 1) |> cumsum()
) |>
ggplot() +
geom_step(aes(x = follow.up, y = accum.death))
```
```{r}
```
```{r}
plot.death = dat[dat$death == 1,] |>
ggplot(aes(x = follow.up)) +
geom_histogram(bins = 50)
plot.death
```
### Overall (?)
```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table(dat, time.intervals)
```
### `tx.type` (M)
```{r}
```
## Cox Model
### `age` (continuous) (L)
### `age` (categorical) (M)
### Full model (L)
```{r}
m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat)
summary(m1)
m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat)
summary(m2)
anova(m1, m2, test = "LRT")
```
### Discussion: include `year` or not?
## Assumption Testing
# Conclusion (M + L)