some changes

This commit is contained in:
Louis Chih-Ming Lee 2026-05-19 09:37:05 +02:00
parent bce631d9e0
commit a23e66a9ad
4 changed files with 6565 additions and 757 deletions

1302
report.Rmd

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

3075
slides.html Normal file

File diff suppressed because it is too large Load diff

609
slides.qmd Normal file
View file

@ -0,0 +1,609 @@
---
title: Slides
execute:
cache: true
freeze: auto
include: true
echo: false
number-sections: true
---
```{r}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(survival)
library(emmeans)
library(foreign)
library(gtsummary)
library(gt)
library(ggsurvfit)
```
```{r}
dat <- read.csv("./unos.txt", sep = "\t")
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
"year", "sex", "tx.type", "follow.up")
dat <- dat |>
mutate(
sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
hla.match = factor(hla.match),
year = factor(year)
)
```
# Introduction: research question
## Survival after transplantation (M)
## Identify Predictors (M)
## Why Survival Analysis (L)
Motivation: study the distribution of time to event $T$.
Example: time of death after kidney transplant.
```{r}
ex <- data.frame(
id = c(1, 2, 3, 4, 5),
transplant = c(2000, 2000, 2001, 2003, 2004),
death = c(2005, 2009, 2005, 2004, 2016)
)
ex_table_1 <- ex |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death = "Death"
) %>%
cols_align(align = "center", columns = everything())
ex_plot_real_time <- ggplot(ex) +
geom_segment(aes(x = transplant, xend = death, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_1
ex_plot_real_time
```
---
We then calculate time to death for each respondents.
With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X`
```{r}
ex_t <- ex |>
mutate(
t = death - transplant
)
ex_table_2 <- ex_t |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death = "Death",
t = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_uniform_time <- ggplot(ex_t) +
geom_segment(aes(x = 0, xend = t, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_2
ex_plot_uniform_time
```
---
What if we do not know when exactly some respondents die?
Scenario 1: the study ends at the year 2008?
```{r}
ex_t_c <- ex_t |>
mutate(
death_censored = if_else(death <= 2008, death, 2008),
death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"),
status = if_else(death <= 2008, 1, 0),
t_1 = death_censored - transplant,
t_1_txt = if_else(death <= 2008, as.character(t_1),
paste(">", as.character(t_1)))
)
```
```{r}
ex_table_3 <- ex_t_c |>
select(id, transplant, death_censored_txt, t_1_txt) |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death_censored_txt = "Death",
t_1_txt = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_real_time_1 <- ggplot(ex_t_c) +
geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death_censored, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
xlim(2000, 2016) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
linewidth = 0.8) +
theme_minimal()
ex_plot_uniform_time_1 <- ggplot(ex_t_c) +
geom_segment(aes(x = 0, xend = t_1, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t_1, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_3
ex_plot_real_time_1
ex_plot_uniform_time_1
```
**Right Censoring**: only observe the event (death) if it occurs before a
certain time (2008).
---
Scenario 2: respondent 3 move away; loss follow up
```{r}
ex_alt <- ex_t_c
ex_alt$death_censored_txt[3] <- "> 2005"
ex_alt$status[3] <- 0
ex_alt$t_1_txt[3] <- "> 4"
ex_table_4 <- ex_alt |>
select(id, transplant, death_censored_txt, t_1_txt) |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death_censored_txt = "Death",
t_1_txt = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_real_time_2 <- ggplot(ex_alt) +
geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death_censored, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
xlim(2000, 2016) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
linewidth = 0.8) +
theme_minimal()
ex_plot_uniform_time_2 <- ggplot(ex_alt) +
geom_segment(aes(x = 0, xend = t_1, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t_1, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_4
ex_plot_real_time_2
ex_plot_uniform_time_2
```
---
How many patients are right censored?
```{r}
dat |>
mutate(Overall = "Overall") |>
pivot_longer(
cols = c(Overall, sex, tx.type),
names_to = "Attribute",
values_to = "Category"
) |>
count(Attribute, Category, death) |>
ggplot(aes(x = Category, y = n, fill = factor(death))) +
geom_col(position = "dodge") +
facet_wrap(~ Attribute, scales = "free_x") +
labs(x = "Group", y = "Count", fill = "Death Status") +
theme_minimal()
```
## Challenges in Survival Analysis (L)
- **Right Censoring**: only observe the event if it occurs before a certain
time.
- **Left Censoring**: event has occurred prior to the start of a research
- Follow up every 3 years.
- Event has occurred -> event happened sometime before follow up.
- **Left Truncation**: delayed entry; respondents are included only if they
survived long enough.
- Start follow up with patients 100 days after the transplant
- Patients dies within 100 day wouldn't be included in the dataset
- **Right Truncation**: respondents are included only if they have already
experienced the event.
- Retrospective analysis from deceased patients between 1990 and 2000.
- Patients not dead before 2000 are not included in the dataset.
# Method & Result
## Explain Dataset (M)
## Table 1 (L)
### HLA match `hla.match`
```{r}
dat$hla.match |> table(useNA = "always")
```
```{r}
ggplot(dat, aes(x = hla.match)) +
geom_bar() +
labs(x = "HLA match", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = hla.match, fill = tx.type)) +
geom_bar(position = "dodge") +
labs(x = "HLA match", y = "Count") +
theme_minimal()
```
### Donor age `age.donor`
```{r}
#| echo: true
mean(dat$age.donor, na.rm = TRUE)
median(dat$age.donor, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = age.donor)) +
geom_bar() +
labs(x = "Donor Age", y = "Count") +
theme_minimal()
```
```{r}
#| fig-width: 8
#| fig-height: 12
ggplot(dat, aes(x = age.donor)) +
geom_bar() +
labs(x = "Donor Age", y = "Count") +
theme_minimal() +
facet_grid(hla.match ~ .)
```
```{r}
ggplot(dat, aes(x = age.donor, color = hla.match)) +
geom_density() +
labs(x = "Donor Age", y = "Count") +
theme_minimal()
```
### Recipient Age `age.rec`
```{r}
#| echo: true
mean(dat$age.rec, na.rm = TRUE)
median(dat$age.rec, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = age.rec)) +
geom_bar() +
labs(x = "Recipient Age", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) +
geom_boxplot() +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = age.rec, fill = tx.type)) +
geom_bar(position = "dodge") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = age.rec, color = hla.match)) +
geom_density() +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) +
geom_boxplot() +
theme_minimal()
```
### Cold Ischemia Time `cold.isc`
```{r}
#| echo: true
mean(dat$cold.isc, na.rm = TRUE)
median(dat$cold.isc, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = cold.isc)) +
geom_density() +
labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) +
geom_density() +
labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
theme_minimal()
```
### Transplant Type `tx.type`
```{r}
#| echo: true
dat$tx.type |> table()
```
```{r}
ggplot(dat, aes(x = tx.type)) +
geom_bar() +
labs(x = "Transplant Type", y = "Count") +
theme_minimal()
```
### Year `year`
```{r}
dat$year |> table()
```
```{r}
ggplot(dat, aes(x = year)) +
geom_bar() +
labs(x = "Year", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = year, y = follow.up)) +
geom_boxplot()
```
```{r}
ggplot(dat, aes(x = year, fill = tx.type)) +
geom_bar(position = "dodge")
```
```{r}
ggplot(dat, aes(x = year, y = age.rec)) +
geom_boxplot()
```
```{r}
ggplot(dat, aes(x = year, y = age.donor)) +
geom_boxplot()
```
## Overall Kaplan-Meier
```{r}
km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat)
```
```{r}
km_all |>
ggsurvfit(type = "survival") +
add_confidence_interval() +
scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) +
labs(
x = "Years of Follow-up",
y = "Overall Survival Probability"
) +
theme_minimal()
```
## Hazard Functions
```{r}
get.life.table <- function(dat, time.intervals) {
n.pop <- nrow(dat)
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
) |>
ungroup() |>
calculate.hazard(n.pop)
}
get.life.table.by.groups <- function(dat, time.intervals, grps) {
grps |>
lapply(function(grp) {
dat |>
get.life.table.by.group(time.intervals, grp) |>
mutate(
grp.name = grp,
grp.value = pick(1)[[1]]
) |>
select(-1)
}) |>
bind_rows()
}
get.life.table.by.group <- function(dat, time.intervals, grp) {
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval, .data[[grp]]) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
.groups = "keep"
) |>
ungroup(fu.interval) |>
group_modify(function(df.sub, grp) {
grp.name <- names(grp)
grp.value <- grp[[1]]
n.pop <- (dat[[grp.name]] == grp.value) |> sum()
calculate.hazard(df.sub, n.pop)
}) |>
ungroup()
}
calculate.hazard <- function(life.table, n.pop) {
n.removed <- life.table$n.event + life.table$n.censored
n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
life.table |>
mutate(
n.at.risk = n.pop - n.removed.accum,
# TODO: how to account for censored? How do we adjust for uneven interval?
hazard.rate = n.event / n.at.risk
)
}
recode.dat <- function(dat, time.intervals) {
df <- dat[dat$follow.up <= sum(time.intervals), ]
time.points <- cumsum(time.intervals)
df$fu.interval <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
```
### `death` Distribution (?)
```{r}
dat |>
mutate(
accum.death = cumsum(death),
accum.censored = if_else(death == 1, 0, 1) |> cumsum()
) |>
ggplot() +
geom_step(aes(x = follow.up, y = accum.death))
```
```{r}
```
```{r}
plot.death = dat[dat$death == 1,] |>
ggplot(aes(x = follow.up)) +
geom_histogram(bins = 50)
plot.death
```
### Overall (?)
```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table(dat, time.intervals)
```
### `tx.type` (M)
```{r}
```
## Cox Model
### `age` (continuous) (L)
### `age` (categorical) (M)
### Full model (L)
```{r}
m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat)
summary(m1)
m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat)
summary(m2)
anova(m1, m2, test = "LRT")
```
### Discussion: include `year` or not?
## Assumption Testing
# Conclusion (M + L)