kidney/slides.Rmd
2026-05-19 17:25:57 +02:00

728 lines
16 KiB
Text

---
title: Slides
execute:
cache: true
freeze: auto
include: true
echo: false
number-sections: true
---
```{r}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(survival)
library(emmeans)
library(foreign)
library(gtsummary)
library(gt)
library(ggsurvfit)
```
```{r}
dat <- read.csv("./unos.txt", sep = "\t")
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
"year", "sex", "tx.type", "follow.up")
dat <- dat |>
mutate(
sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
hla.match = factor(hla.match),
year = factor(year)
)
```
# Introduction: research question
## Survival after transplantation (M)
## Identify Predictors (M)
## Why Survival Analysis (L)
Motivation: study the distribution of time to event $T$.
Example: time of death after kidney transplant.
```{r}
ex <- data.frame(
id = c(1, 2, 3, 4, 5),
transplant = c(2000, 2000, 2001, 2003, 2004),
death = c(2005, 2009, 2005, 2004, 2016)
)
ex_table_1 <- ex |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death = "Death"
) %>%
cols_align(align = "center", columns = everything())
ex_plot_real_time <- ggplot(ex) +
geom_segment(aes(x = transplant, xend = death, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_1
ex_plot_real_time
```
---
We then calculate time to death for each respondents.
With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X`
```{r}
ex_t <- ex |>
mutate(
t = death - transplant
)
ex_table_2 <- ex_t |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death = "Death",
t = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_uniform_time <- ggplot(ex_t) +
geom_segment(aes(x = 0, xend = t, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_2
ex_plot_uniform_time
```
---
What if we do not know when exactly some respondents die?
Scenario 1: the study ends at the year 2008?
```{r}
ex_t_c <- ex_t |>
mutate(
death_censored = if_else(death <= 2008, death, 2008),
death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"),
status = if_else(death <= 2008, 1, 0),
t_1 = death_censored - transplant,
t_1_txt = if_else(death <= 2008, as.character(t_1),
paste(">", as.character(t_1)))
)
```
```{r}
ex_table_3 <- ex_t_c |>
select(id, transplant, death_censored_txt, t_1_txt) |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death_censored_txt = "Death",
t_1_txt = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_real_time_1 <- ggplot(ex_t_c) +
geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death_censored, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
xlim(2000, 2016) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
linewidth = 0.8) +
theme_minimal()
ex_plot_uniform_time_1 <- ggplot(ex_t_c) +
geom_segment(aes(x = 0, xend = t_1, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t_1, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_3
ex_plot_real_time_1
ex_plot_uniform_time_1
```
**Right Censoring**: only observe the event (death) if it occurs before a
certain time (2008).
---
Scenario 2: respondent 3 move away; loss follow up
```{r}
ex_alt <- ex_t_c
ex_alt$death_censored_txt[3] <- "> 2005"
ex_alt$status[3] <- 0
ex_alt$t_1_txt[3] <- "> 4"
ex_table_4 <- ex_alt |>
select(id, transplant, death_censored_txt, t_1_txt) |>
gt() |>
cols_label(
id = "ID",
transplant = "Transplant",
death_censored_txt = "Death",
t_1_txt = "Time to death"
) |>
cols_align(align = "center", columns = everything())
ex_plot_real_time_2 <- ggplot(ex_alt) +
geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = death_censored, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
xlim(2000, 2016) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
linewidth = 0.8) +
theme_minimal()
ex_plot_uniform_time_2 <- ggplot(ex_alt) +
geom_segment(aes(x = 0, xend = t_1, y = factor(id),
yend = factor(id)), color = "grey50", linewidth = 1) +
geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
geom_point(aes(x = t_1, y = factor(id), color =
if_else(status == 1, "death", "censored")), size = 3) +
scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
"censored" = "orange")) +
scale_y_discrete(limits = rev) +
labs(x = "Year", y = "Subject ID", color = "Event") +
theme_minimal()
```
```{r}
ex_table_4
ex_plot_real_time_2
ex_plot_uniform_time_2
```
---
How many patients are right censored?
```{r}
dat |>
mutate(Overall = "Overall") |>
pivot_longer(
cols = c(Overall, sex, tx.type),
names_to = "Attribute",
values_to = "Category"
) |>
count(Attribute, Category, death) |>
ggplot(aes(x = Category, y = n, fill = factor(death))) +
geom_col(position = "dodge") +
facet_wrap(~ Attribute, scales = "free_x") +
labs(x = "Group", y = "Count", fill = "Death Status") +
theme_minimal()
```
## Challenges in Survival Analysis (L)
- **Right Censoring**: only observe the event if it occurs before a certain
time.
- **Left Censoring**: event has occurred prior to the start of a research
- Follow up every 3 years.
- Event has occurred -> event happened sometime before follow up.
- **Left Truncation**: delayed entry; respondents are included only if they
survived long enough.
- Start follow up with patients 100 days after the transplant
- Patients dies within 100 day wouldn't be included in the dataset
- **Right Truncation**: respondents are included only if they have already
experienced the event.
- Retrospective analysis from deceased patients between 1990 and 2000.
- Patients not dead before 2000 are not included in the dataset.
# Method & Result
## Explain Dataset (M)
::: {.callout-note}
# Things to highlight in figures
- sex and recipient age are similar across `tx.type`
- Living donors have less `hla.match` then cadaveric donors
- There is age difference between different `tx.type`
:::
### `sex` ~ `tx.tpye`
::: {.callout-note}
`sex` are similar across `tx.type`
:::
```{r, fig.width=7, fig.height=5}
dat |>
count(tx.type, sex) |>
group_by(tx.type) |>
mutate(
total = sum(n),
percent = n / total
) |>
ungroup() |>
ggplot() +
geom_col(aes(x = tx.type, y = percent, fill = sex), position = "dodge") +
labs(
title = "Percentage of Sex in Transplant Type",
y = "Percentage",
x = "Transplant Type",
fill = NULL
) +
scale_x_discrete(
labels = c("Cadaveric" = "Deceased"),
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(size = 17, face = "bold"),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10),
legend.text = element_text(size = 12),
plot.margin = margin(20, 30, 20, 30)
)
```
## Table 1 (L)
### HLA match `hla.match`
```{r}
dat$hla.match |> table(useNA = "always")
```
```{r}
ggplot(dat, aes(x = hla.match)) +
geom_bar() +
labs(x = "HLA match", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = hla.match, fill = tx.type)) +
geom_bar(position = "dodge") +
labs(x = "HLA match", y = "Count") +
theme_minimal()
```
::: {.callout-warning}
TODO: change to percentage within each `tx.type`
:::
::: {.callout-note}
Living donors have less `hla.match` then cadaveric donors
:::
### Donor age `age.donor`
```{r}
#| echo: true
mean(dat$age.donor, na.rm = TRUE)
median(dat$age.donor, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = age.donor)) +
geom_bar() +
labs(x = "Donor Age", y = "Count") +
theme_minimal()
```
```{r, fig.width = 8, fig.height = 12}
ggplot(dat, aes(x = age.donor)) +
geom_bar() +
labs(x = "Donor Age", y = "Count") +
theme_minimal() +
facet_grid(hla.match ~ .)
```
```{r, fig.width= 8, fig.height= 12}
ggplot(dat, aes(x = age.donor, color = tx.type)) +
geom_density() +
labs(x = "Donor Age", y = "Density") +
theme_minimal() +
facet_grid(hla.match ~ .)
```
```{r}
ggplot(dat, aes(x = age.donor, color = hla.match)) +
geom_density() +
labs(x = "Donor Age", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(tx.type), y = age.donor)) +
geom_boxplot() +
theme_minimal()
```
::: {.callout-note}
There is age difference between different `tx.type`
:::
### Recipient Age `age.rec`
```{r}
#| echo: true
mean(dat$age.rec, na.rm = TRUE)
median(dat$age.rec, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = age.rec)) +
geom_bar() +
labs(x = "Recipient Age", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) +
geom_boxplot() +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = age.rec, fill = tx.type)) +
geom_bar(position = "dodge") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = age.rec, color = hla.match)) +
geom_density() +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) +
geom_boxplot() +
theme_minimal()
```
### Cold Ischemia Time `cold.isc`
```{r}
#| echo: true
mean(dat$cold.isc, na.rm = TRUE)
median(dat$cold.isc, na.rm = TRUE)
```
```{r}
ggplot(dat, aes(x = cold.isc)) +
geom_density() +
labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) +
geom_density() +
labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = cold.isc)) +
geom_density() +
theme_minimal() +
facet_grid(tx.type ~ .)
```
### Transplant Type `tx.type`
```{r}
dat$tx.type |> table(useNA = "always")
```
```{r}
#| echo: true
dat$tx.type |> table()
```
```{r}
ggplot(dat, aes(x = tx.type)) +
geom_bar() +
labs(x = "Transplant Type", y = "Count") +
theme_minimal()
```
### Year `year`
```{r}
dat$year |> table()
```
```{r}
ggplot(dat, aes(x = year)) +
geom_bar() +
labs(x = "Year", y = "Count") +
theme_minimal()
```
```{r}
ggplot(dat, aes(x = year, y = follow.up)) +
geom_boxplot()
```
```{r}
ggplot(dat, aes(x = year, fill = tx.type)) +
geom_bar(position = "dodge")
```
```{r}
ggplot(dat, aes(x = year, y = age.rec)) +
geom_boxplot()
```
```{r}
ggplot(dat, aes(x = year, y = age.donor)) +
geom_boxplot()
```
## Kaplan-Meier
### Overall (L)
```{r}
km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat)
summary(km_all, times = c(4, 8, 12))
```
```{r}
km_all |>
ggsurvfit(type = "survival") +
add_confidence_interval() +
scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) +
labs(
x = "Years of Follow-up",
y = "Overall Survival Probability"
) +
theme_minimal()
```
### `tx.type` (M)
```{r, fig.width=7, fig.height=5}
km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat)
km.tx |>
ggsurvfit(type = "survival") +
add_confidence_interval() +
scale_x_continuous(
breaks = seq(0, 20, by = 4)
) +
scale_y_continuous(
limits = c(0, 1),
breaks = seq(0, 1, by = 0.2),
labels = scales::label_number(accuracy = 0.1)
) +
labs(
x = "Years of Follow-up",
y = "Overall Survival Probability"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(size = 17, face = "bold"),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10),
legend.text = element_text(size = 12),
plot.margin = margin(20, 30, 20, 30)
)
```
## Cox Model
```{r}
get.life.table <- function(dat, time.intervals) {
n.pop <- nrow(dat)
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
) |>
ungroup() |>
calculate.hazard(n.pop)
}
get.life.table.by.groups <- function(dat, time.intervals, grps) {
grps |>
lapply(function(grp) {
dat |>
get.life.table.by.group(time.intervals, grp) |>
mutate(
grp.name = grp,
grp.value = pick(1)[[1]]
) |>
select(-1)
}) |>
bind_rows()
}
get.life.table.by.group <- function(dat, time.intervals, grp) {
dat |>
recode.dat(time.intervals) |>
group_by(fu.interval, .data[[grp]]) |>
summarize(
n.censored = sum(.data$death == 0),
n.event = sum(.data$death),
.groups = "keep"
) |>
ungroup(fu.interval) |>
group_modify(function(df.sub, grp) {
grp.name <- names(grp)
grp.value <- grp[[1]]
n.pop <- (dat[[grp.name]] == grp.value) |> sum()
calculate.hazard(df.sub, n.pop)
}) |>
ungroup()
}
calculate.hazard <- function(life.table, n.pop) {
n.removed <- life.table$n.event + life.table$n.censored
n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
life.table |>
mutate(
n.at.risk = n.pop - n.removed.accum,
# TODO: how to account for censored? How do we adjust for uneven interval?
hazard.rate = n.event / n.at.risk
)
}
recode.dat <- function(dat, time.intervals) {
df <- dat[dat$follow.up <= sum(time.intervals), ]
time.points <- cumsum(time.intervals)
df$fu.interval <- sapply(df$follow.up, function(time) {
time.points[time <= time.points][1]
})
df
}
```
### `death` Distribution (?)
```{r}
# dat |>
# mutate(
# accum.death = cumsum(death),
# accum.censored = if_else(death == 1, 0, 1) |> cumsum()
# ) |>
# ggplot() +
# geom_pont(aes(x = follow.up, y = accum.death, color = death))
```
```{r}
```
```{r}
plot.death = dat[dat$death == 1,] |>
ggplot(aes(x = follow.up)) +
geom_histogram(bins = 50)
plot.death
```
### Overall (L)
::: {.callout-note}
skip overall, jump directly to `tx.type`
:::
```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table(dat, time.intervals)
```
### `tx.type` (M)
### `age` (continuous) (L)
### `age` (categorical) (M)
### Full model (L)
```{r}
m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat)
summary(m1)
m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat)
summary(m2)
anova(m1, m2, test = "LRT")
```
### Discussion: include `year` or not?
## Assumption Testing
# Conclusion (M + L)