some changes

2026-05-19 09:37:05 +02:00 · 2026-05-19 09:37:05 +02:00 · a23e66a9ad
commit a23e66a9ad
parent bce631d9e0
4 changed files with 6565 additions and 757 deletions
--- a/report.Rmd
+++ b/report.Rmd
@ -13,7 +13,7 @@ library(ggplot2)
 library(survival)
 library(emmeans)
 library(foreign)
-# library(gtsummary)
+library(gtsummary)
 ```

 ```{r}
@ -45,6 +45,89 @@ sapply(names(dat), function(col) {
 > Illustrate in a table the characteristics of the population (age, sex, race, 
 donor, . . . ).

+```{r}
+hist(dat$age.donor)
+hist(dat$age.rec)
+hist(dat$cold.isc)
+hist(dat$year)
+hist(dat$follow.up)
+
+unique(dat$hla.match)
+unique(dat$year)
+```
+Change the order!!
+
+```{r}
+dat.table1 <- dat |> 
+  mutate(
+    sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
+    tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
+    hla.match = factor(hla.match),
+    year = factor(year)
+  )
+```
+
+```{r}
+table1 <- dat.table1 |> 
+  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
+  tbl_summary(
+    statistic = list(
+      all_continuous() ~ "{median} ({p25}, {p75})"
+    ),
+    label = list(
+      hla.match ~ "HLA matches, n(%)",
+      age.donor ~ "Donor age, median (IQR)",
+      age.rec ~ "Recipient age, median (IQR)",
+      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
+      year ~ "Year of transplant",
+      sex ~ "Sex, n(%)",
+      tx.type ~ "Transplant type, n(%)"
+    ),
+    missing = "ifany"
+  ) |> 
+  modify_footnote(all_stat_cols() ~ NA)
+
+table1
+```
+
+```{r}
+table.split <- dat.table1 |> 
+  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
+  tbl_summary(
+    by = tx.type,
+    statistic = list(
+      all_continuous() ~ "{median} ({p25}, {p75})"
+    ),
+    label = list(
+      hla.match ~ "HLA matches, n(%)",
+      age.donor ~ "Donor age, median (IQR)",
+      age.rec ~ "Recipient age, median (IQR)",
+      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
+      year ~ "Year of transplant",
+      sex ~ "Sex, n(%)"
+    ),
+    missing = "ifany"
+  ) |> 
+  add_overall() |> 
+  modify_footnote(all_stat_cols() ~ NA)
+
+table.split
+```
+
+
+Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI
+```{r}
+rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)
+
+summary(rev_km)$table
+
+median_followup <- summary(rev_km)$table["median"]
+confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
+median_followup
+confidence_interval
+```
+
+
 ```{r}
 g <- ggplot(dat)
 ```
@ -70,12 +153,11 @@ g + geom_boxplot(aes(x = year, y = follow.up, group = year))
 Plot the Kaplan-Meier overall survival curve for pediatric kid-
 ney transplant recipients for the first 12 years after transplantation.

+!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!
+
 ```{r}
 km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up  <= 12, ])
 plot(km)
-
-
-
 ```

 ## Exercise 3
@ -577,6 +659,9 @@ surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
 fit.cox.model(surv, 6)
 ```

+```{r}
+
+```


 Next, we test for each remaining variables
@ -589,5 +674,36 @@ Based on the previous results choose the best predictors.
 Exercise 9 — Check the proportional hazards assumption. You may use the
 function cox.zph. Discuss the result and possible implications.

-Exercise 10 — Plot the Schoenfeld residuals and comment.
+H0: PH-assumption holds
+H1: PH-assumption doesn't hold
+
+```{r}
+# different cox model!!
+cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
+cox.zph(cox.final)
+```
+
+
+Exercise 10 — Plot the Schoenfeld residuals and comment.
+A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate's impact changes, while a horizontal line around zero supports the assumption.
+
+```{r}
+plot(cox.zph(cox.final))
+```
+
+
+
+NOTES:
+left censoring: event happened before time zero -> not applicable
+left truncation: not applicable??
+right censoring: we saw this in the follow up times, administrative censoring high
+
+! No left truncation, since our target population is patients who received a 
+kidney transplant. So we don't care about patients who died before receiving
+a transplant.
+
+! ADD: we used a Cox model but we already saw that the PH assumption doesn't hold !
+
+? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.
+

--- a/report.html
+++ b/report.html
--- a/slides.html
+++ b/slides.html
--- a/slides.qmd
+++ b/slides.qmd
@ -0,0 +1,609 @@
+---
+title: Slides
+execute:
+  cache: true
+  freeze: auto
+  include: true
+  echo: false
+number-sections: true
+---
+
+```{r}
+library(tidyverse)
+library(dplyr)
+library(ggplot2)
+library(survival)
+library(emmeans)
+library(foreign)
+library(gtsummary)
+library(gt)
+library(ggsurvfit)
+```
+
+```{r}
+dat <- read.csv("./unos.txt", sep = "\t")
+names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
+                "year", "sex", "tx.type", "follow.up")
+dat <- dat |> 
+  mutate(
+    sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
+    tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
+    hla.match = factor(hla.match),
+    year = factor(year)
+  )
+```
+
+# Introduction: research question
+
+## Survival after transplantation (M)
+
+## Identify Predictors (M)
+
+## Why Survival Analysis (L)
+
+Motivation: study the distribution of time to event $T$.
+
+Example: time of death after kidney transplant. 
+
+
+```{r}
+ex <- data.frame(
+  id = c(1, 2, 3, 4, 5),
+  transplant = c(2000, 2000, 2001, 2003, 2004),
+  death = c(2005, 2009, 2005, 2004, 2016)
+)
+
+ex_table_1 <- ex |>
+  gt() |>
+  cols_label(
+    id = "ID",
+    transplant = "Transplant",
+    death = "Death"
+  ) %>%
+  cols_align(align = "center", columns = everything())
+
+ex_plot_real_time <- ggplot(ex) +
+  geom_segment(aes(x = transplant, xend = death, y = factor(id),
+                   yend = factor(id)), color = "grey50", linewidth = 1) +
+  geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
+  geom_point(aes(x = death, y = factor(id), color = "death"), size = 3) +
+  scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
+  scale_y_discrete(limits = rev) +
+  labs(x = "Year", y = "Subject ID", color = "Event") +
+  theme_minimal()
+```
+
+```{r}
+ex_table_1
+ex_plot_real_time
+```
+
+---
+
+We then calculate time to death for each respondents. 
+
+With this, all we need is to fit a linear model `t ~ X` or `log(t) ~ X`
+
+```{r}
+ex_t <- ex |>
+  mutate(
+    t = death - transplant
+  )
+
+ex_table_2 <- ex_t |>
+  gt() |>
+  cols_label(
+    id = "ID",
+    transplant = "Transplant",
+    death = "Death",
+    t = "Time to death"
+  ) |>
+  cols_align(align = "center", columns = everything())
+
+ex_plot_uniform_time <- ggplot(ex_t) +
+  geom_segment(aes(x = 0, xend = t, y = factor(id),
+                   yend = factor(id)), color = "grey50", linewidth = 1) +
+  geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
+  geom_point(aes(x = t, y = factor(id), color = "death"), size = 3) +
+  scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D")) +
+  scale_y_discrete(limits = rev) +
+  labs(x = "Year", y = "Subject ID", color = "Event") +
+  theme_minimal()
+
+```
+
+
+```{r}
+ex_table_2
+ex_plot_uniform_time
+```
+
+---
+
+What if we do not know when exactly some respondents die?
+
+Scenario 1: the study ends at the year 2008?
+
+```{r}
+ex_t_c <- ex_t |>
+  mutate(
+    death_censored = if_else(death <= 2008, death, 2008),
+    death_censored_txt = if_else(death <= 2008, as.character(death), "> 2008"),
+    status = if_else(death <= 2008, 1, 0),
+    t_1 = death_censored - transplant,
+    t_1_txt = if_else(death <= 2008, as.character(t_1),
+                      paste(">", as.character(t_1)))
+  )
+```
+
+```{r}
+ex_table_3 <- ex_t_c |>
+  select(id, transplant, death_censored_txt, t_1_txt) |>
+  gt() |>
+  cols_label(
+    id = "ID",
+    transplant = "Transplant",
+    death_censored_txt = "Death",
+    t_1_txt = "Time to death"
+  ) |>
+  cols_align(align = "center", columns = everything())
+
+ex_plot_real_time_1 <- ggplot(ex_t_c) +
+  geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
+                   yend = factor(id)), color = "grey50", linewidth = 1) +
+  geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
+  geom_point(aes(x = death_censored, y = factor(id), color =
+                   if_else(status == 1, "death", "censored")), size = 3) +
+  scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+                                "censored" = "orange")) +
+  scale_y_discrete(limits = rev) +
+  labs(x = "Year", y = "Subject ID", color = "Event") +
+  xlim(2000, 2016) +
+  geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
+             linewidth = 0.8) +
+  theme_minimal()
+
+
+ex_plot_uniform_time_1 <- ggplot(ex_t_c) +
+  geom_segment(aes(x = 0, xend = t_1, y = factor(id),
+                   yend = factor(id)), color = "grey50", linewidth = 1) +
+  geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
+  geom_point(aes(x = t_1, y = factor(id), color =
+                   if_else(status == 1, "death", "censored")), size = 3) +
+  scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+                                "censored" = "orange")) +
+  scale_y_discrete(limits = rev) +
+  labs(x = "Year", y = "Subject ID", color = "Event") +
+  theme_minimal()
+```
+
+
+```{r}
+ex_table_3
+ex_plot_real_time_1
+ex_plot_uniform_time_1
+```
+
+**Right Censoring**: only observe the event (death) if it occurs before a 
+certain time (2008).
+
+---
+
+Scenario 2: respondent 3 move away; loss follow up
+
+```{r}
+ex_alt <- ex_t_c
+ex_alt$death_censored_txt[3] <- "> 2005"
+ex_alt$status[3] <- 0
+ex_alt$t_1_txt[3] <- "> 4"
+
+
+ex_table_4 <- ex_alt |>
+  select(id, transplant, death_censored_txt, t_1_txt) |>
+  gt() |>
+  cols_label(
+    id = "ID",
+    transplant = "Transplant",
+    death_censored_txt = "Death",
+    t_1_txt = "Time to death"
+  ) |>
+  cols_align(align = "center", columns = everything())
+
+ex_plot_real_time_2 <- ggplot(ex_alt) +
+  geom_segment(aes(x = transplant, xend = death_censored, y = factor(id),
+                   yend = factor(id)), color = "grey50", linewidth = 1) +
+  geom_point(aes(x = transplant, y = factor(id), color = "trans"), size = 3) +
+  geom_point(aes(x = death_censored, y = factor(id), color =
+                   if_else(status == 1, "death", "censored")), size = 3) +
+  scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+                                "censored" = "orange")) +
+  scale_y_discrete(limits = rev) +
+  labs(x = "Year", y = "Subject ID", color = "Event") +
+  xlim(2000, 2016) +
+  geom_vline(xintercept = 2008, linetype = "dashed", color = "orange",
+             linewidth = 0.8) +
+  theme_minimal()
+
+
+ex_plot_uniform_time_2 <- ggplot(ex_alt) +
+  geom_segment(aes(x = 0, xend = t_1, y = factor(id),
+                   yend = factor(id)), color = "grey50", linewidth = 1) +
+  geom_point(aes(x = 0, y = factor(id), color = "trans"), size = 3) +
+  geom_point(aes(x = t_1, y = factor(id), color =
+                   if_else(status == 1, "death", "censored")), size = 3) +
+  scale_color_manual(values = c("trans" = "#00BFC4", "death" = "#F8766D",
+                                "censored" = "orange")) +
+  scale_y_discrete(limits = rev) +
+  labs(x = "Year", y = "Subject ID", color = "Event") +
+  theme_minimal()
+```
+
+```{r}
+ex_table_4
+ex_plot_real_time_2
+ex_plot_uniform_time_2
+```
+
+---
+
+How many patients are right censored?
+
+```{r}
+dat |>
+  mutate(Overall = "Overall") |>
+  pivot_longer(
+    cols = c(Overall, sex, tx.type), 
+    names_to = "Attribute", 
+    values_to = "Category"
+  ) |>
+  count(Attribute, Category, death) |>
+  ggplot(aes(x = Category, y = n, fill = factor(death))) +
+  geom_col(position = "dodge") +
+  facet_wrap(~ Attribute, scales = "free_x") +
+  labs(x = "Group", y = "Count", fill = "Death Status") +
+  theme_minimal()
+```
+
+## Challenges in Survival Analysis (L)
+
+- **Right Censoring**: only observe the event if it occurs before a certain 
+  time.
+- **Left Censoring**: event has occurred prior to the start of a research
+  - Follow up every 3 years.
+  - Event has occurred -> event happened sometime before follow up.
+- **Left Truncation**: delayed entry; respondents are included only if they
+  survived long enough.
+  - Start follow up with patients 100 days after the transplant
+  - Patients dies within 100 day wouldn't be included in the dataset
+- **Right Truncation**: respondents are included only if they have already
+  experienced the event.
+  - Retrospective analysis from deceased patients between 1990 and 2000.
+  - Patients not dead before 2000 are not included in the dataset.
+
+# Method & Result
+
+## Explain Dataset (M)
+
+## Table 1 (L)
+
+
+### HLA match `hla.match`
+
+```{r}
+dat$hla.match |> table(useNA = "always")
+```
+
+```{r}
+ggplot(dat, aes(x = hla.match)) +
+  geom_bar() +
+  labs(x = "HLA match", y = "Count") +
+  theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = hla.match, fill = tx.type)) +
+  geom_bar(position = "dodge") +
+  labs(x = "HLA match", y = "Count") +
+  theme_minimal()
+```
+
+### Donor age `age.donor`
+
+```{r}
+#| echo: true
+mean(dat$age.donor, na.rm = TRUE)
+median(dat$age.donor, na.rm = TRUE)
+```
+
+```{r}
+ggplot(dat, aes(x = age.donor)) +
+  geom_bar() +
+  labs(x = "Donor Age", y = "Count") +
+  theme_minimal()
+```
+
+```{r}
+#| fig-width: 8
+#| fig-height: 12
+ggplot(dat, aes(x = age.donor)) +
+  geom_bar() +
+  labs(x = "Donor Age", y = "Count") +
+  theme_minimal() +
+  facet_grid(hla.match ~ .)
+```
+
+```{r}
+ggplot(dat, aes(x = age.donor, color = hla.match)) +
+  geom_density() +
+  labs(x = "Donor Age", y = "Count") +
+  theme_minimal()
+```
+
+
+### Recipient Age `age.rec`
+
+```{r}
+#| echo: true
+mean(dat$age.rec, na.rm = TRUE)
+median(dat$age.rec, na.rm = TRUE)
+```
+
+
+```{r}
+ggplot(dat, aes(x = age.rec)) +
+  geom_bar() +
+  labs(x = "Recipient Age", y = "Count") +
+  theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = as.factor(age.rec), y = age.donor)) +
+  geom_boxplot() +
+  theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = age.rec, fill = tx.type)) +
+  geom_bar(position = "dodge") +
+  theme_minimal()
+
+```
+
+```{r}
+ggplot(dat, aes(x = age.rec, color = hla.match)) +
+  geom_density() +
+  theme_minimal()
+```
+
+```{r}
+ggplot(dat, aes(x = as.factor(age.rec), y = cold.isc)) +
+  geom_boxplot() +
+  theme_minimal()
+```
+
+
+### Cold Ischemia Time `cold.isc`
+
+```{r}
+#| echo: true
+mean(dat$cold.isc, na.rm = TRUE)
+median(dat$cold.isc, na.rm = TRUE)
+```
+
+```{r}
+ggplot(dat, aes(x = cold.isc)) +
+  geom_density() +
+  labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
+  theme_minimal()
+```
+
+```{r}
+ggplot(dat, aes(x = cold.isc, color = tx.type, group = tx.type)) +
+  geom_density() +
+  labs(x = "Cold Ischemia Time (hours)", y = "Probability Density") +
+  theme_minimal()
+```
+
+### Transplant Type `tx.type`
+
+```{r}
+#| echo: true
+dat$tx.type |> table()
+```
+
+```{r}
+ggplot(dat, aes(x = tx.type)) +
+  geom_bar() +
+  labs(x = "Transplant Type", y = "Count") +
+  theme_minimal()
+
+```
+
+### Year `year`
+
+```{r}
+dat$year |> table()
+```
+
+```{r}
+ggplot(dat, aes(x = year)) +
+  geom_bar() +
+  labs(x = "Year", y = "Count") +
+  theme_minimal()
+```
+
+```{r}
+ggplot(dat, aes(x = year, y = follow.up)) +
+  geom_boxplot()
+```
+
+```{r}
+ggplot(dat, aes(x = year, fill = tx.type)) +
+  geom_bar(position = "dodge")
+```
+
+```{r}
+ggplot(dat, aes(x = year, y = age.rec)) +
+  geom_boxplot()
+```
+
+```{r}
+ggplot(dat, aes(x = year, y = age.donor)) +
+  geom_boxplot()
+```
+
+## Overall Kaplan-Meier
+
+```{r}
+km_all <- survfit(Surv(follow.up, death) ~ 1, data = dat)
+```
+
+```{r}
+km_all |>
+  ggsurvfit(type = "survival") +
+  add_confidence_interval() +
+  scale_y_continuous(limits = c(0, 1), labels = scales::label_percent()) +
+  labs(
+    x = "Years of Follow-up",
+    y = "Overall Survival Probability"
+  ) +
+  theme_minimal()
+```
+
+## Hazard Functions 
+
+```{r}
+get.life.table <- function(dat, time.intervals) {
+  n.pop <- nrow(dat)
+
+  dat |>
+    recode.dat(time.intervals) |>
+    group_by(fu.interval) |>
+    summarize(
+      n.censored = sum(.data$death == 0),
+      n.event = sum(.data$death),
+    ) |>
+    ungroup() |>
+    calculate.hazard(n.pop)
+}
+
+get.life.table.by.groups <- function(dat, time.intervals, grps) {
+  grps |>
+    lapply(function(grp) {
+      dat |>
+        get.life.table.by.group(time.intervals, grp) |>
+        mutate(
+          grp.name = grp,
+          grp.value = pick(1)[[1]]
+        ) |>
+        select(-1)
+    }) |>
+    bind_rows()
+}
+
+get.life.table.by.group <- function(dat, time.intervals, grp) {
+  dat |>
+    recode.dat(time.intervals) |>
+    group_by(fu.interval, .data[[grp]]) |>
+    summarize(
+      n.censored = sum(.data$death == 0),
+      n.event = sum(.data$death),
+      .groups = "keep"
+    ) |>
+    ungroup(fu.interval) |>
+    group_modify(function(df.sub, grp) {
+      grp.name <- names(grp)
+      grp.value <- grp[[1]]
+      n.pop <- (dat[[grp.name]] == grp.value) |> sum()
+      calculate.hazard(df.sub, n.pop)
+    }) |>
+    ungroup()
+}
+
+calculate.hazard <- function(life.table, n.pop) {
+  n.removed <- life.table$n.event + life.table$n.censored
+  n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
+  life.table |>
+    mutate(
+      n.at.risk = n.pop - n.removed.accum,
+      # TODO: how to account for censored? How do we adjust for uneven interval?
+      hazard.rate = n.event / n.at.risk
+    )
+}
+
+recode.dat <- function(dat, time.intervals) {
+  df <- dat[dat$follow.up <= sum(time.intervals), ]
+  time.points <- cumsum(time.intervals)
+  df$fu.interval <- sapply(df$follow.up, function(time) {
+      time.points[time <= time.points][1]
+  })
+
+  df
+}
+```
+
+### `death` Distribution (?)
+
+```{r}
+dat |>
+  mutate(
+    accum.death = cumsum(death),
+    accum.censored = if_else(death == 1, 0, 1) |> cumsum()
+  ) |>
+  ggplot() +
+  geom_step(aes(x = follow.up, y = accum.death))
+```
+
+```{r}
+
+```
+
+```{r}
+plot.death = dat[dat$death == 1,] |>
+  ggplot(aes(x = follow.up)) +
+  geom_histogram(bins = 50)
+plot.death
+```
+
+
+### Overall (?)
+
+```{r}
+time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
+get.life.table(dat, time.intervals)
+```
+
+### `tx.type` (M)
+
+
+```{r}
+
+```
+
+
+## Cox Model
+
+### `age` (continuous) (L)
+
+### `age` (categorical) (M)
+
+### Full model (L)
+
+```{r}
+m1 <- coxph(Surv(follow.up, death) ~ hla.match + tx.type, data = dat)
+summary(m1)
+
+m2 <- coxph(Surv(follow.up, death) ~ hla.match * tx.type, data = dat)
+summary(m2)
+
+anova(m1, m2, test = "LRT")
+```
+
+### Discussion: include `year` or not?
+
+## Assumption Testing
+
+# Conclusion (M + L)