---
title: "foo"
execute:
  cache: true
  freeze: auto
number-sections: true
---

```{r}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(survival)
library(emmeans)
library(foreign)
library(gtsummary)
library(ggsurvfit)
```

```{r}
dat <- read.csv("./unos.txt", sep = "\t")
head(dat)
names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death",
                "year", "sex", "tx.type", "follow.up")
```

Complete case

```{r}
nrow(dat)
```


```{r}
sapply(names(dat), function(col) {
  is.na(dat[, col]) |> sum()
})
```


# Exercise

## Exercise 1 

> Illustrate in a table the characteristics of the population (age, sex, race, 
donor, . . . ).

```{r}
hist(dat$age.donor)
hist(dat$age.rec)
hist(dat$cold.isc)
hist(dat$year)
hist(dat$follow.up)

unique(dat$hla.match)
unique(dat$year)
```
Change the order!!

```{r}
dat.table1 <- dat |> 
  mutate(
    sex = factor(sex, levels = c(0,1), labels = c("Female","Male")),
    tx.type = factor(tx.type, levels = c(0,1), labels = c("Cadaveric","Living")),
    hla.match = factor(hla.match),
    year = factor(year)
  )
```

```{r}
table1 <- dat.table1 |> 
  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{median} ({p25}, {p75})"
    ),
    label = list(
      hla.match ~ "HLA matches, n(%)",
      age.donor ~ "Donor age, median (IQR)",
      age.rec ~ "Recipient age, median (IQR)",
      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
      year ~ "Year of transplant",
      sex ~ "Sex, n(%)",
      tx.type ~ "Transplant type, n(%)"
    ),
    missing = "ifany"
  ) |> 
  modify_footnote(all_stat_cols() ~ NA)

table1
```

```{r}
table.split <- dat.table1 |> 
  select(hla.match, age.donor, age.rec, cold.isc, year, sex, tx.type) |> 
  tbl_summary(
    by = tx.type,
    statistic = list(
      all_continuous() ~ "{median} ({p25}, {p75})"
    ),
    label = list(
      hla.match ~ "HLA matches, n(%)",
      age.donor ~ "Donor age, median (IQR)",
      age.rec ~ "Recipient age, median (IQR)",
      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
      year ~ "Year of transplant",
      sex ~ "Sex, n(%)"
    ),
    missing = "ifany"
  ) |> 
  add_overall() |> 
  modify_footnote(all_stat_cols() ~ NA)

table.split
```

```{r}
table.split <- dat.table1 |> 
  select(sex, age.rec, age.donor, hla.match, cold.isc, tx.type) |> 
  tbl_summary(
    by = tx.type,
    statistic = list(
      all_continuous() ~ "{median} ({p25}, {p75})"
    ),
    label = list(
      hla.match ~ "HLA matches, n(%)",
      age.donor ~ "Donor age, median (IQR)",
      age.rec ~ "Recipient age, median (IQR)",
      cold.isc ~ "Cold ischemic time (hours), median (IQR), ",
      sex ~ "Sex, n(%)"
    ),
    missing = "ifany"
  ) |> 
  add_overall() |> 
  modify_footnote(all_stat_cols() ~ NA)

table.split
```


Calculate median follow-up (reverse Kaplan-Meier method), with 95% CI
```{r}
rev_km <- survfit(Surv(follow.up, death == 0) ~ 1, data = dat)

summary(rev_km)$table

median_followup <- summary(rev_km)$table["median"]
confidence_interval <- c(summary(rev_km)$table["0.95LCL"], summary(rev_km)$table["0.95UCL"])
median_followup
confidence_interval
```


```{r}
g <- ggplot(dat)
```

```{r}
g + geom_point(aes(x = follow.up, y = death))
g +  geom_density(aes(x = follow.up))

g +  geom_density(aes(x = age.donor))
g +  geom_bar(aes(x = age.rec))

g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor))

# dat$age.1 |> table()
```

```{r}
g + geom_boxplot(aes(x = year, y = follow.up, group = year))
```

## Exercise 2

Plot the Kaplan-Meier overall survival curve for pediatric kid-
ney transplant recipients for the first 12 years after transplantation.

!!!!!!!! just set xlim to 12, do not remove those individuals !!!!!!!

```{r}
km <- survfit(Surv(follow.up, death) ~ 1, data = dat[dat$follow.up  <= 12, ])
plot(km)
```

## Exercise 3

We are going to compare mortality rates (hazard functions)
between children whose transplanted kidney was provided by a living donor
(in general a family member) and those whose source was recently deceased
(variable donor type: `txtype`). Use the life table method to calculate the death
rates for the first 5 years for each group (take in the first year intervals of 4
months and then look at each year) and show the results in a table. Estimate
the hazard ratio in each time interval as the ratio between the mortality rates
in the two groups. What do you notice?

```{r}
dat.5 <- dat[dat$follow.up <= 5, ]
head(dat.5)
classify_time_interval = function(fu) {
  if (fu <= 1/3) {
    return(1/3)
  } else if (fu <= 2/3) {
    return(2/3)
  } else if (fu <= 1) {
    return(1)
  }
  ceiling(fu)
}

dat.5$fu.interval <- sapply(dat.5$follow.up, classify_time_interval)
table(dat.5$fu.interval)
head(dat.5)
```

```{r}
dat.5.life <- dat.5 |>
  group_by(fu.interval) |>
  summarize(
    n.censored = sum(death == 0),
    n.event = sum(death),
    n.at.risk = nrow(dat),
  )

for (i in 2:nrow(dat.5.life)) {
  j <- i - 1

  n.censored.pre <- dat.5.life$n.censored[j]
  n.event.pre <- dat.5.life$n.event[j]
  n.at.risk.pre <- dat.5.life$n.at.risk[j]

  n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

  dat.5.life$n.at.risk[i] <- n.at.risk
}

print(nrow(dat))
dat.5.life

```
n.removed.acc
```{r}
dat.5.life <- dat.5.life |>
  mutate(
     hazard.rate = n.event / n.at.risk
  )

```

---


```{r}
get_life_table = function(dat.sub, dat) {
  dat.sub <- dat.sub |>
    group_by(fu.interval) |>
    summarize(
      n.censored = sum(death == 0),
      n.event = sum(death),
      n.at.risk = nrow(dat),
    )

  for (i in 2:nrow(dat.sub)) {
    j <- i - 1

    n.censored.pre <- dat.sub$n.censored[j]
    n.event.pre <- dat.sub$n.event[j]
    n.at.risk.pre <- dat.sub$n.at.risk[j]

    n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre

    dat.sub$n.at.risk[i] <- n.at.risk
  }

  dat.sub <- dat.sub |>
    mutate(
       hazard.rate = n.event / n.at.risk
    )

  return(dat.sub)
}
```

```{r}
dat.5.tx0 = dat.5[dat.5$tx.type == 0, ]
dat.5.tx1 = dat.5[dat.5$tx.type == 1, ]
```

```{r}
tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ])
tx0.life
```

```{r}
tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ])
tx1.life
```


```{r}
tx1.life$hazard.rate / tx0.life$hazard.rate
```

```{r}
hazard.df <- data.frame(
  fu.interval = tx1.life$fu.interval,
  hazard.rate.0 = tx0.life$hazard.rate,
  hazard.rate.1 = tx1.life$hazard.rate,
  hazard.ratio = tx1.life$hazard.rate / tx0.life$hazard.rate
) 

ggplot(hazard.df, aes(x = fu.interval)) +
  geom_line(aes(y = hazard.rate.0), color = "blue") + 
  geom_line(aes(y = hazard.rate.1), color = "orange")

ggplot(hazard.df, aes(x = fu.interval)) +
  geom_line(aes(y = hazard.ratio))

```

---

General Solution:

```{r}
get.life.table <- function(dat, time.intervals) {
  n.pop <- nrow(dat)

  dat |>
    recode.dat(time.intervals) |>
    group_by(fu.interval) |>
    summarize(
      n.censored = sum(.data$death == 0),
      n.event = sum(.data$death),
    ) |>
    ungroup() |>
    calculate.hazard(n.pop)
}

get.life.table.by.groups <- function(dat, time.intervals, grps) {
  grps |>
    lapply(function(grp) {
      dat |>
        get.life.table.by.group(time.intervals, grp) |>
        mutate(
          grp.name = grp,
          grp.value = pick(1)[[1]]
        ) |>
        select(-1)
    }) |>
    bind_rows()
}

get.life.table.by.group <- function(dat, time.intervals, grp) {
  dat |>
    recode.dat(time.intervals) |>
    group_by(fu.interval, .data[[grp]]) |>
    summarize(
      n.censored = sum(.data$death == 0),
      n.event = sum(.data$death),
      .groups = "keep"
    ) |>
    ungroup(fu.interval) |>
    group_modify(function(df.sub, grp) {
      grp.name <- names(grp)
      grp.value <- grp[[1]]
      n.pop <- (dat[[grp.name]] == grp.value) |> sum()
      calculate.hazard(df.sub, n.pop)
    }) |>
    ungroup()
}

calculate.hazard <- function(life.table, n.pop) {
  n.removed <- life.table$n.event + life.table$n.censored
  n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)])
  life.table |>
    mutate(
      n.at.risk = n.pop - n.removed.accum,
      # TODO: how to account for censored? How do we adjust for uneven interval?
      hazard.rate = n.event / n.at.risk
    )
}

recode.dat <- function(dat, time.intervals) {
  df <- dat[dat$follow.up <= sum(time.intervals), ]
  time.points <- cumsum(time.intervals)
  df$fu.interval <- sapply(df$follow.up, function(time) {
      time.points[time <= time.points][1]
  })

  df
}
```

```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
grps <- c("tx.type", "sex", "hla.match", "cold.isc")

get.life.table.by.groups(dat, time.intervals, grps)
```

```{r}
grps <- c("tx.type", "sex", "hla.match", "cold.isc")
grps <- c("tx.type")


plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) {
  dat |>
    get.life.table.by.groups(time.intervals, grps) |>
    ggplot(aes(x = fu.interval, y = hazard.rate,
               color = paste(grp.name, grp.value, sep = " = "))) +
    geom_point() +
    geom_line() +
    labs(
      color = "Group"
    ) +
    theme(
      legend.position = "bottom"
    )
}

```
```{r}
time.intervals <- c(rep(1/3, 3), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

```{r}
time.intervals <- rep(1, 5) 
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

```{r}
time.intervals <- rep(1/3, 15)
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

```{r}
time.intervals <- c(rep(1/3, 3*3), rep(1, 2))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

```{r}
time.intervals <- c(rep(1/4, 4), rep(1, 4))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

```{r}
time.intervals <- c(rep(1/4, 4*2), rep(1, 3))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

```{r}
time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6))
plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type"))
```

---

Compare

```{r}
time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1)
get.life.table.by.groups(dat, time.intervals, grps)
```

```{r}
hazard.df
```


## Exercise 4

Show a plot with Kaplan-Meier survival curves for the two donor types.

```{r}
km.tx <- survfit(Surv(follow.up, death) ~ tx.type, data = dat[dat$follow.up  <= 12, ])

plot(km.tx, col = c("blue", "orange"))
legend(legend = c("cadaveric", "living"), "bottomleft", lwd = 2, col = c("blue", "orange"))
```

## Exercise 5

Fit a univariate Cox model with predictor donor type. Report
the hazard ratio and 95% confidence interval and interpret the result obtained.

```{r}
cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat)
summary(cox.tx)
```

```{r}
log.hazard.ratio.hat <- cox.tx$coefficient
hazard.ratio.hat <- log.hazard.ratio.hat |> exp()

```

```{r}
log.hazard.ratio.se <- summary(cox.tx)$coefficients[3]
log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se
log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se

c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp()
```

::: {.callout-note}
Patients receiving kidney transplants from living donors (`tx.type = 1`) has
1.91 higher hazard rate than those receiving from cadaveric donors 
(`tx.type = 0`).

Since 1 is not in the 95% confidence interval, the result is statistically 
significant.
:::


# Exercise 6

Research shows that an important determinant of mortality
after kidney transplant is the age of the recipient. Fit a Cox model with age
as predictor and estimate the hazard ratio and its confidence interval. Consider
age first as continuous variable and then divide into categories.

## Continuous

```{r}
cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat)
cox.age.rec |> summary()
```

## Categorical (Case 1)

```{r}
recode.age <- function(dat, ages) {
  df$age.group <- sapply(df$follow.up, function(time) {
      time.points[time <= time.points][1]
  })

  df
}
```

```{r}
dat.age.rec <- dat[!is.na(dat$age.rec), ]
class.age <- function(age) {
  if (age < 2) { 
    return("0-1")
  } else if (age < 6) {
    return("2-5")
  } else if (age < 11) {
    return("6-10")
  } else if (age < 19) {
    return("11-18")
  }
}
dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |>
  factor(levels = c(
    "0-1",
    "2-5",
    "6-10",
    "11-18"
  ))


cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec)
```

```{r}
emmeans(cox.age.rec, pairwise ~ age.group, type = "response")
```


# Exercise 7

Fit a multivariate Cox model by using other predictors and
describe your results.


```{r}
sapply(names(dat), function(col) {
  is.na(dat[, col]) |> sum()
})
```

```{r}
names(dat)
```

```{r}
Waldtest <- function(cox.fit, i.betas){
  q <- length(i.betas)
  coefs <- cox.fit$coefficients
  # print(cox.fit$var)
  var <- cox.fit$var[i.betas, i.betas]
  Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas]
  p.value <- 1-pchisq(as.numeric(Wald), df = q)
  return(cbind(Wald, p.value))
}
```


```{r}
fit.cox.model <- function(surv, i.betas) {
  fit.tx <- coxph(surv, data = dat)
  Waldtest(fit.tx, i.betas)
}
```

Start with `tx.type`

```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec
fit.cox.model(surv, 1:2) # NOTE: Why is this 1???
```

Iteration 1: `tx.type + age + ?`

```{r}
names(dat)
```

```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + year
fit.cox.model(surv, 3)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex
fit.cox.model(surv, 3)
```

Add `hla.match`

Iteration 2: `tx.type + age.rec + hla.match + ?`

```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor
fit.cox.model(surv, 4)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc
fit.cox.model(surv, 4)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year
fit.cox.model(surv, 4)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex
fit.cox.model(surv, 4)
```

Add `age.donor`

Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?`

```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
  cold.isc
fit.cox.model(surv, 5)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
  year
fit.cox.model(surv, 5)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +
  sex
fit.cox.model(surv, 5)
```

Add `year`

```{r}
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
  year + cold.isc
fit.cox.model(surv, 6)
surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + 
  year + sex
fit.cox.model(surv, 6)
```

```{r}

```


Next, we test for each remaining variables

# Exercise 8

Estimate the survival function for specific covariate patterns.
Based on the previous results choose the best predictors.

Exercise 9 — Check the proportional hazards assumption. You may use the
function cox.zph. Discuss the result and possible implications.

H0: PH-assumption holds
H1: PH-assumption doesn't hold

```{r}
# different cox model!!
cox.final <- coxph(Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor, data = dat)
cox.zph(cox.final)
cox.final
```


Exercise 10 — Plot the Schoenfeld residuals and comment.
A non-random pattern or slope in a plot of scaled residuals against time indicates a violation, suggesting the covariate's impact changes, while a horizontal line around zero supports the assumption.

```{r}
plot(cox.zph(cox.final))
```

```{r, fig.width=10, fig.height=10}
plot(cox.zph(cox.final))
```


NOTES:
left censoring: event happened before time zero -> not applicable
left truncation: not applicable??
right censoring: we saw this in the follow up times, administrative censoring high

! No left truncation, since our target population is patients who received a 
kidney transplant. So we don't care about patients who died before receiving
a transplant.

! ADD: we used a Cox model but we already saw that the PH assumption doesn't hold !

? remove year from prediction model, since it is not a patient characteristic and may not be stable for future predictions. It mainly captures changes in clinical practice rather than individual risk.