diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..a2491c9 --- /dev/null +++ b/.lintr @@ -0,0 +1,5 @@ +linters: lintr::linters_with_defaults( + object_name_linter = lintr::object_name_linter( + styles = c("snake_case", "dotted.case") + ) + ) diff --git a/report.Rmd b/report.Rmd index 8d1c83f..f030138 100644 --- a/report.Rmd +++ b/report.Rmd @@ -8,20 +8,34 @@ number-sections: true ```{r} library(tidyverse) +library(dplyr) +library(ggplot2) library(survival) +library(emmeans) +library(foreign) # library(gtsummary) ``` ```{r} dat <- read.csv("./unos.txt", sep = "\t") head(dat) -``` - -```{r} names(dat) <- c("hla.match", "age.donor", "age.rec", "cold.isc", "death", "year", "sex", "tx.type", "follow.up") ``` +Complete case + +```{r} +nrow(dat) +``` + + +```{r} +sapply(names(dat), function(col) { + is.na(dat[, col]) |> sum() +}) +``` + # Exercise @@ -47,6 +61,10 @@ g + geom_boxplot(aes(x = age.rec, y = age.donor, group = age.donor)) # dat$age.1 |> table() ``` +```{r} +g + geom_boxplot(aes(x = year, y = follow.up, group = year)) +``` + ## Exercise 2 Plot the Kaplan-Meier overall survival curve for pediatric kid- @@ -115,7 +133,7 @@ print(nrow(dat)) dat.5.life ``` - +n.removed.acc ```{r} dat.5.life <- dat.5.life |> mutate( @@ -128,8 +146,8 @@ dat.5.life <- dat.5.life |> ```{r} -get_life_table = function(dat) { - dat <- dat |> +get_life_table = function(dat.sub, dat) { + dat.sub <- dat.sub |> group_by(fu.interval) |> summarize( n.censored = sum(death == 0), @@ -137,24 +155,24 @@ get_life_table = function(dat) { n.at.risk = nrow(dat), ) - for (i in 2:nrow(dat)) { + for (i in 2:nrow(dat.sub)) { j <- i - 1 - n.censored.pre <- dat$n.censored[j] - n.event.pre <- dat$n.event[j] - n.at.risk.pre <- dat$n.at.risk[j] + n.censored.pre <- dat.sub$n.censored[j] + n.event.pre <- dat.sub$n.event[j] + n.at.risk.pre <- dat.sub$n.at.risk[j] n.at.risk <- n.at.risk.pre - n.event.pre - n.censored.pre - dat$n.at.risk[i] <- n.at.risk + dat.sub$n.at.risk[i] <- n.at.risk } - dat <- dat |> + dat.sub <- dat.sub |> mutate( hazard.rate = n.event / n.at.risk ) - return(dat) + return(dat.sub) } ``` @@ -164,12 +182,12 @@ dat.5.tx1 = dat.5[dat.5$tx.type == 1, ] ``` ```{r} -tx0.life <- get_life_table(dat.5.tx0) +tx0.life <- get_life_table(dat.5.tx0, dat[dat$tx.type == 0, ]) tx0.life ``` ```{r} -tx1.life <- get_life_table(dat.5.tx1) +tx1.life <- get_life_table(dat.5.tx1, dat[dat$tx.type == 1, ]) tx1.life ``` @@ -195,6 +213,159 @@ ggplot(hazard.df, aes(x = fu.interval)) + ``` +--- + +General Solution: + +```{r} +get.life.table <- function(dat, time.intervals) { + n.pop <- nrow(dat) + + dat |> + recode.dat(time.intervals) |> + group_by(fu.interval) |> + summarize( + n.censored = sum(.data$death == 0), + n.event = sum(.data$death), + ) |> + ungroup() |> + calculate.hazard(n.pop) +} + +get.life.table.by.groups <- function(dat, time.intervals, grps) { + grps |> + lapply(function(grp) { + dat |> + get.life.table.by.group(time.intervals, grp) |> + mutate( + grp.name = grp, + grp.value = pick(1)[[1]] + ) |> + select(-1) + }) |> + bind_rows() +} + +get.life.table.by.group <- function(dat, time.intervals, grp) { + dat |> + recode.dat(time.intervals) |> + group_by(fu.interval, .data[[grp]]) |> + summarize( + n.censored = sum(.data$death == 0), + n.event = sum(.data$death), + .groups = "keep" + ) |> + ungroup(fu.interval) |> + group_modify(function(df.sub, grp) { + grp.name <- names(grp) + grp.value <- grp[[1]] + n.pop <- (dat[[grp.name]] == grp.value) |> sum() + calculate.hazard(df.sub, n.pop) + }) |> + ungroup() +} + +calculate.hazard <- function(life.table, n.pop) { + n.removed <- life.table$n.event + life.table$n.censored + n.removed.accum <- c(0, cumsum(n.removed)[-length(n.removed)]) + life.table |> + mutate( + n.at.risk = n.pop - n.removed.accum, + # TODO: how to account for censored? How do we adjust for uneven interval? + hazard.rate = n.event / n.at.risk + ) +} + +recode.dat <- function(dat, time.intervals) { + df <- dat[dat$follow.up <= sum(time.intervals), ] + time.points <- cumsum(time.intervals) + df$fu.interval <- sapply(df$follow.up, function(time) { + time.points[time <= time.points][1] + }) + + df +} + + + +``` + +```{r} +time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) +grps <- c("tx.type", "sex", "hla.match", "cold.isc") + +get.life.table.by.groups(dat, time.intervals, grps) +``` + +```{r} +grps <- c("tx.type", "sex", "hla.match", "cold.isc") +grps <- c("tx.type") + + +plot.hazard.rate.by.groups <- function(dat, time.intervals, grps) { + dat |> + get.life.table.by.groups(time.intervals, grps) |> + ggplot(aes(x = fu.interval, y = hazard.rate, + color = paste(grp.name, grp.value, sep = " = "))) + + geom_point() + + geom_line() + + labs( + color = "Group" + ) + + theme( + legend.position = "bottom" + ) +} + +``` +```{r} +time.intervals <- c(rep(1/3, 3), rep(1, 4)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- rep(1, 5) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- rep(1/3, 15) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/3, 3*3), rep(1, 2)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/4, 4), rep(1, 4)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/4, 4*2), rep(1, 3)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +```{r} +time.intervals <- c(rep(1/4, 4*2), rep(1/3, 3*3), rep(1, 6)) +plot.hazard.rate.by.groups(dat, time.intervals, c("tx.type")) +``` + +--- + +Compare + +```{r} +time.intervals <- c(1/3, 1/3, 1/3, 1, 1, 1, 1) +get.life.table.by.groups(dat, time.intervals, grps) +``` + +```{r} +hazard.df +``` + ## Exercise 4 @@ -214,31 +385,209 @@ Fit a univariate Cox model with predictor donor type. Report the hazard ratio and 95% confidence interval and interpret the result obtained. ```{r} -cox <- coxph(Surv(follow.up, death) ~ tx.type, data = dat) -summary(cox) +cox.tx <- coxph(Surv(follow.up, death) ~ tx.type, data = dat) +summary(cox.tx) ``` ```{r} -1.90539 +log.hazard.ratio.hat <- cox.tx$coefficient +hazard.ratio.hat <- log.hazard.ratio.hat |> exp() + ``` ```{r} -1.90539 + 1.96 * exp(0.09558) -(2.298 - 1.58) / 2 |> exp() / 2 +log.hazard.ratio.se <- summary(cox.tx)$coefficients[3] +log.hazard.ratio.hat.lower.bound <- log.hazard.ratio.hat - qnorm(0.975) * log.hazard.ratio.se +log.hazard.ratio.hat.upper.bound <- log.hazard.ratio.hat + qnorm(0.975) * log.hazard.ratio.se + +c(log.hazard.ratio.hat.lower.bound, log.hazard.ratio.hat.upper.bound) |> exp() ``` -Exercise 6 — Research shows that an important determinant of mortality +::: {.callout-note} +Patients receiving kidney transplants from living donors (`tx.type = 1`) has +1.91 higher hazard rate than those receiving from cadaveric donors +(`tx.type = 0`). + +Since 1 is not in the 95% confidence interval, the result is statistically +significant. +::: + + +# Exercise 6 + +Research shows that an important determinant of mortality after kidney transplant is the age of the recipient. Fit a Cox model with age as predictor and estimate the hazard ratio and its confidence interval. Consider age first as continuous variable and then divide into categories. -Exercise 7 — Fit a multivariate Cox model by using other predictors and +## Continuous + +```{r} +cox.age.rec <- coxph(Surv(follow.up, death) ~ age.rec, data = dat) +cox.age.rec |> summary() +``` + +## Categorical (Case 1) + +```{r} +recode.age <- function(dat, ages) { + df$age.group <- sapply(df$follow.up, function(time) { + time.points[time <= time.points][1] + }) + + df +} +``` + +```{r} +dat.age.rec <- dat[!is.na(dat$age.rec), ] +class.age <- function(age) { + if (age < 2) { + return("0,1") + } else if (age < 6) { + return("2,3,4,5") + } else if (age < 11) { + return("6,7,8,9,10") + } else if (age < 19) { + return("11-18") + } +} +``` + +```{r} +dat.age.rec$age.group <- sapply(dat.age.rec$age.rec, class.age) |> + factor(levels = c( + "0,1", + "2,3,4,5", + "6,7,8,9,10", + "11-18" + )) + + +cox.age.rec <- coxph(Surv(follow.up, death) ~ age.group, data = dat.age.rec) +cox.age.rec |> summary() +``` + +```{r} +emmeans(cox.age.rec, pairwise ~ age.group, type = "response") +``` + + +# Exercise 7 + +Fit a multivariate Cox model by using other predictors and describe your results. -Exercise 8 — Estimate the survival function for specific covariate patterns. + +```{r} +sapply(names(dat), function(col) { + is.na(dat[, col]) |> sum() +}) +``` + +```{r} +names(dat) +``` + +```{r} +Waldtest <- function(cox.fit, i.betas){ + q <- length(i.betas) + coefs <- cox.fit$coefficients + # print(cox.fit$var) + var <- cox.fit$var[i.betas, i.betas] + Wald <- coefs[i.betas]%*%solve(var)%*%coefs[i.betas] + p.value <- 1-pchisq(as.numeric(Wald), df = q) + return(cbind(Wald, p.value)) +} +``` + + +```{r} +fit.cox.model <- function(surv, i.betas) { + fit.tx <- coxph(surv, data = dat) + Waldtest(fit.tx, i.betas) +} +``` + +Start with `tx.type` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec +fit.cox.model(surv, 1:2) # NOTE: Why is this 1??? +``` + +Iteration 1: `tx.type + age + ?` + +```{r} +names(dat) +``` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + age.donor +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + cold.isc +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + year +fit.cox.model(surv, 3) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + sex +fit.cox.model(surv, 3) +``` + +Add `hla.match` + +Iteration 2: `tx.type + age.rec + hla.match + ?` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor +fit.cox.model(surv, 4) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + cold.isc +fit.cox.model(surv, 4) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + year +fit.cox.model(surv, 4) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + sex +fit.cox.model(surv, 4) +``` + +Add `age.donor` + +Iteration 2: `tx.type + age.rec + hla.match + age.donor + ?` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + cold.isc +fit.cox.model(surv, 5) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + year +fit.cox.model(surv, 5) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + sex +fit.cox.model(surv, 5) +``` + +Add `year` + +```{r} +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + year + cold.isc +fit.cox.model(surv, 6) +surv <- Surv(follow.up, death) ~ tx.type + age.rec + hla.match + age.donor + + year + sex +fit.cox.model(surv, 6) +``` + + + +Next, we test for each remaining variables + +# Exercise 8 + +Estimate the survival function for specific covariate patterns. Based on the previous results choose the best predictors. Exercise 9 — Check the proportional hazards assumption. You may use the function cox.zph. Discuss the result and possible implications. + Exercise 10 — Plot the Schoenfeld residuals and comment.