rggplot2survminer

Plotting a Cox PH model using ggforest in RStudio when a factor is stratified?


I am trying to find a way to make a forest plot of hazard ratios from a Cox-PH model when one of the model variables needs to be stratified. For a non-stratified model, the ggforest() function is excellent. Running some example code

library(survival)
library(survminer)

model <- coxph(Surv(time, status) ~ sex + rx + adhere,
               data = colon )
ggforest(model)

colon <- within(colon, {
  sex <- factor(sex, labels = c("female", "male"))
  differ <- factor(differ, labels = c("well", "moderate", "poor"))
  extent <- factor(extent, labels = c("submuc.", "muscle", "serosa", "contig."))
})
bigmodel <-
  coxph(Surv(time, status) ~ sex + rx + adhere + differ + extent + node4,
        data = colon )
ggforest(bigmodel)

produces this graph

non-stratified figure

However if I have to correct for non-proportionality with stratification

stratamodel <- coxph(Surv(time, status) ~ sex + strata(rx) + adhere + differ + extent + node4,
                     data = colon )
ggforest(stratamodel)

The following error message appears:

"Error in [.data.frame(data, , var) : undefined columns selected

In addition: Warning message:

In .get_data(model, data = data) : The data argument is not provided. Data will be extracted from model fit."

Any suggestions for how to get the information that ggforest needs from the strata model so that it can produce a plot? Thanks for your help!


Solution

  • I gather that the desired forest plot is one that simply skips the stratified RX variable in the model's formula. If so, we can simply insert an if clause inside, to ignore formula parts that don't correspond exactly to column names in the data (e.g. "strata(rx)" is not a column name).

    Approach 1

    If you are comfortable with R sufficiently to modify functions, run trace(ggforest, edit = TRUE) and replace the allTerms <- lapply(...) (around lines 10-25) in the pop-up window with the following version:

    allTerms <- lapply(seq_along(terms), function(i) {
      var <- names(terms)[i]
      if(var %in% colnames(data)) {
        if (terms[i] %in% c("factor", "character")) {
          adf <- as.data.frame(table(data[, var]))
          cbind(var = var, adf, pos = 1:nrow(adf))
        }
        else if (terms[i] == "numeric") {
          data.frame(var = var, Var1 = "", Freq = nrow(data), 
                     pos = 1)
        }
        else {
          vars = grep(paste0("^", var, "*."), coef$term, 
                      value = TRUE)
          data.frame(var = vars, Var1 = "", Freq = nrow(data), 
                     pos = seq_along(vars))
        }
      } else {
        message(var, "is not found in data columns, and will be skipped.")
      }
      
    })
    
    ggforest(stratamodel) # this should work after the modification
    

    plot

    The modification will not persist to subsequent R sessions. If you want to reverse the modification within the current session, simply run untrace(ggforest).

    Approach 2

    If you prefer to have a permanently modified version of the function for future use / don't want to muck around with a library's function, save the function below:

    ggforest2 <- function (model, data = NULL, main = "Hazard ratio", 
                           cpositions = c(0.02, 0.22, 0.4), fontsize = 0.7, 
                           refLabel = "reference", noDigits = 2) {
      conf.high <- conf.low <- estimate <- NULL
      stopifnot(class(model) == "coxph")
      data <- survminer:::.get_data(model, data = data)
      terms <- attr(model$terms, "dataClasses")[-1]
      coef <- as.data.frame(broom::tidy(model))
      gmodel <- broom::glance(model)
      allTerms <- lapply(seq_along(terms), function(i) {
        var <- names(terms)[i]
        if(var %in% colnames(data)) {
          if (terms[i] %in% c("factor", "character")) {
            adf <- as.data.frame(table(data[, var]))
            cbind(var = var, adf, pos = 1:nrow(adf))
          }
          else if (terms[i] == "numeric") {
            data.frame(var = var, Var1 = "", Freq = nrow(data), 
                       pos = 1)
          }
          else {
            vars = grep(paste0("^", var, "*."), coef$term, 
                        value = TRUE)
            data.frame(var = vars, Var1 = "", Freq = nrow(data), 
                       pos = seq_along(vars))
          }
        } else {
          message(var, "is not found in data columns, and will be skipped.")
        }    
      })
      allTermsDF <- do.call(rbind, allTerms)
      colnames(allTermsDF) <- c("var", "level", "N", 
                                "pos")
      inds <- apply(allTermsDF[, 1:2], 1, paste0, collapse = "")
      rownames(coef) <- gsub(coef$term, pattern = "`", replacement = "")
      toShow <- cbind(allTermsDF, coef[inds, ])[, c("var", "level", "N", "p.value", 
                                                    "estimate", "conf.low", 
                                                    "conf.high", "pos")]
      toShowExp <- toShow[, 5:7]
      toShowExp[is.na(toShowExp)] <- 0
      toShowExp <- format(exp(toShowExp), digits = noDigits)
      toShowExpClean <- data.frame(toShow, pvalue = signif(toShow[, 4], noDigits + 1), 
                                   toShowExp)
      toShowExpClean$stars <- paste0(round(toShowExpClean$p.value, noDigits + 1), " ", 
                                     ifelse(toShowExpClean$p.value < 0.05, "*", ""), 
                                     ifelse(toShowExpClean$p.value < 0.01, "*", ""), 
                                     ifelse(toShowExpClean$p.value < 0.001, "*", ""))
      toShowExpClean$ci <- paste0("(", toShowExpClean[, "conf.low.1"], 
                                  " - ", toShowExpClean[, "conf.high.1"], ")")
      toShowExpClean$estimate.1[is.na(toShowExpClean$estimate)] = refLabel
      toShowExpClean$stars[which(toShowExpClean$p.value < 0.001)] = "<0.001 ***"
      toShowExpClean$stars[is.na(toShowExpClean$estimate)] = ""
      toShowExpClean$ci[is.na(toShowExpClean$estimate)] = ""
      toShowExpClean$estimate[is.na(toShowExpClean$estimate)] = 0
      toShowExpClean$var = as.character(toShowExpClean$var)
      toShowExpClean$var[duplicated(toShowExpClean$var)] = ""
      toShowExpClean$N <- paste0("(N=", toShowExpClean$N, ")")
      toShowExpClean <- toShowExpClean[nrow(toShowExpClean):1, ]
      rangeb <- range(toShowExpClean$conf.low, toShowExpClean$conf.high, 
                      na.rm = TRUE)
      breaks <- axisTicks(rangeb/2, log = TRUE, nint = 7)
      rangeplot <- rangeb
      rangeplot[1] <- rangeplot[1] - diff(rangeb)
      rangeplot[2] <- rangeplot[2] + 0.15 * diff(rangeb)
      width <- diff(rangeplot)
      y_variable <- rangeplot[1] + cpositions[1] * width
      y_nlevel <- rangeplot[1] + cpositions[2] * width
      y_cistring <- rangeplot[1] + cpositions[3] * width
      y_stars <- rangeb[2]
      x_annotate <- seq_len(nrow(toShowExpClean))
      annot_size_mm <- fontsize * 
        as.numeric(grid::convertX(unit(theme_get()$text$size, "pt"), "mm"))
      p <- ggplot(toShowExpClean, 
                  aes(seq_along(var), exp(estimate))) + 
        geom_rect(aes(xmin = seq_along(var) - 0.5, 
                      xmax = seq_along(var) + 0.5, 
                      ymin = exp(rangeplot[1]), 
                      ymax = exp(rangeplot[2]), 
                      fill = ordered(seq_along(var)%%2 + 1))) +
        scale_fill_manual(values = c("#FFFFFF33",  "#00000033"), guide = "none") + 
        geom_point(pch = 15, size = 4) + 
        geom_errorbar(aes(ymin = exp(conf.low), ymax = exp(conf.high)), 
                      width = 0.15) + 
        geom_hline(yintercept = 1, linetype = 3) + 
        coord_flip(ylim = exp(rangeplot)) + 
        ggtitle(main) + 
        scale_y_log10(name = "", labels = sprintf("%g", breaks), 
                      expand = c(0.02, 0.02), breaks = breaks) + 
        theme_light() +
        theme(panel.grid.minor.y = element_blank(), 
              panel.grid.minor.x = element_blank(), 
              panel.grid.major.y = element_blank(), 
              legend.position = "none", 
              panel.border = element_blank(), 
              axis.title.y = element_blank(), 
              axis.text.y = element_blank(), 
              axis.ticks.y = element_blank(), 
              plot.title = element_text(hjust = 0.5)) + 
        xlab("") + 
        annotate(geom = "text", x = x_annotate, y = exp(y_variable), 
                 label = toShowExpClean$var, fontface = "bold", 
                 hjust = 0, size = annot_size_mm) + 
        annotate(geom = "text", x = x_annotate, y = exp(y_nlevel), hjust = 0, 
                 label = toShowExpClean$level, 
                 vjust = -0.1, size = annot_size_mm) + 
        annotate(geom = "text", x = x_annotate, y = exp(y_nlevel), 
                 label = toShowExpClean$N, fontface = "italic", hjust = 0, 
                 vjust = ifelse(toShowExpClean$level == "", 0.5, 1.1),
                 size = annot_size_mm) + 
        annotate(geom = "text", x = x_annotate, y = exp(y_cistring), 
                 label = toShowExpClean$estimate.1, size = annot_size_mm, 
                 vjust = ifelse(toShowExpClean$estimate.1 == "reference", 0.5, -0.1)) + 
        annotate(geom = "text", x = x_annotate, y = exp(y_cistring), 
                 label = toShowExpClean$ci, size = annot_size_mm, 
                 vjust = 1.1, fontface = "italic") + 
        annotate(geom = "text", x = x_annotate, y = exp(y_stars), 
                 label = toShowExpClean$stars, size = annot_size_mm, 
                 hjust = -0.2, fontface = "italic") +
        annotate(geom = "text", x = 0.5, y = exp(y_variable), 
                 label = paste0("# Events: ", gmodel$nevent, 
                                "; Global p-value (Log-Rank): ", 
                                format.pval(gmodel$p.value.log, eps = ".001"), 
                                " \nAIC: ", round(gmodel$AIC, 2), 
                                "; Concordance Index: ", round(gmodel$concordance, 2)), 
                 size = annot_size_mm, hjust = 0, vjust = 1.2, fontface = "italic")
      gt <- ggplot_gtable(ggplot_build(p))
      gt$layout$clip[gt$layout$name == "panel"] <- "off"
      ggpubr::as_ggplot(gt)
    }
    

    It's a snapshot of the ggforest function as it currently stands, with the same modification as above. If the package's creator makes modifications to the package in the future, this can break or become outdated. But for now, ggforest2(stratamodel) will yield the same result as Approach 1.