rggplot2facet-gridr-forestplotforest-plots

How to add correctly positioned labels/titles to a ggplot Forest Plot in R?


I have created the following forest plot (Plot 1), but I would like to add labels/titles in line and above the y-axis values (i.e., "Analyte") and above the p-values included on each of the panels/forest plot (i.e., "p-value"). In plot 1, I have managed to add a the title/label but am not able to position it in line with the values of "Assay" on the y-axis. Plot 2 below shows the desired output.

Plot 1 (Current Output):

enter image description here

As an example, the following plot (Plot 2) demonstrates how I would like the title/labels to be displayed (i.e., I would like them to appear in line with the relevant information analytes/p-values).

Plot 2 (Desired Layout):

enter image description here

Question: How can I modify my existing code for plot 1 to create plot 2?

Note that there is one value of QM_P_trend for each comparison (AvsB and CvsB) of each Assay/protein, but it is repeated within the data for each level of tertile.

Thank you!

Below is the R code I used to generate Plot 1, along with the data:



###################################################################################
# CREATE DATA -------------------------------------------------------------------#
###################################################################################

data <- structure(list(Carrier_comparison = c("AvsB", "AvsB", "AvsB", "CvsB", 
            "CvsB", "CvsB", "AvsB", "AvsB", "AvsB", "CvsB", "CvsB", "CvsB", 
            "AvsB", "AvsB", "AvsB", "CvsB", "CvsB", "CvsB"), Tertile = structure(c(3L, 
            2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 
            1L), levels = c("Tertile 3 (63-70)", "Tertile 2 (55-62)", "Tertile 1 (40-54)"
            ), class = "factor"), Assay = structure(c(1L, 1L, 1L, 1L, 1L, 
            1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), levels = c("ADGRV1", 
            "CD63", "PRDX3"), class = "factor"), BETA = c(0.0536645935253193, 
            0.00934331381571756, -0.0502503436470944, 0.00816704351606194, 
            0.0946005745129161, -0.0749759598409221, -0.0600444465527413, 
            0.0601560795464937, -0.0640637292430954, 0.0219960172083058, 
            -0.148417379449254, -0.0346517428785544, 0.118618573404726, 0.0822558056705285, 
            0.15602529221275, -0.0326073326703257, -0.054029294502272, -0.0346020391169203
            ), SE = c(0.11057661927178, 0.113129029834029, 0.126265725049744, 
            0.0823378264579776, 0.0866601604863716, 0.100479295994743, 0.102816408820442, 
            0.106832143822942, 0.117433994065687, 0.0774019365360251, 0.0821098859062016, 
            0.0932959318751491, 0.102738955735482, 0.108841714570024, 0.118096903894084, 
            0.0781497668089097, 0.0811784837707031, 0.0931611680323888), 
            P_uncorrected = c(0.627642070133785, 0.934211300388773, 0.690846714020907, 
            0.921016957706137, 0.275432313055544, 0.455911706235102, 
            0.559432213622232, 0.573597978019841, 0.585625011701019, 
            0.776350390578681, 0.0711164128651439, 0.710460149142023, 
            0.248707272854769, 0.450125272681661, 0.187040523394829, 
            0.676619698998927, 0.505916944765656, 0.71045895979309), 
            QM_P_trend = c(0.545836080293994, 0.545836080293994, 0.545836080293994, 
            0.7479209500248, 0.7479209500248, 0.7479209500248, 0.867112096895725, 
            0.867112096895725, 0.867112096895725, 0.57166587344432, 0.57166587344432, 
            0.57166587344432, 0.880298751128355, 0.880298751128355, 0.880298751128355, 
            0.949181439539369, 0.949181439539369, 0.949181439539369)), row.names = c(NA, 
            -18L), class = "data.frame")

###################################################################################
# CREATE PLOTS-------------------------------------------------------------------#
###################################################################################

# Packages 
library(tidyverse) # for maipulating data frames



# Amended forest plot function
forestplot_trend <- function (df, name = name, estimate = estimate, se = se, pvalue = NULL, 
                              ptrend = NULL, colour = NULL, shape = NULL, logodds = FALSE, 
                              psignif = 0.05, facet_var = NULL, ci = 0.95, ...) 
{
  # Ensure input is a data frame and logodds is a logical value
  stopifnot(is.data.frame(df))
  stopifnot(is.logical(logodds))
  
  # Convert input column names to quosures for tidy evaluation
  name <- enquo(name)
  estimate <- enquo(estimate)
  se <- enquo(se)
  pvalue <- enquo(pvalue)
  ptrend <- enquo(ptrend)
  facet_var <- enquo(facet_var)
  colour <- enquo(colour)
  shape <- enquo(shape)
  
  # Capture additional arguments passed to the function
  args <- list(...)
  
  # Compute the confidence interval constant (Z-score for normal distribution)
  const <- stats::qnorm(1 - (1 - ci)/2)
  
  # Prepare data frame: factorize `name` column, compute confidence intervals, and format labels
  df <- df %>% 
    dplyr::mutate(
      `:=`(!!name, factor(!!name, levels = !!name %>% unique() %>% rev(), ordered = TRUE)),  # Reverse factor levels
      .xmin = !!estimate - const * !!se,  # Lower CI bound
      .xmax = !!estimate + const * !!se,  # Upper CI bound
      .filled = TRUE,  # Default filled state
      .label = sprintf("%.2f", !!estimate)  # Format estimate values as strings
    )
  
  # If logodds is TRUE, exponentiate the estimates and confidence intervals
  if (logodds) {
    df <- df %>% mutate(
      .xmin = exp(.data$.xmin), 
      .xmax = exp(.data$.xmax), 
      `:=`(!!estimate, exp(!!estimate))
    )
  }
  
  # If p-value is provided, mark significant values based on threshold (psignif)
  if (!rlang::quo_is_null(pvalue)) {
    df <- df %>% dplyr::mutate(.filled = !!pvalue < !!psignif)
  }
  
  # Initialize ggplot with estimate on x-axis and name on y-axis and ptrend values
  g <- ggplot2::ggplot(df, aes(x = !!estimate, y = !!name)) +
    geom_text(
      aes(x = Inf, y = !!name, label = sprintf("%.3f", !!ptrend)),
      hjust = 1.3, vjust = 0.5, family = "Arial"
    ) 
  
  # Apply a forest plot theme and add background elements
  g <- g + theme_forest() + scale_colour_ng_d() + scale_fill_ng_d() + 
    geom_stripes() + 
    geom_vline(xintercept = ifelse(logodds, 1, 0), linetype = "solid", size = 0.4, colour = "black")  # Reference line
  
  # Adjust the x-axis to give more space for the labels
  g <- g + coord_cartesian(xlim = c(min(df[[as_label(estimate)]]) - 0.5, max(df[[as_label(estimate)]]) + 0.5))  # Increase xlim range
  
  # Add effect estimates with confidence intervals
  g <- g + geom_effect(
    ggplot2::aes(
      xmin = .data$.xmin, xmax = .data$.xmax, 
      colour = !!colour, shape = !!shape, filled = .data$.filled
    ), 
    position = ggstance::position_dodgev(height = 0.5)
  ) + 
    ggplot2::scale_shape_manual(values = c(21L, 22L, 23L, 24L, 25L)) + 
    guides(colour = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE))
  
  # Add optional title, subtitle, caption, x-label, and y-label if provided
  if ("title" %in% names(args)) {
    g <- g + labs(title = args$title)
  }
  if ("subtitle" %in% names(args)) {
    g <- g + labs(subtitle = args$subtitle)
  }
  if ("caption" %in% names(args)) {
    g <- g + labs(caption = args$caption)
  }
  if ("xlab" %in% names(args)) {
    g <- g + labs(x = args$xlab)
  }
  if (!"ylab" %in% names(args)) {
    args$ylab <- ""
  }
  g <- g + labs(y = args$ylab) 
  
  # Set axis limits if provided
  if ("xlim" %in% names(args)) {
    g <- g + coord_cartesian(xlim = args$xlim)
  }
  if ("ylim" %in% names(args)) {
    g <- g + ylim(args$ylim)
  }
  
  # Adjust x-axis tick marks if provided and logodds is FALSE
  if ("xtickbreaks" %in% names(args) & !logodds) {
    g <- g + scale_x_continuous(breaks = args$xtickbreaks)
  }
  
  
  return(g)
}


# Create the forest plot function
forest_plot_data <- function(proteins, data, main_comparison) {
  # Define the desired order for Carrier_comparison based on main_comparison
  comparison_levels <- if (main_comparison == "C") {
    c("CvsB", "AvsB")
  } else {
    c("AvsB", "CvsB")
  }
  
  # Filter data for selected proteins
  data <- data %>%
    filter(Assay %in% proteins) 

  # Ensure the Carrier_comparison is a factor with the desired level order
  data$Carrier_comparison <- factor(data$Carrier_comparison, levels = comparison_levels)
  
  # Plot
  forestplot_trend(
    df = data,
    name = Assay,  # Use combined label for y-axis
    estimate = BETA,
    se = SE,
    pvalue = P_uncorrected, 
    ptrend = QM_P_trend, # Add QM_P_trend values
    psignif = 0.01, # fill estimate if significant at this level
    ci = 0.95,
    logodds = FALSE,
    xlab = "Beta (95% confidence intervals)",
    ylab="Analyte",
    colour = Tertile,
    shape = Tertile,
    position = position_dodge(width = 0.7)  # Increase dodge width for better separation
  ) +
    facet_grid(. ~ Carrier_comparison, scales = "free", space = "free") +
    theme(
      text = element_text(family = "Arial"),  # Change "Arial" to your preferred font
      strip.text = element_text(size = 14, color = "black", hjust = 0.5, vjust = 5.5, margin = margin(t = 15, b = 15)),
      axis.text.x = element_text(size = 12, color = "black"),
      axis.text.y = element_text(size = 12, color = "black"),
      axis.title.x = element_text(size = 14, color = "black", vjust = 0.5),
      axis.title.y = element_text(size = 14, color = "black", face = "bold", angle = 0, vjust = 1.03, hjust = -5),  # Rotated y-axis title
      legend.title = element_text(size = 14, color = "black"),
      legend.text = element_text(size = 12, color = "black"),  # Customize legend labels
      legend.key.size = unit(1.5, "lines")
    )
}


# Create list of proteins
proteins <- levels(data$Assay)

# Generate plots
CvsB_forest_plot <- forest_plot_data(proteins, data, main_comparison = "C")


CvsB_forest_plot


Solution

  • I managaged to align the y-axis title using the margin argument instead of hjust. hjust did not work as expected, due to using the argument angle = 0. This is because justification works relative to the text direction.

    To add the p-value labels I then used geom_text.

    Plot produced:

    enter image description here

    Code used to produce the plot:

    ###################################################################################
    # CREATE PLOTS-------------------------------------------------------------------#
    ###################################################################################
    
    # Load necessary libraries
    library(tidyverse)
    
    # Amended forest plot function
    forestplot_trend <- function (df, name = name, estimate = estimate, se = se, pvalue = NULL, 
                                  ptrend = NULL, colour = NULL, shape = NULL, logodds = FALSE, 
                                  psignif = 0.05, facet_var = NULL, ci = 0.95, ...) 
    {
      # Ensure input is a data frame and logodds is a logical value
      stopifnot(is.data.frame(df))
      stopifnot(is.logical(logodds))
      
      # Convert input column names to quosures for tidy evaluation
      name <- enquo(name)
      estimate <- enquo(estimate)
      se <- enquo(se)
      pvalue <- enquo(pvalue)
      ptrend <- enquo(ptrend)
      facet_var <- enquo(facet_var)
      colour <- enquo(colour)
      shape <- enquo(shape)
      
      # Capture additional arguments passed to the function
      args <- list(...)
      
      # Compute the confidence interval constant (Z-score for normal distribution)
      const <- stats::qnorm(1 - (1 - ci)/2)
      
      # Prepare data frame: factorize `name` column, compute confidence intervals, and format labels
      df <- df %>% 
        dplyr::mutate(
          `:=`(!!name, factor(!!name, levels = !!name %>% unique() %>% rev(), ordered = TRUE)),  # Reverse factor levels
          .xmin = !!estimate - const * !!se,  # Lower CI bound
          .xmax = !!estimate + const * !!se,  # Upper CI bound
          .filled = TRUE,  # Default filled state
          .label = sprintf("%.2f", !!estimate)  # Format estimate values as strings
        )
      
      # If logodds is TRUE, exponentiate the estimates and confidence intervals
      if (logodds) {
        df <- df %>% mutate(
          .xmin = exp(.data$.xmin), 
          .xmax = exp(.data$.xmax), 
          `:=`(!!estimate, exp(!!estimate))
        )
      }
      
      # If p-value is provided, mark significant values based on threshold (psignif)
      if (!rlang::quo_is_null(pvalue)) {
        df <- df %>% dplyr::mutate(.filled = !!pvalue < !!psignif)
      }
      
      # Initialize ggplot with estimate on x-axis and name on y-axis and ptrend values
      g <- ggplot2::ggplot(df, aes(x = !!estimate, y = !!name)) 
      
      # Apply a forest plot theme and add background elements
      g <- g + theme_forest() + scale_colour_ng_d() + scale_fill_ng_d() + 
        geom_stripes() + 
        geom_vline(xintercept = ifelse(logodds, 1, 0), linetype = "solid", size = 0.4, colour = "black")  # Reference line
      
      # Adjust the x-axis to give more space for the labels
      g <- g + coord_cartesian(xlim = c(min(df[[as_label(estimate)]]) - 0.5, max(df[[as_label(estimate)]]) + 0.5))  # Increase xlim range
      
      # Add effect estimates with confidence intervals
      g <- g + geom_effect(
        ggplot2::aes(
          xmin = .data$.xmin, xmax = .data$.xmax, 
          colour = !!colour, shape = !!shape, filled = .data$.filled
        ), 
        position = ggstance::position_dodgev(height = 0.5)
      ) + 
        ggplot2::scale_shape_manual(values = c(21L, 22L, 23L, 24L, 25L)) + 
        guides(colour = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE))
      
      # Add optional title, subtitle, caption, x-label, and y-label if provided
      if ("title" %in% names(args)) {
        g <- g + labs(title = args$title)
      }
      if ("subtitle" %in% names(args)) {
        g <- g + labs(subtitle = args$subtitle)
      }
      if ("caption" %in% names(args)) {
        g <- g + labs(caption = args$caption)
      }
      if ("xlab" %in% names(args)) {
        g <- g + labs(x = args$xlab)
      }
      if (!"ylab" %in% names(args)) {
        args$ylab <- ""
      }
      g <- g + labs(y = args$ylab) 
      
      # Set axis limits if provided
      if ("xlim" %in% names(args)) {
        g <- g + coord_cartesian(xlim = args$xlim)
      }
      if ("ylim" %in% names(args)) {
        g <- g + ylim(args$ylim)
      }
      
      # Adjust x-axis tick marks if provided and logodds is FALSE
      if ("xtickbreaks" %in% names(args) & !logodds) {
        g <- g + scale_x_continuous(breaks = args$xtickbreaks)
      }
      
      
      return(g)
    }
    
    
    # Create the forest plot function with added labels
    forest_plot_data <- function(proteins, data, main_comparison) {
      # Define the desired order for Carrier_comparison based on main_comparison
      comparison_levels <- if (main_comparison == "C") {
        c("CvsB", "AvsB")
      } else {
        c("AvsB", "CvsB")
      }
      
      # Filter data for selected proteins
      data <- data %>%
        filter(Assay %in% proteins) 
      
      # Ensure the Carrier_comparison is a factor with the desired level order
      data$Carrier_comparison <- factor(data$Carrier_comparison, levels = comparison_levels)
      
      # Plot
      plot <- forestplot_trend(
        df = data,
        name = Assay,  # Use combined label for y-axis
        estimate = BETA,
        se = SE,
        pvalue = P_uncorrected, 
        ptrend = QM_P_trend, # Add QM_P_trend values
        psignif = 0.01, # fill estimate if significant at this level
        ci = 0.95,
        logodds = FALSE,
        xlab = "Beta (95% confidence intervals)",
        ylab = "Analyte",  
        colour = Tertile,
        shape = Tertile,
        position = position_dodge(width = 0.7)  # Increase dodge width for better separation
      ) +
        facet_grid(. ~ Carrier_comparison, scales = "free", space = "free") +
        theme(
          text = element_text(family = "Arial"),  # Change "Arial" to your preferred font
          strip.text = element_text(size = 16, color = "black", hjust = 0.5, vjust = 5.5, margin = margin(t = 18, b = 15)),
          axis.text.x = element_text(size = 12, color = "black"),
          axis.text.y = element_text(size = 12, color = "black"),
          axis.title.x = element_text(size = 14, color = "black", vjust = 0.5),
          axis.title.y = element_text(size = 14, color = "black", face = "bold", angle = 0, vjust = 1.00, margin = margin(r = -50)),  # Rotated y-axis title
          legend.title = element_text(size = 14, color = "black"),
          legend.text = element_text(size = 12, color = "black"),  # Customize legend labels
          legend.key.size = unit(1.5, "lines")
        )
      
      # Add "p-value" label above p-values in each panel
      plot <- plot + 
        geom_text(
          data = data.frame(Carrier_comparison = comparison_levels, 
                            label = "bolditalic('p') * bold('-value')"),  # Correct expression for bold and italic
          aes(x = Inf, y = Inf, label = label),
          hjust = 1.1, vjust = 1.5, size = 4, family = "Arial", color = "black", 
          parse = TRUE  # Use parse = TRUE to interpret the label as an expression
        )   
      
      # Add QM_P_trend values aligned with Assay values on the y-axis
      plot <- plot + 
        geom_text(
          aes(x = Inf, y = Assay, label = sprintf("%.3f", QM_P_trend)),  # Display QM_P_trend values
          hjust = 1.3, vjust = 0.5, family = "Arial", size = 4, color = "black"
        )
      
      return(plot)
    }
    
    # Create list of proteins
    proteins <- levels(data$Assay)
    
    # Generate plots
    CvsB_forest_plot <- forest_plot_data(proteins, data, main_comparison = "C")
    
    # Display the plot
    CvsB_forest_plot