How to generate a table calculates patient-years at risk& incidence rate per 100 patient-years, grouped by SOC/PT?

I can try to create this kind of complex nested structure table using gt, where each treatment contains multiple pieces of information. However, it seems relatively difficult to achieve this using gtsummary. Below are the code and output.

# Load necessary packages
library(dplyr)
library(tidyr)
library(gt)
library(pharmaverseadam)  # Package containing adae dataset

# Load data
data("adae")
data("adsl")

# Step 1: Calculate exposure time for each patient by treatment group, SOC, and PT
# First, identify all unique combinations of USUBJID, TRT01A, AESOC, AEDECOD
adsl <- pharmaverseadam::adsl %>%
  select(USUBJID, TRTSDT, TRTEDT)

all_combinations <- adae %>%
  select(USUBJID, TRT01A) %>%
  left_join(adsl, by = "USUBJID") %>%
  distinct() %>%
  # Cross join with all SOC/PT combinations
  crossing(
    adae %>% select(AESOC, AEDECOD) %>% distinct()
  )

# Get the actual AEs that occurred
# Modified to handle missing AENDT values
actual_aes <- adae %>%
  select(USUBJID, TRT01A, AESOC, AEDECOD, ASTDT, AENDT) %>%
  # Ensure start dates are valid
  filter(!is.na(ASTDT)) %>%
  # Join with ADSL to get TRTEDT for handling missing AENDT
  left_join(adsl %>% select(USUBJID, TRTEDT), by = "USUBJID") %>%
  # Replace missing AENDT with TRTEDT
  mutate(AENDT = if_else(is.na(AENDT), TRTEDT, AENDT)) %>%
  # Remove TRTEDT column as it's no longer needed
  select(-TRTEDT)

# Calculate exposure time
exposure_data <- all_combinations %>%
  # Left join to identify which combinations actually had AEs
  left_join(
    actual_aes,
    by = c("USUBJID", "TRT01A", "AESOC", "AEDECOD")
  ) %>%
  # Calculate exposure time based on whether AE occurred or not
  mutate(
    had_ae = !is.na(ASTDT),
    exposure_days = case_when(
      had_ae == TRUE ~ as.numeric(difftime(AENDT, ASTDT, units = "days")) + 1,
      had_ae == FALSE ~ as.numeric(difftime(TRTEDT, TRTSDT, units = "days")) + 1 + 112
    ),
    exposure_years = exposure_days / 365.25  # Convert to years
  )

# Step 2: Count AEs by treatment group, SOC, and PT
ae_counts_pt <- exposure_data %>%
  group_by(TRT01A, AESOC, AEDECOD) %>%
  summarise(
    n_with_ae = sum(had_ae, na.rm = TRUE),  # Count of patients with this specific AE
    total_patients = n_distinct(USUBJID),   # Total patients in this treatment group
    total_exposure_years = sum(exposure_years, na.rm = TRUE),  # Total exposure years
    .groups = "drop"
  ) %>%
  mutate(
    percent = 100 * n_with_ae / total_patients,
    # Incidence rate per 100 patient-years
    incidence_rate = 100 * n_with_ae / total_exposure_years
  )

# Step 3: Calculate SOC level statistics
ae_counts_soc <- ae_counts_pt %>%
  group_by(TRT01A, AESOC) %>%
  summarise(
    n_with_ae = sum(n_with_ae),  # Sum of all PTs within this SOC
    total_patients = first(total_patients),  # Same for all PTs within treatment
    total_exposure_years = sum(total_exposure_years),  # Sum exposure years
    .groups = "drop"
  ) %>%
  mutate(
    percent = 100 * n_with_ae / total_patients,
    incidence_rate = 100 * n_with_ae / total_exposure_years
  )

# Step 4: Calculate total counts across all treatments for sorting
# Calculate PT totals across all treatments
pt_total_counts <- ae_counts_pt %>%
  group_by(AESOC, AEDECOD) %>%
  summarise(
    pt_total = sum(n_with_ae),
    .groups = "drop"
  )

# Calculate SOC totals across all treatments
soc_total_counts <- ae_counts_soc %>%
  group_by(AESOC) %>%
  summarise(
    soc_total = sum(n_with_ae),
    .groups = "drop"
  )

# Join with the PT data and sort
ae_sorted_pt <- ae_counts_pt %>%
  left_join(pt_total_counts, by = c("AESOC", "AEDECOD")) %>%
  left_join(soc_total_counts, by = "AESOC") %>%
  # Add more significant indentation to PT (4 spaces)
  mutate(TERM = paste0("    ", AEDECOD))

# Join with the SOC data and sort
ae_sorted_soc <- ae_counts_soc %>%
  left_join(soc_total_counts, by = "AESOC") %>%
  # Use SOC as the term (no indentation)
  mutate(TERM = AESOC)

# Combine PT and SOC data and keep the sorting columns
ae_combined <- bind_rows(
  ae_sorted_soc %>% mutate(level = "SOC"),
  ae_sorted_pt %>% mutate(level = "PT")
)

# Create a sorting key for each SOC based on total count
soc_order <- soc_total_counts %>%
  arrange(desc(soc_total), AESOC) %>%
  mutate(soc_order = row_number())

# Add the SOC order to the combined data
ae_combined <- ae_combined %>%
  left_join(soc_order, by = "AESOC")

# Sort the data:
# 1. First by SOC total (using the soc_order)
# 2. Then by level (SOC before PT)
# 3. For PTs within a SOC, sort by PT total
# 4. If PT totals are equal, sort alphabetically
ae_combined <- ae_combined %>%
  arrange(
    soc_order,  # Sort by SOC total (descending)
    desc(level == "SOC"),  # SOC before PT
    desc(pt_total),  # Sort by PT total count (for PT rows)
    TERM  # If counts are equal, sort alphabetically
  )

# Step 5: Reshape the data to create a nested structure with statistics under each treatment
# First, remove the sorting columns before pivoting
ae_combined_clean <- ae_combined %>%
  select(-pt_total,  -soc_order)

ae_nested <- ae_combined_clean %>%
  # Pivot to create a nested structure
  pivot_wider(
    id_cols = c(AESOC, TERM, level),
    names_from = TRT01A,
    values_from = c(n_with_ae, percent, total_exposure_years, incidence_rate),
    names_sep = "_"  # This creates column names like "n_with_ae_Placebo"
  )

# Get unique treatment groups
treatment_groups <- unique(ae_counts_pt$TRT01A)

# Create column names for each treatment group and statistic
col_names <- list()
for (trt in treatment_groups) {
  col_names[[paste0("n_with_ae_", trt)]] <- "n"
  col_names[[paste0("percent_", trt)]] <- "%"
  col_names[[paste0("total_exposure_years_", trt)]] <- "Time at Risk"
  col_names[[paste0("incidence_rate_", trt)]] <- "Incidence Rate"
}

# Create a gt table with nested structure
ae_table <- ae_nested %>%
  # Remove AESOC column as it's redundant now
  select(-AESOC) %>%
  gt() %>%
  tab_header(
    title = "Adverse Events by System Organ Class and Preferred Term"
  ) %>%
  cols_label(
    TERM = "SOC/Preferred Term"
  ) %>%
  # Add column labels
  cols_label(.list = col_names) %>%
  # Hide the level column
  cols_hide(columns = "level")

# Dynamically add spanners for each treatment group
for (trt in treatment_groups) {
  ae_table <- ae_table %>%
    tab_spanner(
      label = trt,
      columns = starts_with(c(
        paste0("n_with_ae_", trt),
        paste0("percent_", trt),
        paste0("total_exposure_years_", trt),
        paste0("incidence_rate_", trt)
      ))
    )
}

# Continue with formatting
ae_table <- ae_table %>%
  # Format numbers
  fmt_number(
    columns = starts_with("percent_"),
    decimals = 1
  ) %>%
  fmt_number(
    columns = starts_with("total_exposure_years_"),
    decimals = 1
  ) %>%
  fmt_number(
    columns = starts_with("incidence_rate_"),
    decimals = 2
  ) %>%
  # Style the table
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      rows = level == "SOC"
    )
  ) %>%
  
  tab_style(
    style = cell_text(indent = px(20)),  
    locations = cells_body(
      columns = TERM,
      rows = level == "PT"
    )
  ) %>%
  tab_options(
    row_group.background.color = "lightgrey"
  ) %>%
  cols_align(
    align = "left",
    columns = TERM
  ) %>%
  cols_align(
    align = "right",
    columns = -TERM)
ae_table

I would like to know how to achieve this using gtsummary. Thank you.

Solution

It's a complex table, but each part of it is straightforward enough.

First note that gtsummary would typically put statistics like n and percent in the same cell. For a table like this, we'll need to build a table each summary statistic then merge them all together.

The general outline to follow here is

Prepare data
Create an analysis result dataset (ARD) for n and p, then the incidence rate and followup time.
Build a table for each statistic using the ARD.
Merge all the tables together

library(cards)
library(gtsummary)
library(tidyverse)

################# 1. Prepare data

adae <- pharmaverseadam::adae |> 
  mutate(TRT01A = factor(TRT01A)) |> 
  filter(AESOC %in% unique(AESOC)[1:2]) |> 
  slice(.by = "AESOC", 1) 
adsl <- pharmaverseadam::adsl |> 
  dplyr::filter(TRT01A != "Screen Failure") |> 
  mutate(TRT01A = factor(TRT01A))

# first some housekeeping, make a data frame with all subjects, soc, and ae
df_all_soc_ae <-
  adsl[c("USUBJID", "TRT01A")] |> 
  mutate(
    data = 
      adae[c("AESOC", "AEDECOD")] |> 
      distinct() |> 
      list()
  ) |> 
  unnest(data)

################# 2. Create an ARD

# Now create the ARD for rates of AE
ard_ae_rates <-
  adae |> 
  ard_stack_hierarchical(
    by = TRT01A,
    variables = c(AESOC, AEDECOD),
    id = USUBJID,
    denominator = pharmaverseadam::adsl
  )

# now create ARD with incidenence rates
ard_incidence_ae <-
  adae |> 
  # counting number of AEs
  summarise(
    .by = c(USUBJID, TRT01A, AESOC, AEDECOD),
    count = length(USUBJID)
  ) |> 
  right_join(
    df_all_soc_ae,
    by = c("USUBJID", "TRT01A", "AESOC", "AEDECOD")
  ) |> 
  mutate(
    # first give a count of zero to those without an AE
    count = coalesce(count, 0L),
    # for simplicicty, assume everyone is followed for half a year
    followup = 0.5
  ) |> 
  cardx::ard_incidence_rate(
    time = "followup",
    count = "count",
    by = TRT01A,
    strata = c(AESOC, AEDECOD),
    unit_label = "years"
  ) |> 
  # keep the incidence rate and the total followup time
  dplyr::filter(stat_name %in% c("estimate", "tot_person_time"))

# now repeat for SOC
ard_incidence_soc <-
  adae |> 
  # counting number of AEs
  summarise(
    .by = c(USUBJID, TRT01A, AESOC),
    count = length(USUBJID)
  ) |> 
  right_join(
    distinct(df_all_soc_ae, USUBJID, TRT01A, AESOC),
    by = c("USUBJID", "TRT01A", "AESOC")
  ) |> 
  mutate(
    # first give a count of zero to those without an AE
    count = coalesce(count, 0L),
    # for simplicicty, assume everyone is followed for half a year
    followup = 0.5
  ) |> 
  cardx::ard_incidence_rate(
    time = "followup",
    count = "count",
    by = TRT01A,
    strata = AESOC,
    unit_label = "years"
  ) |> 
  # keep the incidence rate and the total followup time
  dplyr::filter(stat_name %in% c("estimate", "tot_person_time"))

# next make the incidence ARDs look like the AE rates
ard_incidence_ae2 <-
  ard_incidence_ae |> 
  dplyr::select(-all_ard_variables()) |> 
  rename(variable = group3, variable_level = group3_level)
ard_incidence_soc2 <-
  ard_incidence_soc |> 
    dplyr::select(-all_ard_variables()) |> 
    rename(variable = group2, variable_level = group2_level)

################# 3. Build tables from ARD

tbl_n <-
  tbl_ard_hierarchical(
    ard_ae_rates, 
    by = "TRT01A", 
    variables = c("AESOC", "AEDECOD"),
    statistic = ~"{n}"
  ) |> 
  remove_footnote_header(everything()) |> 
  modify_spanning_header(all_stat_cols() ~ "**{level}**") |> 
  modify_header(all_stat_cols() ~ "n")
tbl_p <-
  tbl_ard_hierarchical(
    ard_ae_rates, 
    by = "TRT01A", 
    variables = c("AESOC", "AEDECOD"),
    statistic = ~"{p}%"
  ) |> 
  remove_footnote_header(everything()) |> 
  modify_spanning_header(all_stat_cols() ~ "**{level}**") |> 
  modify_header(all_stat_cols() ~ "%")
tbl_inc <-
  bind_ard(
    ard_incidence_ae2,
    ard_incidence_soc2
  ) |> 
  tbl_ard_hierarchical(
    by = "TRT01A", 
    variables = c("AESOC", "AEDECOD"),
    statistic = ~"{estimate}"
  ) |> 
  remove_footnote_header(everything()) |> 
  modify_spanning_header(all_stat_cols() ~ "**{level}**") |> 
  modify_header(all_stat_cols() ~ "Incidence per 100PY")
tbl_followup <-
  bind_ard(
    ard_incidence_ae2,
    ard_incidence_soc2
  ) |> 
  tbl_ard_hierarchical(
    by = "TRT01A", 
    variables = c("AESOC", "AEDECOD"),
    statistic = ~"{tot_person_time}"
  ) |> 
  remove_footnote_header(everything()) |> 
  modify_spanning_header(all_stat_cols() ~ "**{level}**") |> 
  modify_header(all_stat_cols() ~ "Tot. Follow-up")

################# 4. Merge to create final table

list(tbl_n, tbl_p, tbl_inc, tbl_followup) |> 
  tbl_merge(tab_spanner = FALSE) |> 
  # reorder columns
  modify_table_body(
    \(x) {
      column_order <- x |> select(all_stat_cols()) |> names() |> sort()
      relocate(x, all_of(column_order), .after = "label")
    }
  ) |> 
  modify_abbreviation("PY = Person-years")