rdplyrmeansummarize

Summarise by group, then build average based on condition


I need to create a table that shows the average values of each variable DT06_1-DT06_19. I only have variables DT06_1-DT06_3, DT06_7, DT06_10-DT06_19.

The averages shall be computed by group. I have four groups shown in insuff_suff_res_cat. Not every participant shows values for each of the four categories. I don't have rows where the participant shows values for a category.

The averages on each DT06* variable shall only be computed where the respective DT01* variable shows the value "1" instead of "0".

My dataset:

structure(list(ID = c("poHYvL", "34HEg4", "poHYvL", "34HEg4", 
"Y0FPVt", "uXaA9j", "nIogJg", "9tAGOE", "Y0FPVt", "uXaA9j", 
"nIogJg", "9tAGOE"), insuff_suff_res_cat = c("not_enough_resources", 
"not_enough_resources", "not_enough_resources_help", "not_enough_resources_help", 
"enough_resources", "enough_resources", "enough_resources", "enough_resources", 
"enough_resources_help", "enough_resources_help", "enough_resources_help", 
"enough_resources_help"), DT01_1 = structure(c(0, 0, 0, 0, 1, 
0, 0, 0, 1, 0, 0, 0), label = "Printed textbooks:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_2 = structure(c(0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0), label = "Digital textbooks:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_3 = structure(c(0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0), label = "Photocopied materials:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_4 = structure(c(0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0), label = "Academic articles and other articles/blogs:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_5 = structure(c(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0), label = "Lecturer slides/lecturer notes/lecture recordings:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_6 = structure(c(0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0), label = "Notes from lectures (your notes or other students’ notes):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_7 = structure(c(0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0), label = "Printed practice test questions or exam practice papers:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_8 = structure(c(0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0), label = "Practice test questions or exam practice papers on my VLE or LMS (e g , Moodle, Canvas, Blackboard):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_9 = structure(c(0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0), label = "University VLE or LMS discussion forums:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_10 = structure(c(0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0), label = "Digital study / revision platforms / websites or apps (could include flashcards, test questions, multiple choice questions, content summaries etc):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_11 = structure(c(0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1), label = "Video (e g , Youtube, TED talks or similar platforms):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_12 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Social Media (e g , TikTok):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_13 = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0), label = "Online chat forums:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_14 = structure(c(0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0), label = "Chat-GPT or other AI based chatbots (/or open-AI technologies):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_15 = structure(c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), label = "Study plan apps/ goal tracking apps:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_16 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Memory booster / distraction blocker apps:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_17 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Software development platforms / Data analysis software:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_18 = structure(c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0), label = "Tutoring services:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_19 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Other:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT06_1 = c(NA, NA, NA, NA, "150", NA, NA, NA, "170", NA, 
NA, NA), DT06_2 = c(NA, NA, NA, NA, "0", NA, "40", NA, "0", NA, 
"40", NA), DT06_3 = c(NA, NA, NA, NA, "0", NA, NA, NA, "0", NA, 
NA, NA), DT06_7 = c(NA, NA, NA, NA, "0", "0", NA, NA, "0", "0", 
NA, NA), DT06_10 = c(NA, NA, NA, NA, "0", "0", NA, NA, "0", "0", 
NA, NA), DT06_11 = c(NA, "0", NA, "0", "0", "0", NA, "60", "0", 
"0", NA, "45"), DT06_12 = c(NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_), 
    DT06_13 = c(NA, NA, NA, NA, "0", NA, NA, NA, "0", NA, NA, 
    NA), DT06_14 = c(NA, NA, NA, NA, "50", "0", NA, NA, "50", 
    "0", NA, NA), DT06_15 = c(NA, NA, NA, NA, NA, NA, NA, "21", 
    NA, NA, NA, "21"), DT06_16 = c(NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_), DT06_17 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), DT06_18 = c(NA, 
    NA, NA, NA, NA, NA, "60", NA, NA, NA, "60", NA), DT06_19 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_)), row.names = c(NA, 
-12L), class = "data.frame")

Desired outcome (numbers are invented):

not_enough_resources not_enough_resources_help enough_resources enough_resources_help
DT06_1 50 15 11 33
DT06_2 10 19 31 92
DT06_3 14 21 15 25
DT06_7 20 20 20 27
DT06_10 10 7 20 23
... ... ... ... ...
DT06_19 12 30 12 21

I tried to apply dplyr::summarize() but I can't find a solution to connect it to my conditions.


Solution

  • Using haven, dplyr, and tidyr, here is the workflow to return mean values of insuff_suff_res_cat groups from DT06* columns based on pairwise relationships with DT01* columns:

    1. As the DT06 column suffixes are not consecutive like the DT01 columns, create vector of DT06 suffixes to filter result
    2. Convert all DT* columns to as.numeric() so they can be interpreted by R and pivoted together
    3. Pivot long with all DT01* values in one column and all DT06* values in another column. DT* column suffixes stored in new ID1 column
    4. Filter so only DT01 == "1" remain
    5. Summarise DT06 column by groups defined by insuff_suff_res_cat and ID1
    6. Pivot insuff_suff_res_cat column to wide format
    7. Add any missing DT06 suffixes that may not have been included in the summarise result e.g. DT06_12, DT06_17, and DT06_19 in this example
    8. Filter so only DT06* suffixes remain
    9. Change ID1 values back their original values
    library(dplyr)
    library(tidyr)
    library(haven)
    
    # Create vector of DT06 suffix values e.g. c(1,2,3,7,10,...) for filtering result
    DT06_cols <- sub("^DT06_", "", grep("^DT06", names(df), value = TRUE))
    
    result <- df |>
      mutate(across(starts_with("DT"), as.numeric)) |>
      pivot_longer(cols = starts_with("DT"),
                   names_to = c(".value", "ID1"),
                   names_pattern = "(DT\\d+)_(\\d+)") |>
      filter(DT01 == 1) |>
      summarise(DT06 = round(mean(DT06, na.rm = TRUE), 2),
                .by = c(insuff_suff_res_cat, ID1)) |>
      pivot_wider(id_cols = ID1,
                  names_from = insuff_suff_res_cat,
                  values_from = DT06) |>
      complete(ID1 = DT06_cols) |>
      arrange(as.integer(ID1)) |>
      filter(ID1 %in% DT06_cols) |>
      mutate(ID1 = paste0("DT06_", sprintf("%02d", as.integer(ID1))))
    
    
    
    data.frame(result)
    #        ID1 not_enough_resources not_enough_resources_help enough_resources enough_resources_help
    # 1  DT06_01                   NA                        NA              150                   170
    # 2  DT06_02                   NA                       NaN               20                    20
    # 3  DT06_03                   NA                       NaN                0                     0
    # 4  DT06_07                   NA                        NA                0                     0
    # 5  DT06_10                   NA                        NA                0                     0
    # 6  DT06_11                    0                         0               20                    15
    # 7  DT06_12                   NA                        NA               NA                    NA
    # 8  DT06_13                   NA                        NA                0                     0
    # 9  DT06_14                   NA                        NA               25                    25
    # 10 DT06_15                   NA                        NA               21                    21
    # 11 DT06_16                   NA                        NA               NA                    NA
    # 12 DT06_17                   NA                        NA               NA                    NA
    # 13 DT06_18                   NA                        NA               60                    60
    # 14 DT06_19                   NA                        NA               NA                    NA
    

    The regular expression (regex) "(DT\\d+)_(\\d+)" works by defining two capture groups: