rdplyrmeancountifsummarize

R summarise by group, then build avg. based on condition


I have to create a table that shows average values of each variable DT06_1-DT06_19 (caution: I only have variables DT06_1-DT06_3, DT06_7, DT06_10-DT06_19. The averages shall be computed by group. I have four groups shown in insuff_suff_res_cat (caution: not every participant shows values for each of the four categories. I dont have rows where the participant shows values for a category). And last, the averages on each DT06-variable shall only be computed where the respective DT01-variable shows the value "1" instead of "0". So together: For each group of the variable "insuff_suff_res_cat", please calculate the average value of each DT06-variable (e.g., for DT06_1, for DT06_2 and so on) only if the respective DT01-variable == "1".

my dataset:

structure(list(ID = c("poHYvL", "34HEg4", "poHYvL", "34HEg4", 
"Y0FPVt", "uXaA9j", "nIogJg", "9tAGOE", "Y0FPVt", "uXaA9j", 
"nIogJg", "9tAGOE"), insuff_suff_res_cat = c("not_enough_resources", 
"not_enough_resources", "not_enough_resources_help", "not_enough_resources_help", 
"enough_resources", "enough_resources", "enough_resources", "enough_resources", 
"enough_resources_help", "enough_resources_help", "enough_resources_help", 
"enough_resources_help"), DT01_1 = structure(c(0, 0, 0, 0, 1, 
0, 0, 0, 1, 0, 0, 0), label = "Printed textbooks:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_2 = structure(c(0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0), label = "Digital textbooks:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_3 = structure(c(0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0), label = "Photocopied materials:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_4 = structure(c(0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0), label = "Academic articles and other articles/blogs:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_5 = structure(c(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0), label = "Lecturer slides/lecturer notes/lecture recordings:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_6 = structure(c(0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0), label = "Notes from lectures (your notes or other students’ notes):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_7 = structure(c(0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0), label = "Printed practice test questions or exam practice papers:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_8 = structure(c(0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0), label = "Practice test questions or exam practice papers on my VLE or LMS (e g , Moodle, Canvas, Blackboard):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_9 = structure(c(0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0), label = "University VLE or LMS discussion forums:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_10 = structure(c(0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0), label = "Digital study / revision platforms / websites or apps (could include flashcards, test questions, multiple choice questions, content summaries etc):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_11 = structure(c(0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1), label = "Video (e g , Youtube, TED talks or similar platforms):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_12 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Social Media (e g , TikTok):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_13 = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0), label = "Online chat forums:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_14 = structure(c(0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0), label = "Chat-GPT or other AI based chatbots (/or open-AI technologies):Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_15 = structure(c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), label = "Study plan apps/ goal tracking apps:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_16 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Memory booster / distraction blocker apps:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_17 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Software development platforms / Data analysis software:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_18 = structure(c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0), label = "Tutoring services:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT01_19 = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "Other:Tool / resource usage", format.spss = "F5.0", labels = c(Unchecked = 0, 
Checked = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), DT06_1 = c(NA, NA, NA, NA, "150", NA, NA, NA, "170", NA, 
NA, NA), DT06_2 = c(NA, NA, NA, NA, "0", NA, "40", NA, "0", NA, 
"40", NA), DT06_3 = c(NA, NA, NA, NA, "0", NA, NA, NA, "0", NA, 
NA, NA), DT06_7 = c(NA, NA, NA, NA, "0", "0", NA, NA, "0", "0", 
NA, NA), DT06_10 = c(NA, NA, NA, NA, "0", "0", NA, NA, "0", "0", 
NA, NA), DT06_11 = c(NA, "0", NA, "0", "0", "0", NA, "60", "0", 
"0", NA, "45"), DT06_12 = c(NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_), 
    DT06_13 = c(NA, NA, NA, NA, "0", NA, NA, NA, "0", NA, NA, 
    NA), DT06_14 = c(NA, NA, NA, NA, "50", "0", NA, NA, "50", 
    "0", NA, NA), DT06_15 = c(NA, NA, NA, NA, NA, NA, NA, "21", 
    NA, NA, NA, "21"), DT06_16 = c(NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_), DT06_17 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), DT06_18 = c(NA, 
    NA, NA, NA, NA, NA, "60", NA, NA, NA, "60", NA), DT06_19 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_)), row.names = c(NA, 
-12L), class = "data.frame")

the desired outcome (numbers are invented): this pictures shows my desired outcome

I have already tried to apply the summarize command but I really can't find a solution to connect it to my conditions. Any help appreciated.


Solution

  • Using haven, dplyr, and tidyr, here is the workflow to return mean values of insuff_suff_res_cat groups from DT06* columns based on pairwise relationships with DT01* columns:

    1. As the DT06 column suffixes are not consecutive like the DT01 columns, create vector of DT06 suffixes to filter result
    2. Convert all DT* columns to as.numeric() so they can be interpreted by R and pivoted together
    3. Pivot long with all DT01* values in one column and all DT06* values in another column. DT* column suffixes stored in new ID1 column
    4. Filter so only DT01 == "1" remain
    5. Summarise DT06 column by groups defined by insuff_suff_res_cat and ID1
    6. Pivot insuff_suff_res_cat column to wide format
    7. Add any missing DT06 suffixes that may not have been included in the summarise result e.g. DT06_12, DT06_17, and DT06_19 in this example
    8. Filter so only DT06* suffixes remain
    9. Change ID1 values back their original values
    library(dplyr)
    library(tidyr)
    library(haven)
    
    # Create vector of DT06 suffix values e.g. c(1,2,3,7,10,...) for filtering result
    DT06_cols <- sub("^DT06_", "", grep("^DT06", names(df), value = TRUE))
    
    result <- df |>
      mutate(across(starts_with("DT"), as.numeric)) |>
      pivot_longer(cols = starts_with("DT"),
                   names_to = c(".value", "ID1"),
                   names_pattern = "(DT\\d+)_(\\d+)") |>
      filter(DT01 == 1) |>
      summarise(DT06 = round(mean(DT06, na.rm = TRUE), 2),
                .by = c(insuff_suff_res_cat, ID1)) |>
      pivot_wider(id_cols = ID1,
                  names_from = insuff_suff_res_cat,
                  values_from = DT06) |>
      complete(ID1 = DT06_cols) |>
      arrange(as.integer(ID1)) |>
      filter(ID1 %in% DT06_cols) |>
      mutate(ID1 = paste0("DT06_", sprintf("%02d", as.integer(ID1))))
    
    
    
    data.frame(result)
    #        ID1 not_enough_resources not_enough_resources_help enough_resources enough_resources_help
    # 1  DT06_01                   NA                        NA              150                   170
    # 2  DT06_02                   NA                       NaN               20                    20
    # 3  DT06_03                   NA                       NaN                0                     0
    # 4  DT06_07                   NA                        NA                0                     0
    # 5  DT06_10                   NA                        NA                0                     0
    # 6  DT06_11                    0                         0               20                    15
    # 7  DT06_12                   NA                        NA               NA                    NA
    # 8  DT06_13                   NA                        NA                0                     0
    # 9  DT06_14                   NA                        NA               25                    25
    # 10 DT06_15                   NA                        NA               21                    21
    # 11 DT06_16                   NA                        NA               NA                    NA
    # 12 DT06_17                   NA                        NA               NA                    NA
    # 13 DT06_18                   NA                        NA               60                    60
    # 14 DT06_19                   NA                        NA               NA                    NA