rdplyr

How to group by single column and create Python-style list strings from multiple rows and columns


I want to group the data by mcode and create two different types of rows for each group.

Below is sample data.

   Cat1  Cat2  Cat3  mcode key   pcode needed
 1 C1    C2    C31   B3100 TRUE  P001  P001  
 2 C1    C2    C31   B3100 FALSE P002  P002  
 3 C1    C2    C31   B5500 TRUE  P003  P003  
 4 C1    C2    C31   B5500 FALSE P004  NA    
 5 C1    C2    C31   B5500 FALSE P005  NA    
 6 C1    C2    C32   B1000 TRUE  P006  NA    
 7 C1    C2    C32   B1000 FALSE P007  P007  
 8 C1    C2    C32   B1000 FALSE P008  NA    
 9 C1    C2    C32   B1000 FALSE P009  P009  
10 C1    C2    C32   B1000 FALSE P010  P010  

For each group, I want to get category values (Cat1, Cat2, Cat3) from rows where key is TRUE.

Additionally, I need to create Python-style list strings that combine all values from pcode and needed columns separately, excluding NA values.

Please note that key column is TRUE when mcode has different value for the first time.

Below is the expected output.

  mcode Cat1  Cat2  Cat3  type   extended_info                                             
1 B1000 C1    C2    C32   pcode  ['P006','P007','P008','P009','P010']
2 B1000 C1    C2    C32   needed ['P007','P009','P010']              
3 B3100 C1    C2    C31   pcode  ['P001','P002']                     
4 B3100 C1    C2    C31   needed ['P001','P002']                     
5 B5500 C1    C2    C31   pcode  ['P003','P004','P005']              
6 B5500 C1    C2    C31   needed ['P003']   

Here are tribble for reproduce data and expeceted output

df <- tribble(
  ~Cat1, ~Cat2, ~Cat3, ~mcode, ~key,   ~pcode, ~needed,
  "C1",        "C2",        "C31",       "B3100",      TRUE,   "P001",       "P001",
  "C1",        "C2",        "C31",       "B3100",      FALSE,  "P002",       "P002",
  "C1",        "C2",        "C31",       "B5500",      TRUE,   "P003",       "P003",
  "C1",        "C2",        "C31",       "B5500",      FALSE,  "P004",       NA,
  "C1",        "C2",        "C31",       "B5500",      FALSE,  "P005",       NA,
  "C1",        "C2",        "C32",       "B1000",      TRUE,   "P006",       NA,
  "C1",        "C2",        "C32",       "B1000",      FALSE,  "P007",       "P007",
  "C1",        "C2",        "C32",       "B1000",      FALSE,  "P008",       NA,
  "C1",        "C2",        "C32",       "B1000",      FALSE,  "P009",       "P009",
  "C1",        "C2",        "C32",       "B1000",      FALSE,  "P010",       "P010"
)
expected_output <- tribble(
  ~mcode, ~Cat1, ~Cat2, ~Cat3, ~type,   ~extended_info,
  "B1000", "C1", "C2", "C32", "pcode",  "['P006','P007','P008','P009','P010']",
  "B1000", "C1", "C2", "C32", "needed", "['P007','P009','P010']",
  "B3100", "C1", "C2", "C31", "pcode",  "['P001','P002']",
  "B3100", "C1", "C2", "C31", "needed", "['P001','P002']",
  "B5500", "C1", "C2", "C31", "pcode",  "['P003','P004','P005']",
  "B5500", "C1", "C2", "C31", "needed", "['P003']"
)

Solution

  • It looks like you have only one row with key = TRUE for each mcode.

    Something like this should get what you need:

    expected_output <- df %>% 
      summarise(Cat1 = first(Cat1[key]), 
                Cat2 = first(Cat2[key]),
                Cat3 = first(Cat3[key]),
                pcode = list(sort(unique(pcode[!is.na(pcode)]))),
                needed = list(sort(unique(needed[!is.na(needed)]))),
                .by = mcode) %>% 
      pivot_longer(cols = c(pcode, needed), names_to = "type", values_to = "extended_info") %>% 
      arrange(mcode)