rdplyrindexingreplaceunique

Identify a specific string in a column and count number of unique letters by group in R


I have a dataframe df1

df1<- setNames(data.frame(matrix(ncol = 3, nrow = 37)), c("material","condition", "pID")) df1$material <- c("FBZOIKS","FBZOIKS","FBZOIKS","FBZOIKS","VNTYALQ","VNTYALQ","VNTYALQ","HMRCJXU","HMRCJXU","HMRCJXU","HMRCJXU","HMRCJXU","CURHJXM","UXJMRCH","UXJMRCH","XMRCUJH","XMRCUJH","XMRCUJH","FBZOIKS","FBZOIKS", "FBZOIKS","FBZOIKS","VNTYALQ","VNTYALQ","VNTYALQ","VNTYALQ","HMRCJXU","HMRCJXU","HMRCJXU","HMRCJXU","CURHJXM","CURHJXM","UXJMRCH","UXJMRCH","XMRCUJH","XMRCUJH","XMRCUJH") df1$condition <- c("false"," "," "," "," "," "," "," "," "," "," ","","false"," "," "," "," ",""," false"," ", " "," "," "," "," "," "," "," "," "," "," false"," "," "," "," "," ","") df1$pID <- c("p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1"," p1","p1"," p2"," p2", " p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2"," p2","p2")

and I need to create two columns grouping them by pID: block and Nletters_block. For block, I need to identify the first 'false' in the column condition and give a value of 1 until the next 'false' for that pID is identified. When the next is identified, I need to assign a value of 2. If a next is identified I need to assign a value of 3 and so on.

For Nletters_block, I need to calculate for each participant and block, the unique number of letters that are embedded in the column material.

If I could use the dplyr library, I would be even better.

Below is what I would like to obtain:

material  condition pID  block  Nletters_block
FBZOIKS   false     p1   1      21
FBZOIKS             p1   1      21
FBZOIKS             p1   1      21
FBZOIKS             p1   1      21
VNTYALQ             p1   1      21
VNTYALQ             p1   1      21
VNTYALQ             p1   1      21
HMRCJXU             p1   1      21
HMRCJXU             p1   1      21
HMRCJXU             p1   1      21
HMRCJXU             p1   1      21
HMRCJXU             p1   1      21
CURHJXM   false     p1   2      7
UXJMRCH             p1   2      7
UXJMRCH             p1   2      7
XMRCUJH             p1   2      7
XMRCUJH             p1   2      7
XMRCUJH             p1   2      7
FBZOIKS   false     p2   1      21
FBZOIKS             p2   1      21
FBZOIKS             p2   1      21
FBZOIKS             p2   1      21
VNTYALQ             p2   1      21
VNTYALQ             p2   1      21
VNTYALQ             p2   1      21
VNTYALQ             p2   1      21
HMRCJXU             p2   1      21
HMRCJXU             p2   1      21
HMRCJXU             p2   1      21
HMRCJXU             p2   1      21
CURHJXM   false     p2   2      7
CURHJXM             p2   2      7
UXJMRCH             p2   2      7
UXJMRCH             p2   2      7
XMRCUJH             p2   2      7
XMRCUJH             p2   2      7
XMRCUJH             p2   2      7

Solution

  • First grouping by pID then by pID and block. Counting the characters by pasting all material and subsequently splitting.

    library(dplyr)
    
    df1 %>% 
      mutate(block = cumsum(condition == "false"), .by = pID) %>% 
      mutate(Nletters_block = length(unique(unlist(strsplit(
                                paste(material, collapse=""), "")))), 
             .by = c(pID, block))
       material condition pID block Nletters_block
    1   FBZOIKS     false  p1     1             21
    2   FBZOIKS            p1     1             21
    3   FBZOIKS            p1     1             21
    4   FBZOIKS            p1     1             21
    5   VNTYALQ            p1     1             21
    6   VNTYALQ            p1     1             21
    7   VNTYALQ            p1     1             21
    8   HMRCJXU            p1     1             21
    9   HMRCJXU            p1     1             21
    10  HMRCJXU            p1     1             21
    11  HMRCJXU            p1     1             21
    12  HMRCJXU            p1     1             21
    13  CURHJXM     false  p1     2              7
    14  UXJMRCH            p1     2              7
    15  UXJMRCH            p1     2              7
    16  XMRCUJH            p1     2              7
    17  XMRCUJH            p1     2              7
    18  XMRCUJH            p1     2              7
    19  FBZOIKS     false  p2     1             21
    20  FBZOIKS            p2     1             21
    21  FBZOIKS            p2     1             21
    22  FBZOIKS            p2     1             21
    23  VNTYALQ            p2     1             21
    24  VNTYALQ            p2     1             21
    25  VNTYALQ            p2     1             21
    26  VNTYALQ            p2     1             21
    27  HMRCJXU            p2     1             21
    28  HMRCJXU            p2     1             21
    29  HMRCJXU            p2     1             21
    30  HMRCJXU            p2     1             21
    31  CURHJXM     false  p2     2              7
    32  CURHJXM            p2     2              7
    33  UXJMRCH            p2     2              7
    34  UXJMRCH            p2     2              7
    35  XMRCUJH            p2     2              7
    36  XMRCUJH            p2     2              7
    37  XMRCUJH            p2     2              7
    
    Data
    df1 <- structure(list(material = c("FBZOIKS", "FBZOIKS", "FBZOIKS",
    "FBZOIKS", "VNTYALQ", "VNTYALQ", "VNTYALQ", "HMRCJXU", "HMRCJXU",
    "HMRCJXU", "HMRCJXU", "HMRCJXU", "CURHJXM", "UXJMRCH", "UXJMRCH",
    "XMRCUJH", "XMRCUJH", "XMRCUJH", "FBZOIKS", "FBZOIKS", "FBZOIKS",
    "FBZOIKS", "VNTYALQ", "VNTYALQ", "VNTYALQ", "VNTYALQ", "HMRCJXU",
    "HMRCJXU", "HMRCJXU", "HMRCJXU", "CURHJXM", "CURHJXM", "UXJMRCH",
    "UXJMRCH", "XMRCUJH", "XMRCUJH", "XMRCUJH"), condition = c("false",
    "", "", "", "", "", "", "", "", "", "", "", "false", "", "",
    "", "", "", "false", "", "", "", "", "", "", "", "", "", "",
    "", "false", "", "", "", "", "", ""), pID = c("p1", "p1", "p1",
    "p1", "p1", "p1", "p1", "p1", "p1", "p1", "p1", "p1", "p1", "p1",
    "p1", "p1", "p1", "p1", "p2", "p2", "p2", "p2", "p2", "p2", "p2",
    "p2", "p2", "p2", "p2", "p2", "p2", "p2", "p2", "p2", "p2", "p2",
    "p2")), class = "data.frame", row.names = c(NA, -37L))