rdplyr

create a dummy variable from a messy data


I have a tibble like this -

# A tibble: 11 × 1
   messycol                   
   <chr>                      
 1 ""                         
 2 ""                         
 3 ""                         
 4 ""                         
 5 ""                         
 6 ""                         
 7 "|15|16|32|33|36|39|40|41|"
 8 "|15|16|32|39|"            
 9 "|10|29|32|38|39|40|"      
10 "|32|39|"                  
11 ""                         

The dput() output is here -

structure(list(messycol = structure(c("", "", "", "", "", "", 
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|", 
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA, 
-11L), class = c("tbl_df", "tbl", "data.frame"))

I want to create a dummy variable from the messycol like this - if messycol includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol should be as it is - " "

So, my expected output is like this -

# A tibble: 11 × 2
   messycol                         DUMMY                 
   <chr>                      
 1 ""                               ""                      
 2 ""                               ""                          
 3 ""                               ""                           
 4 ""                               ""                          
 5 ""                               ""                         
 6 ""                               ""                         
 7 "|15|16|32|33|36|39|40|41|"       1
 8 "|15|16|32|39|"                   1     
 9 "|10|29|32|38|39|40|"             0    
10 "|32|39|"                         0              
11 ""                               ""  

Solution

  • One option would be to use string::str_split and a member of the purrr::map family of functions:

    library(tidyverse)
    
    dat |>
      mutate(
        dummy = map_chr(
          str_split(messycol, pattern = "\\|"),
          ~ as.character(+any(.x %in% c(15, 16)))
        ),
        dummy = if_else(messycol == "", messycol, dummy)
      )
    #> # A tibble: 11 × 2
    #>    messycol                    dummy
    #>    <chr>                       <chr>
    #>  1 ""                          ""   
    #>  2 ""                          ""   
    #>  3 ""                          ""   
    #>  4 ""                          ""   
    #>  5 ""                          ""   
    #>  6 ""                          ""   
    #>  7 "|15|16|32|33|36|39|40|41|" "1"  
    #>  8 "|15|16|32|39|"             "1"  
    #>  9 "|10|29|32|38|39|40|"       "0"  
    #> 10 "|32|39|"                   "0"  
    #> 11 ""                          ""
    

    Or using base R you can achieve the same result e.g. like so:

    dat |>
      transform(dummy = vapply(
        strsplit(messycol, split = "\\|"),
        FUN = \(x) as.character(+any(x %in% c(15, 16))),
        FUN.VALUE = character(1)
      )) |>
      transform(dummy = ifelse(messycol == "", messycol, dummy))
    #>                     messycol dummy
    #> 1                                 
    #> 2                                 
    #> 3                                 
    #> 4                                 
    #> 5                                 
    #> 6                                 
    #> 7  |15|16|32|33|36|39|40|41|     1
    #> 8              |15|16|32|39|     1
    #> 9        |10|29|32|38|39|40|     0
    #> 10                   |32|39|     0
    #> 11
    

    Finally, another more verbose option using the tidyverse would be to use separate_longer_delim :

    dat |>
      mutate(row = row_number()) |>
      separate_longer_delim(messycol, delim = "|") |>
      mutate(dummy = as.character(+any(messycol %in% c(15, 16))), .by = row) |>
      summarise(messycol = paste0(messycol, collapse = "|"), .by = c(row, dummy)) |>
      mutate(dummy = if_else(messycol == "", messycol, dummy))
    #> # A tibble: 11 × 3
    #>      row dummy messycol                   
    #>    <int> <chr> <chr>                      
    #>  1     1 ""    ""                         
    #>  2     2 ""    ""                         
    #>  3     3 ""    ""                         
    #>  4     4 ""    ""                         
    #>  5     5 ""    ""                         
    #>  6     6 ""    ""                         
    #>  7     7 "1"   "|15|16|32|33|36|39|40|41|"
    #>  8     8 "1"   "|15|16|32|39|"            
    #>  9     9 "0"   "|10|29|32|38|39|40|"      
    #> 10    10 "0"   "|32|39|"                  
    #> 11    11 ""    ""