I have a tibble like this -
# A tibble: 11 × 1
messycol
<chr>
1 ""
2 ""
3 ""
4 ""
5 ""
6 ""
7 "|15|16|32|33|36|39|40|41|"
8 "|15|16|32|39|"
9 "|10|29|32|38|39|40|"
10 "|32|39|"
11 ""
The dput()
output is here -
structure(list(messycol = structure(c("", "", "", "", "", "",
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|",
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA,
-11L), class = c("tbl_df", "tbl", "data.frame"))
I want to create a dummy variable from the messycol
like this - if messycol
includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol
should be as it is - " "
So, my expected output is like this -
# A tibble: 11 × 2
messycol DUMMY
<chr>
1 "" ""
2 "" ""
3 "" ""
4 "" ""
5 "" ""
6 "" ""
7 "|15|16|32|33|36|39|40|41|" 1
8 "|15|16|32|39|" 1
9 "|10|29|32|38|39|40|" 0
10 "|32|39|" 0
11 "" ""
One option would be to use string::str_split
and a member of the purrr::map
family of functions:
library(tidyverse)
dat |>
mutate(
dummy = map_chr(
str_split(messycol, pattern = "\\|"),
~ as.character(+any(.x %in% c(15, 16)))
),
dummy = if_else(messycol == "", messycol, dummy)
)
#> # A tibble: 11 × 2
#> messycol dummy
#> <chr> <chr>
#> 1 "" ""
#> 2 "" ""
#> 3 "" ""
#> 4 "" ""
#> 5 "" ""
#> 6 "" ""
#> 7 "|15|16|32|33|36|39|40|41|" "1"
#> 8 "|15|16|32|39|" "1"
#> 9 "|10|29|32|38|39|40|" "0"
#> 10 "|32|39|" "0"
#> 11 "" ""
Or using base R you can achieve the same result e.g. like so:
dat |>
transform(dummy = vapply(
strsplit(messycol, split = "\\|"),
FUN = \(x) as.character(+any(x %in% c(15, 16))),
FUN.VALUE = character(1)
)) |>
transform(dummy = ifelse(messycol == "", messycol, dummy))
#> messycol dummy
#> 1
#> 2
#> 3
#> 4
#> 5
#> 6
#> 7 |15|16|32|33|36|39|40|41| 1
#> 8 |15|16|32|39| 1
#> 9 |10|29|32|38|39|40| 0
#> 10 |32|39| 0
#> 11
Finally, another more verbose option using the tidyverse
would be to use separate_longer_delim
:
dat |>
mutate(row = row_number()) |>
separate_longer_delim(messycol, delim = "|") |>
mutate(dummy = as.character(+any(messycol %in% c(15, 16))), .by = row) |>
summarise(messycol = paste0(messycol, collapse = "|"), .by = c(row, dummy)) |>
mutate(dummy = if_else(messycol == "", messycol, dummy))
#> # A tibble: 11 × 3
#> row dummy messycol
#> <int> <chr> <chr>
#> 1 1 "" ""
#> 2 2 "" ""
#> 3 3 "" ""
#> 4 4 "" ""
#> 5 5 "" ""
#> 6 6 "" ""
#> 7 7 "1" "|15|16|32|33|36|39|40|41|"
#> 8 8 "1" "|15|16|32|39|"
#> 9 9 "0" "|10|29|32|38|39|40|"
#> 10 10 "0" "|32|39|"
#> 11 11 "" ""