I have the following data frame df
:
Index col_a col_b col_c col_d
1 4 c v g j
2 1 x <NA> z s
3 1, 3 k j n y
4 2 q t o i
5 2, 3 y m w r
6 2 d u x a
7 3, 4 n y k g
8 4 h d <NA> u
9 1 s x j m
df <- structure(list(Index = c("4", "1", "1, 3", "2", "2, 3", "2",
"3, 4", "4", "1"), col_a = c("c", "x", "k", "q", "y", "d", "n",
"h", "s"), col_b = c("v", NA, "j", "t", "m", "u", "y", "d", "x"
), col_c = c("g", "z", "n", "o", "w", "x", "k", NA, "j"), col_d = c("j",
"s", "y", "i", "r", "a", "g", "u", "m")), class = "data.frame", row.names = c(NA,
-9L))
I wish to recode/update the values in the df
to "Yes"/"No" based on the Index
column, where 1
= col_a
, 2
= col_b
and so on.
This is my desired output:
Index col_a col_b col_c col_d
1 4 No No No Yes
2 1 Yes No No No
3 1, 3 Yes No Yes No
4 2 No Yes No No
5 2, 3 No Yes Yes No
6 2 No Yes No No
7 3, 4 No No Yes Yes
8 4 No No No Yes
9 1 Yes No No No
Currently I'm using a lapply
approach to solve the problem, but I wonder if there's any simpler solution in just one or two lines of code.
do.call(rbind, lapply(1:nrow(df), \(x) {
vec <- as.integer(unlist(strsplit(df[x, 1], ","))) + 1
df[x, vec] <- "Yes"
df[x, -c(1, vec)] <- "No"
df[x, ]
}))
A base R
option is to use row/column
indexing should be faster for assignment than with rowwise looping
m1 <- cbind(rep(seq_len(nrow(df)), nchar(gsub("\\D+", "", df$Index))),
scan(text = df$Index, what = numeric(), sep=","))
df[-1][m1] <- "Yes"
df[-1][df[-1] != "Yes"|is.na(df[-1])] <- "No"
-output
> df
Index col_a col_b col_c col_d
1 4 No No No Yes
2 1 Yes No No No
3 1, 3 Yes No Yes No
4 2 No Yes No No
5 2, 3 No Yes Yes No
6 2 No Yes No No
7 3, 4 No No Yes Yes
8 4 No No No Yes
9 1 Yes No No No
Or another option is
i1 <- Reduce(`|`, lapply(read.csv(text = df$Index,
header = FALSE),\(x) col(df[-1]) == x))
df[-1] <- c("No", "Yes")[1+(i1 & !is.na(i1))]
-output
> df
Index col_a col_b col_c col_d
1 4 No No No Yes
2 1 Yes No No No
3 1, 3 Yes No Yes No
4 2 No Yes No No
5 2, 3 No Yes Yes No
6 2 No Yes No No
7 3, 4 No No Yes Yes
8 4 No No No Yes
9 1 Yes No No No
Or with tidyverse
library(dplyr)
library(stringr)
df %>%
mutate(across(starts_with("col_"),
~ case_when(str_detect(Index, as.character( match(cur_column(),
names(df))-1)) ~ "Yes", TRUE ~ "No")))
-output
Index col_a col_b col_c col_d
1 4 No No No Yes
2 1 Yes No No No
3 1, 3 Yes No Yes No
4 2 No Yes No No
5 2, 3 No Yes Yes No
6 2 No Yes No No
7 3, 4 No No Yes Yes
8 4 No No No Yes
9 1 Yes No No No