I would like to paste column names with their values. It must be based on some condition (if statement) and it can be based on a single variable or multiple variables.
Below is a small example showing how the data looks like. I would like to speed up this process and get the same results as the fun2, fun3, and fun4.
To make this as simple as possible, there is only one rule to set to missing if columns a, b, c, and d have values bigger than zero. But, I left the name of the rule, because it can be different, like "rule 1" > 0 and "rule 2" if is non-missing.
library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
This it how the data looks like:
id t1 t2 a b c d re
1 0.6883367 -0.3454049 0 0 0 0 ''
2 -1.0653127 -1.3035077 NA NA NA NA ''
3 0.5210550 0.8489376 0 NA 0 1 ''
4 0.3697369 -0.1135827 1 0 NA 1 ''
5 1.3195759 -1.5431305 0 1 0 0 ''
6 -0.2106836 -0.3421900 NA 0 1 1 ''
7 -0.2258871 -2.1644697 1 1 NA 0 ''
8 -0.7132686 1.7673775 1 NA 1 1 ''
9 0.9467068 1.8188665 0 1 1 NA ''
10 -0.3900479 1.7306935 1 1 1 1 ''
Bellow is the desired output. The idea is to keep a column whit a description with the reason some value has been set to missing. In this example, only the first two individuals have records for both t1 and t2. Individuals 1, 2, and 3 have records for t1, while individuals 1, 2, 5, 7, and 9 have records for t2.
id t1 t2 a b c d re
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1);"
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1);"
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1);"
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"
First attempt (fun1). Not the expected results because it looks for single whitespace inside mutate. All the other functions (fun2, fun3, and fun4) print the right results.
fun1 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 NA 1.43 0 0 0 0 "Rule1:t1( ); "
2 NA 0.733 NA NA NA NA "Rule1:t1( ); "
3 NA NA 0 NA 0 1 "Rule2:t2(d=1); Rule1:t1( ); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
Function 2 (fun2) uses "trimws".
fun2 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
Function 3 (fun3) uses "gsub" with regular expression.
fun3 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := gsub("\\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "
Function 4 (fun4) uses stri_detect inside mutate with regular expression.
fun4 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
Benchmark with more data
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
benchmark(fun1(dat),
fun2(dat),
fun3(dat),
fun4(dat))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
fun1(dat) 100 642 653 660 668 666 774 66800 1.00
fun2(dat) 100 742 756 763 773 768 874 77300 1.16
fun3(dat) 100 765 779 785 794 791 903 79400 1.19
fun4(dat) 100 743 756 763 777 770 1010 77700 1.16
Does anyone have an idea on how to speed up this process?
Thank you.
Up front, I confess that I have not been able to beat the benchmarking (thanks for the challenge). There might be ways to wring a little bit of speed out of it, but let me recommend a method that does the same thing (faster with smaller data, about the same with large data) but supporting per-rule functions. It isn't what you asked directly, but you hinted at different functions for each rule.
(I've updated the code, thanks to @Cole for finding a remnant of my early exploration.)
RULES <- list(
Rule1 = list(
rule = "Rule1",
lhs = "t1",
rhs = c("a", "b", "c"),
fun = function(z) !is.na(z) & z > 0
),
Rule2 = list(
rule = "Rule2",
lhs = "t2",
rhs = "d",
fun = is.na
)
)
fun9 <- function(dat, RULES = list()) {
nr <- nrow(dat)
# RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
for (r in seq_along(RULES)) {
fun <- RULES[[r]]$fun
lhs <- RULES[[r]]$lhs
for (rhs in RULES[[r]]$rhs) {
lgl <- do.call(fun, list(dat[[rhs]]))
set(dat, which(lgl), lhs, NA)
RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
}
ind <- nzchar(RE[[r]])
RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
}
set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}
The premise of the RULES
and using fun9
should be self-evident.
Benchmarking with small data seems promising:
set.seed(2021)
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
fun9(dat, RULES)[]
# id t1 t2 a b c d re
# <int> <num> <num> <num> <num> <num> <num> <char>
# 1: 1 -0.1224600 -1.0822049 0 0 0 0 ;
# 2: 2 0.5524566 NA NA NA NA NA ;Rule2:t2( d=1)
# 3: 3 0.3486495 0.1819954 0 NA 0 1 ;
# 4: 4 NA 1.5085418 1 0 NA 1 Rule1:t1( a=1);
# 5: 5 NA 1.6044701 0 1 0 0 Rule1:t1( b=1);
# 6: 6 NA -1.8414756 NA 0 1 1 Rule1:t1( c=1);
# 7: 7 NA 1.6233102 1 1 NA 0 Rule1:t1( a=1 b=1);
# 8: 8 NA 0.1313890 1 NA 1 1 Rule1:t1( a=1 c=1);
# 9: 9 NA NA 0 1 1 NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10: 10 NA 1.5133183 1 1 1 1 Rule1:t1( a=1 b=1 c=1);
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 fun4(dat) 9.52ms 11.1ms 88.5 316KB 2.06 43 1 486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]> <tibble [44 x 3]>
# 2 fun9(dat, RULES) 97.5us 113.5us 7760. 416B 6.24 3731 3 481ms <NULL> <Rprofmem[,3] [2 x 3]> <bch:tm [3,734]> <tibble [3,734 x 3]>
Just from `itr/sec`
, this fun9
looks to be a bit faster.
With larger data:
set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 fun4(dat) 1.24s 1.24s 0.806 62.9MB 1.61 1 2 1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms 315.4ms 3.17 53.8MB 4.76 2 3 630.8ms <NULL> <Rprofmem[,3] [70 x 3]> <bch:tm [2]> <tibble [2 x 3]>
While this solution does not use tidytable
or its flow, it is faster. The cleanup of re
is another step, likely to bring this speed back down to mortal levels :-).
Side note: I was trying to use lapply
, mget
, and other tricks to do things within the data.table
data environment, but in the end, using data.table::set
(https://stackoverflow.com/a/16846530/3358272) and simple vectors appeared to be the fastest.