I have a code segment that I am trying to optimize to run a bit faster.
df1 <- df %>%
rowwise() %>%
mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value) %>%
filter(oddsRatio > 1 & fisher < pval) %>%
mutate(test_direction = binom.test(x = forward, n = counts, p = 0.5)$p.value) %>%
filter(test_direction < 0.05)
I've been trying to replace the first mutate/filter pair with
df1 <- df %>%
{.$fisher = fisher.test(matrix(c(.$counts, .$nt1_not_t2,
.$nt2_not_t1, .$not_t1_or_t2), nrow = 2, ncol = 2))$p.value; .} %>%
vctrs::vec_slice(., .$oddsRatio > 1 & .$fisher < pval)
However the matrix, instead of having 4 values, is trying to use the entire dataset.
Warning message:
In matrix(c(.$counts, .$nt1_not_t2, .$nt2_not_t1, .$not_t1_or_t2), :
data length differs from size of matrix: [12184 != 2 x 2]
Not sure if there is some syntax I can add to avoid that, or another more performant way to achieve this.
Any advice would be great
EDIT:: using profvis()
[![profiling][1]][1]
some sample data
df <- structure(list(counts = c(114L, 55L, 57L, 95L, 514L, 65L, 694L,
28L, 148L, 122L, 240L, 38L, 260L, 65L, 40L, 12L, 32L, 134L, 42L,
16L, 37L, 33L, 63L, 16L, 20L, 13L, 12L, 17L, 15L, 26L, 548L,
31L, 467L, 202L, 1696L, 1422L, 219L, 362L, 417L, 1449L, 2142L,
241L, 128L, 1812L, 3281L, 677L, 1006L, 137L, 67L, 161L), forward = c(61L,
37L, 32L, 47L, 233L, 43L, 568L, 15L, 75L, 76L, 149L, 27L, 177L,
34L, 33L, 7L, 22L, 81L, 24L, 8L, 19L, 22L, 55L, 8L, 12L, 8L,
7L, 10L, 12L, 13L, 407L, 17L, 260L, 119L, 861L, 906L, 144L, 199L,
195L, 645L, 1223L, 166L, 73L, 844L, 2727L, 341L, 529L, 86L, 36L,
87L), oddsRatio = c(2.81719639416155, 2.56249627110554, 3.0012284711951,
3.2379086619481, 3.28262910798122, 1.78506192857701, 1.23683314379245,
1.47200829293576, 1.60268857356236, 2.54327837666455, 2.65317932754055,
2.7443152244971, 2.41170031230883, 1.39183344640434, 1.67403290633562,
2.45502917152859, 1.52227974689146, 1.67590893004601, 1.5285951005419,
1.88542317834637, 1.64599293496765, 1.23362495081645, 0.884202671972712,
1.15874072081511, 1.23428571428571, 0.918351141954909, 0.942141312184571,
1.63698770491803, 1.92011125705014, 1.67230461700034, 6.63377209451277,
1.98838889786539, 3.21071805754822, 2.3432554142601, 3.03013725938027,
11.6868669158062, 3.25457032130808, 3.5341881506799, 2.37982532860213,
11.383770310192, 2.98290705092543, 1.43986166989779, 1.32458721885379,
1.70316762338472, 1.20465352503506, 1.32048669611991, 1.38391148677588,
1.70567801561587, 1.11338513259357, 1.18942080378251), not_t1_or_t2 = c(16736L,
17180L, 17230L, 16989L, 12236L, 16869L, 4192L, 17243L, 15631L,
16569L, 15416L, 17344L, 14981L, 16665L, 17139L, 17533L, 17201L,
15903L, 17066L, 16400L, 12543L, 12161L, 4345L, 15346L, 14742L,
15391L, 15681L, 15977L, 16568L, 14576L, 14024L, 14291L, 13801L,
14039L, 11687L, 13734L, 14104L, 13968L, 13703L, 13701L, 10576L,
13757L, 14009L, 9868L, 3491L, 12503L, 11656L, 14064L, 14137L,
13875L), nt1_not_t2 = c(755L, 814L, 812L, 774L, 355L, 804L, 175L,
841L, 721L, 747L, 629L, 831L, 609L, 804L, 829L, 857L, 837L, 735L,
827L, 69L, 48L, 52L, 22L, 69L, 65L, 72L, 73L, 68L, 70L, 59L,
3609L, 4126L, 3690L, 3955L, 2461L, 2735L, 3938L, 3795L, 3740L,
2708L, 2015L, 3916L, 4029L, 2345L, 876L, 3480L, 3151L, 4020L,
4090L, 3996L), nt2_not_t1 = c(897L, 453L, 403L, 644L, 5397L,
764L, 13441L, 390L, 2002L, 1064L, 2217L, 289L, 2652L, 968L, 494L,
100L, 432L, 1730L, 567L, 2017L, 5874L, 6256L, 14072L, 3071L,
3675L, 3026L, 2736L, 2440L, 1849L, 3841L, 321L, 54L, 544L, 306L,
2658L, 611L, 241L, 377L, 642L, 644L, 3769L, 588L, 336L, 4477L,
10854L, 1842L, 2689L, 281L, 208L, 470L)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame")) ```
[1]: https://i.sstatic.net/a1ebG.png
Based on Axeman's comment here a small benchmark using different approaches:
library(dplyr)
library(purrr)
my_fisher <- function(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2) {
fisher.test(
matrix(
c(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2),
nrow = 2,
ncol = 2
)
)$p.value
}
We apply this function using purrr
s pmap
-function and using furrr
s future_pmap
, a parallel version of pmap
:
# purrr
df %>%
mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
# furrr, choose workers based on actual hardware
library(furrr)
plan("future::multisession", workers = 4)
df %>%
mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
Benchmarking:
library(microbenchmark)
microbenchmark(
orig = df %>%
rowwise() %>%
mutate(fisher = fisher.test(matrix(c(counts, nt1_not_t2,
nt2_not_t1, not_t1_or_t2), nrow = 2, ncol = 2))$p.value),
purrr = df %>%
mutate(fisher = pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher)),
furrr = df %>%
mutate(fisher = future_pmap_dbl(list(counts, nt1_not_t2, nt2_not_t1, not_t1_or_t2), my_fisher))
)
returns
Unit: milliseconds
expr min lq mean median uq max neval
orig 108.7756 110.3752 116.08041 113.84570 118.3328 224.2154 100
purrr 105.9916 108.0498 114.80299 113.86020 116.7421 224.0461 100
furrr 86.1203 89.5472 93.77818 92.63145 96.1540 122.7134 100
So using furrr
seems to give a small advantage regarding time / speed.