This is a toy data.
library(data.table)
rowlength<-100000
set.seed(10)
Nametypes<-c("A1","B2","C3","D4","E5","F6","G7");
DT<-data.table(DTName1 = sample(Nametypes,rowlength,replace = TRUE),DTName2 = sample(Nametypes,rowlength,replace = TRUE),Criteria1=round(runif(rowlength,min = 0,max = 1)),
Numeric1=rnorm(rowlength,mean = 1,sd = 100),
bins=round(runif(rowlength,min = 1,max = 1e9)))
values<-c(0,2,5,7,9,10,12,15,18,20,25,30,35,50,100,150,200,500,Inf)*1e5;
binL<-values[-length(values)];binU<-values[-1];
I want to do exactly as mentioned below: one involves calculating a sum, and the other involves counting, both with multiple criteria.
Input1<-"DTName1";
Input2<-"Numeric1";
sapply(Nametypes, \(j){cbind(sapply(1: 19,\(i)sum(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
sapply(Nametypes, \(j){cbind(sapply(1: 19,\(i)nrow(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
However, my code is extremely inefficient due to slow processing. In my toy data, I have used only 100K entries, but it may scale up to 5 million.
1.I need the values to display as zero if no criteria are met.
2.Please use only the data.table package.
3.Input1 and Input2 can vary since they come from external sources, which is why I am using the get
function. I am unsure if this should be avoided. Alternatives are welcome.
1) We can use tapply
and table
as shown. This returns a matrix and a "table" class object each with the intervals specified in the row names.
DT[Criteria1 == 1, tapply(.SD[[Input2]],
.(cut(bins, values, include.lowest = TRUE), factor(.SD[[Input1]], Nametypes)),
sum,
default = 0)]
DT[Criteria1 == 1, table(
cut(bins, values, include.lowest = TRUE),
factor(.SD[[Input1]], Nametypes)) ]
1a) or
DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") :=
.(cut(bins, values, include.lowest = TRUE),
factor(DTName1, levels = Nametypes),
factor(DTName2, levels = Nametypes))]
DT2[, tapply(.SD[[Input2]], .(cuts, .SD[[Input1]]), sum, default = 0)]
## A1 B2 C3 D4 E5 F6 G7
## [0,2e+05] -55.37939 0.00000 152.331295 0.000000 -32.11065 -57.98264 137.752461
## (2e+05,5e+05] -56.37766 132.72732 30.682194 -103.529421 224.21310 119.04061 -6.906409
## (5e+05,7e+05] 19.43140 201.32557 0.000000 -30.504084 0.00000 -121.27830 48.831916
## (7e+05,9e+05] -71.53381 -155.44489 -1.853501 -70.497173 82.48696 0.00000 187.066078
## (9e+05,1e+06] 156.18999 0.00000 -51.555027 0.000000 -100.15647 -129.21000 0.000000
## ...snip...
DT2[, table(cuts, .SD[[Input1]])]
## cuts A1 B2 C3 D4 E5 F6 G7
## [0,2e+05] 3 0 2 0 1 1 1
## (2e+05,5e+05] 1 1 1 1 3 2 1
## (5e+05,7e+05] 2 2 0 1 0 4 1
## (7e+05,9e+05] 1 4 1 1 2 0 2
## (9e+05,1e+06] 1 0 1 0 1 1 0
## ...snip...
2) Alternately use dcast
. DT2
is from (1a). This returns a data frame with a cuts
column containing the intervals.
fo <- reformulate(Input1, "cuts")
dcast(DT2, fo, fun.aggregate = sum, value.var = Input2, fill = 0, drop = FALSE)
dcast(DT2, fo, fun.aggregate = length, drop = FALSE)
3) xtabs
also works. DT2
is from (1a). This returns an object of class c("xtabs", "table")
with cuts
being the intervals as row names.
xtabs(reformulate(c("cuts", Input1), Input2), DT2)
xtabs(reformulate(c("cuts", Input1)), DT2)
Comparing the run time of 25 replications of the question's code and each of the alternatives above we see that all the alternatives here are faster than the code in the question and 1a is fastest on this particular test being nearly 2 orders of magnitude faster. Although x1a is fastest on this particlar test the cld column shows that the difference in timings among x1, x1a, x2 and x3 is not statistically significant. The difference between any of them and the code in the question is significant. Based on the timings below I suspect that it now takes under a second to process 5 million rows so you may wish to choose among these based on other considerations.
library(microbenchmark)
microbenchmark(times = 25,
ques = {
sapply(Nametypes, \(j){cbind(sapply(1:19,\(i)sum(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
sapply(Nametypes, \(j){cbind(sapply(1:19,\(i)nrow(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
},
x1 = {
DT[Criteria1 == 1, tapply(.SD[[Input2]],
.(cut(bins, values, include.lowest = TRUE), factor(.SD[[Input1]], Nametypes)),
sum,
default = 0)]
DT[Criteria1 == 1, table(
cut(bins, values, include.lowest = TRUE),
factor(.SD[[Input1]], Nametypes)) ]
},
x1a = {
DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") :=
.(cut(bins, values, include.lowest = TRUE),
factor(DTName1, levels = Nametypes),
factor(DTName2, levels = Nametypes))]
DT2[, tapply(.SD[[Input2]], .(cuts, .SD[[Input1]]), sum, default = 0)]
DT2[, table(cuts, .SD[[Input1]])]
},
x2 = {
DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") :=
.(cut(bins, values, include.lowest = TRUE),
factor(DTName1, levels = Nametypes),
factor(DTName2, levels = Nametypes))]
fo <- reformulate(Input1, "cuts")
dcast(DT2, fo, fun.aggregate = sum, value.var = Input2, fill = 0, drop = FALSE)
dcast(DT2, fo, fun.aggregate = length, drop = FALSE)
},
x3 = {
DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") :=
.(cut(bins, values, include.lowest = TRUE),
factor(DTName1, levels = Nametypes),
factor(DTName2, levels = Nametypes))]
xtabs(reformulate(c("cuts", Input1), Input2), DT2)
xtabs(reformulate(c("cuts", Input1)), DT2)
}
)
giving
Unit: milliseconds
expr min lq mean median uq max neval cld
ques 2434.5281 2481.5263 2546.53712 2533.2777 2579.1233 2717.1869 25 a
x1 32.4013 35.9994 39.55306 38.2676 42.7410 49.6467 25 b
x1a 23.0932 26.1382 29.66336 28.3790 32.8300 38.1734 25 b
x2 43.4468 45.2096 50.95142 47.6681 54.5723 86.3079 25 b
x3 23.6705 27.3961 30.30350 28.8655 33.5297 40.0132 25 b