rdata.table

sum and count of a dataset with specified bins as criteria


This is a toy data.


library(data.table)
rowlength<-100000

set.seed(10)
Nametypes<-c("A1","B2","C3","D4","E5","F6","G7");
DT<-data.table(DTName1 = sample(Nametypes,rowlength,replace = TRUE),DTName2 = sample(Nametypes,rowlength,replace = TRUE),Criteria1=round(runif(rowlength,min = 0,max = 1)),
           Numeric1=rnorm(rowlength,mean = 1,sd = 100),
           bins=round(runif(rowlength,min = 1,max = 1e9)))
values<-c(0,2,5,7,9,10,12,15,18,20,25,30,35,50,100,150,200,500,Inf)*1e5;
binL<-values[-length(values)];binU<-values[-1];

I want to do exactly as mentioned below: one involves calculating a sum, and the other involves counting, both with multiple criteria.

Input1<-"DTName1";
Input2<-"Numeric1";


sapply(Nametypes, \(j){cbind(sapply(1: 19,\(i)sum(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
sapply(Nametypes, \(j){cbind(sapply(1: 19,\(i)nrow(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})

However, my code is extremely inefficient due to slow processing. In my toy data, I have used only 100K entries, but it may scale up to 5 million.

1.I need the values to display as zero if no criteria are met. 2.Please use only the data.table package. 3.Input1 and Input2 can vary since they come from external sources, which is why I am using the get function. I am unsure if this should be avoided. Alternatives are welcome.


Solution

  • 1) We can use tapply and table as shown. This returns a matrix and a "table" class object each with the intervals specified in the row names.

    DT[Criteria1 == 1, tapply(.SD[[Input2]],
        .(cut(bins, values, include.lowest = TRUE), factor(.SD[[Input1]], Nametypes)),
        sum,
        default = 0)]
    
    DT[Criteria1 == 1, table(
      cut(bins, values, include.lowest = TRUE),
      factor(.SD[[Input1]], Nametypes)) ]
    

    1a) or

    DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") := 
      .(cut(bins, values, include.lowest = TRUE),
      factor(DTName1, levels = Nametypes),
      factor(DTName2, levels = Nametypes))]
    
     DT2[, tapply(.SD[[Input2]], .(cuts, .SD[[Input1]]), sum, default = 0)]
    
    ##                            A1           B2           C3           D4          E5          F6           G7
    ## [0,2e+05]           -55.37939      0.00000   152.331295     0.000000   -32.11065   -57.98264   137.752461
    ## (2e+05,5e+05]       -56.37766    132.72732    30.682194  -103.529421   224.21310   119.04061    -6.906409
    ## (5e+05,7e+05]        19.43140    201.32557     0.000000   -30.504084     0.00000  -121.27830    48.831916
    ## (7e+05,9e+05]       -71.53381   -155.44489    -1.853501   -70.497173    82.48696     0.00000   187.066078
    ## (9e+05,1e+06]       156.18999      0.00000   -51.555027     0.000000  -100.15647  -129.21000     0.000000
    ## ...snip...
    
    DT2[, table(cuts, .SD[[Input1]])]
                   
    ## cuts                  A1   B2   C3   D4   E5   F6   G7
    ##   [0,2e+05]            3    0    2    0    1    1    1
    ##   (2e+05,5e+05]        1    1    1    1    3    2    1
    ##   (5e+05,7e+05]        2    2    0    1    0    4    1
    ##   (7e+05,9e+05]        1    4    1    1    2    0    2
    ##   (9e+05,1e+06]        1    0    1    0    1    1    0
    ## ...snip...
    

    2) Alternately use dcast. DT2 is from (1a). This returns a data frame with a cuts column containing the intervals.

    fo <- reformulate(Input1, "cuts")
    
    dcast(DT2, fo, fun.aggregate = sum, value.var = Input2, fill = 0, drop = FALSE)
    
    dcast(DT2, fo, fun.aggregate = length, drop = FALSE)
    

    3) xtabs also works. DT2 is from (1a). This returns an object of class c("xtabs", "table") with cuts being the intervals as row names.

    xtabs(reformulate(c("cuts", Input1), Input2), DT2)
    
    xtabs(reformulate(c("cuts", Input1)), DT2)
    

    Peformance

    Comparing the run time of 25 replications of the question's code and each of the alternatives above we see that all the alternatives here are faster than the code in the question and 1a is fastest on this particular test being nearly 2 orders of magnitude faster. Although x1a is fastest on this particlar test the cld column shows that the difference in timings among x1, x1a, x2 and x3 is not statistically significant. The difference between any of them and the code in the question is significant. Based on the timings below I suspect that it now takes under a second to process 5 million rows so you may wish to choose among these based on other considerations.

    library(microbenchmark)
    microbenchmark(times = 25,
      ques = {
        sapply(Nametypes, \(j){cbind(sapply(1:19,\(i)sum(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
        sapply(Nametypes, \(j){cbind(sapply(1:19,\(i)nrow(DT[Criteria1==1 & bins>= binL[i] & bins < binU[i] & get(Input1)== j, .(get(Input2))])))})
      },
      x1 = {
        DT[Criteria1 == 1, tapply(.SD[[Input2]],
          .(cut(bins, values, include.lowest = TRUE), factor(.SD[[Input1]], Nametypes)),
          sum,
          default = 0)]
        DT[Criteria1 == 1, table(
          cut(bins, values, include.lowest = TRUE),
          factor(.SD[[Input1]], Nametypes)) ]
      },
      x1a = {
        DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") := 
          .(cut(bins, values, include.lowest = TRUE),
          factor(DTName1, levels = Nametypes),
          factor(DTName2, levels = Nametypes))]
        DT2[, tapply(.SD[[Input2]], .(cuts, .SD[[Input1]]), sum, default = 0)]
        DT2[, table(cuts, .SD[[Input1]])]
      },
      x2 = {
        DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") := 
          .(cut(bins, values, include.lowest = TRUE),
          factor(DTName1, levels = Nametypes),
          factor(DTName2, levels = Nametypes))]
        fo <- reformulate(Input1, "cuts")
        dcast(DT2, fo, fun.aggregate = sum, value.var = Input2, fill = 0, drop = FALSE)
        dcast(DT2, fo, fun.aggregate = length, drop = FALSE)
      },
      x3 = {
        DT2 <- DT[Criteria1 == 1][, c("cuts", "DTName1", "DTName2") := 
          .(cut(bins, values, include.lowest = TRUE),
          factor(DTName1, levels = Nametypes),
          factor(DTName2, levels = Nametypes))]
        xtabs(reformulate(c("cuts", Input1), Input2), DT2)
        xtabs(reformulate(c("cuts", Input1)), DT2)
      }
    )
    

    giving

    Unit: milliseconds
     expr       min        lq       mean    median        uq       max neval cld
     ques 2434.5281 2481.5263 2546.53712 2533.2777 2579.1233 2717.1869    25  a 
       x1   32.4013   35.9994   39.55306   38.2676   42.7410   49.6467    25   b
      x1a   23.0932   26.1382   29.66336   28.3790   32.8300   38.1734    25   b
       x2   43.4468   45.2096   50.95142   47.6681   54.5723   86.3079    25   b
       x3   23.6705   27.3961   30.30350   28.8655   33.5297   40.0132    25   b