rdatatablewide-column-store

Calculate weights of data table rows of wide-format


I have a very huge data table in wide-format and give you here just an short example:

library(maditr)
library(reshape2)
library(data.table)
dt.test <- data.table(time = as.Date('2009-01-01') + 0:9,
                      s1 = rnorm(10, 5, 1),
                      s2 = rnorm(10, 10, 2),
                      s3 = rnorm(10, 20, 4))
dt.test <- dt.test %>%
           reshape2::melt(id.vars = "time", variable.name = "stock", value.name = "price") %>% 
           maditr::dcast(stock ~ time, fun.aggregate = list, value.var = "price")

> dt.test
#   stock 2009-01-01 2009-01-02 2009-01-03 2009-01-04 2009-01-05 2009-01-06 2009-01-07 2009-01-08 2009-01-09 2009-01-10
#1:    s1   5.139208   3.879785   6.099094   5.304787   4.732387    4.77054   4.877472   4.830193   5.468288   3.121548
#2:    s2   11.70447   7.360979    10.5366   7.525067   9.770211   14.13924   8.356996   10.45796   9.749032   5.034887
#3:    s3   28.17668   10.22208   29.17786   24.41746   17.87546   24.67163   13.13525    14.9941   18.84145   22.07169

Now I need to calculate for each row the weights, which means: each row-entry divided by the mean of the whole row. Now two end formats are possible for me. Either the weights come in the same table or we create a new weights table dt.weights and save the weights there.

Version 1: Saving them in the same table dt.test

Save the weight-row for s1 after row s1, which is called s1W, a weight-row s2W after row s2 and a weight-row s3W after s3 in my data table.

Version 2: Saving them in a new table dt.weights

This should look the same way (wide-format) as before, but replace the stocks by the weights:

# dt.weights
#   stock_w  2009-01-01 2009-01-02 2009-01-03 2009-01-04 2009-01-05 2009-01-06 2009-01-07 2009-01-08 2009-01-09 2009-01-10
#1:      s1  5.139208   3.879785   6.099094   5.304787   4.732387    4.77054   4.877472   4.830193   5.468288   3.121548
#2:      s2  11.70447   7.360979    10.5366   7.525067   9.770211   14.13924   8.356996   10.45796   9.749032   5.034887
#3:      s3  28.17668   10.22208   29.17786   24.41746   17.87546   24.67163   13.13525    14.9941   18.84145   22.07169

How can this work? The wide-format of dt.test and dt.weights shouldn't be changed.

EDIT: Small subset of my data:

structure(list(GPNRPlan = c(630882L, 773297L, 830234L), `2018-01-01-00` = list(
    101.59, 211.02, 27.91), `2018-01-01-01` = list(98.32, 209.58, 
    20.98), `2018-01-01-02` = list(95.51, 205.74, 14.62), `2018-01-01-03` = list(
    93.28, 210.62, 13.96), `2018-01-01-04` = list(93.03, 219.44, 
    13.39), `2018-01-01-05` = list(101.62, 269.02, 11.08), `2018-01-01-06` = list(
    114.8, 275.1, 10.94), `2018-01-01-07` = list(114.08, 285.74, 
    11.17), `2018-01-01-08` = list(105.71, 299.54, 10.6), `2018-01-01-09` = list(
    107.51, 316.7, 10.84), `2018-01-01-10` = list(112.17, 298.06, 
    10.85), `2018-01-01-11` = list(120.32, 259.44, 13.08), `2018-01-01-12` = list(
    119.75, 255.66, 29.68), `2018-01-01-13` = list(115.29, 284.08, 
    51.07), `2018-01-01-14` = list(117.77, 249.38, 50.02), `2018-01-01-15` = list(
    121.11, 266.22, 51.51), `2018-01-01-16` = list(131.71, 258.14, 
    54.03), `2018-01-01-17` = list(147.68, 260.28, 60.09), `2018-01-01-18` = list(
    157.38, 258.74, 60.94), `2018-01-01-19` = list(161.6, 227.96, 
    64.69), `2018-01-01-20` = list(154, 214.5, 52.98), `2018-01-01-21` = list(
    139.59, 218.96, 46.26), `2018-01-01-22` = list(121.03, 214.46, 
    31.93), `2018-01-01-23` = list(104.83, 211.48, 21.57), `2018-01-02-00` = list(
    104.33, 208.66, 18.05), `2018-01-02-01` = list(96.83, 209.98, 
    13.68), `2018-01-02-02` = list(94.7, 214.64, 13.49), `2018-01-02-03` = list(
    93.8, 211.1, 10.61), `2018-01-02-04` = list(93.73, 223.18, 
    10.56), `2018-01-02-05` = list(103.18, 290.62, 10.33), `2018-01-02-06` = list(
    132.27, 302.42, 11.02), `2018-01-02-07` = list(345.6, 409.74, 
    14.39), `2018-01-02-08` = list(564.61, 402.22, 15.35), `2018-01-02-09` = list(
    619.72, 426.02, 36.77), `2018-01-02-10` = list(626.2, 393.6, 
    43.88), `2018-01-02-11` = list(619.47, 383.42, 41.75), `2018-01-02-12` = list(
    577.24, 338.1, 47.78), `2018-01-02-13` = list(495.96, 367.32, 
    46.86), `2018-01-02-14` = list(494.98, 349.86, 50.1), `2018-01-02-15` = list(
    538.48, 339.86, 50.87), `2018-01-02-16` = list(524.3, 315.06, 
    53.49), `2018-01-02-17` = list(475.18, 307, 53.45), `2018-01-02-18` = list(
    366, 294.9, 55.2), `2018-01-02-19` = list(225.15, 243.6, 
    57.24), `2018-01-02-20` = list(182.21, 234.46, 51.21), `2018-01-02-21` = list(
    160.3, 234, 33.39), `2018-01-02-22` = list(136.96, 234.78, 
    24.73), `2018-01-02-23` = list(117.38, 230.74, 21.19), `2018-01-03-00` = list(
    105.54, 231.48, 14.04), `2018-01-03-01` = list(100.61, 237.72, 
    15.86), `2018-01-03-02` = list(97.85, 230.2, 13.49), `2018-01-03-03` = list(
    96.75, 240.52, 10.18), `2018-01-03-04` = list(97, 235.82, 
    10.28), `2018-01-03-05` = list(105.85, 301.14, 9.6), `2018-01-03-06` = list(
    131.66, 317.92, 11.66), `2018-01-03-07` = list(342.72, 401.08, 
    13.97), `2018-01-03-08` = list(563.71, 421.58, 19.96), `2018-01-03-09` = list(
    617.2, 417.5, 39.21), `2018-01-03-10` = list(626.69, 388.46, 
    37.58), `2018-01-03-11` = list(624.09, 379.5, 41.75), `2018-01-03-12` = list(
    581.89, 353.32, 46.09), `2018-01-03-13` = list(500.86, 253.34, 
    45.87), `2018-01-03-14` = list(498.61, 317.54, 42.89), `2018-01-03-15` = list(
    539.27, 351.52, 50.02), `2018-01-03-16` = list(527.84, 320.2, 
    46.89), `2018-01-03-17` = list(477.11, 296.58, 47.37), `2018-01-03-18` = list(
    360.48, 287.28, 49.22), `2018-01-03-19` = list(224.44, 247.82, 
    52.01), `2018-01-03-20` = list(183.9, 242.1, 56.8)), row.names = c(NA, 
-3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001ff1d5e1ef0>, sorted = "GPNRPlan")

What I get with you solution (weights for each row twice):

enter image description here


Solution

  • It is not immediate but it seems to work:

    First step: Build the weights table.

    #Compute row means
    aux <- rowMeans(dt.test[, lapply(.SD, unlist), .SDcols = 2:ncol(dt.test)])
    #Compute weights and add a column with "row names"
    cbind(stock = paste0(dt.test$stock, "W"), 
          dt.test[, lapply(.SD, function(x) unlist(x) / aux), .SDcols = 2:ncol(dt.test)]
          )
       stock 2009-01-01 2009-01-02 2009-01-03 2009-01-04 2009-01-05 2009-01-06 2009-01-07 2009-01-08 2009-01-09
    1:   s1W  0.9517776  0.9545703   1.041051   1.191171    1.32749   0.884822  0.8491691    1.15274  0.7940578
    2:   s2W   1.126405   0.838604  0.9497316   1.033589   1.160884  0.8274315   1.071979  0.9675088   1.267201
    3:   s3W   1.235231  0.7507189   1.044507   1.244315  0.9601964   0.909451   1.101159  0.9570893  0.8158607
       2009-01-10
    1:   0.853152
    2:  0.7566673
    3:  0.9814708
    
    

    And then, if you like, bind all together sorted:

    aux <- rowMeans(dt.test[, lapply(.SD, unlist), .SDcols = 2:ncol(dt.test)])
    
    rbind(dt.test, 
          cbind(stock = paste0(dt.test$stock, "W"), 
                dt.test[, lapply(.SD, function(x) unlist(x) / aux), .SDcols = 2:ncol(dt.test)]
                )
           )[order(as.character(stock))]
       stock 2009-01-01 2009-01-02 2009-01-03 2009-01-04 2009-01-05 2009-01-06 2009-01-07 2009-01-08 2009-01-09
    1:    s1   4.770632    4.78463     5.2181   5.970551    6.65383   4.435028   4.256324   5.777923   3.980087
    2:   s1W  0.9517776  0.9545703   1.041051   1.191171    1.32749   0.884822  0.8491691    1.15274  0.7940578
    3:    s2   11.36418   8.460586    9.58174   10.42776   11.71203   8.347867   10.81508   9.761092   12.78465
    4:   s2W   1.126405   0.838604  0.9497316   1.033589   1.160884  0.8274315   1.071979  0.9675088   1.267201
    5:    s3   25.03878   15.21747   21.17271   25.22293   19.46369   18.43505   22.32109   19.40071   16.53793
    6:   s3W   1.235231  0.7507189   1.044507   1.244315  0.9601964   0.909451   1.101159  0.9570893  0.8158607
       2009-01-10
    1:   4.276287
    2:   0.853152
    3:   7.633934
    4:  0.7566673
    5:   19.89493
    6:  0.9814708