rdata.tableself-join

R data.table update last values of parameters with some dynamics by id, using shift etc


Here's a data.table, with some parameters for each id by some regular quarterly dates. It's originally shuffled randomly, but, at first, let say, it is sorted by fab_date and id.

set.seed(1)
dt_to_fun <- data.table(fab_date = structure(c(18993, 19174, 19358, 19539, 
                            18993, 19174, 19358, 19539, 18993, 19174, 19358, 19539, 18993, 
                            19174, 19358, 19539, 18993, 19174, 19358, 19539), class = "Date"), 
     id = c("n_01", "n_01", "n_01", "n_01", "n_02", "n_02", "n_02", 
                 "n_02", "n_03", "n_03", "n_03", "n_03", "n_04", "n_04", "n_04", 
                 "n_04", "n_05", "n_05", "n_05", "n_05"), 
     param_01 = sample(c(10,20, 30, 40, 50), 20, replace = TRUE), 
     param_02 = sample(c(10,20, 30, 40, 50), 20, replace = TRUE), 
     param_03 = sample(c(10,20, 30, 40, 50), 20, replace = TRUE))

dt_to_fun

       fab_date   id param_01 param_02 param_03
 1: 2022-01-01 n_01       10       50       30
 2: 2022-07-01 n_01       40       20       20
 3: 2023-01-01 n_01       10       20       20
 4: 2023-07-01 n_01       20       10       50
 5: 2022-01-01 n_02       50       40       20
 6: 2022-07-01 n_02       30       10       10
 7: 2023-01-01 n_02       20       40       30
 8: 2023-07-01 n_02       30       30       30
 9: 2022-01-01 n_03       30       20       40
10: 2022-07-01 n_03       10       20       30
11: 2023-01-01 n_03       50       40       10
12: 2023-07-01 n_03       50       40       40
13: 2022-01-01 n_04       20       40       50
14: 2022-07-01 n_04       20       20       10
15: 2023-01-01 n_04       10       40       10
16: 2023-07-01 n_04       50       10       40
17: 2022-01-01 n_05       50       10       50
18: 2022-07-01 n_05       10       40       50
19: 2023-01-01 n_05       10       10       40
20: 2023-07-01 n_05       50       20       50
>

The goal is: for each id replace (last date) parameter value like this: param_01(last date) = param_01(last date) + param_01(-1 period) - param_01(-2 period) For example, for id n_05 last value of param_02 is 20, previous is 10, two times earlier is 40, so the result should be 20 + 10 - 40 = -10. The same for all id's and all param columns. The last date is '2023-07-01', so only parameters in rows with this date should be updated.

I managed to do this calculation, but the way I do it is applied for all dates, but it needs to be done only for the last date of each id. Here is the function:

quarterly_process_fun <- function(dt) {
  param_cols <- c("param_01", "param_02", "param_03")
  dt[, (param_cols) := lapply(
    .SD,
    \(x) (x + data.table::shift(x, n = 1L, fill = x[1L], type = "lag") - data.table::shift(x, n = 2L, fill = x[1L], type = "lag")
    )),
    by = .(id),
    .SDcols = param_cols
  ]
  return(dt)
} 
quarterly_process_fun(dt_to_fun)

and the result:

dt_to_fun
        fab_date   id param_01 param_02 param_03
     1: 2022-01-01 n_01       10       50       30
     2: 2022-07-01 n_01       40       20       20
     3: 2023-01-01 n_01       40      -10       10
     4: 2023-07-01 n_01      -10       10       50
     5: 2022-01-01 n_02       50       40       20
     6: 2022-07-01 n_02       30       10       10
     7: 2023-01-01 n_02        0       10       20
     8: 2023-07-01 n_02       20       60       50
     9: 2022-01-01 n_03       30       20       40
    10: 2022-07-01 n_03       10       20       30
    11: 2023-01-01 n_03       30       40        0
    12: 2023-07-01 n_03       90       60       20
    13: 2022-01-01 n_04       20       40       50
    14: 2022-07-01 n_04       20       20       10
    15: 2023-01-01 n_04       10       20      -30
    16: 2023-07-01 n_04       40       30       40
    17: 2022-01-01 n_05       50       10       50
    18: 2022-07-01 n_05       10       40       50
    19: 2023-01-01 n_05      -30       40       40
    20: 2023-07-01 n_05       50      -10       40

So, how can I adjust it, so that it's only calculates and replaces differences for last dates for each id?

The other question is, if this can be managed on a shuffled data?

    set.seed(1)
    dt_to_fun <- data.table(fab_date = structure(c(18993, 19174, 19358, 19539, 
                                18993, 19174, 19358, 19539, 18993, 19174, 19358, 19539, 18993, 
                                19174, 19358, 19539, 18993, 19174, 19358, 19539), class = "Date"), 
         id = c("n_01", "n_01", "n_01", "n_01", "n_02", "n_02", "n_02", 
                     "n_02", "n_03", "n_03", "n_03", "n_03", "n_04", "n_04", "n_04", 
                     "n_04", "n_05", "n_05", "n_05", "n_05"), 
         param_01 = sample(c(10,20, 30, 40, 50), 20, replace = TRUE), 
         param_02 = sample(c(10,20, 30, 40, 50), 20, replace = TRUE), 
         param_03 = sample(c(10,20, 30, 40, 50), 20, replace = TRUE))
    
#shuffle rows
dt_to_fun <- dt_to_fun[sample(nrow(dt_to_fun)),]
     fab_date   id param_01 param_02 param_03
 1: 2023-07-01 n_03       50       40       40
 2: 2023-07-01 n_04       50       10       40
 3: 2022-01-01 n_01       10       50       30
 4: 2022-01-01 n_04       20       40       50
 5: 2022-01-01 n_02       50       40       20
 6: 2023-01-01 n_04       10       40       10
 7: 2022-07-01 n_02       30       10       10
 8: 2022-07-01 n_05       10       40       50
 9: 2022-01-01 n_03       30       20       40
10: 2023-01-01 n_02       20       40       30
11: 2023-01-01 n_03       50       40       10
12: 2023-01-01 n_01       10       20       20
13: 2022-07-01 n_04       20       20       10
14: 2022-07-01 n_01       40       20       20
15: 2022-07-01 n_03       10       20       30
16: 2023-07-01 n_05       50       20       50
17: 2023-07-01 n_01       20       10       50
18: 2023-07-01 n_02       30       30       30
19: 2023-01-01 n_05       10       10       40
20: 2022-01-01 n_05       50       10       50

Solution

  • quarterly_process_fun <- function(dt) {
      setorder(dt, id, fab_date)
      param_cols <- c("param_01", "param_02", "param_03")
      dt[, (param_cols) := lapply(
        .SD,
        \(x) c(head(x, -1), tail(x, 3) %*% c(-1, 1, 1))),
        by = .(id),
        .SDcols = param_cols
      ]
      return(dt[])
    } 
    
    quarterly_process_fun(dt_to_fun)
    #       fab_date   id param_01 param_02 param_03
    #  1: 2022-01-01 n_01       10       50       30
    #  2: 2022-07-01 n_01       40       20       20
    #  3: 2023-01-01 n_01       10       20       20
    #  4: 2023-07-01 n_01      -10       10       50
    #  5: 2022-01-01 n_02       50       40       20
    #  6: 2022-07-01 n_02       30       10       10
    #  7: 2023-01-01 n_02       20       40       30
    #  8: 2023-07-01 n_02       20       60       50
    #  9: 2022-01-01 n_03       30       20       40
    # 10: 2022-07-01 n_03       10       20       30
    # 11: 2023-01-01 n_03       50       40       10
    # 12: 2023-07-01 n_03       90       60       20
    # 13: 2022-01-01 n_04       20       40       50
    # 14: 2022-07-01 n_04       20       20       10
    # 15: 2023-01-01 n_04       10       40       10
    # 16: 2023-07-01 n_04       40       30       40
    # 17: 2022-01-01 n_05       50       10       50
    # 18: 2022-07-01 n_05       10       40       50
    # 19: 2023-01-01 n_05       10       10       40
    # 20: 2023-07-01 n_05       50      -10       40