rdata.table

How to create a variable based on unique counts within a time interval by multiple time points and grouping variable?


I would like to count the unique number of drugs, defined as the number of unique drug_code dispensations each individual (noted by idnr) have within 1 year prior the index_date + time_from_index. The problem is that the actual data I have is much larger, and therefore I cant do a cartesian join, which if it work could be a solution. Preferably, I would like to do this with data.table or anything else that could be speed efficient.

y.df <- data.frame(
  idnr = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4),
  time_from_index = c(0, 365, 730, 0, 365, 0, 365, 730, 0, 365),
  index_date = as.Date(c('2023-01-01', '2023-01-01', '2023-01-01', '2023-06-01', '2023-06-01', '2023-09-01', '2023-09-01', '2023-09-01', '2024-01-01', '2024-01-01'))
)

   idnr time_from_index index_date
1     1               0 2023-01-01
2     1             365 2023-01-01
3     1             730 2023-01-01
4     2               0 2023-06-01
5     2             365 2023-06-01
6     3               0 2023-09-01
7     3             365 2023-09-01
8     3             730 2023-09-01
9     4               0 2024-01-01
10    4             365 2024-01-01


x.df <- data.frame(
  idnr = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4),
  date_of_dispensing = as.Date(c(
    '2022-12-15', '2022-12-20', '2023-01-05', '2023-01-20', 
    '2023-05-10', '2023-06-01', '2023-06-15', '2023-07-01', 
    '2023-08-10', '2023-09-05', '2023-10-01', '2023-11-01',
    '2023-12-15', '2023-12-20', '2024-01-05', '2024-01-15', '2024-02-01'
  )),
  drug_code = c('A', 'A', 'B', 'C', 'D', 'D', 'E', 'F', 'G', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N')
)

   idnr date_of_dispensing drug_code
1     1         2022-12-15         A
2     1         2022-12-20         A
3     1         2023-01-05         B
4     1         2023-01-20         C
5     2         2023-05-10         D
6     2         2023-06-01         D
7     2         2023-06-15         E
8     2         2023-07-01         F
9     3         2023-08-10         G
10    3         2023-09-05         G
11    3         2023-10-01         H
12    3         2023-11-01         I
13    4         2023-12-15         J
14    4         2023-12-20         K
15    4         2024-01-05         L
16    4         2024-01-15         M
17    4         2024-02-01         N

The expected output would look like:

# A tibble: 10 × 4
    idnr time_from_index index_date no_of_unique_drugs
   <dbl>           <dbl> <date>                  <int>
 1     1               0 2023-01-01                  1
 2     1             365 2023-01-01                  2
 3     1             730 2023-01-01                  0
 4     2               0 2023-06-01                  1
 5     2             365 2023-06-01                  3
 6     3               0 2023-09-01                  1
 7     3             365 2023-09-01                  3
 8     3             730 2023-09-01                  0
 9     4               0 2024-01-01                  2
10     4             365 2024-01-01                  3
``

Solution

  • A dplyr solution:

    right_join(
      x.df,
      y.df |> mutate(end = index_date + time_from_index, start = `year<-`(end, year(end) - 1)),
      join_by(idnr, between(date_of_dispensing, start, end))
    ) |> 
      summarise(
        nr_of_unique_drugs = n_distinct(drug_code, na.rm = TRUE), 
        .by = c(idnr, index_date, time_from_index)
      )
    
       idnr index_date time_from_index nr_of_unique_drugs
    1     1 2023-01-01               0                  1
    2     1 2023-01-01             365                  2
    3     2 2023-06-01               0                  1
    4     2 2023-06-01             365                  3
    5     3 2023-09-01               0                  1
    6     3 2023-09-01             365                  3
    7     4 2024-01-01               0                  2
    8     4 2024-01-01             365                  3
    9     1 2023-01-01             730                  0
    10    3 2023-09-01             730                  0