rloopsdplyrprocessing-efficiencycoding-efficiency

R: Improving efficiency of getting sum of differences in R


Purpose:

past vector is the set of o3.cpts found in the past period. current vector is the set of o3.cpts found in the current period. Each o3.cpt has a set of numbers that are associated with the code (PCs in numbers data frame). I am trying to get the sum of differences of PCs for every pair of past and current o3.cpt. In the original dataset, there are more than 700 PC columns in numbers data frame.

Question:

The below code is what I am currently using, but it is running too slow (I need to repeat this process for more than 700,000 times). Would there be more efficient way to compute these sum_diff?

Example dataset:

  past = sample(1:40, 20, replace=F)
  current = sample(41:80, 20, replace=F)
  
  set.seed(100)
  
  numbers <- 
    data.frame(
      o3.cpt = c(past, current),
      PC1 = runif(length(c(past, current)), min = -3, max = 3),
      PC2 = runif(length(c(past, current)), min = 1, max = 10),
      PC3 = runif(length(c(past, current)), min = -4, max = 2),
      PC4 = runif(length(c(past, current)), min = -9, max = 8),
      PC5 = runif(length(c(past, current)), min =  4, max = 9)
      )
  
  pairs <- 
    expand.grid(
      past,
      current
    )
  

Current code:

if(nrow(pairs) > 0){
    sum_diff <- 
      pairs %>% 
      ddply(c('Var1', 'Var2'), function(j){
        
        p <-  
          numbers %>% 
          filter(o3.cpt == j$Var1[1]) %>% 
          select(starts_with('PC')) %>% 
          t %>% 
          data.frame %>% 
          .$.
        
        c<-  
          numbers %>% 
          filter(o3.cpt == j$Var2[1]) %>% 
          select(starts_with('PC')) %>% 
          t %>% 
          data.frame %>% 
          .$.
        
        data.frame(diff = sum(abs(p - c)))
      
    })
  }

Extended example dataset and the current code

Extended example dataset

d<-
      structure(list(o3.month.date = structure(c(16953, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16953, 16953, 16953, 
      16953, 16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16922, 16922, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16922, 16953, 16953, 16953, 16953, 16953, 
      16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16953, 16953, 16922, 16922, 16922, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16922, 16922, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 
      16953, 16953), class = "Date"), o3.month.date.previous = structure(c(16922, 
      16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 
      16892, 16892, 16922, 16892, 16892, 16892, 16892, 16892, 16892, 
      16892, 16892, 16892, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16892, 16892, 16892, 16892, 16892, 16892, 
      16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 16922, 
      16922, 16922, 16922, 16922, 16892, 16892, 16892, 16892, 16892, 
      16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16892, 16892, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16892, 16922, 16922, 16922, 
      16922, 16922, 16922, 16892, 16892, 16892, 16892, 16892, 16892, 
      16892, 16892, 16922, 16922, 16892, 16892, 16892, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16892, 16892, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 
      16922, 16922, 16922, 16922), class = "Date"), 
      aid = c("a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725", 
      "a725", "a725", "a725", "a725", "a725", "a725", "a725"), 
      oid = c("o11816", 
      "o20077", "o20077", "o20077", "o20077", "o20077", "o20077", "o20077", 
      "o20077", "o20077", "o20077", "o23696", "o24697", "o28255", "o29968", 
      "o29968", "o29968", "o29968", "o29968", "o29968", "o29968", "o29968", 
      "o36383", "o36383", "o36383", "o36383", "o36383", "o36383", "o36383", 
      "o36481", "o36481", "o37147", "o37761", "o37761", "o37761", "o37761", 
      "o38480", "o38480", "o38480", "o38480", "o38480", "o38480", "o38480", 
      "o38480", "o38729", "o38904", "o38904", "o38904", "o38904", "o38904", 
      "o42403", "o42403", "o42403", "o42403", "o42403", "o42403", "o42403", 
      "o42403", "o42579", "o43113", "o43255", "o43255", "o43486", "o43486", 
      "o44068", "o44068", "o44068", "o44068", "o44474", "o44474", "o45552", 
      "o47039", "o47039", "o47039", "o47039", "o47039", "o47039", "o47039", 
      "o47082", "o47082", "o47209", "o47209", "o47209", "o47209", "o47209", 
      "o47209", "o48228", "o48509", "o48943", "o48943", "o48943", "o48943", 
      "o48943", "o48943", "o49126", "o49126", "o49126", "o49126", "o49126", 
      "o49126", "o49890", "o50413", "o50505", "o50505", "o51150", "o51835", 
      "o51835", "o52116", "o52116", "o52116", "o52116", "o52116", "o52116", 
      "o52116", "o52116", "o52116", "o52260", "o52260", "o52606", "o52670", 
      "o53341", "o53341", "o53395", "o53969", "o53969", "o55166", "o55166", 
      "o55166", "o55761", "o55761", "o55761", "o55761", "o55761", "o56077", 
      "o57530", "o57574", "o57574", "o57574", "o57688", "o57722", "o57722", 
      "o57862", "o57862", "o57862", "o58501", "o58567", "o58567", "o58567", 
      "o59423", "o59423", "o59843", "o60553", "o60553", "o60553", "o60553", 
      "o60553", "o61995", "o62230", "o62230", "o62230", "o62839", "o63835", 
      "o63835", "o63835", "o64526", "o64526", "o67971", "o67971", "o67971", 
      "o67971", "o68299", "o68299", "o68846", "o68887", "o69530", "o69530"
      ), 
      o3.cpt = c("c3354", "c742", "c750", "c3480", "c565", "c740", 
      "c3462", "c730", "c729", "c562", "c3478", "c3453", "c3354", "c484", 
      "c740", "c750", "c3730", "c3472", "c565", "c562", "c3478", "c729", 
      "c565", "c731", "c740", "c750", "c3478", "c562", "c3766", "c3407", 
      "c3397", "c3468", "c3470", "c3478", "c3468", "c3469", "c717", 
      "c563", "c718", "c750", "c745", "c562", "c3488", "c3487", "c3531", 
      "c1226", "c1413", "c1333", "c1349", "c1414", "c723", "c725", 
      "c742", "c565", "c3730", "c562", "c728", "c708", "c3306", "c3354", 
      "c3438", "c3407", "c3356", "c3407", "c1312", "c1226", "c1216", 
      "c1228", "c1239", "c1234", "c534", "c731", "c562", "c565", "c729", 
      "c740", "c3478", "c750", "c3456", "c3450", "c717", "c563", "c562", 
      "c718", "c745", "c750", "c3501", "c748", "c562", "c3489", "c750", 
      "c745", "c717", "c563", "c562", "c750", "c718", "c3489", "c717", 
      "c745", "c3335", "c3472", "c3472", "c3478", "c1154", "c3424", 
      "c3356", "c731", "c3478", "c729", "c562", "c742", "c739", "c565", 
      "c750", "c728", "c3411", "c3412", "c3377", "c3422", "c3478", 
      "c3480", "c3503", "c3435", "c3406", "c3730", "c3478", "c3480", 
      "c750", "c563", "c562", "c745", "c717", "c3342", "c2276", "c3357", 
      "c3407", "c3730", "c3337", "c3430", "c3407", "c3730", "c3407", 
      "c3357", "c3468", "c3356", "c3407", "c3730", "c3407", "c3430", 
      "c1066", "c750", "c731", "c562", "c742", "c732", "c17", "c1327", 
      "c346", "c402", "c3440", "c3517", "c3730", "c3513", "c2569", 
      "c2565", "c256", "c362", "c343", "c390", "c1060", "c1167", "c289", 
      "c1067", "c133", "c151")), 
      row.names = c(NA, -176L), 
      class = "data.frame")

  sample_cpts <-
    c("c3354", "c742", "c750", "c3480", "c565", "c740", "c3462", 
      "c730", "c729", "c562", "c3478", "c3453", "c484", "c3730", "c3472", 
      "c731", "c3766", "c3407", "c3397", "c3468", "c3470", "c3469", 
      "c717", "c563", "c718", "c745", "c3488", "c3487", "c3531", "c1226", 
      "c1413", "c1333", "c1349", "c1414", "c723", "c725", "c728", "c708", 
      "c3306", "c3438", "c3356", "c1312", "c1216", "c1228", "c1239", 
      "c1234", "c534", "c3456", "c3450", "c3501", "c748", "c3489", 
      "c3335", "c1154", "c3424", "c739", "c3411", "c3412", "c3377", 
      "c3422", "c3503", "c3435", "c3406", "c3342", "c2276", "c3357", 
      "c3337", "c3430", "c1066", "c732", "c17", "c1327", "c346", "c402", 
      "c3440", "c3517", "c3513", "c2569", "c2565", "c256", "c362", 
      "c343", "c390", "c1060", "c1167", "c289", "c1067", "c133", "c151"
    )

  set.seed(100)

 pcs <- 
    data.frame(
      o3.cpt = sample_cpts,
      PC1 = runif(length(sample_cpts), min = -3, max = 3),
      PC2 = runif(length(sample_cpts), min = 1, max = 10),
      PC3 = runif(length(sample_cpts), min = -4, max = 2),
      PC4 = runif(length(sample_cpts), min = -9, max = 8),
      PC5 = runif(length(sample_cpts), min =  4, max = 9)
    )
  

Current code for the extended dataset:

 d1 <-
    d  %>% 
    # filter(aid == 'a725') %>%
    ddply('aid', function(x){
      
      mean_diff <- 
        x  %>% 
        # filter(o3.month.date == '2016-05-01')
        ddply('o3.month.date', function(i){
          
          past <-
              x %>% 
              filter(o3.month.date == i$o3.month.date.previous[1]) %>% 
              # select(o3.cpt, aid, o3.month.date, o3.month.date.previous, PC1, PC2, PC3) %>%
              select(o3.cpt, starts_with('PC')) %>%
              distinct
            
            current <-
              i %>% 
              # select(oid, o3.cpt, aid, o3.month.date, o3.month.date.previous, PC1, PC2, PC3) %>%
              select(o3.cpt, starts_with('PC'))%>%
              distinct
            
            intersect <- intersect(past$o3.cpt, current$o3.cpt)
            
            past <- 
              past %>% 
              filter(!(o3.cpt %in% intersect)) %>%
              .$o3.cpt
            
            current <- 
              current %>% 
              filter(!(o3.cpt %in% intersect)) %>%
              .$o3.cpt
            
            pairs <- 
              expand.grid(
                past,
                current
              )
          
          if(nrow(pairs) == 0){
            output <-
              data.frame(mean_diff = NA)
          }
          
          if(nrow(pairs) > 0){
            
          sum_diff <- 
            pairs %>%
              ddply(c('Var1', 'Var2'), function(j){
                
                p <-
                  pcs %>%
                  filter(o3.cpt == j$Var1[1]) %>%
                  select(starts_with('PC')) %>%
                  t %>%
                  data.frame %>%
                  .$.
                
                c<-
                  pcs %>%
                  filter(o3.cpt == j$Var2[1]) %>%
                  select(starts_with('PC')) %>%
                  t %>%
                  data.frame %>%
                  .$.
                
                data.frame(diff = sum(abs(p - c)))
                
              })
            
            output <-
              sum_diff %>%
              summarise(
                mean_diff = mean(diff, na.rm = T)
              )
          }

          output
          
        })
      
      mean_diff
      
    }, .parallel = T)
  
  d1

Solution

  • This is the pairwise Manhattan distance between the matrices made up of the past and current observations. Rfast::dista is a very fast implementation. The following runs in less than 0.0002 seconds.

    pairs$diff <- c(
      Rfast::dista(
        as.matrix(numbers[match(past, numbers$o3.cpt), -1]),
        as.matrix(numbers[match(current, numbers$o3.cpt), -1]),
        type = "manhattan"
      )
    )
    

    Timing:

    microbenchmark::microbenchmark(
      dista = {pairs$diff <- c(
        dista(
          as.matrix(numbers[match(past, numbers$o3.cpt), -1]),
          as.matrix(numbers[match(current, numbers$o3.cpt), -1]),
          type = "manhattan"
        )
      )}
    )
    #> Unit: microseconds
    #>   expr     min      lq     mean  median       uq     max neval
    #>  dista 177.101 181.401 187.5551 183.301 185.4515 346.101   100