rfunctiongroup-byapplycolumn-sum

create a matrix by seperating an existing matrix into groups(by factor) and applying a function to each sub-matrix


This problem is best addressed through example:

The setup

x1 <- c(1,4,5,6,7,1)
x2 <- c(1,1,2,3,4,1)
x3 <- c(3,4,5,6,7,1)
x4 <- c(1,2,3,5,7,2)
x5 <- c(6,2,3,9,7,2)
x6 <- c(5,2,4,3,2,3)
x7 <- c(6,4,3,1,8,3)



matrix <- t(data.frame(x1,x2,x3,x4,x5,x6,x7))
colnames(matrix)[6] <- "factor"```

my goal is to create a matrix, where matrix elements are calculated based on the column sum of elements in the group, excluding row (i). I then divide this by the number of elements in the group less 1. So 3 objects in group 1 implies a division by (3-1) =2.

for each group, as described by column 6 or "factor", I would like to calculate a corresponding matrix, and then bind these matrices together.

for group 1 the desired matrix is

output1 <- c(1+3/2, 1+4/2, 2+5/2 ...)
output2 <- c(1+3/2, 4+4/2, 5+5/2...)
output3 <- c(1+1/2, 4+1/2, 5+2/2, 6+3/2 ...)

mat_output1 <- rbind(output1, output2, output3)

for group 2 the desired matrix is

output4 <- c(6/1 , 2/1, 3/1, 9/1, 7/1 ...)
output5 <- c(1/1,2/1,3/1,5/1,7/1,2/1)
mat_output2 <- rbind(output4, output5)

and for group 3

output6 <- c(6/1,4/1,3/1,1/1,8/1,3/1)
output7 <- c(5/1,2/1,4/1,3/1,2/1,3/1)
mat_output3 <- rbind(output6, output7)

with desired output of the form:

mat_output <- rbind(mat_output1, mat_output2, mat_output3)

Solution

  • This approach instead adds up the each column at once and then subtracts each i from the column sum.

    #here's a pretty quick base R solution:
    
    do.call(rbind
            ,tapply(seq_len(nrow(matrix))
                    , matrix[, 'factor']
                    , FUN = function(i) sweep(-matrix[i, -length(matrix)]
                                              , 2
                                              , colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
                    )
    )
    
       <NA> <NA> <NA> <NA> <NA> factor
    x1    2  2.5  3.5  4.5  5.5      1
    x2    2  4.0  5.0  6.0  7.0      1
    x3    1  2.5  3.5  4.5  5.5      1
    x4    6  2.0  3.0  9.0  7.0      2
    x5    1  2.0  3.0  5.0  7.0      2
    x6    6  4.0  3.0  1.0  8.0      3
    x7    5  2.0  4.0  3.0  2.0      3
    
    # similar but MUCH slower
    
    do.call(rbind
            , by(matrix[, -6]
                 , matrix[, 6]
                 #, function(x) sweep(-x, 2,colSums(x), FUN = '+') / (nrow(x)-1))
                 , function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)) #mapply is faster
    )
    

    Some performance. Note, @Ronak provides a warning so I changed to the dplyr suggested code of mutate_at(vars(-group_cols), ...) instead of mutate_all(...). Also, while the data.table is in @akrun, I was about to edit it in before @akrun posted and it's basically @Ronak's dplyr way that's been translated.

    @G. Fernando's solution takes about 20 seconds so I excluded it from the profile. @akrun's base solution is the fastest. I personally kind of like @Ronak's base the best as it reads well.

    #10,000 rows
    #1,000 groups
    Unit: milliseconds
            expr       min        lq       mean     median        uq       max neval
      dt_version  191.4823  193.7294  201.39367  200.61610  210.0798  211.0581    10
      cole_base2 7688.4689 7948.5534 8159.32689 8224.02570 8358.9145 8560.0802    10
      cole_base3  760.9410  761.6176  789.35789  791.22520  812.1285  822.8938    10
      Ronak_base  378.2914  381.9018  403.30458  403.65600  418.5159  431.2887    10
     Ronak_dplyr 7025.7606 7045.9863 7217.55143 7150.09070 7395.1977 7505.7091    10
      akrun_base   26.3189   27.2791   28.90526   28.03645   29.3622   33.5207    10
    
    #10,000 rows
    #100 groups
    Unit: milliseconds
            expr      min       lq      mean    median       uq      max neval
      dt_version  32.6928  33.4362  36.27415  37.34835  38.8137  39.9793    10
      cole_base2 770.1962 817.3142 847.01249 846.13940 893.4095 900.8028    10
      cole_base3  97.5201 101.1023 108.46434 102.01210 105.9185 165.3160    10
      Ronak_base 115.7445 117.9968 128.06018 124.27730 129.9934 170.3994    10
     Ronak_dplyr 721.4570 734.6108 747.46815 735.65990 756.1121 787.0906    10
      akrun_base  23.3171  24.4940  26.79405  26.55190  29.1286  30.2099    10
    
    library(data.table)
    library(microbenchmark)
    library(dplyr)
    
    n_cols <- 10
    n_rows <- 1E5
    n_row_per_group <- 100
    
    
    set.seed(1)
    matrix <- matrix(sample(100, n_rows*n_cols, replace = T), ncol = n_cols)
    matrix <- cbind(matrix, factor = rep(1:(n_rows / n_row_per_group), each = n_row_per_group))
    df <- data.frame(matrix)
    dt <- as.data.table(df)
    
    do.call(rbind
            , lapply(unique(matrix[,'factor'])
                     , function(x) {
                       sub_mat <- matrix[matrix[, 'factor'] == x,] 
                       sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
             })
    )
    
    
    
    microbenchmark(
      # cole_base = { #too slow for lots of little groups
      #   do.call(rbind
      #           ,by(matrix[, -6]
      #               , matrix[, 6]
      #               , function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)
      #           )
      #   )
      # },
       dt_version = {
        dt[, lapply(.SD, function(x) (sum(x) - x) / (.N - 1)) , by = 'factor']
      }
      ,cole_base2 = {
        do.call(rbind
                , lapply(unique(matrix[,'factor'])
                         , function(x) {
                           sub_mat <- matrix[matrix[, 'factor'] == x,] 
                           sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
                         })
        )
      }
      ,cole_base3 = {
        do.call(rbind
                ,tapply(seq_len(nrow(matrix))
                        , matrix[, 'factor']
                        , FUN = function(i) sweep(-matrix[i, -length(matrix)], 2, colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
                        , simplify = F)
        )
      }
    
      ,Ronak_base = {
    
        lapply(df[-ncol(df)], function(x) 
          ave(x, df$factor, FUN = function(x) (sum(x) - x)/(length(x) - 1)))
      }
      # ,G_fern_base = { #pretty slow, i hardcoded the factor row - it needs fixed slightly
      #   do.call(rbind,
      #           lapply(levels(factor(matrix[,11])),function(x) {
      #             list=as.list(NULL)
      #             index=which(matrix[,11]==x)
      #             for(i in 1:length(index)){
      #               if(length(index)>2){
      #                 list[[i]]=colSums(matrix[index[-i],])
      #               }else{
      #                 list[[i]]=matrix[index[-i],] 
      #               }
      #               list[[i]]=list[[i]][-11]/(length(index)-1)
      #             }
      #             return(do.call(rbind,list))
      #           })
      #   )
      # }
      , Ronak_dplyr = {
        df %>%
          group_by(factor) %>%
          mutate_at(vars(-group_cols()), ~(sum(.)-.)/ (n() - 1))
      }
      , akrun_base = {
        n1 <- tabulate(matrix[, ncol(matrix)])
        m1 <- rowsum(matrix[,-ncol(matrix)], group = matrix[,ncol(matrix)])
        (m1[rep(seq_len(nrow(m1)), n1),] - matrix[, -ncol(matrix)])/rep(n1 - 1, n1)
    
      }
      , times = 10
    )