This problem is best addressed through example:
The setup
x1 <- c(1,4,5,6,7,1)
x2 <- c(1,1,2,3,4,1)
x3 <- c(3,4,5,6,7,1)
x4 <- c(1,2,3,5,7,2)
x5 <- c(6,2,3,9,7,2)
x6 <- c(5,2,4,3,2,3)
x7 <- c(6,4,3,1,8,3)
matrix <- t(data.frame(x1,x2,x3,x4,x5,x6,x7))
colnames(matrix)[6] <- "factor"```
my goal is to create a matrix, where matrix elements are calculated based on the column sum of elements in the group, excluding row (i). I then divide this by the number of elements in the group less 1. So 3 objects in group 1 implies a division by (3-1) =2.
for each group, as described by column 6 or "factor", I would like to calculate a corresponding matrix, and then bind these matrices together.
for group 1 the desired matrix is
output1 <- c(1+3/2, 1+4/2, 2+5/2 ...)
output2 <- c(1+3/2, 4+4/2, 5+5/2...)
output3 <- c(1+1/2, 4+1/2, 5+2/2, 6+3/2 ...)
mat_output1 <- rbind(output1, output2, output3)
for group 2 the desired matrix is
output4 <- c(6/1 , 2/1, 3/1, 9/1, 7/1 ...)
output5 <- c(1/1,2/1,3/1,5/1,7/1,2/1)
mat_output2 <- rbind(output4, output5)
and for group 3
output6 <- c(6/1,4/1,3/1,1/1,8/1,3/1)
output7 <- c(5/1,2/1,4/1,3/1,2/1,3/1)
mat_output3 <- rbind(output6, output7)
with desired output of the form:
mat_output <- rbind(mat_output1, mat_output2, mat_output3)
This approach instead adds up the each column at once and then subtracts each i
from the column sum.
#here's a pretty quick base R solution:
do.call(rbind
,tapply(seq_len(nrow(matrix))
, matrix[, 'factor']
, FUN = function(i) sweep(-matrix[i, -length(matrix)]
, 2
, colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
)
)
<NA> <NA> <NA> <NA> <NA> factor
x1 2 2.5 3.5 4.5 5.5 1
x2 2 4.0 5.0 6.0 7.0 1
x3 1 2.5 3.5 4.5 5.5 1
x4 6 2.0 3.0 9.0 7.0 2
x5 1 2.0 3.0 5.0 7.0 2
x6 6 4.0 3.0 1.0 8.0 3
x7 5 2.0 4.0 3.0 2.0 3
# similar but MUCH slower
do.call(rbind
, by(matrix[, -6]
, matrix[, 6]
#, function(x) sweep(-x, 2,colSums(x), FUN = '+') / (nrow(x)-1))
, function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)) #mapply is faster
)
Some performance. Note, @Ronak provides a warning so I changed to the dplyr
suggested code of mutate_at(vars(-group_cols), ...)
instead of mutate_all(...)
. Also, while the data.table
is in @akrun, I was about to edit it in before @akrun posted and it's basically @Ronak's dplyr
way that's been translated.
@G. Fernando's solution takes about 20 seconds so I excluded it from the profile. @akrun's base solution is the fastest. I personally kind of like @Ronak's base the best as it reads well.
#10,000 rows
#1,000 groups
Unit: milliseconds
expr min lq mean median uq max neval
dt_version 191.4823 193.7294 201.39367 200.61610 210.0798 211.0581 10
cole_base2 7688.4689 7948.5534 8159.32689 8224.02570 8358.9145 8560.0802 10
cole_base3 760.9410 761.6176 789.35789 791.22520 812.1285 822.8938 10
Ronak_base 378.2914 381.9018 403.30458 403.65600 418.5159 431.2887 10
Ronak_dplyr 7025.7606 7045.9863 7217.55143 7150.09070 7395.1977 7505.7091 10
akrun_base 26.3189 27.2791 28.90526 28.03645 29.3622 33.5207 10
#10,000 rows
#100 groups
Unit: milliseconds
expr min lq mean median uq max neval
dt_version 32.6928 33.4362 36.27415 37.34835 38.8137 39.9793 10
cole_base2 770.1962 817.3142 847.01249 846.13940 893.4095 900.8028 10
cole_base3 97.5201 101.1023 108.46434 102.01210 105.9185 165.3160 10
Ronak_base 115.7445 117.9968 128.06018 124.27730 129.9934 170.3994 10
Ronak_dplyr 721.4570 734.6108 747.46815 735.65990 756.1121 787.0906 10
akrun_base 23.3171 24.4940 26.79405 26.55190 29.1286 30.2099 10
library(data.table)
library(microbenchmark)
library(dplyr)
n_cols <- 10
n_rows <- 1E5
n_row_per_group <- 100
set.seed(1)
matrix <- matrix(sample(100, n_rows*n_cols, replace = T), ncol = n_cols)
matrix <- cbind(matrix, factor = rep(1:(n_rows / n_row_per_group), each = n_row_per_group))
df <- data.frame(matrix)
dt <- as.data.table(df)
do.call(rbind
, lapply(unique(matrix[,'factor'])
, function(x) {
sub_mat <- matrix[matrix[, 'factor'] == x,]
sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
})
)
microbenchmark(
# cole_base = { #too slow for lots of little groups
# do.call(rbind
# ,by(matrix[, -6]
# , matrix[, 6]
# , function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)
# )
# )
# },
dt_version = {
dt[, lapply(.SD, function(x) (sum(x) - x) / (.N - 1)) , by = 'factor']
}
,cole_base2 = {
do.call(rbind
, lapply(unique(matrix[,'factor'])
, function(x) {
sub_mat <- matrix[matrix[, 'factor'] == x,]
sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
})
)
}
,cole_base3 = {
do.call(rbind
,tapply(seq_len(nrow(matrix))
, matrix[, 'factor']
, FUN = function(i) sweep(-matrix[i, -length(matrix)], 2, colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
, simplify = F)
)
}
,Ronak_base = {
lapply(df[-ncol(df)], function(x)
ave(x, df$factor, FUN = function(x) (sum(x) - x)/(length(x) - 1)))
}
# ,G_fern_base = { #pretty slow, i hardcoded the factor row - it needs fixed slightly
# do.call(rbind,
# lapply(levels(factor(matrix[,11])),function(x) {
# list=as.list(NULL)
# index=which(matrix[,11]==x)
# for(i in 1:length(index)){
# if(length(index)>2){
# list[[i]]=colSums(matrix[index[-i],])
# }else{
# list[[i]]=matrix[index[-i],]
# }
# list[[i]]=list[[i]][-11]/(length(index)-1)
# }
# return(do.call(rbind,list))
# })
# )
# }
, Ronak_dplyr = {
df %>%
group_by(factor) %>%
mutate_at(vars(-group_cols()), ~(sum(.)-.)/ (n() - 1))
}
, akrun_base = {
n1 <- tabulate(matrix[, ncol(matrix)])
m1 <- rowsum(matrix[,-ncol(matrix)], group = matrix[,ncol(matrix)])
(m1[rep(seq_len(nrow(m1)), n1),] - matrix[, -ncol(matrix)])/rep(n1 - 1, n1)
}
, times = 10
)