past
vector is the set of o3.cpt
s found in the past period.
current
vector is the set of o3.cpt
s found in the current period.
Each o3.cpt
has a set of numbers that are associated with the code (PCs in numbers
data frame).
I am trying to get the sum of differences of PCs for every pair of past and current o3.cpt
.
In the original dataset, there are more than 700 PC columns in numbers
data frame.
The below code is what I am currently using, but it is running too slow (I need to repeat this process for more than 700,000 times).
Would there be more efficient way to compute these sum_diff
?
past = sample(1:40, 20, replace=F)
current = sample(41:80, 20, replace=F)
set.seed(100)
numbers <-
data.frame(
o3.cpt = c(past, current),
PC1 = runif(length(c(past, current)), min = -3, max = 3),
PC2 = runif(length(c(past, current)), min = 1, max = 10),
PC3 = runif(length(c(past, current)), min = -4, max = 2),
PC4 = runif(length(c(past, current)), min = -9, max = 8),
PC5 = runif(length(c(past, current)), min = 4, max = 9)
)
pairs <-
expand.grid(
past,
current
)
if(nrow(pairs) > 0){
sum_diff <-
pairs %>%
ddply(c('Var1', 'Var2'), function(j){
p <-
numbers %>%
filter(o3.cpt == j$Var1[1]) %>%
select(starts_with('PC')) %>%
t %>%
data.frame %>%
.$.
c<-
numbers %>%
filter(o3.cpt == j$Var2[1]) %>%
select(starts_with('PC')) %>%
t %>%
data.frame %>%
.$.
data.frame(diff = sum(abs(p - c)))
})
}
d<-
structure(list(o3.month.date = structure(c(16953, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16953, 16953, 16953,
16953, 16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16922, 16922, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16922, 16953, 16953, 16953, 16953, 16953,
16953, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16953, 16953, 16922, 16922, 16922, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16922, 16922, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953, 16953,
16953, 16953), class = "Date"), o3.month.date.previous = structure(c(16922,
16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892,
16892, 16892, 16922, 16892, 16892, 16892, 16892, 16892, 16892,
16892, 16892, 16892, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16892, 16892, 16892, 16892, 16892, 16892,
16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 16922,
16922, 16922, 16922, 16922, 16892, 16892, 16892, 16892, 16892,
16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892, 16892,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16892, 16892, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16892, 16922, 16922, 16922,
16922, 16922, 16922, 16892, 16892, 16892, 16892, 16892, 16892,
16892, 16892, 16922, 16922, 16892, 16892, 16892, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16892, 16892, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922, 16922,
16922, 16922, 16922, 16922), class = "Date"),
aid = c("a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725", "a725",
"a725", "a725", "a725", "a725", "a725", "a725", "a725"),
oid = c("o11816",
"o20077", "o20077", "o20077", "o20077", "o20077", "o20077", "o20077",
"o20077", "o20077", "o20077", "o23696", "o24697", "o28255", "o29968",
"o29968", "o29968", "o29968", "o29968", "o29968", "o29968", "o29968",
"o36383", "o36383", "o36383", "o36383", "o36383", "o36383", "o36383",
"o36481", "o36481", "o37147", "o37761", "o37761", "o37761", "o37761",
"o38480", "o38480", "o38480", "o38480", "o38480", "o38480", "o38480",
"o38480", "o38729", "o38904", "o38904", "o38904", "o38904", "o38904",
"o42403", "o42403", "o42403", "o42403", "o42403", "o42403", "o42403",
"o42403", "o42579", "o43113", "o43255", "o43255", "o43486", "o43486",
"o44068", "o44068", "o44068", "o44068", "o44474", "o44474", "o45552",
"o47039", "o47039", "o47039", "o47039", "o47039", "o47039", "o47039",
"o47082", "o47082", "o47209", "o47209", "o47209", "o47209", "o47209",
"o47209", "o48228", "o48509", "o48943", "o48943", "o48943", "o48943",
"o48943", "o48943", "o49126", "o49126", "o49126", "o49126", "o49126",
"o49126", "o49890", "o50413", "o50505", "o50505", "o51150", "o51835",
"o51835", "o52116", "o52116", "o52116", "o52116", "o52116", "o52116",
"o52116", "o52116", "o52116", "o52260", "o52260", "o52606", "o52670",
"o53341", "o53341", "o53395", "o53969", "o53969", "o55166", "o55166",
"o55166", "o55761", "o55761", "o55761", "o55761", "o55761", "o56077",
"o57530", "o57574", "o57574", "o57574", "o57688", "o57722", "o57722",
"o57862", "o57862", "o57862", "o58501", "o58567", "o58567", "o58567",
"o59423", "o59423", "o59843", "o60553", "o60553", "o60553", "o60553",
"o60553", "o61995", "o62230", "o62230", "o62230", "o62839", "o63835",
"o63835", "o63835", "o64526", "o64526", "o67971", "o67971", "o67971",
"o67971", "o68299", "o68299", "o68846", "o68887", "o69530", "o69530"
),
o3.cpt = c("c3354", "c742", "c750", "c3480", "c565", "c740",
"c3462", "c730", "c729", "c562", "c3478", "c3453", "c3354", "c484",
"c740", "c750", "c3730", "c3472", "c565", "c562", "c3478", "c729",
"c565", "c731", "c740", "c750", "c3478", "c562", "c3766", "c3407",
"c3397", "c3468", "c3470", "c3478", "c3468", "c3469", "c717",
"c563", "c718", "c750", "c745", "c562", "c3488", "c3487", "c3531",
"c1226", "c1413", "c1333", "c1349", "c1414", "c723", "c725",
"c742", "c565", "c3730", "c562", "c728", "c708", "c3306", "c3354",
"c3438", "c3407", "c3356", "c3407", "c1312", "c1226", "c1216",
"c1228", "c1239", "c1234", "c534", "c731", "c562", "c565", "c729",
"c740", "c3478", "c750", "c3456", "c3450", "c717", "c563", "c562",
"c718", "c745", "c750", "c3501", "c748", "c562", "c3489", "c750",
"c745", "c717", "c563", "c562", "c750", "c718", "c3489", "c717",
"c745", "c3335", "c3472", "c3472", "c3478", "c1154", "c3424",
"c3356", "c731", "c3478", "c729", "c562", "c742", "c739", "c565",
"c750", "c728", "c3411", "c3412", "c3377", "c3422", "c3478",
"c3480", "c3503", "c3435", "c3406", "c3730", "c3478", "c3480",
"c750", "c563", "c562", "c745", "c717", "c3342", "c2276", "c3357",
"c3407", "c3730", "c3337", "c3430", "c3407", "c3730", "c3407",
"c3357", "c3468", "c3356", "c3407", "c3730", "c3407", "c3430",
"c1066", "c750", "c731", "c562", "c742", "c732", "c17", "c1327",
"c346", "c402", "c3440", "c3517", "c3730", "c3513", "c2569",
"c2565", "c256", "c362", "c343", "c390", "c1060", "c1167", "c289",
"c1067", "c133", "c151")),
row.names = c(NA, -176L),
class = "data.frame")
sample_cpts <-
c("c3354", "c742", "c750", "c3480", "c565", "c740", "c3462",
"c730", "c729", "c562", "c3478", "c3453", "c484", "c3730", "c3472",
"c731", "c3766", "c3407", "c3397", "c3468", "c3470", "c3469",
"c717", "c563", "c718", "c745", "c3488", "c3487", "c3531", "c1226",
"c1413", "c1333", "c1349", "c1414", "c723", "c725", "c728", "c708",
"c3306", "c3438", "c3356", "c1312", "c1216", "c1228", "c1239",
"c1234", "c534", "c3456", "c3450", "c3501", "c748", "c3489",
"c3335", "c1154", "c3424", "c739", "c3411", "c3412", "c3377",
"c3422", "c3503", "c3435", "c3406", "c3342", "c2276", "c3357",
"c3337", "c3430", "c1066", "c732", "c17", "c1327", "c346", "c402",
"c3440", "c3517", "c3513", "c2569", "c2565", "c256", "c362",
"c343", "c390", "c1060", "c1167", "c289", "c1067", "c133", "c151"
)
set.seed(100)
pcs <-
data.frame(
o3.cpt = sample_cpts,
PC1 = runif(length(sample_cpts), min = -3, max = 3),
PC2 = runif(length(sample_cpts), min = 1, max = 10),
PC3 = runif(length(sample_cpts), min = -4, max = 2),
PC4 = runif(length(sample_cpts), min = -9, max = 8),
PC5 = runif(length(sample_cpts), min = 4, max = 9)
)
d1 <-
d %>%
# filter(aid == 'a725') %>%
ddply('aid', function(x){
mean_diff <-
x %>%
# filter(o3.month.date == '2016-05-01')
ddply('o3.month.date', function(i){
past <-
x %>%
filter(o3.month.date == i$o3.month.date.previous[1]) %>%
# select(o3.cpt, aid, o3.month.date, o3.month.date.previous, PC1, PC2, PC3) %>%
select(o3.cpt, starts_with('PC')) %>%
distinct
current <-
i %>%
# select(oid, o3.cpt, aid, o3.month.date, o3.month.date.previous, PC1, PC2, PC3) %>%
select(o3.cpt, starts_with('PC'))%>%
distinct
intersect <- intersect(past$o3.cpt, current$o3.cpt)
past <-
past %>%
filter(!(o3.cpt %in% intersect)) %>%
.$o3.cpt
current <-
current %>%
filter(!(o3.cpt %in% intersect)) %>%
.$o3.cpt
pairs <-
expand.grid(
past,
current
)
if(nrow(pairs) == 0){
output <-
data.frame(mean_diff = NA)
}
if(nrow(pairs) > 0){
sum_diff <-
pairs %>%
ddply(c('Var1', 'Var2'), function(j){
p <-
pcs %>%
filter(o3.cpt == j$Var1[1]) %>%
select(starts_with('PC')) %>%
t %>%
data.frame %>%
.$.
c<-
pcs %>%
filter(o3.cpt == j$Var2[1]) %>%
select(starts_with('PC')) %>%
t %>%
data.frame %>%
.$.
data.frame(diff = sum(abs(p - c)))
})
output <-
sum_diff %>%
summarise(
mean_diff = mean(diff, na.rm = T)
)
}
output
})
mean_diff
}, .parallel = T)
d1
This is the pairwise Manhattan distance between the matrices made up of the past
and current
observations. Rfast::dista
is a very fast implementation. The following runs in less than 0.0002 seconds.
pairs$diff <- c(
Rfast::dista(
as.matrix(numbers[match(past, numbers$o3.cpt), -1]),
as.matrix(numbers[match(current, numbers$o3.cpt), -1]),
type = "manhattan"
)
)
Timing:
microbenchmark::microbenchmark(
dista = {pairs$diff <- c(
dista(
as.matrix(numbers[match(past, numbers$o3.cpt), -1]),
as.matrix(numbers[match(current, numbers$o3.cpt), -1]),
type = "manhattan"
)
)}
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> dista 177.101 181.401 187.5551 183.301 185.4515 346.101 100