Considering groups (gp
) of n results (ngp
), how to select/subset given numbers of results (nesgp
) that are spaced as evenly as possible between the minimum and maximum (both necessarily included) in a new column selec
?
Edit: Ideally, unselected results should appear as NA
in the new selec
column, not duplicated.
> print(dat, n=56)
# A tibble: 56 x 4
gp result ngp nesgp
<chr> <dbl> <dbl> <dbl>
1 CA 1.64 24 15
2 CA 1.69 24 15
3 CA 1.71 24 15
4 CA 1.74 24 15
5 CA 1.78 24 15
6 CA 1.82 24 15
7 CA 1.86 24 15
8 CA 1.9 24 15
9 CA 1.94 24 15
10 CA 1.98 24 15
11 CA 2.6 24 15
12 CA 2.65 24 15
13 CA 2.71 24 15
14 CA 2.76 24 15
15 CA 2.83 24 15
16 CA 2.89 24 15
17 CA 2.94 24 15
18 CA 3 24 15
19 CA 3.22 24 15
20 CA 3.42 24 15
21 CA 3.47 24 15
22 CA 3.68 24 15
23 CA 3.85 24 15
24 CA 4.38 24 15
25 ASAT 9 20 12
26 ASAT 11 20 12
27 ASAT 51 20 12
28 ASAT 61 20 12
29 ASAT 69 20 12
30 ASAT 78 20 12
31 ASAT 89 20 12
32 ASAT 102 20 12
33 ASAT 111 20 12
34 ASAT 120 20 12
35 ASAT 146 20 12
36 ASAT 163 20 12
37 ASAT 189 20 12
38 ASAT 208 20 12
39 ASAT 218 20 12
40 ASAT 304 20 12
41 ASAT 332 20 12
42 ASAT 345 20 12
43 ASAT 362 20 12
44 ASAT 402 20 12
45 ORO 0.56 12 8
46 ORO 0.7 12 8
47 ORO 0.77 12 8
48 ORO 0.78 12 8
49 ORO 0.82 12 8
50 ORO 0.82 12 8
51 ORO 0.92 12 8
52 ORO 0.94 12 8
53 ORO 1.16 12 8
54 ORO 1.46 12 8
55 ORO 1.54 12 8
56 ORO 1.77 12 8
Data
dat <-
structure(list(gp = c("CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "ASAT", "ASAT", "ASAT", "ASAT",
"ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT",
"ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT",
"ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO",
"ORO", "ORO", "ORO"), result = c(1.64, 1.69, 1.71, 1.74, 1.78,
1.82, 1.86, 1.9, 1.94, 1.98, 2.6, 2.65, 2.71, 2.76, 2.83, 2.89,
2.94, 3, 3.22, 3.42, 3.47, 3.68, 3.85, 4.38, 9, 11, 51, 61, 69,
78, 89, 102, 111, 120, 146, 163, 189, 208, 218, 304, 332, 345,
362, 402, 0.56, 0.7, 0.77, 0.78, 0.82, 0.82, 0.92, 0.94, 1.16,
1.46, 1.54, 1.77), ngp = c(24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12),
nesgp = c(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -56L))
Thanks for help.
I'm not sure what you mean by "spaced as evenly as possible" but I wrote an example that uses sampling of # of points to minimize the spread between their deltas that could be a good starting point for you:
par(mfrow = c(length(unique(dat$gp)), 1))
dat$selec <- NA
# for each group,
groups <- unique(dat$gp)
for(gp in groups){
x <- dat$result[dat$gp == gp]
minmax_x <- range(x)
possible_xs <- x[!(x %in% minmax_x)]
# run a lot of samples of different possible lengths to test
r <- replicate(20000, sort(c(minmax_x,
sample(possible_xs,
size = sample(3:length(possible_xs),1)
)
)
)
)
spreads <- sapply(r, function(obj) var(diff(obj)))
minimized_variance_index <- which.min(spreads)
dat$selec[which(dat$result %in% r[[minimized_variance_index]])] <- 1
# visualize
plot(x, rep(1, length(x)), yaxt = "n", ylab = "", xlab = "result",
main = paste(gp,", spread =", round(var(diff(r[[minimized_variance_index]])),5)))
abline(v= r[[minimized_variance_index]])
}
There are not as many points selected in this case as what you seem to be looking for.
If you would like to first determine an ideal distribution based on an evenly spread number of points, you'll just have to come up with that arbitrary number num_intervals <- length(x)-1
Here are the functions that make the coding a little easier
create_equal_spaced_intervals <- function(x_values, num_intervals){
intervals <- seq(from = min(x_values), to = max(x_values), length.out = num_intervals)
names(intervals) <- paste0("interval",1:num_intervals)
return(intervals)
}
snap_closest_x_to_closest_y <- function(x_values, y_values){
rowMins <- function(a) apply(a, 1, function(b) which.min(b))
colMins <- function(a) apply(a, 2, function(b) which.min(b))
absolute_dist_matrix <- abs(outer(x_values, y_values, "-"))
snapped_Ys <- unique(rowMins(absolute_dist_matrix))
snapped_Xs <- colMins(absolute_dist_matrix[,snapped_Ys])
return(x_values[snapped_Xs])
}
corr_of_var_fn <- function(x) round(sd(diff(x))/mean(diff(x)), 4)
And here is how to go about performing the algo
# ANALYSIS BY GP
dat_by_gp <- split(dat, dat$gp, drop= T)
spread_results_by_gp <- vector("list", length(dat_by_gp))
for(i in 1:length(dat_by_gp)){
subdat <- dat_by_gp[[i]]
subdat$selec <- NA
no_dupes <- which(!duplicated(subdat$result))
vec <- subdat$result[no_dupes]
n <- length(vec)
spread_results <- rep(NA, n)
# identify the best interval to use
# by iterating from 3 to the size
# can change the 3 though..
for(num_intervals in 3:n){
intervals <- create_equal_spaced_intervals(vec, num_intervals)
selec <- snap_closest_x_to_closest_y(x_values = vec, y_values = intervals)
# measure result
spread_results[num_intervals] <- corr_of_var_fn(selec)
}
# get the MOST EVEN result
number_of_intervals <- which.min(spread_results)
selec <- snap_closest_x_to_closest_y(vec, create_equal_spaced_intervals(vec, number_of_intervals))
# assign back to the matrix
index <- which(subdat$result[no_dupes] %in% selec)
subdat$selec[no_dupes][index] <- 1
spread_results_by_gp[[i]] <- spread_results
dat_by_gp[[i]] <- subdat
cat(subdat$gp[1], "Using ", number_of_intervals,
" intervals which produces a spread of ", spread_results[which.min(spread_results)],
"and ", length(selec), "results\n")
}
# and you could overwrite your dat object by using these values
dat$selec <- do.call(rbind, dat_by_gp)$selec
We can also visualize the results by doing the following
# visualize individually below
plot_individual_interval_comparison <- function(x){
default_plot_params <- par(no.readonly = TRUE)
vec <- x[!duplicated(x)]
n <- length(vec)
spread_results <- rep(NA, n-2)
par(mfrow = c(n-2, 1), mar = c(0,6,0,0), oma = c(3,1,1,1), las = 2)
for(num_intervals in 3:n){
intervals <- create_equal_spaced_intervals(vec, num_intervals)
selec <- snap_closest_x_to_closest_y(x_values = vec, y_values = intervals)
# measure result
corr_of_var = corr_of_var_fn(selec)
spread_results[num_intervals] <- corr_of_var
# visualize
plot(x, rep(1, length(x)), xaxt = "n", yaxt = "n", xlab = "", ylab = "")
mtext(paste("intervals=",num_intervals,"\n","spread=",corr_of_var), side = 2,line = 1, cex = .6)
abline(v = intervals, col = 'gray', lty = 1, lwd = 1)
abline(v = selec, col = 'blue', lty = 2, lwd = 2)
}
par(default_plot_params)
}
# making the plots
plot_individual_interval_comparison(dat$result[dat$gp == "CA"])
plot_individual_interval_comparison(dat$result[dat$gp == "ASAT"])
plot_individual_interval_comparison(dat$result[dat$gp == "ORO"])
par(mfrow= c(1,3))
plot(spread_results_by_gp[[1]], main = "CA", ylab = "spread", type = 'o')
plot(spread_results_by_gp[[2]], main = "ASAT", ylab = "spread", type = 'o')
plot(spread_results_by_gp[[3]], main = "ORO", ylab = "spread", type = 'o')
You'll notice this approach doesn't give you quite the same visual even spread as the previous approach.