The returned vector must be the same length as the original; must not skip ranks; must give same rank for duplicate values.
Input:
x <- c(1, 5, 10, 10, 50)
Desired Output:
[1] 1 2 3 3 4
Below is close but skips 4, and ties.method = "first"
gives 1 2 3 4 5
. I can't use unique(), because that will return different length vector.
rank(c(1, 5, 10, 10, 50), ties.method = "min")
[1] 1 2 3 3 5
Just for fun, here's a benchmark of the proposed solutions.
dplyr::dense_rank()
has a decisive lead.
factor
/levels
is particularly inefficient for non-integer vectors.
match
/unique
has a slight edge over using factor
, again, putting aside non-integer vectors.
data.table::frank(..., ties.method = "dense")
is not particularly impressive, but it performs well on huge vectors.
## Dense Rank functions
factor_method <- \(x) {
as.numeric(factor(x, levels = unique(sort(x))))
}
dplyr_dense_rank <- \(x) {
dplyr::dense_rank(x)
}
match_unique <- \(x) {
match(x, sort(unique(x)))
}
dt_frank <- \(x){
data.table::frank(x, ties.method = "dense")
}
## Test datasets
set.seed(42)
small_vec <- runif(100, 1, 20)
huge_vec <- runif(100000, 1, 2000)
small_int <- sample(1:20, 100, replace = TRUE)
medium_int <- sample(1:100, 1000, replace = TRUE)
large_int <- sample(1:500, 10000, replace = TRUE)
huge_int <- sample(1:2000, 1000000, replace = TRUE)
mostly_dups <- sample(1:10, 10000, replace = TRUE)
mostly_unique <- sample(1:9500, 10000, replace = TRUE)
sorted <- sort(sample(1:1000, 5000, replace = TRUE))
rev_sorted <- rev(sorted)
## Benchmarks
mb <- \(dat, tms) {
microbenchmark::microbenchmark(
factor_method = factor_method(dat),
match_unique = match_unique(dat),
dplyr_dense_rank = dplyr_dense_rank(dat),
times = tms
)
}
mb1 <- mb(small_vec, 1000)
mb2 <- mb(huge_vec, 20)
mb3 <- mb(small_int, 1000)
mb4 <- mb(medium_int, 500)
mb5 <- mb(large_int, 200)
mb6 <- mb(huge_int, 20)
mb7 <- mb(mostly_dups, 100)
mb8 <- mb(mostly_unique, 100)
mb9 <- mb(sorted, 200)
mb10 <- mb(rev_sorted, 200)
all_benchmarks <- rbind(
data.frame(mb1, dataset = "Small (100)", size = 100),
data.frame(mb2, dataset = "Huge (1M)", size = 1000000),
data.frame(mb3, dataset = "Small Int (100)", size = 100),
data.frame(mb4, dataset = "Medium Int (1K)", size = 1000),
data.frame(mb5, dataset = "Large Int (10K)", size = 10000),
data.frame(mb6, dataset = "Huge Int (1M)", size = 1000000),
data.frame(mb7, dataset = "Mostly Dups (10K)", size = 10000),
data.frame(mb8, dataset = "Mostly Unique (10K)", size = 10000),
data.frame(mb9, dataset = "Sorted (5K)", size = 5000),
data.frame(mb10, dataset = "Rev Sorted Int (5K)", size = 5000)
) |>
transform(dataset =
factor(dataset,
levels = c("Small (100)", "Small Int (100)",
"Medium Int (1K)",
"Large Int (10K)",
"Huge (1M)", "Huge Int (1M)",
"Mostly Dups (10K)", "Mostly Unique (10K)",
"Sorted (5K)", "Rev Sorted Int (5K)")))
## Visualization
library(ggplot2)
ggplot(all_benchmarks, aes(x = dataset, y = time/1000000, fill = expr)) +
geom_violin() +
scale_y_log10() +
labs(
title = "Dense Ranking Methods Performance Comparison",
subtitle = "Execution time in milliseconds (log scale) - Lower is better",
x = "Dataset Type",
y = "Time (milliseconds)",
fill = "Method"
) +
facet_wrap(~dataset, nrow = 1,
scales = "free_x", strip.position="bottom") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks = element_blank())