Rank values without skipping a rank for values after duplicates

The returned vector must be the same length as the original; must not skip ranks; must give same rank for duplicate values.

Input:

x <- c(1, 5, 10, 10, 50)

Desired Output:

[1] 1 2 3 3 4

Below is close but skips 4, and ties.method = "first" gives 1 2 3 4 5. I can't use unique(), because that will return different length vector.

rank(c(1, 5, 10, 10, 50), ties.method =  "min")
[1] 1 2 3 3 5

Solution

Just for fun, here's a benchmark of the proposed solutions.

dplyr::dense_rank() has a decisive lead.
factor/levels is particularly inefficient for non-integer vectors.
match/unique has a slight edge over using factor, again, putting aside non-integer vectors.
data.table::frank(..., ties.method = "dense") is not particularly impressive, but it performs well on huge vectors.

## Dense Rank functions
factor_method <- \(x) {
  as.numeric(factor(x, levels = unique(sort(x))))
}

dplyr_dense_rank <- \(x) {
  dplyr::dense_rank(x)
}

match_unique <- \(x) {
  match(x, sort(unique(x)))
}

dt_frank <- \(x){
  data.table::frank(x, ties.method = "dense")
}

## Test datasets
set.seed(42)

small_vec <- runif(100, 1, 20)
huge_vec <- runif(100000, 1, 2000)

small_int <- sample(1:20, 100, replace = TRUE)
medium_int <- sample(1:100, 1000, replace = TRUE)
large_int <- sample(1:500, 10000, replace = TRUE)
huge_int <- sample(1:2000, 1000000, replace = TRUE)

mostly_dups <- sample(1:10, 10000, replace = TRUE)
mostly_unique <- sample(1:9500, 10000, replace = TRUE)
sorted <- sort(sample(1:1000, 5000, replace = TRUE))
rev_sorted <- rev(sorted)

## Benchmarks
mb <- \(dat, tms) {
  microbenchmark::microbenchmark(
    factor_method = factor_method(dat),
    match_unique = match_unique(dat),
    dplyr_dense_rank = dplyr_dense_rank(dat),
    times = tms
  )
}

mb1 <- mb(small_vec, 1000)
mb2 <- mb(huge_vec, 20)
mb3 <- mb(small_int, 1000)
mb4 <- mb(medium_int, 500)
mb5 <- mb(large_int, 200)
mb6 <- mb(huge_int, 20)
mb7 <- mb(mostly_dups, 100)
mb8 <- mb(mostly_unique, 100)
mb9 <- mb(sorted, 200)
mb10 <- mb(rev_sorted, 200)

all_benchmarks <- rbind(
  data.frame(mb1, dataset = "Small (100)", size = 100),
  data.frame(mb2, dataset = "Huge (1M)", size = 1000000),
  data.frame(mb3, dataset = "Small Int (100)", size = 100),
  data.frame(mb4, dataset = "Medium Int (1K)", size = 1000),
  data.frame(mb5, dataset = "Large Int (10K)", size = 10000),
  data.frame(mb6, dataset = "Huge Int (1M)", size = 1000000),
  data.frame(mb7, dataset = "Mostly Dups (10K)", size = 10000),
  data.frame(mb8, dataset = "Mostly Unique (10K)", size = 10000),
  data.frame(mb9, dataset = "Sorted (5K)", size = 5000),
  data.frame(mb10, dataset = "Rev Sorted Int (5K)", size = 5000)
) |>
  transform(dataset = 
              factor(dataset,
                     levels = c("Small (100)", "Small Int (100)",
                                "Medium Int (1K)", 
                                "Large Int (10K)",
                                "Huge (1M)", "Huge Int (1M)",
                                "Mostly Dups (10K)", "Mostly Unique (10K)",
                                "Sorted (5K)", "Rev Sorted Int (5K)")))

## Visualization
library(ggplot2)

ggplot(all_benchmarks, aes(x = dataset, y = time/1000000, fill = expr)) +
  geom_violin() +
  scale_y_log10() +
  labs(
    title = "Dense Ranking Methods Performance Comparison",
    subtitle = "Execution time in milliseconds (log scale) - Lower is better",
    x = "Dataset Type",
    y = "Time (milliseconds)",
    fill = "Method"
  ) +
  facet_wrap(~dataset, nrow = 1, 
             scales = "free_x", strip.position="bottom") + 
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks = element_blank())