rrankdense-rank

Rank values without skipping a rank for values after duplicates


The returned vector must be the same length as the original; must not skip ranks; must give same rank for duplicate values.

Input:

x <- c(1, 5, 10, 10, 50)

Desired Output:

[1] 1 2 3 3 4

Below is close but skips 4, and ties.method = "first" gives 1 2 3 4 5. I can't use unique(), because that will return different length vector.

rank(c(1, 5, 10, 10, 50), ties.method =  "min")
[1] 1 2 3 3 5

Solution

  • Just for fun, here's a benchmark of the proposed solutions.

    ## Dense Rank functions
    factor_method <- \(x) {
      as.numeric(factor(x, levels = unique(sort(x))))
    }
    
    dplyr_dense_rank <- \(x) {
      dplyr::dense_rank(x)
    }
    
    match_unique <- \(x) {
      match(x, sort(unique(x)))
    }
    
    dt_frank <- \(x){
      data.table::frank(x, ties.method = "dense")
    }
    
    ## Test datasets
    set.seed(42)
    
    small_vec <- runif(100, 1, 20)
    huge_vec <- runif(100000, 1, 2000)
    
    small_int <- sample(1:20, 100, replace = TRUE)
    medium_int <- sample(1:100, 1000, replace = TRUE)
    large_int <- sample(1:500, 10000, replace = TRUE)
    huge_int <- sample(1:2000, 1000000, replace = TRUE)
    
    mostly_dups <- sample(1:10, 10000, replace = TRUE)
    mostly_unique <- sample(1:9500, 10000, replace = TRUE)
    sorted <- sort(sample(1:1000, 5000, replace = TRUE))
    rev_sorted <- rev(sorted)
    
    ## Benchmarks
    mb <- \(dat, tms) {
      microbenchmark::microbenchmark(
        factor_method = factor_method(dat),
        match_unique = match_unique(dat),
        dplyr_dense_rank = dplyr_dense_rank(dat),
        times = tms
      )
    }
    
    mb1 <- mb(small_vec, 1000)
    mb2 <- mb(huge_vec, 20)
    mb3 <- mb(small_int, 1000)
    mb4 <- mb(medium_int, 500)
    mb5 <- mb(large_int, 200)
    mb6 <- mb(huge_int, 20)
    mb7 <- mb(mostly_dups, 100)
    mb8 <- mb(mostly_unique, 100)
    mb9 <- mb(sorted, 200)
    mb10 <- mb(rev_sorted, 200)
    
    all_benchmarks <- rbind(
      data.frame(mb1, dataset = "Small (100)", size = 100),
      data.frame(mb2, dataset = "Huge (1M)", size = 1000000),
      data.frame(mb3, dataset = "Small Int (100)", size = 100),
      data.frame(mb4, dataset = "Medium Int (1K)", size = 1000),
      data.frame(mb5, dataset = "Large Int (10K)", size = 10000),
      data.frame(mb6, dataset = "Huge Int (1M)", size = 1000000),
      data.frame(mb7, dataset = "Mostly Dups (10K)", size = 10000),
      data.frame(mb8, dataset = "Mostly Unique (10K)", size = 10000),
      data.frame(mb9, dataset = "Sorted (5K)", size = 5000),
      data.frame(mb10, dataset = "Rev Sorted Int (5K)", size = 5000)
    ) |>
      transform(dataset = 
                  factor(dataset,
                         levels = c("Small (100)", "Small Int (100)",
                                    "Medium Int (1K)", 
                                    "Large Int (10K)",
                                    "Huge (1M)", "Huge Int (1M)",
                                    "Mostly Dups (10K)", "Mostly Unique (10K)",
                                    "Sorted (5K)", "Rev Sorted Int (5K)")))
    
    ## Visualization
    library(ggplot2)
    
    ggplot(all_benchmarks, aes(x = dataset, y = time/1000000, fill = expr)) +
      geom_violin() +
      scale_y_log10() +
      labs(
        title = "Dense Ranking Methods Performance Comparison",
        subtitle = "Execution time in milliseconds (log scale) - Lower is better",
        x = "Dataset Type",
        y = "Time (milliseconds)",
        fill = "Method"
      ) +
      facet_wrap(~dataset, nrow = 1, 
                 scales = "free_x", strip.position="bottom") + 
      theme_bw() +
      theme(axis.text.x = element_blank(),
            axis.ticks = element_blank())