rdataframedplyr

Proportionately redistributing values


(The last) follow up question: Adjusting values in a table based on another table, Splitting multiple values among multiple values

I have these tables in R:

myt_with_counts <- data.frame(
 name = c("a", "a", "a", "b", "b", "c", "c", "c", "d", "e", "e"),
 var1 = c(120, 80, 100, 20, 30, 15, 25, 35, 40, 50, 60), 
 var2 = c(90, 60, 70, 25, 35, 20, 30, 40, 45, 55, 65),
 count = c(10, 5, 15, 8, 12, 6, 9, 3, 7, 11, 4)
)

  rel_table <- data.frame(
      name = c("a", "b", "c"),
      proportion = c(0.5, 0.4, 0.1)
    )

My Question: I want to redistribute var1 and var2 values from group "a" to other groups based on specified proportions (50% stays, 40% to "b", 10% to "C"), where the amount removed from each "a" row and added to each receiving row is weighted by a count variable. The total sum of each value column remains unchanged after redistribution.

I really struggled to do this:

redistribute <- function(x, val.cols, name.col, count.col, prop, from) {
   val.col <- match(val.cols, names(x))
   x.names <- x[[name.col]]
   prop.names <- prop[[1]]
   rows <- match(prop.names, x.names)
   
   from_totals <- colSums(x[x.names == from, val.col, drop=FALSE])
   from_counts <- x[x.names == from, count.col]
   total_from_counts <- sum(from_counts)
   
   for(i in seq_along(prop.names)) {
       group_name <- prop.names[i]
       group_prop <- prop[[2]][i]
       group_rows <- x.names == group_name
       
       if(group_name == from) {
           proportion_to_remove <- 1 - group_prop
           amount_to_remove <- from_totals * proportion_to_remove
           for(j in seq_along(val.col)) {
               removal_per_count <- amount_to_remove[j] / total_from_counts
               x[group_rows, val.col[j]] <- x[group_rows, val.col[j]] - from_counts * removal_per_count
           }
       } else {
           group_counts <- x[group_rows, count.col]
           total_group_counts <- sum(group_counts)
           amount_to_add <- from_totals * group_prop
           for(j in seq_along(val.col)) {
               addition_per_count <- amount_to_add[j] / total_group_counts
               x[group_rows, val.col[j]] <- x[group_rows, val.col[j]] + group_counts * addition_per_count
           }
       }
   }
   x
}

result <- redistribute(myt_with_counts, val.cols=c("var1", "var2"), name.col="name", count.col="count", prop=rel_table, from="a")

Solution

  • Again, same concept as the previous question. Only here, instead of dividing by number of rows, we multiply by current count divided by total count.

    library(dplyr)
    library(rlang)
    
    distribute_fn_weighted <- \(dat, idCol, fromRow, toRows, 
                                colVars, props, countCol) {
      all_targets <- c(fromRow, toRows)
      prop_vec <- setNames(props, all_targets)
      source_totals <- dat %>% 
        filter({{ idCol }} == fromRow) %>% 
        summarise(across({{ colVars }}, sum)) %>%
        as.list()
      group_count_totals <- dat %>%
        filter({{ idCol }} %in% all_targets) %>%
        group_by({{ idCol }}) %>%
        summarise(total_count = sum({{ countCol }}), .groups = "drop")
      
      dat %>%
        left_join(group_count_totals, by = as_label(enquo(idCol))) %>%
        mutate(
          across({{ colVars }}, ~ {
            current_id <- {{ idCol }}
            current_count <- {{ countCol }}
            case_when(
              current_id == fromRow ~ 
                .x - (source_totals[[cur_column()]] *
                        (1 - prop_vec[as.character(fromRow)]) * 
                        current_count / total_count),
              current_id %in% toRows ~ .x + 
                (source_totals[[cur_column()]] * 
                   prop_vec[as.character(current_id)] * 
                   current_count / total_count),
              TRUE ~ .x
            )
          })
        ) %>%
        select(-total_count)
    }
    
    distribute_fn_weighted(
      dat = myt_with_counts, 
      idCol = name, 
      fromRow = "a", 
      toRows = c("b", "c"), 
      colVars = c(var1, var2), 
      props = c(0.5, 0.4, 0.1),
      countCol = count
    )
    #>    name var1     var2 count
    #> 1     a   70 53.33333    10
    #> 2     a   55 41.66667     5
    #> 3     a   25 15.00000    15
    #> 4     b   68 60.20000     8
    #> 5     b  102 87.80000    12
    #> 6     c   25 27.33333     6
    #> 7     c   40 41.00000     9
    #> 8     c   40 43.66667     3
    #> 9     d   40 45.00000     7
    #> 10    e   50 55.00000    11
    #> 11    e   60 65.00000     4
    

    Created on 2025-07-25 with reprex v2.1.1