rdataframedplyr

Splitting multiple values among multiple values


This is a follow-up question: Adjusting values in a table based on another table

I have two tables:

myt_repeating <- data.frame(
  name = c("a", "a", "a", "b", "b", "c", "c", "c", "d", "e", "e"),
  var1 = c(120, 80, 100, 20, 30, 15, 25, 35, 40, 50, 60), 
  var2 = c(90, 60, 70, 25, 35, 20, 30, 40, 45, 55, 65),
  var3 = c(110, 70, 90, 30, 40, 25, 35, 45, 50, 60, 70)
)

rel_table <- data.frame(
  name = c("a", "b", "c"),
  proportion = c(0.5, 0.4, 0.1)
)

My Question: For var1 and var2

I could not think of any way to do this without a if loop:

redistribute <- function(x, val.cols, name.col, prop, from) {
   val.col <- match(val.cols, names(x))
   x.names <- x[[name.col]]
   prop.names <- prop[[1]]
   rows <- match(prop.names, x.names)
   
   from_totals <- colSums(x[x.names == from, val.col, drop=FALSE])
   
   for(i in seq_along(prop.names)) {
       group_name <- prop.names[i]
       group_prop <- prop[[2]][i]
       group_rows <- x.names == group_name
       n_rows <- sum(group_rows)
       
       if(group_name == from) {
           x[group_rows, val.col] <- x[group_rows, val.col] * group_prop
       } else {
           add_values <- (from_totals * group_prop) / n_rows
           x[group_rows, val.col] <- x[group_rows, val.col] + 
              matrix(add_values, nrow=n_rows, ncol=length(val.col), byrow=TRUE)
       }
   }
   x
}

result <- redistribute(myt_repeating, val.cols=c("var1", "var2"),
                       name.col="name", prop=rel_table, from="a")

Is there a more concise way to do this?


Solution

  • Same as previous question, here we just need to consider/divide by number of rows in each group.

    library(dplyr)
    library(rlang)
    
    distribute_fn_multi <- function(dat, idCol, fromRow, toRows, colVars, props) {
      all_targets <- c(fromRow, toRows)
      prop_vec <- setNames(props, all_targets)
      source_totals <- dat %>% 
        filter({{ idCol }} == fromRow) %>% 
        summarise(across({{ colVars }}, sum)) %>%
        as.list()
      group_counts <- dat %>%
        filter({{ idCol }} %in% all_targets) %>%
        count({{ idCol }}, name = "n_rows")
      
      dat %>%
        left_join(group_counts, by = as_label(enquo(idCol))) %>%
        mutate(
          across({{ colVars }}, ~ {
            current_id <- {{ idCol }}
            case_when(
              current_id == fromRow ~ source_totals[[cur_column()]] * 
                prop_vec[as.character(fromRow)] / n_rows,
              current_id %in% toRows ~ .x + 
                (source_totals[[cur_column()]] * 
                   prop_vec[as.character(current_id)] / n_rows),
              TRUE ~ .x
            )
          })
        ) %>%
        select(-n_rows)
    }
    
    distribute_fn_multi(
      dat = myt_repeating, 
      idCol = name, 
      fromRow = "a", 
      toRows = c("b", "c"), 
      colVars = c(var1, var2), 
      props = c(0.5, 0.4, 0.1)
    )
    
    # >    name var1     var2 var3
    # > 1     a   50 36.66667  110
    # > 2     a   50 36.66667   70
    # > 3     a   50 36.66667   90
    # > 4     b   80 69.00000   30
    # > 5     b   90 79.00000   40
    # > 6     c   25 27.33333   25
    # > 7     c   35 37.33333   35
    # > 8     c   45 47.33333   45
    # > 9     d   40 45.00000   50
    # > 10    e   50 55.00000   60
    # > 11    e   60 65.00000   70