rdataframefor-looprowsum

rowsum for multiple columns in r


I can take the sum of the target column by the levels in the categorical columns which are in catVariables. However, instead of doing this in a for loop I want to apply this to all categorical columns at once. For loop will make the code run for longer and doing this in a vectorized way will be faster.

# Data
col1 <- c("L", "R", "R", "L", "R", "L", "R", "L")
col2 <- c("R", "R", "R", "L", "L", "R", "L", "R")
col3 <- c("L", "-", "L", "R", "-", "L", "R", "-")
target <- c(1, 0, 0, 1, 1, 0, 1, 0)



dat <- data.frame("col1" = col1, "col2" = col2, "col3" = col3, "target" = target)

dat[sapply(dat, is.character)] <- lapply(dat[sapply(dat, is.character)], as.factor)
catVariables <- names(Filter(is.factor, dat))



# test
col1 <- c("L", "R", "R", "L", "R", "L", "R", "L")
col2 <- c("R", "R", "R", "L", "L", "R", "L", "R")
col3 <- c("L", "-", "L", "R", "-", "L", "R", "-")
target <- c(1, 0, 0, 1, 1, 0, 1, 0)

test_dat <- data.frame("col1" = col1, "col2" = col2, "col3" = col3, "target" = target)



for (col in catVariables){
ratios <- rowsum(dat[["target"]], dat[[col]])/sum(dat[["target"]])
print(ratios)
dat[[col]] <- ratios[match(dat[[col]],names(ratios[,1]))]
test_dat[[col]] <- ratios[match(test_dat[[col]], names(ratios[,1]))]
}

Solution

  • We may use across in dplyr for doing the rowsum on multiple columns

    library(dplyr)
    dat %>% 
      mutate(across(all_of(catVariables), 
         ~ {tmp <- rowsum(target, .x)/sum(target);
      tmp[match(.x, row.names(tmp))]}))
    

    -output

       col1 col2 col3 target
    1  0.5 0.25 0.25      1
    2  0.5 0.25 0.25      0
    3  0.5 0.25 0.25      0
    4  0.5 0.75 0.50      1
    5  0.5 0.75 0.25      1
    6  0.5 0.25 0.25      0
    7  0.5 0.75 0.50      1
    8  0.5 0.25 0.25      0
    

    Or with test_dat/train data ('dat'), an option is to loop over the test_dat, extract the corresponding column from 'dat' using column name (cur_column()) to calculate the rowsum by group, and then match the 'test_dat' column values with the row names of the output to expand the data

    test_dat %>% 
      mutate(across(all_of(catVariables), 
         ~ {tmp <- rowsum(dat[["target"]], dat[[cur_column()]])/sum(dat[["target"]]);
      tmp[match(.x, row.names(tmp))]}))
      col1 col2 col3 target
    1  0.5 0.25 0.25      1
    2  0.5 0.25 0.25      0
    3  0.5 0.25 0.25      0
    4  0.5 0.75 0.50      1
    5  0.5 0.75 0.25      1
    6  0.5 0.25 0.25      0
    7  0.5 0.75 0.50      1
    8  0.5 0.25 0.25      0