rdataframeloopsdplyrpairwise

More efficient program to create columns and sum pairwise comparisons of specific conditions in a large dataset


I have a dataset like this:

dat1 <- read.table(text = "
  nodepair 3 4 5
1    A6_A1 2 5 1
2    A6_A2 2 5 1
3    A6_A3 2 5 1
4    AL_A1 1 0 0
5     D_A6 0 3 0
6     F_A1 1 0 1
7      H_D 0 0 2
8      H_H 0 0 2 
", header = TRUE, check.names = FALSE)

And I need to write a program to efficiently create new columns which sums each pairwise comparison when a zero appears between pairs-- specifically to create the 'b' condition is when a nodepair is unique to the first variable but not the second, and the 'c' condition is when a nodepair is unique to the second and not the first. Here is the desired output:

dat2 <- read.table(text = "
  nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
4    AL_A1 1 0 0    1   NA    1   NA    0    0
5     D_A6 0 3 0   NA    3    0    0    3   NA
6     F_A1 1 0 1    1   NA   NA   NA   NA    1
7      H_D 0 0 2    0    0   NA    2   NA    2
8      H_H 0 0 2    0    0   NA    2   NA    2  
", header = TRUE, check.names = FALSE)

This code will work but is certainly less than ideal for my much larger dataset:

dat1 <- dat1 %>%
  mutate('3-4b' = case_when(`4` == 0 ~ as.integer(rowSums(across(c(`3`,`4`)))))) %>%
  mutate('3-4c' = case_when(`3` == 0 ~ as.integer(rowSums(across(c(`3`,`4`)))))) %>%
  mutate('3-5b' = case_when(`5` == 0 ~ as.integer(rowSums(across(c(`3`,`5`)))))) %>%
  mutate('3-5c' = case_when(`3` == 0 ~ as.integer(rowSums(across(c(`3`,`5`)))))) %>%
  
  mutate('4-5b' = case_when(`5` == 0 ~ as.integer(rowSums(across(c(`4`,`5`)))))) %>%
  mutate('4-5c' = case_when(`4` == 0 ~ as.integer(rowSums(across(c(`4`,`5`)))))) 

Solution

  • library(dplyr)
    library(purrr)
    
    t(combn(names(dat1)[-1], 2)) %>% 
      rbind(., .[,c(2,1)]) %>% 
      cbind(rep(c("b", "c"), each = nrow(.)/2)) %>% 
      as.data.frame() %>% 
      mutate(V4 = ifelse(V3 == "b", paste0(V1, "-", V2, V3), 
                                    paste0(V2, "-", V1, V3))) %>% 
      arrange(V4) %>% 
      group_split(row_number(), .keep = FALSE) %>% 
      map(., 
          ~ dat1 %>% transmute(!!.x[[4]] := 
                                 rowSums(cbind(dat1[, .x[[1]]], 
                                               dat1[, .x[[2]]])) * 
                                 ifelse(dat1[, .x[[2]]] == 0, 1, NA))) %>% 
      bind_cols(dat1, .)
      
    
    #>   nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
    #> 1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
    #> 2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
    #> 3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
    #> 4    AL_A1 1 0 0    1   NA    1   NA    0    0
    #> 5     D_A6 0 3 0   NA    3    0    0    3   NA
    #> 6     F_A1 1 0 1    1   NA   NA   NA   NA    1
    #> 7      H_D 0 0 2    0    0   NA    2   NA    2
    #> 8      H_H 0 0 2    0    0   NA    2   NA    2
    

    Created on 2024-02-15 with reprex v2.0.2