rdataframemutateacross

Mutate a new variable across multiple variables


I have this following data.frame. Each value in CP variables are in the same format -> hueX:Y-Z.

hueX is always the same in a row.

I would like to create another variable with value equal to

data_trial = data.frame(hue=c(2,8,3,2,5),
          CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
          CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
          CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
          CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
          TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))

> data_trial
  hue              CP_A              CP_B              CP_C               CP_D TCK_A
1   2   hue2:6789-99987   hue2:6782-99987   hue2:6785-99989                      Yes
2   8   hue8:7854-98743   hue8:7859-98734   hue8:6797-99980    hue8:6756-99987      
3   3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434  hue3:60942-123412   Yes
4   2  hue2:7658-873457                   hue2:7657-8734509 hue2:7650-87345065   Yes
5   5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500  hue5:45699-676565   Yes


output

  hue              CP_A              CP_B              CP_C               CP_D TCK_A             output
1   2   hue2:6789-99987   hue2:6782-99987   hue2:6785-99989                      Yes    hue2:6784-99988
2   8   hue8:7854-98743   hue8:7859-98734   hue8:6797-99980    hue8:6756-99987          hue8:7854-98743
3   3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434  hue3:60942-123412   Yes  hue3:60975-123418
4   2  hue2:7658-873457                   hue2:7657-8734509 hue2:7650-87345065   Yes hue2:7654-48039787
5   5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500  hue5:45699-676565   Yes  hue5:45688-676554

What I have tried :


  data_trial %>% 
  separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
  separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
  separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
  separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
  separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
  separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
  mutate(Y_B=as.numeric(Y_B)) %>%
  mutate(Y_C=as.numeric(Y_C)) %>%
  mutate(Y_D=as.numeric(Y_D)) %>%
  mutate(Z_B=as.numeric(Z_B)) %>%
  mutate(Z_C=as.numeric(Z_C)) %>%
  mutate(Z_D=as.numeric(Z_D)) %>%
  rowwise %>%
  mutate(CP_output=ifelse(TCK_A=="Yes", paste0("hue", hue, ":", mean(across(c(Y_B, Y_C, Y_D)), na.rm=TRUE), "-", mean(across(c(Z_B,Z_C,Z_D)), na.rm=TRUE)), CP_A))

# A tibble: 5 × 13
# Rowwise: 
    hue CP_A              hueX_B   Y_B    Z_B hueX_C   Y_C     Z_C hueX_D   Y_D      Z_D TCK_A CP_output      
  <dbl> <chr>             <chr>  <dbl>  <dbl> <chr>  <dbl>   <dbl> <chr>  <dbl>    <dbl> <chr> <chr>          
1     2 hue2:6789-99987   "hue2"  6782  99987 hue2    6785   99989 ""        NA       NA "Yes" hue2:NA-NA     
2     8 hue8:7854-98743   "hue8"  7859  98734 hue8    6797   99980 "hue8"  6756    99987 ""    hue8:7854-98743
3     3 hue3:60987-123423 "hue3" 60989 123407 hue3   60995  123434 "hue3" 60942   123412 "Yes" hue3:NA-NA     
4     2 hue2:7658-873457  ""        NA     NA hue2    7657 8734509 "hue2"  7650 87345065 "Yes" hue2:NA-NA     
5     5 hue5:45658-676549 "hue5" 45697 676598 hue5   45667  676500 "hue5" 45699   676565 "Yes" hue5:NA-NA 
  

The first steps are working but might be simplified. The last step is not working. I have correct result only in the FALSE condition.

I still have difficulties to work accross multiple columns and repeat some identical operations on multiples columns. Any ideas to help me.

EDIT

I have also tried this following code and I get correct means but uncorrect hue value (the contrary compared to my first attempt)


data_trials_2 = data_trial %>% 
  separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
  separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
  separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
  separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
  separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
  separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
  mutate(Y_B=as.numeric(Y_B)) %>%
  mutate(Y_C=as.numeric(Y_C)) %>%
  mutate(Y_D=as.numeric(Y_D)) %>%
  mutate(Z_B=as.numeric(Z_B)) %>%
  mutate(Z_C=as.numeric(Z_C)) %>%
  mutate(Z_D=as.numeric(Z_D))

  data_trials_2$CP_output= paste0("hue", rowwise(data_trials_2[,1]), ":", round(rowMeans(data_trials_2[,c(4,7,10)], na.rm=TRUE)) , "-",    round(rowMeans(data_trials_2[,c(5,8,11)], na.rm=TRUE)))

data_trials_2$CP_output
[1] "huec(2, 8, 3, 2, 5):6783.5-99988"                     
[2] "huec(2, 8, 3, 2, 5):7137.33333333333-99567"           
[3] "huec(2, 8, 3, 2, 5):60975.3333333333-123417.666666667"
[4] "huec(2, 8, 3, 2, 5):7653.5-48039787"                  
[5] "huec(2, 8, 3, 2, 5):45687.6666666667-676554.333333333"

Solution

  • separate from tidyr has been superseded, so I will use separate_wider_delim.

    Once columns are separated, you end up with convenient column names to use tidy-select functions like contains, allowing you to mutate across in place of multiple mutate calls.

    This example doesn't keep the original column data, but you can by setting cols_remove = FALSE in separate_wider_delim.

    Edit Updated to replace rowwise operations with rowMeans.

    library(dplyr)
    library(tidyr)
    
    data_trial %>%
      separate_wider_delim(starts_with('CP'), delim = ':', names = c('hue', 'values'),
                           names_sep = '_', too_few = 'align_end') %>%
      separate_wider_delim(ends_with('_values'), delim = '-', names = c('Y', 'Z'),
                           names_sep = '_', too_few = 'align_end') %>%
      mutate(across(contains('values'), as.numeric)) %>%
      mutate(mean_Y = rowMeans(across(matches("[^A]_values_Y")), na.rm = T),
             mean_Z = rowMeans(across(matches("[^A]_values_Z")), na.rm = T)) %>%
      mutate(output = if_else(TCK_A != 'Yes',
                              paste0('hue',hue,':',CP_A_values_Y,'-',CP_A_values_Z),
                              paste0('hue',hue,':', mean_Y, '-', mean_Z))) %>%
      glimpse()
    #> Rows: 5
    #> Columns: 17
    #> $ hue           <dbl> 2, 8, 3, 2, 5
    #> $ CP_A_hue      <chr> "hue2", "hue8", "hue3", "hue2", "hue5"
    #> $ CP_A_values_Y <dbl> 6789, 7854, 60987, 7658, 45658
    #> $ CP_A_values_Z <dbl> 99987, 98743, 123423, 873457, 676549
    #> $ CP_B_hue      <chr> "hue2", "hue8", "hue3", NA, "hue5"
    #> $ CP_B_values_Y <dbl> 6782, 7859, 60989, NA, 45697
    #> $ CP_B_values_Z <dbl> 99987, 98734, 123407, NA, 676598
    #> $ CP_C_hue      <chr> "hue2", "hue8", "hue3", "hue2", "hue5"
    #> $ CP_C_values_Y <dbl> 6785, 6797, 60995, 7657, 45667
    #> $ CP_C_values_Z <dbl> 99989, 99980, 123434, 8734509, 676500
    #> $ CP_D_hue      <chr> NA, "hue8", "hue3", "hue2", "hue5"
    #> $ CP_D_values_Y <dbl> NA, 6756, 60942, 7650, 45699
    #> $ CP_D_values_Z <dbl> NA, 99987, 123412, 87345065, 676565
    #> $ TCK_A         <chr> "Yes", "", "Yes", "Yes", "Yes"
    #> $ mean_Y        <dbl> 6783.500, 7137.333, 60975.333, 7653.500, 45687.667
    #> $ mean_Z        <dbl> 99988.0, 99567.0, 123417.7, 48039787.0, 676554.3
    #> $ output        <chr> "hue2:6783.5-99988", "hue8:7854-98743", "hue3:60975.3333…
    

    Data

    data_trial <- data.frame(hue=c(2,8,3,2,5),
                            CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
                            CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
                            CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
                            CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
                            TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))