I have this following data.frame. Each value in CP variables are in the same format -> hueX:Y-Z.
hueX is always the same in a row.
I would like to create another variable with value equal to
data_trial = data.frame(hue=c(2,8,3,2,5),
CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))
> data_trial
hue CP_A CP_B CP_C CP_D TCK_A
1 2 hue2:6789-99987 hue2:6782-99987 hue2:6785-99989 Yes
2 8 hue8:7854-98743 hue8:7859-98734 hue8:6797-99980 hue8:6756-99987
3 3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434 hue3:60942-123412 Yes
4 2 hue2:7658-873457 hue2:7657-8734509 hue2:7650-87345065 Yes
5 5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500 hue5:45699-676565 Yes
output
hue CP_A CP_B CP_C CP_D TCK_A output
1 2 hue2:6789-99987 hue2:6782-99987 hue2:6785-99989 Yes hue2:6784-99988
2 8 hue8:7854-98743 hue8:7859-98734 hue8:6797-99980 hue8:6756-99987 hue8:7854-98743
3 3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434 hue3:60942-123412 Yes hue3:60975-123418
4 2 hue2:7658-873457 hue2:7657-8734509 hue2:7650-87345065 Yes hue2:7654-48039787
5 5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500 hue5:45699-676565 Yes hue5:45688-676554
What I have tried :
data_trial %>%
separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
mutate(Y_B=as.numeric(Y_B)) %>%
mutate(Y_C=as.numeric(Y_C)) %>%
mutate(Y_D=as.numeric(Y_D)) %>%
mutate(Z_B=as.numeric(Z_B)) %>%
mutate(Z_C=as.numeric(Z_C)) %>%
mutate(Z_D=as.numeric(Z_D)) %>%
rowwise %>%
mutate(CP_output=ifelse(TCK_A=="Yes", paste0("hue", hue, ":", mean(across(c(Y_B, Y_C, Y_D)), na.rm=TRUE), "-", mean(across(c(Z_B,Z_C,Z_D)), na.rm=TRUE)), CP_A))
# A tibble: 5 × 13
# Rowwise:
hue CP_A hueX_B Y_B Z_B hueX_C Y_C Z_C hueX_D Y_D Z_D TCK_A CP_output
<dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr>
1 2 hue2:6789-99987 "hue2" 6782 99987 hue2 6785 99989 "" NA NA "Yes" hue2:NA-NA
2 8 hue8:7854-98743 "hue8" 7859 98734 hue8 6797 99980 "hue8" 6756 99987 "" hue8:7854-98743
3 3 hue3:60987-123423 "hue3" 60989 123407 hue3 60995 123434 "hue3" 60942 123412 "Yes" hue3:NA-NA
4 2 hue2:7658-873457 "" NA NA hue2 7657 8734509 "hue2" 7650 87345065 "Yes" hue2:NA-NA
5 5 hue5:45658-676549 "hue5" 45697 676598 hue5 45667 676500 "hue5" 45699 676565 "Yes" hue5:NA-NA
The first steps are working but might be simplified. The last step is not working. I have correct result only in the FALSE condition.
I still have difficulties to work accross multiple columns and repeat some identical operations on multiples columns. Any ideas to help me.
EDIT
I have also tried this following code and I get correct means but uncorrect hue value (the contrary compared to my first attempt)
data_trials_2 = data_trial %>%
separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
mutate(Y_B=as.numeric(Y_B)) %>%
mutate(Y_C=as.numeric(Y_C)) %>%
mutate(Y_D=as.numeric(Y_D)) %>%
mutate(Z_B=as.numeric(Z_B)) %>%
mutate(Z_C=as.numeric(Z_C)) %>%
mutate(Z_D=as.numeric(Z_D))
data_trials_2$CP_output= paste0("hue", rowwise(data_trials_2[,1]), ":", round(rowMeans(data_trials_2[,c(4,7,10)], na.rm=TRUE)) , "-", round(rowMeans(data_trials_2[,c(5,8,11)], na.rm=TRUE)))
data_trials_2$CP_output
[1] "huec(2, 8, 3, 2, 5):6783.5-99988"
[2] "huec(2, 8, 3, 2, 5):7137.33333333333-99567"
[3] "huec(2, 8, 3, 2, 5):60975.3333333333-123417.666666667"
[4] "huec(2, 8, 3, 2, 5):7653.5-48039787"
[5] "huec(2, 8, 3, 2, 5):45687.6666666667-676554.333333333"
separate
from tidyr
has been superseded, so I will use separate_wider_delim
.
Once columns are separated, you end up with convenient column names to use tidy-select
functions like contains
, allowing you to mutate
across
in place of multiple mutate
calls.
This example doesn't keep the original column data, but you can by setting cols_remove = FALSE
in separate_wider_delim
.
Edit Updated to replace rowwise
operations with rowMeans
.
library(dplyr)
library(tidyr)
data_trial %>%
separate_wider_delim(starts_with('CP'), delim = ':', names = c('hue', 'values'),
names_sep = '_', too_few = 'align_end') %>%
separate_wider_delim(ends_with('_values'), delim = '-', names = c('Y', 'Z'),
names_sep = '_', too_few = 'align_end') %>%
mutate(across(contains('values'), as.numeric)) %>%
mutate(mean_Y = rowMeans(across(matches("[^A]_values_Y")), na.rm = T),
mean_Z = rowMeans(across(matches("[^A]_values_Z")), na.rm = T)) %>%
mutate(output = if_else(TCK_A != 'Yes',
paste0('hue',hue,':',CP_A_values_Y,'-',CP_A_values_Z),
paste0('hue',hue,':', mean_Y, '-', mean_Z))) %>%
glimpse()
#> Rows: 5
#> Columns: 17
#> $ hue <dbl> 2, 8, 3, 2, 5
#> $ CP_A_hue <chr> "hue2", "hue8", "hue3", "hue2", "hue5"
#> $ CP_A_values_Y <dbl> 6789, 7854, 60987, 7658, 45658
#> $ CP_A_values_Z <dbl> 99987, 98743, 123423, 873457, 676549
#> $ CP_B_hue <chr> "hue2", "hue8", "hue3", NA, "hue5"
#> $ CP_B_values_Y <dbl> 6782, 7859, 60989, NA, 45697
#> $ CP_B_values_Z <dbl> 99987, 98734, 123407, NA, 676598
#> $ CP_C_hue <chr> "hue2", "hue8", "hue3", "hue2", "hue5"
#> $ CP_C_values_Y <dbl> 6785, 6797, 60995, 7657, 45667
#> $ CP_C_values_Z <dbl> 99989, 99980, 123434, 8734509, 676500
#> $ CP_D_hue <chr> NA, "hue8", "hue3", "hue2", "hue5"
#> $ CP_D_values_Y <dbl> NA, 6756, 60942, 7650, 45699
#> $ CP_D_values_Z <dbl> NA, 99987, 123412, 87345065, 676565
#> $ TCK_A <chr> "Yes", "", "Yes", "Yes", "Yes"
#> $ mean_Y <dbl> 6783.500, 7137.333, 60975.333, 7653.500, 45687.667
#> $ mean_Z <dbl> 99988.0, 99567.0, 123417.7, 48039787.0, 676554.3
#> $ output <chr> "hue2:6783.5-99988", "hue8:7854-98743", "hue3:60975.3333…
data_trial <- data.frame(hue=c(2,8,3,2,5),
CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))