rdataframedplyr

Dataframe reshaping


I have the following data frame, with column containing block identifiers, well identifiers, and concentration values of test divided per column.

df <- structure(list(block = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), Well = c("A01", 
"A02", "A03", "A04", "A05", "A01", "A02", "A03", "A04", "A05"
), Conc.test1 = c(2, NA, 2, NA, NA, 2, NA, 2, NA, NA), Conc.test2 = c(NA, 
2, NA, 2, NA, NA, 2, NA, 2, NA), Conc.Test3 = c(NA, NA, NA, 2, 
NA, NA, NA, NA, 2, NA), Conc.test4 = c(NA, NA, 2, 2, NA, 2, NA, 
2, 2, NA), Conc.test5 = c(NA, 2, NA, NA, NA, NA, 2, NA, NA, NA
), Conc.test6 = c(2, NA, NA, NA, NA, 2, NA, NA, NA, NA)), row.names = c(NA, 
10L), class = "data.frame")

#    block Well Conc.test1 Conc.test2 Conc.Test3 Conc.test4 Conc.test5 Conc.test6
# 1      1  A01          2         NA         NA         NA         NA          2
# 2      1  A02         NA          2         NA         NA          2         NA
# 3      1  A03          2         NA         NA          2         NA         NA
# 4      1  A04         NA          2          2          2         NA         NA
# 5      1  A05         NA         NA         NA         NA         NA         NA
# 6      2  A01          2         NA         NA          2         NA          2
# 7      2  A02         NA          2         NA         NA          2         NA
# 8      2  A03          2         NA         NA          2         NA         NA
# 9      2  A04         NA          2          2          2         NA         NA
# 10     2  A05         NA         NA         NA         NA         NA         NA

I would like to find per Block, per well the different tests used (and the amount of tests) and the concentration used, in the following format:

structure(list(block = c(1, 1, 1, 1, 2, 2, 2, 2), tests = c(2, 
2, 2, 3, 3, 2, 2, 3), well = c("A01", "A02", "A03", "A04", "A01", 
"A02", "A03", "A04"), C1 = c("test1", "test2", "test1", "test2", 
"test1", "test2", "test1", "test2"), conc1 = c(2, 2, 2, 2, 2, 
2, 2, 2), C2 = c("test6", "test5", "test4", "test3", "test6", 
"test5", "test4", "test3"), con2 = c(2, 2, 2, 2, 2, 2, 2, 2), 
    C3 = c(NA, NA, NA, "test4", "test4", NA, NA, "test4"), conc3 = c(NA, 
    NA, NA, 2, 2, NA, NA, 2)), row.names = c(NA, 8L), class = "data.frame")

#   block tests well    C1 conc1    C2 con2    C3 conc3
# 1     1     2  A01 test1     2 test6    2  <NA>    NA
# 2     1     2  A02 test2     2 test5    2  <NA>    NA
# 3     1     2  A03 test1     2 test4    2  <NA>    NA
# 4     1     3  A04 test2     2 test3    2 test4     2
# 5     2     3  A01 test1     2 test6    2 test4     2
# 6     2     2  A02 test2     2 test5    2  <NA>    NA
# 7     2     2  A03 test1     2 test4    2  <NA>    NA
# 8     2     3  A04 test2     2 test3    2 test4     2

I got this far, which kind of works, but I don't see how scalable this is.

output_df <- df %>%
  pivot_longer(cols = starts_with("Conc"), names_to = "Test", values_to = "Concentration") %>%
  filter(!is.na(Concentration)) %>%
  group_by(block, Well) %>%
  mutate(test_num = row_number()) %>%
  pivot_wider(names_from = test_num, values_from = c(Test, Concentration), 
              names_sep = "") %>%
  ungroup()

output_df <- output_df %>%
  mutate(tests = rowSums(!is.na(select(., starts_with("Test"))))) %>%
  select(block, Well, tests, everything()) %>%
  rename_with(~gsub("Test", "C", .x), starts_with("Test")) %>%
  rename_with(~gsub("Concentration", "conc", .x), starts_with("Concentration"))

Solution

  • In base R we can do

    reshape(df, direction='l', v.names='Conc', timevar='C', 
            varying=grep('Conc\\.', names(df)), 
            times=tolower(sub('Conc\\.', '', grep('Conc\\.', names(df))))) |> 
      na.omit() |>
      transform(tests=ave(Conc, block, Well, FUN=length), 
                id=ave(Conc, block, Well, FUN=seq)) |>
      reshape(direction='w', timevar='id', idvar=c('block','Well'), v.names=c('C', 'Conc')) |>
      `row.names<-`(NULL) |> sort_by(~block+Well)
    

    where the last line is cosmetics. I recommend you to use either upper case or lower case to start a variable name, not both. If performance matters, we can optimise the transform-step.

      block Well tests   C.1 Conc.1   C.2 Conc.2   C.3 Conc.3
    1     1  A01     2 test1      2 test6      2  <NA>     NA
    5     1  A02     2 test2      2 test5      2  <NA>     NA
    2     1  A03     2 test1      2 test4      2  <NA>     NA
    6     1  A04     3 test2      2 test3      2 test4      2
    3     2  A01     3 test1      2 test4      2 test6      2
    7     2  A02     2 test2      2 test5      2  <NA>     NA
    4     2  A03     2 test1      2 test4      2  <NA>     NA
    8     2  A04     3 test2      2 test3      2 test4      2