rdataframedplyr

Dataframe reshaping


I have the following data frame, with column containing block identifiers, well identifiers, and concentration values of test divided per column.

df <- structure(list(block = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), Well = c("A01", 
"A02", "A03", "A04", "A05", "A01", "A02", "A03", "A04", "A05"
), Conc.test1 = c(2, NA, 2, NA, NA, 2, NA, 2, NA, NA), Conc.test2 = c(NA, 
2, NA, 2, NA, NA, 2, NA, 2, NA), Conc.Test3 = c(NA, NA, NA, 2, 
NA, NA, NA, NA, 2, NA), Conc.test4 = c(NA, NA, 2, 2, NA, 2, NA, 
2, 2, NA), Conc.test5 = c(NA, 2, NA, NA, NA, NA, 2, NA, NA, NA
), Conc.test6 = c(2, NA, NA, NA, NA, 2, NA, NA, NA, NA)), row.names = c(NA, 
10L), class = "data.frame")

#    block Well Conc.test1 Conc.test2 Conc.Test3 Conc.test4 Conc.test5 Conc.test6
# 1      1  A01          2         NA         NA         NA         NA          2
# 2      1  A02         NA          2         NA         NA          2         NA
# 3      1  A03          2         NA         NA          2         NA         NA
# 4      1  A04         NA          2          2          2         NA         NA
# 5      1  A05         NA         NA         NA         NA         NA         NA
# 6      2  A01          2         NA         NA          2         NA          2
# 7      2  A02         NA          2         NA         NA          2         NA
# 8      2  A03          2         NA         NA          2         NA         NA
# 9      2  A04         NA          2          2          2         NA         NA
# 10     2  A05         NA         NA         NA         NA         NA         NA

I would like to find per Block, per well the different tests used (and the amount of tests) and the concentration used, in the following format:

structure(list(block = c(1, 1, 1, 1, 2, 2, 2, 2), tests = c(2, 
2, 2, 3, 3, 2, 2, 3), well = c("A01", "A02", "A03", "A04", "A01", 
"A02", "A03", "A04"), C1 = c("test1", "test2", "test1", "test2", 
"test1", "test2", "test1", "test2"), conc1 = c(2, 2, 2, 2, 2, 
2, 2, 2), C2 = c("test6", "test5", "test4", "test3", "test6", 
"test5", "test4", "test3"), con2 = c(2, 2, 2, 2, 2, 2, 2, 2), 
    C3 = c(NA, NA, NA, "test4", "test4", NA, NA, "test4"), conc3 = c(NA, 
    NA, NA, 2, 2, NA, NA, 2)), row.names = c(NA, 8L), class = "data.frame")

#   block tests well    C1 conc1    C2 con2    C3 conc3
# 1     1     2  A01 test1     2 test6    2  <NA>    NA
# 2     1     2  A02 test2     2 test5    2  <NA>    NA
# 3     1     2  A03 test1     2 test4    2  <NA>    NA
# 4     1     3  A04 test2     2 test3    2 test4     2
# 5     2     3  A01 test1     2 test6    2 test4     2
# 6     2     2  A02 test2     2 test5    2  <NA>    NA
# 7     2     2  A03 test1     2 test4    2  <NA>    NA
# 8     2     3  A04 test2     2 test3    2 test4     2

I got this far, which kind of works, but I don't see how scalable this is.

output_df <- df %>%
  pivot_longer(cols = starts_with("Conc"), names_to = "Test", values_to = "Concentration") %>%
  filter(!is.na(Concentration)) %>%
  group_by(block, Well) %>%
  mutate(test_num = row_number()) %>%
  pivot_wider(names_from = test_num, values_from = c(Test, Concentration), 
              names_sep = "") %>%
  ungroup()

output_df <- output_df %>%
  mutate(tests = rowSums(!is.na(select(., starts_with("Test"))))) %>%
  select(block, Well, tests, everything()) %>%
  rename_with(~gsub("Test", "C", .x), starts_with("Test")) %>%
  rename_with(~gsub("Concentration", "conc", .x), starts_with("Concentration"))

Solution

  • You need pivot_longer at first, creating index column by groups, and then pivot_wider.

    library(dplyr)
    library(tidyr)
    
    df %>%
      pivot_longer(Conc.test1:Conc.test6, names_prefix = "Conc.", names_to = "C",
                   values_to = "conc", values_drop_na = TRUE) %>%
      mutate(id = row_number(), tests = n(), .by = c(block, Well)) %>%
      pivot_wider(names_from = id, values_from = c(C, conc),
                  names_vary = "slowest", names_sep = "")
    
    # # A tibble: 8 × 9
    #   block Well  tests C1    conc1 C2    conc2 C3    conc3
    #   <dbl> <chr> <int> <chr> <dbl> <chr> <dbl> <chr> <dbl>
    # 1     1 A01       2 test1     2 test6     2 NA       NA
    # 2     1 A02       2 test2     2 test5     2 NA       NA
    # 3     1 A03       2 test1     2 test4     2 NA       NA
    # 4     1 A04       3 test2     2 Test3     2 test4     2
    # 5     2 A01       3 test1     2 test4     2 test6     2
    # 6     2 A02       2 test2     2 test5     2 NA       NA
    # 7     2 A03       2 test1     2 test4     2 NA       NA
    # 8     2 A04       3 test2     2 Test3     2 test4     2