rbioinformaticsbioconductor

Error in read.metharray(basenames = files, extended = extended, verbose = verbose, :


I want to keep only files with duplicated substrings before the first underscore. For example, abc_xyz_123.idat and abc_xyz_456.idat are pairs, whereas abc_xyz_123.idat and abc_mno_123.idat are not. The pairs are in random order.

The base variable contains the duplicated values without the _*.idat and the corresponding Sample.ID in sample.sheet must match the Sample.ID in clin.info.

The matching clin.info is then used as an input for missMethyl, which requires a basename column that tells us where the idat files to be read are located. The Basename column contains the filename.substr.

library(missMethyl)

matched.filenames <- sample.sheet$File.Name[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
filename.substr <- stringr::str_extract(matched.filenames, "[^_]*")
clin.info$Basename <- file.path(paste0(getwd(), "/idat"), filename.substr)
sample.sheet$Basename <- clin.info$Basename[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
sample.sheet <- sample.sheet[!(is.na(sample.sheet$Basename)) || !(duplicated(sample.sheet$Basename))] # Delete row if Basename is NA or not duplicated
clin.info <- clin.info[clin.info$Sample.ID %in% sample.sheet$Sample.ID,]

Run missMethyl:

rgSet <- read.metharray.exp(targets=clin.info)

Traceback:

Error in read.metharray(basenames = files, extended = extended, verbose = verbose,  : 
  The following specified files do not exist:
[This error message is followed by the names of all the values in Basename]

Example:

> dput(sample.sheet)
structure(list(File.Name = c("e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat", 
"447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat", "e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat", 
"447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat", "baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
), Sample.ID = c("TCGA.BQ.7059.01A", "TCGA.UZ.A9PM.01A", "TCGA.BQ.7059.01A", 
"TCGA.UZ.A9PM.01A", "TCGA.B9.5156.01A")), class = "data.frame", row.names = c(1L, 
4L, 5L, 6L, 7L))

> dput(clin.info)
structure(list(subtype = c("2a", "1a", "1b", "1a", "1a", "1b", 
"2b", "2b", "2a", "1a", "2b", "1b", "1c", "2b", "2a", "2b", "1b", 
"2b", "2b", "1c", "1c", "2b", "1c", "2c", "1a", "1a", "2b", "1c", 
"1a", "1c", "2b", "2a", "2c", "2a", "1a", "1a", "1b", "1a", "1b", 
"1a", "2a", "1a", "1c", "1a", "1c", "2b", "1a", "2a", "1c", "2a", 
"1a", "2b", "2b", "2c", "2b", "2c", "2b", "2a", "2b", "2a", "2b", 
"2a", "2a", "2a", "2b", "2b", "2b", "2c", "2c", "2b", "2b", "2b", 
"2a", "2b", "1c", "2b", "2a", "2b", "2b", "2a", "2b", "2b", "1a", 
"1b", "1c", "1a", "1a", "2a", "1c", "1a", "1a", "1c", "2a", "2b", 
"2a", "1c", "2a", "1a", "1a", "1c", "1a", "2c", "1a", "1b", "2c", 
"2a", "2c", "2b", "1a", "2c", "1a", "1a", "1c", "2b", "1a", "1c", 
"1b", "1c", "2c", "1c", "2b", "1a", "2c", "1c", "2a", "1c", "1c", 
"2a", "2b", "2a", "1c", "1b", "1c", "2a", "1a", "1a", "2b", "2c", 
"2a", "1a", "2b", "1c", "2a", "1a", "2c", "2c", "2a", "1b", "1b", 
"1b", "2b", "1a", "2b", "2b", "1a", "2c", "2c", "2b", "2a", "1c", 
"2b", "2a", "1b", "2b", "1b", "1a", "2b", "1a", "2b", "1a", "1c", 
"1a", "1b", "1c", "1c", "1a", "1b", "2a", "2a", "2b", "1c", "1c", 
"1c", "1c", "1c", "1c", "1c", "1b", "2b", "2a", "2a", "1c", "1a", 
"1a", "1a", "1a", "2b", "1c", "2a"), Sample.ID = c("TCGA.2K.A9WE.01A", 
"TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A", "TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A", 
"TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A", "TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A", 
"TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A", "TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A", 
"TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A", "TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A", 
"TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A", "TCGA.A4.7583.01A", "TCGA.A4.7584.01A", 
"TCGA.A4.7585.01A", "TCGA.A4.7734.01A", "TCGA.A4.7915.01A", "TCGA.A4.7996.01A", 
"TCGA.A4.7997.01A", "TCGA.A4.8098.01A", "TCGA.A4.8311.01A", "TCGA.A4.8517.01A", 
"TCGA.A4.8630.01A", "TCGA.A4.A57E.01A", "TCGA.AL.7173.01A", "TCGA.AL.A5DJ.01A", 
"TCGA.B1.5398.01A", "TCGA.B1.7332.01A", "TCGA.B1.A47M.01A", "TCGA.B1.A47N.01A", 
"TCGA.B1.A47O.01A", "TCGA.B1.A654.01A", "TCGA.B1.A657.01A", "TCGA.B3.8121.01A", 
"TCGA.B3.A6W5.01A", "TCGA.B9.5155.01A", "TCGA.B9.5156.01A", "TCGA.B9.7268.01A", 
"TCGA.B9.A44B.01A", "TCGA.B9.A5W7.01A", "TCGA.B9.A5W8.01A", "TCGA.B9.A5W9.01A", 
"TCGA.B9.A69E.01A", "TCGA.B9.A8YH.01A", "TCGA.B9.A8YI.01A", "TCGA.BQ.5875.01A", 
"TCGA.BQ.5877.01A", "TCGA.BQ.5878.01A", "TCGA.BQ.5879.01A", "TCGA.BQ.5880.01A", 
"TCGA.BQ.5881.01A", "TCGA.BQ.5882.01A", "TCGA.BQ.5883.01A", "TCGA.BQ.5885.01A", 
"TCGA.BQ.5887.01A", "TCGA.BQ.5888.01A", "TCGA.BQ.5889.01A", "TCGA.BQ.5890.01A", 
"TCGA.BQ.5891.01A", "TCGA.BQ.5892.01A", "TCGA.BQ.5893.01A", "TCGA.BQ.5894.01A", 
"TCGA.BQ.7044.01A", "TCGA.BQ.7046.01A", "TCGA.BQ.7048.01A", "TCGA.BQ.7049.01A", 
"TCGA.BQ.7050.01A", "TCGA.BQ.7051.01A", "TCGA.BQ.7053.01A", "TCGA.BQ.7055.01A", 
"TCGA.BQ.7056.01A", "TCGA.BQ.7058.01A", "TCGA.BQ.7059.01A", "TCGA.BQ.7060.01A", 
"TCGA.BQ.7061.01A", "TCGA.BQ.7062.01A", "TCGA.DW.5560.01A", "TCGA.DW.5561.01A", 
"TCGA.DW.7834.01A", "TCGA.DW.7837.01A", "TCGA.DW.7838.01A", "TCGA.DW.7839.01A", 
"TCGA.DW.7840.01A", "TCGA.DW.7841.01A", "TCGA.DW.7842.01A", "TCGA.DW.7963.01B", 
"TCGA.DZ.6131.01A", "TCGA.DZ.6132.01A", "TCGA.DZ.6133.01A", "TCGA.DZ.6134.01A", 
"TCGA.DZ.6135.01A", "TCGA.EV.5901.01A", "TCGA.EV.5902.01A", "TCGA.EV.5903.01A", 
"TCGA.F9.A4JJ.01A", "TCGA.F9.A7Q0.01A", "TCGA.F9.A7VF.01A", "TCGA.F9.A8NY.01A", 
"TCGA.F9.A97G.01A", "TCGA.G7.6789.01A", "TCGA.G7.6790.01A", "TCGA.G7.6792.01A", 
"TCGA.G7.6793.01A", "TCGA.G7.6795.01A", "TCGA.G7.6796.01A", "TCGA.G7.6797.01A", 
"TCGA.G7.7501.01A", "TCGA.G7.7502.01A", "TCGA.G7.A4TM.01A", "TCGA.G7.A8LB.01A", 
"TCGA.G7.A8LC.01A", "TCGA.G7.A8LD.01A", "TCGA.G7.A8LE.01A", "TCGA.GL.6846.01A", 
"TCGA.GL.7773.01A", "TCGA.GL.7966.01A", "TCGA.GL.8500.01A", "TCGA.GL.A4EM.01A", 
"TCGA.GL.A59R.01A", "TCGA.GL.A9DC.01A", "TCGA.GL.A9DD.01A", "TCGA.GL.A9DE.01A", 
"TCGA.HE.A5NF.01A", "TCGA.HE.A5NH.01A", "TCGA.HE.A5NI.01A", "TCGA.HE.A5NJ.01A", 
"TCGA.HE.A5NK.01A", "TCGA.HE.A5NL.01A", "TCGA.IA.A40U.01A", "TCGA.IA.A40X.01A", 
"TCGA.IA.A40Y.01A", "TCGA.IA.A83V.01A", "TCGA.IA.A83W.01A", "TCGA.IZ.8195.01A", 
"TCGA.IZ.8196.01A", "TCGA.IZ.A6M8.01A", "TCGA.J7.6720.01A", "TCGA.J7.8537.01A", 
"TCGA.J7.A8I2.01A", "TCGA.KV.A6GD.01A", "TCGA.KV.A6GE.01A", "TCGA.KV.A74V.01A", 
"TCGA.MH.A55Z.01A", "TCGA.MH.A560.01A", "TCGA.MH.A562.01A", "TCGA.MH.A855.01A", 
"TCGA.P4.A5E6.01A", "TCGA.P4.A5E7.01A", "TCGA.P4.A5E8.01A", "TCGA.P4.A5EA.01A", 
"TCGA.P4.A5EB.01A", "TCGA.P4.A5ED.01A", "TCGA.P4.AAVL.01A", "TCGA.P4.AAVM.01A", 
"TCGA.PJ.A5Z8.01A", "TCGA.PJ.A5Z9.01A", "TCGA.Q2.A5QZ.01A", "TCGA.SX.A71R.01A", 
"TCGA.SX.A71U.01A", "TCGA.SX.A7SM.01A", "TCGA.SX.A7SN.01A", "TCGA.SX.A7SO.01A", 
"TCGA.SX.A7SP.01A", "TCGA.SX.A7SQ.01A", "TCGA.SX.A7SR.01A", "TCGA.SX.A7SS.01A", 
"TCGA.UN.AAZ9.01A", "TCGA.UZ.A9PJ.01A", "TCGA.UZ.A9PK.01A", "TCGA.UZ.A9PL.01A", 
"TCGA.UZ.A9PM.01A", "TCGA.UZ.A9PN.01A", "TCGA.UZ.A9PO.01A", "TCGA.UZ.A9PP.01A", 
"TCGA.UZ.A9PR.01A", "TCGA.UZ.A9PS.01A", "TCGA.UZ.A9PU.01A", "TCGA.UZ.A9PV.01A", 
"TCGA.UZ.A9PX.01A", "TCGA.UZ.A9PZ.01A", "TCGA.UZ.A9Q0.01A", "TCGA.UZ.A9Q1.01A", 
"TCGA.V9.A7HT.01A", "TCGA.WN.A9G9.01A", "TCGA.Y8.A894.01A", "TCGA.Y8.A895.01A", 
"TCGA.Y8.A896.01A", "TCGA.Y8.A897.01A", "TCGA.Y8.A8RY.01A", "TCGA.Y8.A8RZ.01A", 
"TCGA.Y8.A8S0.01A", "TCGA.Y8.A8S1.01A")), class = "data.frame", row.names = c(NA, 
-199L))

Solution

  • In answer to your original question: here is a quick way to extract those prefixes, before the first _, that are duplicates:

    # Load the necessary packages.
    library(dplyr)
    library(stringr)
    
    
    # ...
    # Load the data, as with `sample.sheet`.
    # ...
    
    
    # Assemble a column of duplicate files.
    sample.sheet %>%
      # Add a column with everything (the prefix) before the "_" (the suffix).
      mutate(file_prefix = str_extract(File.Name, "^[^_]*")) %>%
      # Count the occurrences of each prefix.
      group_by(file_prefix) %>%
      summarize(n = n()) %>%
      ungroup() %>%
      # Include only those prefixes with multiple occurrences: duplicates.
      filter(n > 1) %>%
      select(file_prefix)
    

    Given a sample.sheet like your example

    sample.sheet <- structure(
      list(
        File.Name = c(
          "e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat",
          "447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat",
          "e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat",
          "447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat",
          "baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
        ),
        Sample.ID = c(
          "TCGA.BQ.7059.01A",
          "TCGA.UZ.A9PM.01A",
          "TCGA.BQ.7059.01A",
          "TCGA.UZ.A9PM.01A",
          "TCGA.B9.5156.01A"
        )
      ),
      class = "data.frame",
      row.names = c(1L, 4L, 5L, 6L, 7L)
    )
    

    this workflow should yield the following output:

    # A tibble: 2 × 1
      file_prefix                         
      <chr>                               
    1 447ef862-535e-48b5-893d-588203a49eac
    2 e079a20c-6cdc-4b74-afdf-3fee58f0b574