I want to keep only files with duplicated substrings before the first underscore.
For example, abc_xyz_123.idat
and abc_xyz_456.idat
are pairs, whereas abc_xyz_123.idat
and abc_mno_123.idat
are not. The pairs are in random order.
The base
variable contains the duplicated values without the _*.idat
and the corresponding Sample.ID
in sample.sheet
must match the Sample.ID
in clin.info
.
The matching clin.info
is then used as an input for missMethyl
, which requires a basename column that tells us where the idat files to be read are located. The Basename
column contains the filename.substr
.
library(missMethyl)
matched.filenames <- sample.sheet$File.Name[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
filename.substr <- stringr::str_extract(matched.filenames, "[^_]*")
clin.info$Basename <- file.path(paste0(getwd(), "/idat"), filename.substr)
sample.sheet$Basename <- clin.info$Basename[match(clin.info$Sample.ID, sample.sheet$Sample.ID)]
sample.sheet <- sample.sheet[!(is.na(sample.sheet$Basename)) || !(duplicated(sample.sheet$Basename))] # Delete row if Basename is NA or not duplicated
clin.info <- clin.info[clin.info$Sample.ID %in% sample.sheet$Sample.ID,]
Run missMethyl:
rgSet <- read.metharray.exp(targets=clin.info)
Traceback:
Error in read.metharray(basenames = files, extended = extended, verbose = verbose, :
The following specified files do not exist:
[This error message is followed by the names of all the values in Basename]
Example:
> dput(sample.sheet)
structure(list(File.Name = c("e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat", "e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat", "baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
), Sample.ID = c("TCGA.BQ.7059.01A", "TCGA.UZ.A9PM.01A", "TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A", "TCGA.B9.5156.01A")), class = "data.frame", row.names = c(1L,
4L, 5L, 6L, 7L))
> dput(clin.info)
structure(list(subtype = c("2a", "1a", "1b", "1a", "1a", "1b",
"2b", "2b", "2a", "1a", "2b", "1b", "1c", "2b", "2a", "2b", "1b",
"2b", "2b", "1c", "1c", "2b", "1c", "2c", "1a", "1a", "2b", "1c",
"1a", "1c", "2b", "2a", "2c", "2a", "1a", "1a", "1b", "1a", "1b",
"1a", "2a", "1a", "1c", "1a", "1c", "2b", "1a", "2a", "1c", "2a",
"1a", "2b", "2b", "2c", "2b", "2c", "2b", "2a", "2b", "2a", "2b",
"2a", "2a", "2a", "2b", "2b", "2b", "2c", "2c", "2b", "2b", "2b",
"2a", "2b", "1c", "2b", "2a", "2b", "2b", "2a", "2b", "2b", "1a",
"1b", "1c", "1a", "1a", "2a", "1c", "1a", "1a", "1c", "2a", "2b",
"2a", "1c", "2a", "1a", "1a", "1c", "1a", "2c", "1a", "1b", "2c",
"2a", "2c", "2b", "1a", "2c", "1a", "1a", "1c", "2b", "1a", "1c",
"1b", "1c", "2c", "1c", "2b", "1a", "2c", "1c", "2a", "1c", "1c",
"2a", "2b", "2a", "1c", "1b", "1c", "2a", "1a", "1a", "2b", "2c",
"2a", "1a", "2b", "1c", "2a", "1a", "2c", "2c", "2a", "1b", "1b",
"1b", "2b", "1a", "2b", "2b", "1a", "2c", "2c", "2b", "2a", "1c",
"2b", "2a", "1b", "2b", "1b", "1a", "2b", "1a", "2b", "1a", "1c",
"1a", "1b", "1c", "1c", "1a", "1b", "2a", "2a", "2b", "1c", "1c",
"1c", "1c", "1c", "1c", "1c", "1b", "2b", "2a", "2a", "1c", "1a",
"1a", "1a", "1a", "2b", "1c", "2a"), Sample.ID = c("TCGA.2K.A9WE.01A",
"TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A", "TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A",
"TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A", "TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A",
"TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A", "TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A",
"TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A", "TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A",
"TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A", "TCGA.A4.7583.01A", "TCGA.A4.7584.01A",
"TCGA.A4.7585.01A", "TCGA.A4.7734.01A", "TCGA.A4.7915.01A", "TCGA.A4.7996.01A",
"TCGA.A4.7997.01A", "TCGA.A4.8098.01A", "TCGA.A4.8311.01A", "TCGA.A4.8517.01A",
"TCGA.A4.8630.01A", "TCGA.A4.A57E.01A", "TCGA.AL.7173.01A", "TCGA.AL.A5DJ.01A",
"TCGA.B1.5398.01A", "TCGA.B1.7332.01A", "TCGA.B1.A47M.01A", "TCGA.B1.A47N.01A",
"TCGA.B1.A47O.01A", "TCGA.B1.A654.01A", "TCGA.B1.A657.01A", "TCGA.B3.8121.01A",
"TCGA.B3.A6W5.01A", "TCGA.B9.5155.01A", "TCGA.B9.5156.01A", "TCGA.B9.7268.01A",
"TCGA.B9.A44B.01A", "TCGA.B9.A5W7.01A", "TCGA.B9.A5W8.01A", "TCGA.B9.A5W9.01A",
"TCGA.B9.A69E.01A", "TCGA.B9.A8YH.01A", "TCGA.B9.A8YI.01A", "TCGA.BQ.5875.01A",
"TCGA.BQ.5877.01A", "TCGA.BQ.5878.01A", "TCGA.BQ.5879.01A", "TCGA.BQ.5880.01A",
"TCGA.BQ.5881.01A", "TCGA.BQ.5882.01A", "TCGA.BQ.5883.01A", "TCGA.BQ.5885.01A",
"TCGA.BQ.5887.01A", "TCGA.BQ.5888.01A", "TCGA.BQ.5889.01A", "TCGA.BQ.5890.01A",
"TCGA.BQ.5891.01A", "TCGA.BQ.5892.01A", "TCGA.BQ.5893.01A", "TCGA.BQ.5894.01A",
"TCGA.BQ.7044.01A", "TCGA.BQ.7046.01A", "TCGA.BQ.7048.01A", "TCGA.BQ.7049.01A",
"TCGA.BQ.7050.01A", "TCGA.BQ.7051.01A", "TCGA.BQ.7053.01A", "TCGA.BQ.7055.01A",
"TCGA.BQ.7056.01A", "TCGA.BQ.7058.01A", "TCGA.BQ.7059.01A", "TCGA.BQ.7060.01A",
"TCGA.BQ.7061.01A", "TCGA.BQ.7062.01A", "TCGA.DW.5560.01A", "TCGA.DW.5561.01A",
"TCGA.DW.7834.01A", "TCGA.DW.7837.01A", "TCGA.DW.7838.01A", "TCGA.DW.7839.01A",
"TCGA.DW.7840.01A", "TCGA.DW.7841.01A", "TCGA.DW.7842.01A", "TCGA.DW.7963.01B",
"TCGA.DZ.6131.01A", "TCGA.DZ.6132.01A", "TCGA.DZ.6133.01A", "TCGA.DZ.6134.01A",
"TCGA.DZ.6135.01A", "TCGA.EV.5901.01A", "TCGA.EV.5902.01A", "TCGA.EV.5903.01A",
"TCGA.F9.A4JJ.01A", "TCGA.F9.A7Q0.01A", "TCGA.F9.A7VF.01A", "TCGA.F9.A8NY.01A",
"TCGA.F9.A97G.01A", "TCGA.G7.6789.01A", "TCGA.G7.6790.01A", "TCGA.G7.6792.01A",
"TCGA.G7.6793.01A", "TCGA.G7.6795.01A", "TCGA.G7.6796.01A", "TCGA.G7.6797.01A",
"TCGA.G7.7501.01A", "TCGA.G7.7502.01A", "TCGA.G7.A4TM.01A", "TCGA.G7.A8LB.01A",
"TCGA.G7.A8LC.01A", "TCGA.G7.A8LD.01A", "TCGA.G7.A8LE.01A", "TCGA.GL.6846.01A",
"TCGA.GL.7773.01A", "TCGA.GL.7966.01A", "TCGA.GL.8500.01A", "TCGA.GL.A4EM.01A",
"TCGA.GL.A59R.01A", "TCGA.GL.A9DC.01A", "TCGA.GL.A9DD.01A", "TCGA.GL.A9DE.01A",
"TCGA.HE.A5NF.01A", "TCGA.HE.A5NH.01A", "TCGA.HE.A5NI.01A", "TCGA.HE.A5NJ.01A",
"TCGA.HE.A5NK.01A", "TCGA.HE.A5NL.01A", "TCGA.IA.A40U.01A", "TCGA.IA.A40X.01A",
"TCGA.IA.A40Y.01A", "TCGA.IA.A83V.01A", "TCGA.IA.A83W.01A", "TCGA.IZ.8195.01A",
"TCGA.IZ.8196.01A", "TCGA.IZ.A6M8.01A", "TCGA.J7.6720.01A", "TCGA.J7.8537.01A",
"TCGA.J7.A8I2.01A", "TCGA.KV.A6GD.01A", "TCGA.KV.A6GE.01A", "TCGA.KV.A74V.01A",
"TCGA.MH.A55Z.01A", "TCGA.MH.A560.01A", "TCGA.MH.A562.01A", "TCGA.MH.A855.01A",
"TCGA.P4.A5E6.01A", "TCGA.P4.A5E7.01A", "TCGA.P4.A5E8.01A", "TCGA.P4.A5EA.01A",
"TCGA.P4.A5EB.01A", "TCGA.P4.A5ED.01A", "TCGA.P4.AAVL.01A", "TCGA.P4.AAVM.01A",
"TCGA.PJ.A5Z8.01A", "TCGA.PJ.A5Z9.01A", "TCGA.Q2.A5QZ.01A", "TCGA.SX.A71R.01A",
"TCGA.SX.A71U.01A", "TCGA.SX.A7SM.01A", "TCGA.SX.A7SN.01A", "TCGA.SX.A7SO.01A",
"TCGA.SX.A7SP.01A", "TCGA.SX.A7SQ.01A", "TCGA.SX.A7SR.01A", "TCGA.SX.A7SS.01A",
"TCGA.UN.AAZ9.01A", "TCGA.UZ.A9PJ.01A", "TCGA.UZ.A9PK.01A", "TCGA.UZ.A9PL.01A",
"TCGA.UZ.A9PM.01A", "TCGA.UZ.A9PN.01A", "TCGA.UZ.A9PO.01A", "TCGA.UZ.A9PP.01A",
"TCGA.UZ.A9PR.01A", "TCGA.UZ.A9PS.01A", "TCGA.UZ.A9PU.01A", "TCGA.UZ.A9PV.01A",
"TCGA.UZ.A9PX.01A", "TCGA.UZ.A9PZ.01A", "TCGA.UZ.A9Q0.01A", "TCGA.UZ.A9Q1.01A",
"TCGA.V9.A7HT.01A", "TCGA.WN.A9G9.01A", "TCGA.Y8.A894.01A", "TCGA.Y8.A895.01A",
"TCGA.Y8.A896.01A", "TCGA.Y8.A897.01A", "TCGA.Y8.A8RY.01A", "TCGA.Y8.A8RZ.01A",
"TCGA.Y8.A8S0.01A", "TCGA.Y8.A8S1.01A")), class = "data.frame", row.names = c(NA,
-199L))
In answer to your original question: here is a quick way to extract those prefixes, before the first _
, that are duplicates:
# Load the necessary packages.
library(dplyr)
library(stringr)
# ...
# Load the data, as with `sample.sheet`.
# ...
# Assemble a column of duplicate files.
sample.sheet %>%
# Add a column with everything (the prefix) before the "_" (the suffix).
mutate(file_prefix = str_extract(File.Name, "^[^_]*")) %>%
# Count the occurrences of each prefix.
group_by(file_prefix) %>%
summarize(n = n()) %>%
ungroup() %>%
# Include only those prefixes with multiple occurrences: duplicates.
filter(n > 1) %>%
select(file_prefix)
Given a sample.sheet
like your example
sample.sheet <- structure(
list(
File.Name = c(
"e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Red.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Grn.idat",
"e079a20c-6cdc-4b74-afdf-3fee58f0b574_noid_Grn.idat",
"447ef862-535e-48b5-893d-588203a49eac_noid_Red.idat",
"baed8332-9bc2-46d1-954e-275786b88a94_noid_Grn.idat"
),
Sample.ID = c(
"TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A",
"TCGA.BQ.7059.01A",
"TCGA.UZ.A9PM.01A",
"TCGA.B9.5156.01A"
)
),
class = "data.frame",
row.names = c(1L, 4L, 5L, 6L, 7L)
)
this workflow should yield the following output:
# A tibble: 2 × 1
file_prefix
<chr>
1 447ef862-535e-48b5-893d-588203a49eac
2 e079a20c-6cdc-4b74-afdf-3fee58f0b574