rtidyversetidyrvcf-vcardgatk

Parsing FORMAT and sample value fields from VCF file


I am trying to load a vcf file (created using GATK using data.table::fread, and then parse the results of the FORMAT and sample fields, into different columns. The column names are given by the FORMAT field, with each element separated by a :. and the values for each of these columns is given by the sample field, with each corresponding value in the same order as the names in the FORMAT field, and equally separated by :. The issue I'm facing is that, for each VCF file, I have different values for the FORMAT field, at different rows.

For example, I extracted one row for each unique value found in the FORMAT field:

structure(list(
  `#CHROM` = c("rs11218343_chr11_121564878_SORL1_C/T","rs16941239_chr16_86420604_FOXF1_A/T", 
               "rs7384878_chr7_100334426_SPDYE3_C/T", "rs3851179_chr11_86157598_EED_T/C", 
               "rs10868366_chr9_86085145_GOLM1_G/T", "rs11771145_chr7_143413669_EPHA1_A/G", 
               "rs6966331_chr7_37844191_EPDR1_T/C"),
  POS = c(722L, 1468L, 1367L, 1000L, 246L, 261L, 1384L), 
  ID = c(".", ".", ".", ".", ".", ".", "."), 
  REF = c("AGG", "G", "C", "T", "C", "T", "G"), 
  ALT = c(".", "*,GA", "T", "C", ".", "C", "."), 
  QUAL = c("29.91", ".", "40.64", "32690.06", ".", ".", "0.64"), 
  FILTER = c("LowQual", ".", ".", ".", ".", ".", "LowQual"),
  INFO = c("DP=5;MLEAC=.;MLEAF=.;MQ=26.79", ".",
           "AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802",
           "AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776",
           ".", ".", "BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967"),
  FORMAT = c("GT", "GT:AD", "GT:AD:DP:GQ:PGT:PID:PL:PS", "GT:AD:DP:GQ:PL", 
             "GT:AD:DP:RGQ", "GT:AD:PGT:PID:PS", "GT:DP:RGQ"), 
  F1601764_S460_L002 = c("0/0", "./.:0,0,0", "0|1:12,2:14:48:0|1:1344_G_A:48,0,498:1344",
                         "1/1:0,1054:1054:99:32704,3160,0", "0/0:0:0:0", ".|.:0,0:0|1:251_T_C:251",
                         "0/0:3:16")), 
  class = c("tbl_df", "tbl", "data.frame"),
  row.names = c(NA, -7L))
#>                                 #CHROM  POS ID REF  ALT     QUAL  FILTER
#> 1 rs11218343_chr11_121564878_SORL1_C/T  722  . AGG    .    29.91 LowQual
#> 2  rs16941239_chr16_86420604_FOXF1_A/T 1468  .   G *,GA        .       .
#> 3  rs7384878_chr7_100334426_SPDYE3_C/T 1367  .   C    T    40.64       .
#> 4     rs3851179_chr11_86157598_EED_T/C 1000  .   T    C 32690.06       .
#> 5   rs10868366_chr9_86085145_GOLM1_G/T  246  .   C    .        .       .
#> 6  rs11771145_chr7_143413669_EPHA1_A/G  261  .   T    C        .       .
#> 7    rs6966331_chr7_37844191_EPDR1_T/C 1384  .   G    .     0.64 LowQual
#>                                                                                                                                                            INFO
#> 1                                                                                                                                 DP=5;MLEAC=.;MLEAF=.;MQ=26.79
#> 2                                                                                                                                                             .
#> 3 AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802
#> 4                                                            AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776
#> 5                                                                                                                                                             .
#> 6                                                                                                                                                             .
#> 7                                                      BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967
#>                      FORMAT                        F1601764_S460_L002
#> 1                        GT                                       0/0
#> 2                     GT:AD                                 ./.:0,0,0
#> 3 GT:AD:DP:GQ:PGT:PID:PL:PS 0|1:12,2:14:48:0|1:1344_G_A:48,0,498:1344
#> 4            GT:AD:DP:GQ:PL           1/1:0,1054:1054:99:32704,3160,0
#> 5              GT:AD:DP:RGQ                                 0/0:0:0:0
#> 6          GT:AD:PGT:PID:PS                   .|.:0,0:0|1:251_T_C:251
#> 7                 GT:DP:RGQ                                  0/0:3:16

Created on 2023-10-19 with reprex v2.0.2

What I would like to do, is to find a way to go over every row and separate the values in F1601764_S460_L002 to the corresponding columns given by the elements in the FORMAT field (separated by :).

I started by doing this with data.table::fread and tidyr::separate_wider_delim, thinking that all FORMAT field were: GT:AD:DP:RGQ or GT:AD:DP:GQ:PL. But this, obviously, yield problems.

I also tried the vcfR package, but I feel it's too slow for what I need to do.

What I would like to achieve, in the end, is using a tdyr approach to achieve something like this:

structure(list(`#CHROM` = c("rs6966331_chr7_37844191_EPDR1_T/C", 
"rs7384878_chr7_100334426_SPDYE3_C/T", "rs11771145_chr7_143413669_EPHA1_A/G", 
"rs10868366_chr9_86085145_GOLM1_G/T", "rs3851179_chr11_86157598_EED_T/C", 
"rs11218343_chr11_121564878_SORL1_C/T", "rs16941239_chr16_86420604_FOXF1_A/T"
), POS = c(1384L, 1367L, 261L, 246L, 1000L, 722L, 1468L), AD = c(NA, 
"12,2", "0,0", "0", "0,1054", NA, "0,0,0"), DP = c(3L, 14L, NA, 
0L, 1054L, NA, NA), GQ = c(NA, 48L, NA, NA, 99L, NA, NA), GT = c("0/0", 
"0|1", NA, "0/0", "1/1", "0/0", NA), PGT = c(NA, "0|1", "0|1", 
NA, NA, NA, NA), PID = c(NA, "1344_G_A", "251_T_C", NA, NA, NA, 
NA), PL = c(NA, "48,0,498", NA, NA, "32704,3160,0", NA, NA), 
    PS = c(NA, 1344L, 251L, NA, NA, NA, NA), RGQ = c(16L, NA, 
    NA, 0L, NA, NA, NA), SB = c(NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), GT_alleles = c("G/G", "C|T", ".", "C/C", 
    "C/C", "AGG/AGG", ".")), row.names = c(NA, -7L), class = c("tbl_df", 
"tbl", "data.frame"))
#>                                 #CHROM  POS     AD   DP GQ   GT  PGT      PID
#> 1    rs6966331_chr7_37844191_EPDR1_T/C 1384   <NA>    3 NA  0/0 <NA>     <NA>
#> 2  rs7384878_chr7_100334426_SPDYE3_C/T 1367   12,2   14 48  0|1  0|1 1344_G_A
#> 3  rs11771145_chr7_143413669_EPHA1_A/G  261    0,0   NA NA <NA>  0|1  251_T_C
#> 4   rs10868366_chr9_86085145_GOLM1_G/T  246      0    0 NA  0/0 <NA>     <NA>
#> 5     rs3851179_chr11_86157598_EED_T/C 1000 0,1054 1054 99  1/1 <NA>     <NA>
#> 6 rs11218343_chr11_121564878_SORL1_C/T  722   <NA>   NA NA  0/0 <NA>     <NA>
#> 7  rs16941239_chr16_86420604_FOXF1_A/T 1468  0,0,0   NA NA <NA> <NA>     <NA>
#>             PL   PS RGQ   SB GT_alleles
#> 1         <NA>   NA  16 <NA>        G/G
#> 2     48,0,498 1344  NA <NA>        C|T
#> 3         <NA>  251  NA <NA>          .
#> 4         <NA>   NA   0 <NA>        C/C
#> 5 32704,3160,0   NA  NA <NA>        C/C
#> 6         <NA>   NA  NA <NA>    AGG/AGG
#> 7         <NA>   NA  NA <NA>          .

Created on 2023-10-19 with reprex v2.0.2

I imagine this isn't an easy task, but if someone has some ideas, or already came across a similar issue... your help would be invaluable.


Solution

  • Do you mean something like this?

    library(dplyr)
    library(tidyr) # unnest, pivot_wider
    
    quux %>%
      mutate(across(FORMAT:F1601764_S460_L002, ~ strsplit(., ":"))) %>%
      unnest(c(FORMAT, F1601764_S460_L002)) %>%
      pivot_wider(id_cols = setdiff(names(quux), c("FORMAT", "F1601764_S460_L002")), names_from = FORMAT, values_from = F1601764_S460_L002)
    # # A tibble: 7 × 17
    #   `#CHROM`                               POS ID    REF   ALT   QUAL     FILTER  INFO                                                                                                                                                          GT    AD     DP    GQ    PGT   PID      PL           PS    RGQ  
    #   <chr>                                <int> <chr> <chr> <chr> <chr>    <chr>   <chr>                                                                                                                                                         <chr> <chr>  <chr> <chr> <chr> <chr>    <chr>        <chr> <chr>
    # 1 rs11218343_chr11_121564878_SORL1_C/T   722 .     AGG   .     29.91    LowQual DP=5;MLEAC=.;MLEAF=.;MQ=26.79                                                                                                                                 0/0   NA     NA    NA    NA    NA       NA           NA    NA   
    # 2 rs16941239_chr16_86420604_FOXF1_A/T   1468 .     G     *,GA  .        .       .                                                                                                                                                             ./.   0,0,0  NA    NA    NA    NA       NA           NA    NA   
    # 3 rs7384878_chr7_100334426_SPDYE3_C/T   1367 .     C     T     40.64    .       AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802 0|1   12,2   14    48    0|1   1344_G_A 48,0,498     1344  NA   
    # 4 rs3851179_chr11_86157598_EED_T/C      1000 .     T     C     32690.06 .       AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776                                                            1/1   0,1054 1054  99    NA    NA       32704,3160,0 NA    NA   
    # 5 rs10868366_chr9_86085145_GOLM1_G/T     246 .     C     .     .        .       .                                                                                                                                                             0/0   0      0     NA    NA    NA       NA           NA    0    
    # 6 rs11771145_chr7_143413669_EPHA1_A/G    261 .     T     C     .        .       .                                                                                                                                                             .|.   0,0    NA    NA    0|1   251_T_C  NA           251   NA   
    # 7 rs6966331_chr7_37844191_EPDR1_T/C     1384 .     G     .     0.64     LowQual BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967                                                      0/0   NA     3     NA    NA    NA       NA           NA    16   
    

    Edit: we don't technically need an unnest/pivot operation here, with larger data they might be a little costly. Try this brute-force that only adds columns:

    Map(function(...) as.data.frame(as.list(setNames(...))), strsplit(quux$F1601764_S460_L002, ":"), strsplit(quux$FORMAT, ":")) %>%
      bind_rows() %>%
      bind_cols(quux, .)
    # # A tibble: 7 × 19
    #   `#CHROM`                               POS ID    REF   ALT   QUAL     FILTER  INFO                                                FORMAT F1601764_S460_L002 GT    AD    DP    GQ    PGT   PID   PL    PS    RGQ  
    #   <chr>                                <int> <chr> <chr> <chr> <chr>    <chr>   <chr>                                               <chr>  <chr>              <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    # 1 rs11218343_chr11_121564878_SORL1_C/T   722 .     AGG   .     29.91    LowQual DP=5;MLEAC=.;MLEAF=.;MQ=26.79                       GT     0/0                0/0   NA    NA    NA    NA    NA    NA    NA    NA   
    # 2 rs16941239_chr16_86420604_FOXF1_A/T   1468 .     G     *,GA  .        .       .                                                   GT:AD  ./.:0,0,0          ./.   0,0,0 NA    NA    NA    NA    NA    NA    NA   
    # 3 rs7384878_chr7_100334426_SPDYE3_C/T   1367 .     C     T     40.64    .       AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessH… GT:AD… 0|1:12,2:14:48:0|… 0|1   12,2  14    48    0|1   1344… 48,0… 1344  NA   
    # 4 rs3851179_chr11_86157598_EED_T/C      1000 .     T     C     32690.06 .       AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.00… GT:AD… 1/1:0,1054:1054:9… 1/1   0,10… 1054  99    NA    NA    3270… NA    NA   
    # 5 rs10868366_chr9_86085145_GOLM1_G/T     246 .     C     .     .        .       .                                                   GT:AD… 0/0:0:0:0          0/0   0     0     NA    NA    NA    NA    NA    0    
    # 6 rs11771145_chr7_143413669_EPHA1_A/G    261 .     T     C     .        .       .                                                   GT:AD… .|.:0,0:0|1:251_T… .|.   0,0   NA    NA    0|1   251_… NA    251   NA   
    # 7 rs6966331_chr7_37844191_EPDR1_T/C     1384 .     G     .     0.64     LowQual BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=… GT:DP… 0/0:3:16           0/0   NA    3     NA    NA    NA    NA    NA    16   
    

    You can look at the value after bind_rows() to see just the new columns being added.

    You suggested in a comment that you always have the same columns, I don't think that necessarily helps us here: we might use regex to find each of "GT", "DP", etc in FORMAT, but since we don't have all of them every row, we don't know which position they are in ahead of time, making extraction and pairing with the F16* field values a bit onerous. I think using the same strsplit approach as the first, but manually creating the columns, is a better plan for larger data.