rxmlsbml

How to parse xml/sbml with R package xml?


I'm trying to parse information from the sbml/xml file below

https://dl.dropboxusercontent.com/u/10712588/file.xml

from this code

http://search.bioconductor.jp/codes/11172

It seems that I can import the file normally by

doc <- xmlTreeParse(filename,ignoreBlanks = TRUE) 

but I can't recover node attributes by

atrr <- xpathApply(doc, "//species[@id]", xmlGetAttr, "id")

or

xpathApply(doc, "//species", function(n) xmlValue(n[[2]]))

A node of the file follows...

<species id="M_10fthf_m" initialConcentration="1" constant="false" hasOnly
SubstanceUnits="false" name="10-formyltetrahydrofolate(2-)" metaid="_metaM_10fth
f_m" boundaryCondition="false" sboTerm="SBO:0000247" compartment="m">
        <notes>
          <body xmlns="http://www.w3.org/1999/xhtml">
            <p>FORMULA: C20H21N7O7</p>
            <p>CHARGE: -2</p>
            <p>INCHI: InChI=1S/C20H23N7O7/c21-20-25-16-15(18(32)26-20)23-11(7-22
-16)8-27(9-28)12-3-1-10(2-4-12)17(31)24-13(19(33)34)5-6-14(29)30/h1-4,9,11,13,23
H,5-8H2,(H,24,31)(H,29,30)(H,33,34)(H4,21,22,25,26,32)/p-2/t11-,13+/m1/s1</p>
            <p>HEPATONET_1.0_ABBREVIATION: HC00212</p>
            <p>EHMN_ABBREVIATION: C00234</p>
          </body>
        </notes>
        <annotation>
...

I would like to retrieve all information inside species node, anyone know how to do that?


Solution

  • I guess it depends on what you mean when you say you want to "retrieve" all the information in the species nodes, because that retrieved data could be coerced to any number of different formats. The following assumes you want it all in a data frame, where each row is an species node from your XML file and the columns represent different pieces of information.

    When just trying to extract information, I generally find it easier to work with lists than with XML.

    doc <- xmlTreeParse(xml_file, ignoreBlanks = TRUE)
    doc_list <- xmlToList(doc)
    

    Once it's in a list, you can figure out where the species data is stored:

    sapply(x, function(x)unique(names(x)))
    [[1]]
    NULL
    
    [[2]]
    NULL
    
    [[3]]
    NULL
    
    [[4]]
    [1] "species"
    
    [[5]]
    [1] "reaction"
    
    [[6]]
    [1] "metaid"
    
    $.attrs
    [1] "level"   "version"
    

    So you really only want the information in doc_list[[4]]. Take a look at just the first component of doc_list[[4]]:

    str(doc_list[[4]][[1]])
    List of 9
     $       : chr "FORMULA: C20H21N7O7"
     $       : chr "CHARGE: -2"
     $       : chr "HEPATONET_1.0_ABBREVIATION: HC00212"
     $       : chr "EHMN_ABBREVIATION: C00234"
     $       : chr "http://identifiers.org/obo.chebi/CHEBI:57454"
     $       : chr "http://identifiers.org/pubchem.compound/C00234"
     $       : chr "http://identifiers.org/hmdb/HMDB00972"
     $       : Named chr "#_metaM_10fthf_c"
      ..- attr(*, "names")= chr "about"
     $ .attrs: Named chr [1:9] "M_10fthf_c" "1" "false" "false" ...
      ..- attr(*, "names")= chr [1:9] "id" "initialConcentration" "constant" "hasOnlySubstanceUnits" ...
    

    So you have the information contained in the first eight lists, plus the information contained in the attributes.

    Getting the attributes information is easy because it's already named. The following formats the attributes information into a data frame for each node:

    doc_attrs <- lapply(doc_list[[4]], function(x) {
      x <- unlist(x[names(x) == ".attrs"])
      col_names <- gsub(".attrs.", "", names(x))
      x <- data.frame(matrix(x, nrow = 1), stringsAsFactors = FALSE)
      colnames(x) <- col_names
      x
    })
    

    Some nodes didn't appear to have attributes information and so returned empty data frames. That caused problems later so I created data frames of NAs in their place:

    doc_attrs_cols <- unique(unlist(sapply(doc_attrs, colnames)))
    doc_attrs[sapply(doc_attrs, length) == 0] <- 
      lapply(doc_attrs[sapply(doc_attrs, length) == 0], function(x) {
        df <- data.frame(matrix(rep(NA, length(doc_attrs_cols)), nrow = 1))
        colnames(df) <- doc_attrs_cols
        df
      })
    

    When it came to pulling non-attribute data, the names and values of the variables were generally contained within the same string. I originally tried to come up with a regular expression to extract the names, but they're all formatted so differently that I gave up and just identified all the possibilities in this particular data set:

    flags <- c("FORMULA:", "CHARGE:", "HEPATONET_1.0_ABBREVIATION:", 
      "EHMN_ABBREVIATION:", "obo.chebi/CHEBI:", "pubchem.compound/", "hmdb/HMDB",  
      "INCHI: ", "kegg.compound/", "kegg.genes/", "uniprot/", "drugbank/")
    

    Also, sometimes the non-attribute information was kept as just a list of values, as in the node I showed above, while other times it was contained in "notes" and "annotation" sublists, so I had to include an if else statement to make things more consistent.

    doc_info <- lapply(doc_list[[4]], function(x) {
      if(any(names(x) != ".attrs" & names(x) != "")) {
        names(x)[names(x) != ".attrs"] <- ""
        x <- unlist(do.call("c", as.list(x[names(x) != ".attrs"])))
      } else {
      x <- unlist(x[names(x) != ".attrs"])
      }
      x <- gsub("http://identifiers.org/", "", x)
      need_names <- names(x) == ""
      names(x)[need_names] <- gsub(paste0("(", paste0(flags, collapse = "|"), ").+"), "\\1", x[need_names], perl = TRUE)
      #names(x) <- gsub("\\s+", "", names(x))
      x[need_names] <- gsub(paste0("(", paste0(flags, collapse = "|"), ")(.+)"), "\\2", x[need_names], perl = TRUE)
      col_names <- names(x)
      x <- data.frame(matrix(x, nrow = 1), stringsAsFactors = FALSE)
      colnames(x) <- col_names
      x
    })
    

    To get everything together into a data frame, I suggest the plyr package's rbind.fill.

    require(plyr)
    
    doc_info <- do.call("rbind.fill", doc_info)
    doc_attrs <- do.call("rbind.fill", doc_attrs)
    
    doc_all <- cbind(doc_info, doc_attrs)
    
    
    dim(doc_all)
    [1] 3972   22
    
    colnames(doc_all)
     [1] "FORMULA:"                    "CHARGE:"                     "HEPATONET_1.0_ABBREVIATION:" "EHMN_ABBREVIATION:"         
     [5] "obo.chebi/CHEBI:"            "pubchem.compound/"           "hmdb/HMDB"                   "about"                      
     [9] "INCHI: "                     "kegg.compound/"              "kegg.genes/"                 "uniprot/"                   
    [13] "drugbank/"                   "id"                          "initialConcentration"        "constant"                   
    [17] "hasOnlySubstanceUnits"       "name"                        "metaid"                      "boundaryCondition"          
    [21] "sboTerm"                     "compartment"