node.jsxmlhtml-parser

parsing xml to extract text of a specific tag using htmlparser2


I am trying out node-htmlparser2 and am stuck right at the very start. I have thousands of xml files like so:

<document … loads of attribs …>
    <foo … loads of attribs …>
        <loads…> … </loads>
        <of…> … </of>
        <other…> … </other>
        <tags…> … </tags>
    </foo>
</document>

I want everything inside <foo></foo> as a single string. My code below works but doesn't seem to me to be the right way of doing this

let isFoo = false;
let txt = '';
const p = new htmlparser.Parser({
    onopentag: function(name, attribs){
        if (name === 'foo') {
            isFoo = true;
        }
    },
    ontext: function(text){
        if (isFoo) {
            txt += text;
        }
    },
    onclosetag: function(tagname){
        if (tagname === 'foo') {
            isFoo = false;
            return txt;
        }
    }
}, {decodeEntities: true, xmlMode: true});

let data = [];
for (let file in files) {
    let record = {
        filename: file,
        filetext: p.write(file)
    }
    data.push(record);
    p.end();
}

Is there a better way to work with htmlparser2 without that silly isFoo flag?


Solution

  • Here is a possible way, inspired from the example given on DomHandler's NPM page, and from an ugly console.log on h.DomUtils.

    const h = require('htmlparser2');
    const fs = require('fs');
    const data = []; // your output
    
    files.map((file) => { // files is assumed to be populated
      const record = {
        filename: file
      };
      data.push(record);
      const dh = new h.DomHandler((err, dom) => {
        if (err) return record.err = err;
        // DomUtils has many useful methods, most of them you know already, pick your preferred
        const e = h.DomUtils.getElementsByTagName('foo', dom)[0];
        // getText: only text nodes, getInnerHTML: everything, inner tags included
        record.filetext = h.DomUtils.getText(e);
      });
      const parser = new h.Parser(dh, {decodeEntities: true, xmlMode: true});
      fs.readFile(file, (err, content) => {
        if (err) return record.err = err;
        parser.write(content);
        parser.end();
      });
    });