I'm trying to scrap some PubMed data using cheerio. The following script works ok but when some xml tag does not exist, it generates an erroneously ordered output.
var request = require('request'),
cheerio = require('cheerio');
request('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=23545583,23103438',
function(error, response, body) {
var $ = cheerio.load(body);
for (var i = 0; i < $('PubmedArticle').length; i++) {
console.log($('PubmedArticle PMID').slice(0).eq(i).text());
console.log($('PubmedArticle DateCreated Year').slice(0).eq(i).text());
console.log($('PubmedArticle ArticleTitle').slice(0).eq(i).text());
console.log($('PubmedArticle Abstract AbstractText').slice(0).eq(i).text());
};
});
In this example the abstract outputs below the first title instead second because first article does not contain abstract.
Finally, I think I could overcome this using a different strategy:
var $ = require('cheerio')
var request = require('request')
function gotXML(err, resp, xml) {
if (err) return console.error(err)
var parsedXML = $.load(xml)
parsedXML('PubmedArticle').map(function(i, article) {
console.log($(article).find('pmid')[0].children[0].data);
console.log($(article).find('articletitle')[0].children[0].data);
console.log($(article).find('datecreated year')[0].children[0].data);
if ($(article).find('abstracttext').length>0) {
console.log($(article).find('abstracttext')[0].children[0].data);
};
}
);
}
var domain = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=23545583,23103438';
request(domain, gotXML);