It has come to my attention, during my experimentation that I am unable to parse the text received from request.get with either the htmlparser or htmlparser2 packages.
In comparison to https.request the string returned is not the same, line breaks wise. I am scraping a human readable page, and http.request handler is able to aggregate the data to match the server response in terms of outputting.
But with request.get I am receiving a response that is without line breaks. Therefore I can only assume I am not using a parsing package that is not support by requestjs, and am led to question:
What is the best way to actually parse out the HTML received from the reqeust.get and requst.post calls?
Thank You.
My apologies, I was using the response instead of the body, here is the proper way:
var request = require('request');
var htmlparser = require('htmlparser2');
var select = require('soupselect').select
var url = '';
function httpsHandler(err, response, body) {
if(err){
console.error('to err is human')
process.exit(1);
}
var parser = new htmlparser.Parser(htmlHandler);
parser.parseComplete(body);
}
var htmlHandler = new htmlparser.DefaultHandler( (error, dom) => {
if (error){
console.log( 'error', error );
process.exit(1);
}
var options = extractData( dom );
});
function extractData( dom ){
var collection = select(dom, '#ctl00_LeftColumnMiddle_Table1 table td');
collection.forEach( ( licenses ) => {
licenses.children.forEach( ( license, i ) => {
var data = ( license.data ) ? license.data : license.children[0].data
console.log( data );
});
})
}
//entry point
request.get( url , httpsHandler );