[SOLVED] Best way to scrape and parse html in nodejs with request package

Best way to scrape and parse html in nodejs with request package

It has come to my attention, during my experimentation that I am unable to parse the text received from request.get with either the htmlparser or htmlparser2 packages.

In comparison to https.request the string returned is not the same, line breaks wise. I am scraping a human readable page, and http.request handler is able to aggregate the data to match the server response in terms of outputting.

But with request.get I am receiving a response that is without line breaks. Therefore I can only assume I am not using a parsing package that is not support by requestjs, and am led to question:

What is the best way to actually parse out the HTML received from the reqeust.get and requst.post calls?

Thank You.

Solution

My apologies, I was using the response instead of the body, here is the proper way:

var request = require('request');
var htmlparser = require('htmlparser2');
var select = require('soupselect').select

var url =  '';


function httpsHandler(err, response, body) {
    if(err){
      console.error('to err is human')
      process.exit(1);
    }

    var parser = new htmlparser.Parser(htmlHandler);
    parser.parseComplete(body);
}


var htmlHandler = new htmlparser.DefaultHandler( (error, dom) => {
  if (error){
    console.log( 'error', error );
    process.exit(1);
  }
  var options = extractData( dom );
});


function extractData( dom ){

  var collection = select(dom, '#ctl00_LeftColumnMiddle_Table1 table td');

  collection.forEach( ( licenses ) => {
    licenses.children.forEach( ( license, i ) => {
      var data  = ( license.data  ) ? license.data : license.children[0].data
      console.log( data );
    });
  })
}

//entry point
request.get( url , httpsHandler );