I saw many web scraping tutorial but I can't find the pattern that would scrape the web with subpages.
Here is the sequence
I can find many URL that teaches how to do the step 1. But further down I can't find any good example. In addition I tried X-ray but it doesn't work well because my URL is part of the parent item.
Here is some sample code:
var request = require('request');
var cheerio = require('cheerio');
var url = 'https://news.ycombinator.com';
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('span.comhead').each(function(i, element){
// Obtain the URL of the news
var a = $(this).prev();
var subUrl = a.attr('href');
// Go to that news and obtain the title
request(subUrl, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var title = $("title").text();
console.log(title);
}
});
});
}
});
But the cheerios each
should be running in sequential way. Are there any way that I can scrape the page in parallel?
Thanks for the help
You can do this easily with x-ray. See below code for example:
var Xray = require('x-ray');
var x = Xray();
var baseUrl = 'https://news.ycombinator.com'; // set base url
x(baseUrl, { // scrape base url
title1: x('a', [{links1:'@href'}]) // store links in array
})(function(err, obj1) { // pass array to next fx
obj1.forEach(function(links.link) {
// assuming links.link stores '/sample-link-to-crawl-83792',
x(baseUrl+links.link, { // append base url to link and crawl
title2: x('a', [{links2:'@href'}])
})(function(err, obj2){
obj2.forEach(function(links2.link) { // for each link in obj2
console.log(link) // should print link to console
});
});
});
});
You can continue like this or simply create a function that returns a promise and pass the scanned url's to it at anytime. Then you watch for the completed promise and do what you want with the returned data.