node.jsweb-scrapingweb-crawler

I cannot get url from a website using nodejs


I hope to scrape urls from this website by using the code below:

var request = require("request");
cheerio = require("cheerio");
urls = [];

request("http://news.sabay.com.kh/topics/sport", function(err, resp, body){
    if(!err && resp.statusCode ==200){
        var $ = cheerio.load(body);
        $(".article","h4.title").each(function(){
            var url = this.attr("href");
            urls.push(url);
        });
        console.log(urls);
    }
});

but I cannot get the result. When I run I got this

$ node server.js
[]

Solution

  • First, use a proper CSS selector :

    .article h4.title > a
    

    Then, use the proper field :

    var url = this.attribs.href
    

    Which gives :

    var request = require("request");
    cheerio = require("cheerio");
    urls = [];
    
    request("http://news.sabay.com.kh/topics/sport", function(err, resp, body){
        if(!err && resp.statusCode ==200){
            var $ = cheerio.load(body);
            $(".article h4.title > a").each(function(){
                var url = this.attribs.href;
                urls.push(url);
            });
            console.log(urls);
        }
    });
    

    and outputs :

    [ 'http://news.sabay.com.kh/article/546826',
      'http://news.sabay.com.kh/article/546763',
      'http://news.sabay.com.kh/article/546520',
      'http://news.sabay.com.kh/article/546568',
      'http://news.sabay.com.kh/article/546460',
      'http://news.sabay.com.kh/article/546448',
      'http://news.sabay.com.kh/article/545674',
      'http://news.sabay.com.kh/article/546235',
      'http://news.sabay.com.kh/article/545698',
      'http://news.sabay.com.kh/article/546091' ]