javascriptnode.jspuppeteer

I want to get the urls of each home from the attribute content


const puppeteer = require("puppeteer");
const cheerio = require("cheerio");


const url = "https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header";

async function scrapHomesPage(url)
{
    try
    {
    const browser = await puppeteer.launch({headless:false});
    const page = await browser.newPage();
    
    await page.goto(url);
    
    const html = await page.evaluate(()=> document.body.innerHTML);
    const $ =  cheerio.load(html); 
    
    const homes = $('[itemprop="url"]').map((i, element) => $(element).attr("content")).get();
    console.log(homes);
    }
    catch(err)
    {
        console.error(err);
    }
    
}

scrapHomesPage("https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header");

I tried to add everything I could to wait for the page to load all the contents. I tried wait for selectors etc. I am always getting an empty array instead I should get an array with all the links of each home listed on the Airbnb site for that particular location.


Solution

  • I don't see any reason to use Cheerio here. It's just another layer of indirection to get the data you want, involving an extra dependency, a whole second parse of the page and the potential for bugs when the page goes out of sync with the HTML snapshot you've created. If you do need to use it, you can use page.content() instead of page.evaluate(() => document.body.innerHTML).

    As for the main problem, you appear to be missing a call to page.waitForSelector:

    const puppeteer = require("puppeteer"); // ^19.0.0
    
    const url = "your url";
    
    let browser;
    (async () => {
      browser = await puppeteer.launch();
      const [page] = await browser.pages();
      await page.goto(url, {waitUntil: "domcontentloaded"});
      await page.waitForSelector('[itemprop="url"]');
      const content = await page.$$eval(
        '[itemprop="url"]',
        els => els.map(el => el.getAttribute("content"))
      );
      console.log(content);
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close());
    

    Disclosure: I'm the author of the linked blog post.