I am trying to scrape any web page passed as into the scrape function, but no matter the timeout set at page.goto(), I keep getting a timeout error, if set to 0, the app just keeps waiting.
const express = require('express');
const cors = require('cors');
const MYPORT = process.env.PORT || 4001;
const app = express();
const puppeteer = require('puppeteer');
app.use(express.json());
app.use(cors());
const scrape = async (url) => {
var body;
try {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
timeout: 0
});
console.log('Browser launched');
const page = await browser.newPage();
console.log('Page opened');
await page.goto(url, { waitUntil: 'load', timeout: 3 * 60000 });
await page.waitForSelector('body', {waitUntil: 'load'})
console.log('Link opened');
await page.waitForNavigation({waitUntil: 'networkidle2', timeout: 3 * 60000});
page.$eval('html', bdy => {
console.log(bdy);
body = bdy.innerHTML;
});
browser.close();
return body;
} catch (err) {
console.log(err);
}
};
scrape('http://google.com');
Please, what am I doing wrong?
I am using Ubuntu 18.04 on WSL (Windows Subsystem for Linux)
You have overcomplicated a bit the timeout-waitfor part of your script. I advise the following changes:
headless: false
so you can see on the UI what goes wrong, you can open the browser console etc.waitForSelector
-s. I removed them from your script.waitUntil: 'domcontentloaded'
rather than load
or the most strict networkidle2
! See what is the exact differrence between them in the docs: [link]waitForNavigation
after you've already waited for a selector. That was the main reason why your script failed: once <body>
appeared in the DOM, you asked puppeteer to wait until a navigation is finished, but you didn't navigate at this time: you were already on the page. Keep in mind:
page.waitForNavigation
resolves when the page navigates to a new URL or reloads. It is useful for when you run code which will indirectly cause the page to navigate. [source]
page.$eval
was lacking its async nature, while it should be async always. Anyway the innerHTML
of <body>
can be retrieved more simply with an: await page.evaluate(el => el.innerHTML, await page.$('body'))
.const scrape = async url => {
try {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox']
})
console.log('Browser launched')
const page = await browser.newPage()
console.log('Page opened')
await page.goto(url, { waitUntil: 'domcontentloaded' })
await page.waitForSelector('body')
console.log('Link opened')
const body = await page.evaluate(el => el.innerHTML, await page.$('body'))
console.log(body)
browser.close()
return body
} catch (err) {
console.log(err)
}
}