I have a very simple web scraper app whose sole purpose is to evaluate the javascript on a webpage and return the generated HTML. It works fine most of the time. However, it is not handling timeouts in the way I need it to.
The code is called by an applescript handler which passes in a timeout value and needs to be specifically signalled when the timeout has elapsed.
I adapted some code from here, and although I don't have much experience (at all) with javascript itself I believe I can see what it's doing. Trouble is, at runtime I'm being told that TimeoutError doesn't exist. I assume it's defined somewhere in the puppeteer infrastructure, and I require puppeteer
(puppeteer stealth, actually, but the problem occurs with straight puppeteer also), so what the heck?
I have an SSCCE which should demonstrate it:
//**************************************** constants
// all error codes that we pass back
const ERROR_CODE_URL_INVALID = 1002;
const ERROR_INTERNAL_SERVER_ERROR = 1003;
const ERROR_TIMEOUT = 1004;
//**************************************** scrape
const scrape = async (page, url) => {
let response = null;
// load the URL and check it's something useful
let errorContent = null;
try {
// deliberately tiny timeout value, to demonstrate problem
response = await page.goto(url, { waitUntil: 'load', timeout: 100 });
if (!response) {
// sometimes the response is not returned immediately, so we need to wait for it
response = await page.waitForResponse(() => true, { timeout: 100 });
}
if (response.status() === 200) {
// success; nothing to do here
}
else if (response.status() === 404) {
// get ready to throw a 404 below
errorContent = ERROR_CODE_URL_INVALID;
} else if (response.status() === 500) {
// get ready to throw a 500 below
errorContent = ERROR_INTERNAL_SERVER_ERROR;
} else {
// throw whatever else we have down to the catch block
throw response.text;
}
} catch (error) {
if (error instanceof TimeoutError) {
// get ready to throw a timeout to our caller
errorContent = ERROR_TIMEOUT;
}
else {
// rethrow the error we got above
throw error;
}
}
// now throw whatever error we got above
if (errorContent !== null) {
throw errorContent;
}
// get ready to scrape the page HTML
// first hopefully get rid of any annoying popups in the way
page.mouse.click(1, 1);
// get the page HTML and return it
const data = await page.evaluate(() => document.querySelector('*').outerHTML);
return data;
} // scrape()
//**************************************** main
let browser;
(async function main() {
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality
const puppeteer = require('puppeteer-extra');
// use the following line instead of the above one if not doing puppeteer stealth
//const puppeteer = require('puppeteer');
// add stealth plugin and use defaults (all evasion techniques)
const stealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(stealthPlugin())
// start up the browser and get a page
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
// get the page HTML and return it
let s = await scrape(page, "https://www.scrapethissite.com/pages/");
console.log(s);
})()
.catch(err => console.log(err))
.finally(() => browser?.close());
Edit: following @ggorlen's suggestion in a comment below I have changed the code to the following, but it also fails, this time with TypeError: Cannot read properties of undefined (reading 'TimeoutError')
//**************************************** constants
// all errors that we pass back
const ERROR_CODE_URL_INVALID = 1002;
const ERROR_INTERNAL_SERVER_ERROR = 1003;
const ERROR_TIMEOUT = 1004;
let puppeteer;
//**************************************** scrape
const scrape = async (page, url) => {
let response = null;
// load the URL and check it's something useful
let errorContent = null;
try {
response = await page.goto(url, { waitUntil: 'load', timeout: 100 });
if (!response) {
// sometimes the response is not returned immediately, so we need to wait for it
response = await page.waitForResponse(() => true, { timeout: 100 });
}
if (response.status() === 200) {
// success; nothing to do here
}
else if (response.status() === 404) {
// get ready to throw a 404 below
errorContent = ERROR_CODE_URL_INVALID;
} else if (response.status() === 500) {
// get ready to throw a 500 below
errorContent = ERROR_INTERNAL_SERVER_ERROR;
} else {
// throw whatever else we have down to the catch block
throw response.text;
}
} catch (error) {
if (error instanceof puppeteer.errors.TimeoutError) {
// get ready to throw a timeout to our caller
errorContent = ERROR_TIMEOUT;
}
else {
// rethrow the error we got above
throw error;
}
}
// now throw whatever error we got above
if (errorContent !== null) {
throw errorContent;
}
// get ready to scrape the page HTML
// first hopefully get rid of any annoying popups in the way
page.mouse.click(1, 1);
// get the page HTML and return it
const data = await page.evaluate(() => document.querySelector('*').outerHTML);
return data;
} // scrape()
//**************************************** main
let browser;
(async function main() {
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality
puppeteer = require('puppeteer-extra');
// use the following line instead of the above one if not doing puppeteer stealth
//const puppeteer = require('puppeteer');
// add stealth plugin and use defaults (all evasion techniques)
const stealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(stealthPlugin())
// start up the browser and get a page
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
// get the page HTML and return it
let s = await scrape(page, "https://www.scrapethissite.com/pages/");
console.log(s);
})()
.catch(err => console.log(err))
.finally(() => browser?.close());
I'd be grateful for help resolving this!
I think you're looking for puppeteer.TimeoutError
in CommonJS:
const puppeteer = require("puppeteer"); // ^23.7.1
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
try {
await page.goto("https://www.example.com", {timeout: 1}); // short timeout to force a throw
}
catch (err) {
if (err instanceof puppeteer.TimeoutError) {
console.error("caught timeout error", err); // <-- this will log!
}
else {
console.error("caught non-timeout error", err);
}
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If you're using modules, you can use:
import puppeteer, {TimeoutError} from "puppeteer";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
try {
await page.goto("https://www.example.com", {timeout: 1});
}
catch (err) {
if (err instanceof TimeoutError) {
console.error("caught timeout error", err);
}
else {
console.error("caught non-timeout error", err);
}
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
or:
import * as puppeteer from "puppeteer";
console.log(puppeteer.TimeoutError); // check that it's defined
puppeteer.errors.TimeoutError
may have worked in older Puppeteer versions, but I don't see that it exists in the recent v24/v23 versions I tried.