javascriptnode.jspuppeteer

Puppeteer "TimeoutError is not defined"


I have a very simple web scraper app whose sole purpose is to evaluate the javascript on a webpage and return the generated HTML. It works fine most of the time. However, it is not handling timeouts in the way I need it to.

The code is called by an applescript handler which passes in a timeout value and needs to be specifically signalled when the timeout has elapsed.

I adapted some code from here, and although I don't have much experience (at all) with javascript itself I believe I can see what it's doing. Trouble is, at runtime I'm being told that TimeoutError doesn't exist. I assume it's defined somewhere in the puppeteer infrastructure, and I require puppeteer (puppeteer stealth, actually, but the problem occurs with straight puppeteer also), so what the heck?

I have an SSCCE which should demonstrate it:

//**************************************** constants

// all error codes that we pass back
const ERROR_CODE_URL_INVALID = 1002;
const ERROR_INTERNAL_SERVER_ERROR = 1003;
const ERROR_TIMEOUT = 1004;


//**************************************** scrape
const scrape = async (page, url) => {
    let response = null;

    // load the URL and check it's something useful
    let errorContent = null;
    try {
        // deliberately tiny timeout value, to demonstrate problem
        response = await page.goto(url, { waitUntil: 'load', timeout: 100 });
        if (!response) {
            // sometimes the response is not returned immediately, so we need to wait for it
            response = await page.waitForResponse(() => true, { timeout: 100 });
        }

        if (response.status() === 200) {
            // success; nothing to do here
        }
        else if (response.status() === 404) {
            // get ready to throw a 404 below
            errorContent = ERROR_CODE_URL_INVALID;
        } else if (response.status() === 500) {
            // get ready to throw a 500 below
            errorContent = ERROR_INTERNAL_SERVER_ERROR;
        } else {
            // throw whatever else we have down to the catch block
            throw response.text;
        }
    } catch (error) {
            if (error instanceof TimeoutError) {
                // get ready to throw a timeout to our caller
                errorContent = ERROR_TIMEOUT;
            }
            else {
                // rethrow the error we got above
                throw error;
            }
    }
    // now throw whatever error we got above
    if (errorContent !== null) {
        throw errorContent;
    }

    // get ready to scrape the page HTML
    // first hopefully get rid of any annoying popups in the way
    page.mouse.click(1, 1);

    // get the page HTML and return it
    const data = await page.evaluate(() => document.querySelector('*').outerHTML);
    return data;
} // scrape()

//**************************************** main
let browser;
(async function main() {
    // puppeteer-extra is a drop-in replacement for puppeteer,
    // it augments the installed puppeteer with plugin functionality
    const puppeteer = require('puppeteer-extra');
    // use the following line instead of the above one if not doing puppeteer stealth
    //const puppeteer = require('puppeteer');

    // add stealth plugin and use defaults (all evasion techniques)
    const stealthPlugin = require('puppeteer-extra-plugin-stealth')
    puppeteer.use(stealthPlugin())

    // start up the browser and get a page
    browser = await puppeteer.launch({headless: false});
    const [page] = await browser.pages();
    
    // get the page HTML and return it
    let s = await scrape(page, "https://www.scrapethissite.com/pages/");
    console.log(s);
})()        
.catch(err => console.log(err))
.finally(() => browser?.close());

Edit: following @ggorlen's suggestion in a comment below I have changed the code to the following, but it also fails, this time with TypeError: Cannot read properties of undefined (reading 'TimeoutError')

//**************************************** constants

// all errors that we pass back
const ERROR_CODE_URL_INVALID = 1002;
const ERROR_INTERNAL_SERVER_ERROR = 1003;
const ERROR_TIMEOUT = 1004;

let puppeteer;

//**************************************** scrape
const scrape = async (page, url) => {
    
    let response = null;

    // load the URL and check it's something useful
    let errorContent = null;
    try {
        response = await page.goto(url, { waitUntil: 'load', timeout: 100 });
        if (!response) {
            // sometimes the response is not returned immediately, so we need to wait for it
            response = await page.waitForResponse(() => true, { timeout: 100 });
        }

        if (response.status() === 200) {
            // success; nothing to do here
        }
        else if (response.status() === 404) {
            // get ready to throw a 404 below
            errorContent = ERROR_CODE_URL_INVALID;
        } else if (response.status() === 500) {
            // get ready to throw a 500 below
            errorContent = ERROR_INTERNAL_SERVER_ERROR;
        } else {
            // throw whatever else we have down to the catch block
            throw response.text;
        }
    } catch (error) {
            if (error instanceof puppeteer.errors.TimeoutError) {
                // get ready to throw a timeout to our caller
                errorContent = ERROR_TIMEOUT;
            }
            else {
                // rethrow the error we got above
                throw error;
            }
    }
    // now throw whatever error we got above
    if (errorContent !== null) {
        throw errorContent;
    }

    // get ready to scrape the page HTML
    // first hopefully get rid of any annoying popups in the way
    page.mouse.click(1, 1);

    // get the page HTML and return it
    const data = await page.evaluate(() => document.querySelector('*').outerHTML);
    return data;
} // scrape()

//**************************************** main
let browser;
(async function main() {
    // puppeteer-extra is a drop-in replacement for puppeteer,
    // it augments the installed puppeteer with plugin functionality
    puppeteer = require('puppeteer-extra');
    // use the following line instead of the above one if not doing puppeteer stealth
    //const puppeteer = require('puppeteer');

    // add stealth plugin and use defaults (all evasion techniques)
    const stealthPlugin = require('puppeteer-extra-plugin-stealth')
    puppeteer.use(stealthPlugin())

    // start up the browser and get a page
    browser = await puppeteer.launch({headless: false});
    const [page] = await browser.pages();
    
    // get the page HTML and return it
    let s = await scrape(page, "https://www.scrapethissite.com/pages/");
    console.log(s);
})()        
.catch(err => console.log(err))
.finally(() => browser?.close());

I'd be grateful for help resolving this!


Solution

  • I think you're looking for puppeteer.TimeoutError in CommonJS:

    const puppeteer = require("puppeteer"); // ^23.7.1
    
    let browser;
    (async () => {
      browser = await puppeteer.launch();
      const [page] = await browser.pages();
    
      try {
        await page.goto("https://www.example.com", {timeout: 1}); // short timeout to force a throw
      }
      catch (err) {
        if (err instanceof puppeteer.TimeoutError) {
          console.error("caught timeout error", err); // <-- this will log!
        }
        else {
          console.error("caught non-timeout error", err);
        }
      }
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close());
    

    If you're using modules, you can use:

    import puppeteer, {TimeoutError} from "puppeteer";
    
    let browser;
    (async () => {
      browser = await puppeteer.launch();
      const [page] = await browser.pages();
    
      try {
        await page.goto("https://www.example.com", {timeout: 1});
      }
      catch (err) {
        if (err instanceof TimeoutError) {
          console.error("caught timeout error", err);
        }
        else {
          console.error("caught non-timeout error", err);
        }
      }
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close());
    

    or:

    import * as puppeteer from "puppeteer";
    
    console.log(puppeteer.TimeoutError); // check that it's defined
    

    puppeteer.errors.TimeoutError may have worked in older Puppeteer versions, but I don't see that it exists in the recent v24/v23 versions I tried.