javascriptpuppeteer

Puppeteer problems


I have taken over old code from someone who is no longer with the company.

And im trying to fix his scrapers since the homepages that it was based on has changed their layout dramatically.

Sadly I get errors with his code. with a few changes.

So I changed the ID's it collect from. And changed links as well:

Before:

const PATH_VARIATIONS = [
     {
URL_XPATH_CLASS: 'job', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
    }
];

After

const PATH_VARIATIONS = [
    {
URL_XPATH_CLASS: 'jobs', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job clicky',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
   }
];

EDIT: was asked to show how code runs:

Main.js

I have outcommented part of the code since im focused on 1 of the pages that gets crawled to start with.

let puppeteer = require('puppeteer');
let jobindexClass = require('./scrapers/jobindex-scraper-1.0.0');
let careerjetClass = require('./scrapers/careerjet-scraper-1.0.0');

async function main() {

    const browser = await puppeteer.launch({
        headless: true,
        defaultViewport: null
    });
    const page = await browser.newPage();
    await page.setExtraHTTPHeaders({ // Handling of correct reading of danish alphabet
        'Accept-Language': 'da-DK,da;q=0.9,en-US;q=0.8,en;q=0.7'
    });

    /*if (process.env.ADVERTS_SCRAPE === undefined || process.env.ADVERTS_SCRAPE === "all" || process.env.ADVERTS_SCRAPE === "jobindex") {

        let scraper = new jobindexClass();
        await run(scraper, browser, page);
        //Print result
        scraper.printDatabaseResult();
    }*/
    if (process.env.ADVERTS_SCRAPE === undefined || process.env.ADVERTS_SCRAPE === "all" || process.env.ADVERTS_SCRAPE === "careerjet") {

        let scraper = new careerjetClass();
        await run(scraper, browser, page);
        //Print result
        scraper.printDatabaseResult();
    }

    // Clean up:
    browser.close();
}

async function run(scraper, browser, page) {
    await scraper.connectDatabase()
        .catch((error) => {
            console.log("Error at main → connectDatabase(): " + error);
        throw error;
        });

    await scraper.initializeDatabase()
        .catch((error) => {
            console.log("Error at main → initializeDatabase(): " + error);
        });

    //<editor-fold desc="TestArea for interface">
    await scraper.beginScraping(page, browser, 1, 3)
        .catch((error) => {
            console.log("Error at main → beginScraping(): " + error);

        });

    await scraper.disconnectDatabase()
        .catch((error) => {
            console.log("Error at main → disconnectDatabase(): " + error);
        throw error;
        });
}

main().then((result) => {
    console.log("Successful termination: " + result);
}, (error) => {
    console.log("Failed termination: " + error);
});

Careerjet-scraper-1.0.0.js

let ScraperInterface = require('./jobscraper-interface-1.0.0');


const TARGET_WEBSITE = 'https://www.careerjet.dk';
const REGION_NAMES = new Map([
    ['bornholm', '/jobs?s=&l=Bornholm&nw=1&p='],
    ['storkoebenhavn', '/jobs?s=&l=Storkøbenhavn&nw=1&p='],
    ['region-sjaelland', '/jobs?s=&l=Sjælland&nw=1&p='],
    ['region-nordjylland', '/jobs?s=&l=Nordjylland&nw=1&p='],
    ['region-midtjylland', '/jobs?s=&l=Midtjylland&nw=1&p='],
    ['sydjylland', '/jobs?s=&l=Syddanmark&nw=1&p='],
]);

const PATH_VARIATIONS = [
    {
        URL_XPATH_CLASS: 'job clicky', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job clicky',
        TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
    }
];
const TOTAL_ADVERTS_SELECTOR = '//*[@id="rightcol"]/div[1]/nobr/table/tbody/tr/td/span/nobr';
const TOTAL_ADVERTS_REGEX = /af (.*?) jobs/g;
const PAGE_TIMEOUT = 15000;

/**
 * Class representing the algorithm for careerjet.dk
 * @class
 * @implements {JocscraperTemplate}
 */
class CareerjetScraper extends ScraperInterface {
    constructor() {
        super(TARGET_WEBSITE, REGION_NAMES, PATH_VARIATIONS, TOTAL_ADVERTS_SELECTOR, TOTAL_ADVERTS_REGEX, PAGE_TIMEOUT);
    }

    getPageExtension(pageNo) {
        return `${pageNo + 1}`;
    }

    /**
     * @inheritDoc
     */
    async scrapePage(page, title, url, companyUrl, index, pageNum) {
        let formattedUrl = (TARGET_WEBSITE + url);
        let errorResult = undefined;
        console.time("runTime page number " + pageNum + " annonce " + index);

        try {
            await page.goto(formattedUrl, {
                timeout: this.PAGE_TIMEOUT
            })
                .catch((error) => {
                    throw new Error("page.goto(): " + error);
                });

            // Filter the object and extract body as raw text.
            let bodyHTML = undefined
            await Promise.race([
                page.evaluate(() => document.body.outerHTML),
                page.waitFor(this.PAGE_TIMEOUT)
            ])
                .then((value) => {
                    if (typeof value === "string") {
                        bodyHTML = value
                    } else {
                        throw new Error("newPage.evaluate() TIMEOUT")
                    }
                })
                .catch((error) => {
                    throw new Error("newPage.evaluate() ERROR: " + error)
                });

            // Insert or update annonce to database:
            await this.insertAnnonce(title, bodyHTML, formattedUrl)
                .catch((error) => {
                    throw new Error("insertAnnonce(" + formattedUrl + "): " + error)
                });

        } catch (error) {
            errorResult = error;
        }

        if (errorResult) {
            this.errorTotalCounter++;
            console.log("Error at scrapePage(" + formattedUrl + ") → " + errorResult);
        }

        console.timeEnd("runTime page number " + pageNum + " annonce " + index);
    }

    /**
     * Extracts the text containing the innerHTML which holds the number of pages in the region.
     *
     * @since       1.0.0
     * @access      private
     *
     * @param page
     * @param listLength
     * @returns {Promise<number>}
     */
    async getNumPages(page, listLength) {
        try {
            // Log the current URL of the page
            console.log("Current URL: " + page.url());

            // Initialize the maximum page number
            let maxPage = 0;
            let currentPageNumber = 1
            const baseUrl = page.url();

            while (true) {
                // Update the URL to the current page number
                const url = baseUrl + currentPageNumber;
                await page.goto(url, { waitUntil: 'networkidle2' });

                // Collecting num of pages element text
                let pageRefs = await page.$$('ul[data-page="' + currentPageNumber + '"]');
                if (pageRefs.length === 0) {
                    console.log("No elements found using CSS selector 'ul[data-page=\"" + currentPageNumber + "\"]'.");
                    break;
                } else {
                    console.log("Elements found for page number: " + currentPageNumber);
                    // Extracting the data-page attribute value
                    let currentPage = await page.evaluate(element => element.getAttribute('data-page'), pageRefs[0])
                        .catch((error) => {
                            throw new Error("page.evaluate() → " + error);
                        });

                    console.log("Current page: " + currentPage);
                    maxPage = parseInt(currentPage, 10);
                    if (isNaN(maxPage)) {
                        throw new Error("Failed to parse current page number from data-page attribute: " + currentPage);
                    }
                }

                // Make sure to increment the currentPageNumber
                currentPageNumber++;
            };


            /*
            // Further processing if needed
            // For example, you can parse the currentPage to an integer
            let currentPageNumber = parseInt(currentPage, 10);
            if (isNaN(currentPageNumber)) {
                throw new Error("Failed to parse current page number from data-page attribute: " + currentPage);
            }*/

            return maxPage;
        } catch (error) {
            console.log("Error at getNumPages(" + page + ") → " + error);
            console.error("getNumPages() → " + error);
            throw error;
        }
    }
}

module.exports = CareerjetScraper;

jobscraper-interface-1.0.0.js

BeginScraping function

/**
     * Entry-point method used by main-module for access to the scraper.
     *
     * @since       1.0.0
     * @access      public
     *
     * @param {Object}              page                    Represents a tab in Chromium browser.
     * @param {Object}              browser                 The Chromium browser object.
     * @param {int}                 pageLimit               Limit on how many pages is queued at a time.
     * @param {int}                 poolLimit               Size of pool for simultaneous running pages.
     *
     * @returns {Promise<void>}
     */
    async beginScraping(page, browser, pageLimit, poolLimit) {
        this.PAGE_LIMIT = pageLimit;
        this.PAGE_POOL = new Pagepool(browser, poolLimit);
        try {
            for (let [key, value] of this.REGION_NAMES) {
                console.log(key.toString());
                currentRegionObject = await ORM.FindRegionID(key.toString());
                currentRegionID = currentRegionObject[0].region_id;

                console.log(`BEGINNING SCRAPING IN REGION: ${key}`);
                const REGION_PAGE_SELECTOR = `${this.TARGET_WEBSITE}${value}`;
                console.log("REGION_PAGE_SELECTOR: " + REGION_PAGE_SELECTOR);


                await page.goto(REGION_PAGE_SELECTOR, {
                    timeout: this.PAGE_TIMEOUT
                })
                    .catch((error) => {
                        throw new Error("Error at beginScraping → page.goto(): " + error);
                    });

                const NUM_PAGES = await this.getNumPages(page, ADVERTS_PER_PAGE);
                console.log(NUM_PAGES + " PAGES");

                for (let pageNumber = 0; pageNumber < NUM_PAGES; pageNumber += this.PAGE_LIMIT) {
                    await this.scrapeRegion(page, browser, REGION_PAGE_SELECTOR, pageNumber, pageNumber
                        + this.PAGE_LIMIT).catch((error) => {
                            console.log("Error at scrapeRegion → " + error);
                        });
                }
            }
        } catch (error) {
            console.log("Error at beginScraping → " + error);
        }
    }

    getPageExtension(pageNo) {
        throw new Error("Missing getPageExtension implementation");
    }

Scraperegion

/**
     * Scrapes the region provided by REGION_PAGE_SELECTOR argument.
     *
     * @since       1.0.0
     * @access      private
     *
     * @param {Object}              page                    Page tab created in browser.
     * @param {Object}              browser                 Browser created in main.
     * @param {String}              REGION_PAGE_SELECTOR    Generic XPath to website handle that contains all.
     *                                                      Advertisement lists.
     * @param {int}                 fromPage                Current page number.
     * @param {int}                 toPage                  Upper limit for parallel scraper.
     *
     * @returns {Promise<String>}                           a string to indicate if any errors have been thrown.
     */
    async scrapeRegion(page, browser, REGION_PAGE_SELECTOR, fromPage, toPage) {
        return new Promise((resolve, reject) => {
            let resolveCounter = 0, rejectCounter = 0;
            let result = '';

            // Utility method to limit the amount of simultaneous running pages.
            let settlePromise = () => {
                if (resolveCounter + rejectCounter === (toPage - fromPage))
                    if (rejectCounter > 0)
                        reject(result);
                    else
                        resolve();
            };

            for (let index = fromPage; index < toPage; index++) {
                console.log('BEGINNING SCRAPING ON PAGE: ' + (index + 1));
                const PAGE_SELECTOR = REGION_PAGE_SELECTOR.concat(`${this.getPageExtension(index)}`);
                console.log("PAGE_SELECTOR: " + PAGE_SELECTOR);

                this.getCurrentPageURLTitles(page, PAGE_SELECTOR)
                    .then((pageURLsAndTitles) => {
                        this.scrapePageList(browser, pageURLsAndTitles, index)
                            .catch((error) => {
                                rejectCounter++;
                                result += `Error at scrapeRegion → scrapePageList(${page},'${PAGE_SELECTOR}'): ${error.toString()}`;
                                settlePromise();
                            })
                            .then(() => {
                                resolveCounter++;
                                settlePromise();
                            })
                    })
                    .catch((error) => {
                        rejectCounter++;
                        result += `Error at scrapeRegion → getCurrentPageURLTitles(${page},'${PAGE_SELECTOR}'): ${error.toString()}`;
                        settlePromise();
                    });
            }
        });
    }

getcurrentpageurltitles

/**
     * Gets a list of title/url pairs.
     *
     * @since       1.0.0
     * @access      private
     *
     * @param {Object}              page                    Current page to extract titles and urls from.
     * @param {String}              PAGE_SELECTOR           Formatted url to the page conataining the advertisement list.
     *
     * @returns {Promise<{PAGE_TITLES: Array, PAGE_URLS: Array}>} - Lists with titles and urls.
     */
    async getCurrentPageURLTitles(page, PAGE_SELECTOR) {
        await page.goto(PAGE_SELECTOR, {
            timeout: this.PAGE_TIMEOUT
        })
            .catch((value) => {
                throw new Error("page.goto() → " + value);
            });

        let counter = 0;
        let titles = [], urls = [], companies = [];

        while (titles.length === 0 && counter < this.PATH_VARIATIONS.length) {
            let currentObject = this.PATH_VARIATIONS[counter];
            let candidateObj;
            if(currentObject.COMPANY_XPATH_CLASS === undefined){
                candidateObj = await this.tryPathVariationOnPage(page,currentObject.TITLE_XPATH_CLASS,currentObject.TITLE_XPATH_ATTRIBUTES,currentObject.URL_XPATH_CLASS,currentObject.URL_XPATH_ATTRIBUTES);
            } else {
                candidateObj = await this.tryPathVariationOnPage(page, currentObject.TITLE_XPATH_CLASS,
                    currentObject.TITLE_XPATH_ATTRIBUTES, currentObject.URL_XPATH_CLASS, currentObject.URL_XPATH_ATTRIBUTES, currentObject.COMPANY_XPATH_CLASS, currentObject.COMPANY_XPATH_ATTRIBUTES);
                    companies = candidateObj.companyUrls
            }

            titles = candidateObj.titleList;
            urls = candidateObj.urlList;

            counter++;
        }

        if (titles.length === 0) {
            throw new Error("No valid path found!");
        }

            return { PAGE_TITLES: titles, PAGE_URLS: urls, PAGE_COMPANY_URLS: companies };
        }

trypathvariaononpage which is where my code seems to fail

/**
     * Tries the path variations defined in PATH_VARIATIONS on the current page.
     *
     * @since       1.0.0
     * @access      private
     *
     * @param {Object}              page                    The current page the scraper has reached.
     * @param {String}              titleClass              XPath to the general element in which we are searching.
     * @param {String}              titleAttributes         XPath to the specific children of titleClass XPath.
     * @param {String}              urlClass                XPath to the element where the text representation of url is kept.
     * @param {String}              urlAttributes           XPath to the specific child which keeps the text
     *
     * @returns {Promise<{titleList: Array, urlList: Array}>}
     */
    async tryPathVariationOnPage(page, titleClass, titleAttributes, urlClass, urlAttributes, companyClass, companyAttributes) {
            let titles = [], urls = [], company = [];
            try {
                // Sets the XPath to the elements.
                let xPathTitleStr = `article[class="${titleClass}"]${titleAttributes}`;
                //let xPathTitleStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]/b`
                let xpathTitleData = await page.$x(xPathTitleStr)
                    .catch((error) => {
                        throw new Error("page.$x(): " + error);
                    });
                console.log(xpathTitleData);
                console - log("I got through the company title");
                // Extract the title from the selected elements
                for (let element of xpathTitleData) {
                    let title = await page.evaluate(el => el.getAttribute('title'), element);
                    titles.push(title);
                }

                let xpathCompany, xpathCompanyData;

                if (companyClass !== undefined) {
                    xpathCompany = `//li[contains(@class, "${companyClass}")]${companyAttributes}`;
                    xpathCompanyData = await page.$x(xpathCompany)
                        .catch((error) => {
                            throw new Error("page.$x(): " + error)
                        })
                }

                let xPathUrlStr = `//*[contains(@class, "${urlClass}")]${urlAttributes}`;
                //let xPathUrlStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]`
                let xpathUrlData = await page.$x(xPathUrlStr)
                    .catch((error) => {
                        throw new Error("page.$x(): " + error);
                    });

                // Runs through all advertisements with XPath on current page.
                for (let i = 0; i < xpathTitleData.length; i++) {
                    // Retrieving elements from specific advertisement.
                    let xpathTitleTextContent = await xpathTitleData[i].getProperty('textContent')
                        .catch((error) => {
                            throw new Error("xpathTitleData.getProperty(): " + error);
                        });
                    let xpathUrlTextContent = await xpathUrlData[i].getProperty('textContent')
                        .catch((error) => {
                            throw new Error("xpathUrlData.getProperty(): " + error);
                        });

                    // Extracting the text values from gathered elements.
                    let titleText = await xpathTitleTextContent.jsonValue()
                        .catch((error) => {
                            throw new Error("xpathTitleTextContent.getProperty(): " + error);
                        });
                    titleText = titleText.trim();
                    let urlText = await xpathUrlTextContent.jsonValue()
                        .catch((error) => {
                            throw new Error("xpathUrlTextContent.getProperty(): " + error);
                        });

                    // If one property is empty, the advertisement is invalid.
                    if (titleText.length !== 0 && urlText !== 0) {
                        titles.push(titleText);
                        urls.push(urlText);
                        //company.push("https://www.jobindex.dk" + companyText)
                    }
                }
                // Run through company data for all ads on current page.
                if (xpathCompanyData !== undefined) {
                    for (let i = 0; i < xpathCompanyData.length; i++) {

                        let xpathCompanyTextContent = await xpathCompanyData[i].getProperty('textContent')
                            .catch((error) => {
                                throw new Error("xpathCompanyData.getProperty(): " + error)
                            })
                        let companyText = await xpathCompanyTextContent.jsonValue()
                            .catch((error) => {
                                throw new Error("xpathCompanyTextContent.getProperty(): " + error);
                            })
                        company.push("https://www.jobindex.dk" + companyText)
                    }
                }

                return { titleList: titles, urlList: urls, companyUrls: company };
            } catch (error) {
                console.log("Error at getPageTitlesAndUrls() → " + error)
            }
        }

Specifically this is the part that is failing:

try {
    // Sets the XPath to the elements.
    let xPathTitleStr = `//*[contains(@class, "${titleClass}")]${titleAttributes}`;
    //let xPathTitleStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]/b`
    let xpathTitleData = await page.$x(xPathTitleStr)
        .catch((error) => {
            throw new Error("page.$x(): " + error);
        });
    console.log(xpathTitleData);
    console-log("I got through the company title");

The errors I get are:

BEGINNING SCRAPING ON PAGE: 1 PAGE_SELECTOR: https://www.careerjet.dk/jobs?s=&l=Bornholm&nw=1&p=1 Error at getPageTitlesAndUrls() → TypeError: page.$x is not a function CandidateObj: xpath is undefined undefined CandidateObj is undefined

Sadly the department I work for has no programmers and im an intern. So I have no one at premises who can actually help.


Solution

  • So the answer to the question that I found out after hours of pain inducing headache creating bugfixing.

    is quite simple. the page$x when the original programmer created the code was a standard part of the library, that is no longer the case and since i updated all the libraries due to critical vulnabilities, said functionality is obviously broken.

    Fixed it by rewriting the code and using a page.$$eval with a element selector. and then mapping the data to what I needed:

    // Define and populate elements
                let titleSelector = `.${titleClass} ${titleAttributes}`;
                console.log('Title Selector:', titleSelector);
                let elements = await page.$$eval(titleSelector, elements => elements
                    .map(el => ({
                        title: el.textContent.trim(),
                        url: el.href
                    })));