I have taken over old code from someone who is no longer with the company.
And im trying to fix his scrapers since the homepages that it was based on has changed their layout dramatically.
Sadly I get errors with his code. with a few changes.
So I changed the ID's it collect from. And changed links as well:
Before:
const PATH_VARIATIONS = [
{
URL_XPATH_CLASS: 'job', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
}
];
After
const PATH_VARIATIONS = [
{
URL_XPATH_CLASS: 'jobs', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job clicky',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
}
];
EDIT: was asked to show how code runs:
Main.js
I have outcommented part of the code since im focused on 1 of the pages that gets crawled to start with.
let puppeteer = require('puppeteer');
let jobindexClass = require('./scrapers/jobindex-scraper-1.0.0');
let careerjetClass = require('./scrapers/careerjet-scraper-1.0.0');
async function main() {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null
});
const page = await browser.newPage();
await page.setExtraHTTPHeaders({ // Handling of correct reading of danish alphabet
'Accept-Language': 'da-DK,da;q=0.9,en-US;q=0.8,en;q=0.7'
});
/*if (process.env.ADVERTS_SCRAPE === undefined || process.env.ADVERTS_SCRAPE === "all" || process.env.ADVERTS_SCRAPE === "jobindex") {
let scraper = new jobindexClass();
await run(scraper, browser, page);
//Print result
scraper.printDatabaseResult();
}*/
if (process.env.ADVERTS_SCRAPE === undefined || process.env.ADVERTS_SCRAPE === "all" || process.env.ADVERTS_SCRAPE === "careerjet") {
let scraper = new careerjetClass();
await run(scraper, browser, page);
//Print result
scraper.printDatabaseResult();
}
// Clean up:
browser.close();
}
async function run(scraper, browser, page) {
await scraper.connectDatabase()
.catch((error) => {
console.log("Error at main → connectDatabase(): " + error);
throw error;
});
await scraper.initializeDatabase()
.catch((error) => {
console.log("Error at main → initializeDatabase(): " + error);
});
//<editor-fold desc="TestArea for interface">
await scraper.beginScraping(page, browser, 1, 3)
.catch((error) => {
console.log("Error at main → beginScraping(): " + error);
});
await scraper.disconnectDatabase()
.catch((error) => {
console.log("Error at main → disconnectDatabase(): " + error);
throw error;
});
}
main().then((result) => {
console.log("Successful termination: " + result);
}, (error) => {
console.log("Failed termination: " + error);
});
Careerjet-scraper-1.0.0.js
let ScraperInterface = require('./jobscraper-interface-1.0.0');
const TARGET_WEBSITE = 'https://www.careerjet.dk';
const REGION_NAMES = new Map([
['bornholm', '/jobs?s=&l=Bornholm&nw=1&p='],
['storkoebenhavn', '/jobs?s=&l=Storkøbenhavn&nw=1&p='],
['region-sjaelland', '/jobs?s=&l=Sjælland&nw=1&p='],
['region-nordjylland', '/jobs?s=&l=Nordjylland&nw=1&p='],
['region-midtjylland', '/jobs?s=&l=Midtjylland&nw=1&p='],
['sydjylland', '/jobs?s=&l=Syddanmark&nw=1&p='],
]);
const PATH_VARIATIONS = [
{
URL_XPATH_CLASS: 'job clicky', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job clicky',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
}
];
const TOTAL_ADVERTS_SELECTOR = '//*[@id="rightcol"]/div[1]/nobr/table/tbody/tr/td/span/nobr';
const TOTAL_ADVERTS_REGEX = /af (.*?) jobs/g;
const PAGE_TIMEOUT = 15000;
/**
* Class representing the algorithm for careerjet.dk
* @class
* @implements {JocscraperTemplate}
*/
class CareerjetScraper extends ScraperInterface {
constructor() {
super(TARGET_WEBSITE, REGION_NAMES, PATH_VARIATIONS, TOTAL_ADVERTS_SELECTOR, TOTAL_ADVERTS_REGEX, PAGE_TIMEOUT);
}
getPageExtension(pageNo) {
return `${pageNo + 1}`;
}
/**
* @inheritDoc
*/
async scrapePage(page, title, url, companyUrl, index, pageNum) {
let formattedUrl = (TARGET_WEBSITE + url);
let errorResult = undefined;
console.time("runTime page number " + pageNum + " annonce " + index);
try {
await page.goto(formattedUrl, {
timeout: this.PAGE_TIMEOUT
})
.catch((error) => {
throw new Error("page.goto(): " + error);
});
// Filter the object and extract body as raw text.
let bodyHTML = undefined
await Promise.race([
page.evaluate(() => document.body.outerHTML),
page.waitFor(this.PAGE_TIMEOUT)
])
.then((value) => {
if (typeof value === "string") {
bodyHTML = value
} else {
throw new Error("newPage.evaluate() TIMEOUT")
}
})
.catch((error) => {
throw new Error("newPage.evaluate() ERROR: " + error)
});
// Insert or update annonce to database:
await this.insertAnnonce(title, bodyHTML, formattedUrl)
.catch((error) => {
throw new Error("insertAnnonce(" + formattedUrl + "): " + error)
});
} catch (error) {
errorResult = error;
}
if (errorResult) {
this.errorTotalCounter++;
console.log("Error at scrapePage(" + formattedUrl + ") → " + errorResult);
}
console.timeEnd("runTime page number " + pageNum + " annonce " + index);
}
/**
* Extracts the text containing the innerHTML which holds the number of pages in the region.
*
* @since 1.0.0
* @access private
*
* @param page
* @param listLength
* @returns {Promise<number>}
*/
async getNumPages(page, listLength) {
try {
// Log the current URL of the page
console.log("Current URL: " + page.url());
// Initialize the maximum page number
let maxPage = 0;
let currentPageNumber = 1
const baseUrl = page.url();
while (true) {
// Update the URL to the current page number
const url = baseUrl + currentPageNumber;
await page.goto(url, { waitUntil: 'networkidle2' });
// Collecting num of pages element text
let pageRefs = await page.$$('ul[data-page="' + currentPageNumber + '"]');
if (pageRefs.length === 0) {
console.log("No elements found using CSS selector 'ul[data-page=\"" + currentPageNumber + "\"]'.");
break;
} else {
console.log("Elements found for page number: " + currentPageNumber);
// Extracting the data-page attribute value
let currentPage = await page.evaluate(element => element.getAttribute('data-page'), pageRefs[0])
.catch((error) => {
throw new Error("page.evaluate() → " + error);
});
console.log("Current page: " + currentPage);
maxPage = parseInt(currentPage, 10);
if (isNaN(maxPage)) {
throw new Error("Failed to parse current page number from data-page attribute: " + currentPage);
}
}
// Make sure to increment the currentPageNumber
currentPageNumber++;
};
/*
// Further processing if needed
// For example, you can parse the currentPage to an integer
let currentPageNumber = parseInt(currentPage, 10);
if (isNaN(currentPageNumber)) {
throw new Error("Failed to parse current page number from data-page attribute: " + currentPage);
}*/
return maxPage;
} catch (error) {
console.log("Error at getNumPages(" + page + ") → " + error);
console.error("getNumPages() → " + error);
throw error;
}
}
}
module.exports = CareerjetScraper;
jobscraper-interface-1.0.0.js
BeginScraping function
/**
* Entry-point method used by main-module for access to the scraper.
*
* @since 1.0.0
* @access public
*
* @param {Object} page Represents a tab in Chromium browser.
* @param {Object} browser The Chromium browser object.
* @param {int} pageLimit Limit on how many pages is queued at a time.
* @param {int} poolLimit Size of pool for simultaneous running pages.
*
* @returns {Promise<void>}
*/
async beginScraping(page, browser, pageLimit, poolLimit) {
this.PAGE_LIMIT = pageLimit;
this.PAGE_POOL = new Pagepool(browser, poolLimit);
try {
for (let [key, value] of this.REGION_NAMES) {
console.log(key.toString());
currentRegionObject = await ORM.FindRegionID(key.toString());
currentRegionID = currentRegionObject[0].region_id;
console.log(`BEGINNING SCRAPING IN REGION: ${key}`);
const REGION_PAGE_SELECTOR = `${this.TARGET_WEBSITE}${value}`;
console.log("REGION_PAGE_SELECTOR: " + REGION_PAGE_SELECTOR);
await page.goto(REGION_PAGE_SELECTOR, {
timeout: this.PAGE_TIMEOUT
})
.catch((error) => {
throw new Error("Error at beginScraping → page.goto(): " + error);
});
const NUM_PAGES = await this.getNumPages(page, ADVERTS_PER_PAGE);
console.log(NUM_PAGES + " PAGES");
for (let pageNumber = 0; pageNumber < NUM_PAGES; pageNumber += this.PAGE_LIMIT) {
await this.scrapeRegion(page, browser, REGION_PAGE_SELECTOR, pageNumber, pageNumber
+ this.PAGE_LIMIT).catch((error) => {
console.log("Error at scrapeRegion → " + error);
});
}
}
} catch (error) {
console.log("Error at beginScraping → " + error);
}
}
getPageExtension(pageNo) {
throw new Error("Missing getPageExtension implementation");
}
Scraperegion
/**
* Scrapes the region provided by REGION_PAGE_SELECTOR argument.
*
* @since 1.0.0
* @access private
*
* @param {Object} page Page tab created in browser.
* @param {Object} browser Browser created in main.
* @param {String} REGION_PAGE_SELECTOR Generic XPath to website handle that contains all.
* Advertisement lists.
* @param {int} fromPage Current page number.
* @param {int} toPage Upper limit for parallel scraper.
*
* @returns {Promise<String>} a string to indicate if any errors have been thrown.
*/
async scrapeRegion(page, browser, REGION_PAGE_SELECTOR, fromPage, toPage) {
return new Promise((resolve, reject) => {
let resolveCounter = 0, rejectCounter = 0;
let result = '';
// Utility method to limit the amount of simultaneous running pages.
let settlePromise = () => {
if (resolveCounter + rejectCounter === (toPage - fromPage))
if (rejectCounter > 0)
reject(result);
else
resolve();
};
for (let index = fromPage; index < toPage; index++) {
console.log('BEGINNING SCRAPING ON PAGE: ' + (index + 1));
const PAGE_SELECTOR = REGION_PAGE_SELECTOR.concat(`${this.getPageExtension(index)}`);
console.log("PAGE_SELECTOR: " + PAGE_SELECTOR);
this.getCurrentPageURLTitles(page, PAGE_SELECTOR)
.then((pageURLsAndTitles) => {
this.scrapePageList(browser, pageURLsAndTitles, index)
.catch((error) => {
rejectCounter++;
result += `Error at scrapeRegion → scrapePageList(${page},'${PAGE_SELECTOR}'): ${error.toString()}`;
settlePromise();
})
.then(() => {
resolveCounter++;
settlePromise();
})
})
.catch((error) => {
rejectCounter++;
result += `Error at scrapeRegion → getCurrentPageURLTitles(${page},'${PAGE_SELECTOR}'): ${error.toString()}`;
settlePromise();
});
}
});
}
getcurrentpageurltitles
/**
* Gets a list of title/url pairs.
*
* @since 1.0.0
* @access private
*
* @param {Object} page Current page to extract titles and urls from.
* @param {String} PAGE_SELECTOR Formatted url to the page conataining the advertisement list.
*
* @returns {Promise<{PAGE_TITLES: Array, PAGE_URLS: Array}>} - Lists with titles and urls.
*/
async getCurrentPageURLTitles(page, PAGE_SELECTOR) {
await page.goto(PAGE_SELECTOR, {
timeout: this.PAGE_TIMEOUT
})
.catch((value) => {
throw new Error("page.goto() → " + value);
});
let counter = 0;
let titles = [], urls = [], companies = [];
while (titles.length === 0 && counter < this.PATH_VARIATIONS.length) {
let currentObject = this.PATH_VARIATIONS[counter];
let candidateObj;
if(currentObject.COMPANY_XPATH_CLASS === undefined){
candidateObj = await this.tryPathVariationOnPage(page,currentObject.TITLE_XPATH_CLASS,currentObject.TITLE_XPATH_ATTRIBUTES,currentObject.URL_XPATH_CLASS,currentObject.URL_XPATH_ATTRIBUTES);
} else {
candidateObj = await this.tryPathVariationOnPage(page, currentObject.TITLE_XPATH_CLASS,
currentObject.TITLE_XPATH_ATTRIBUTES, currentObject.URL_XPATH_CLASS, currentObject.URL_XPATH_ATTRIBUTES, currentObject.COMPANY_XPATH_CLASS, currentObject.COMPANY_XPATH_ATTRIBUTES);
companies = candidateObj.companyUrls
}
titles = candidateObj.titleList;
urls = candidateObj.urlList;
counter++;
}
if (titles.length === 0) {
throw new Error("No valid path found!");
}
return { PAGE_TITLES: titles, PAGE_URLS: urls, PAGE_COMPANY_URLS: companies };
}
trypathvariaononpage which is where my code seems to fail
/**
* Tries the path variations defined in PATH_VARIATIONS on the current page.
*
* @since 1.0.0
* @access private
*
* @param {Object} page The current page the scraper has reached.
* @param {String} titleClass XPath to the general element in which we are searching.
* @param {String} titleAttributes XPath to the specific children of titleClass XPath.
* @param {String} urlClass XPath to the element where the text representation of url is kept.
* @param {String} urlAttributes XPath to the specific child which keeps the text
*
* @returns {Promise<{titleList: Array, urlList: Array}>}
*/
async tryPathVariationOnPage(page, titleClass, titleAttributes, urlClass, urlAttributes, companyClass, companyAttributes) {
let titles = [], urls = [], company = [];
try {
// Sets the XPath to the elements.
let xPathTitleStr = `article[class="${titleClass}"]${titleAttributes}`;
//let xPathTitleStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]/b`
let xpathTitleData = await page.$x(xPathTitleStr)
.catch((error) => {
throw new Error("page.$x(): " + error);
});
console.log(xpathTitleData);
console - log("I got through the company title");
// Extract the title from the selected elements
for (let element of xpathTitleData) {
let title = await page.evaluate(el => el.getAttribute('title'), element);
titles.push(title);
}
let xpathCompany, xpathCompanyData;
if (companyClass !== undefined) {
xpathCompany = `//li[contains(@class, "${companyClass}")]${companyAttributes}`;
xpathCompanyData = await page.$x(xpathCompany)
.catch((error) => {
throw new Error("page.$x(): " + error)
})
}
let xPathUrlStr = `//*[contains(@class, "${urlClass}")]${urlAttributes}`;
//let xPathUrlStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]`
let xpathUrlData = await page.$x(xPathUrlStr)
.catch((error) => {
throw new Error("page.$x(): " + error);
});
// Runs through all advertisements with XPath on current page.
for (let i = 0; i < xpathTitleData.length; i++) {
// Retrieving elements from specific advertisement.
let xpathTitleTextContent = await xpathTitleData[i].getProperty('textContent')
.catch((error) => {
throw new Error("xpathTitleData.getProperty(): " + error);
});
let xpathUrlTextContent = await xpathUrlData[i].getProperty('textContent')
.catch((error) => {
throw new Error("xpathUrlData.getProperty(): " + error);
});
// Extracting the text values from gathered elements.
let titleText = await xpathTitleTextContent.jsonValue()
.catch((error) => {
throw new Error("xpathTitleTextContent.getProperty(): " + error);
});
titleText = titleText.trim();
let urlText = await xpathUrlTextContent.jsonValue()
.catch((error) => {
throw new Error("xpathUrlTextContent.getProperty(): " + error);
});
// If one property is empty, the advertisement is invalid.
if (titleText.length !== 0 && urlText !== 0) {
titles.push(titleText);
urls.push(urlText);
//company.push("https://www.jobindex.dk" + companyText)
}
}
// Run through company data for all ads on current page.
if (xpathCompanyData !== undefined) {
for (let i = 0; i < xpathCompanyData.length; i++) {
let xpathCompanyTextContent = await xpathCompanyData[i].getProperty('textContent')
.catch((error) => {
throw new Error("xpathCompanyData.getProperty(): " + error)
})
let companyText = await xpathCompanyTextContent.jsonValue()
.catch((error) => {
throw new Error("xpathCompanyTextContent.getProperty(): " + error);
})
company.push("https://www.jobindex.dk" + companyText)
}
}
return { titleList: titles, urlList: urls, companyUrls: company };
} catch (error) {
console.log("Error at getPageTitlesAndUrls() → " + error)
}
}
Specifically this is the part that is failing:
try {
// Sets the XPath to the elements.
let xPathTitleStr = `//*[contains(@class, "${titleClass}")]${titleAttributes}`;
//let xPathTitleStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]/b`
let xpathTitleData = await page.$x(xPathTitleStr)
.catch((error) => {
throw new Error("page.$x(): " + error);
});
console.log(xpathTitleData);
console-log("I got through the company title");
The errors I get are:
BEGINNING SCRAPING ON PAGE: 1 PAGE_SELECTOR: https://www.careerjet.dk/jobs?s=&l=Bornholm&nw=1&p=1 Error at getPageTitlesAndUrls() → TypeError: page.$x is not a function CandidateObj: xpath is undefined undefined CandidateObj is undefined
Sadly the department I work for has no programmers and im an intern. So I have no one at premises who can actually help.
So the answer to the question that I found out after hours of pain inducing headache creating bugfixing.
is quite simple. the page$x when the original programmer created the code was a standard part of the library, that is no longer the case and since i updated all the libraries due to critical vulnabilities, said functionality is obviously broken.
Fixed it by rewriting the code and using a page.$$eval with a element selector. and then mapping the data to what I needed:
// Define and populate elements
let titleSelector = `.${titleClass} ${titleAttributes}`;
console.log('Title Selector:', titleSelector);
let elements = await page.$$eval(titleSelector, elements => elements
.map(el => ({
title: el.textContent.trim(),
url: el.href
})));