I'm using node.js and puppeteer to get some data. However, Puppeteer keeps getting timeout. My code is as followings:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome-stable',
});
const page = await browser.newPage();
let url = "https://erecruit.wsib.on.ca/PSHREXTP/start.html";
let job_selector = "table#tdgbrHRS_CE_JO_EXT_I\\$0 tbody tr";
// works fine for the following webpage
//let url = "https://career17.sapsf.com/career?company=universitc&career_ns=job_listing_summary&navBarLevel=JOB_SEARCH";
//let job_selector = "tr.jobResultItem";
await page.goto(url);
await page.waitForSelector(job_selector);
const jobs = await page.evaluate((job_selector) => {
const quoteList = document.querySelectorAll(job_selector);
return Array.from(quoteList).map((quote) => quote.outerHTML);
}, job_selector);
console.log(JSON.stringify(jobs));
await browser.close();
})();
I tried to use a longer timeout of 60 seconds. The same error.
I tried to capture the screen before the waitForSelector
call, and only the top banner is shown. That indicates the fact that data is not loaded.
As shown in the code, it works fine with another webpage and selector.
What am I missing here?
The issue with your code is you're trying to fetch data directly from html content which you dont get when working with puppeteer as it uses iframes to load the dynamic html content of url. You need to fetch the data from frames like this :
const puppeteer = require("puppeteer");
const fs = require("fs");
(async () => {
const browser = await puppeteer.launch({ executablePath: "your_chrome_path" });
const page = await browser.newPage();
let url = "https://erecruit.wsib.on.ca/PSHREXTP/start.html";
let job_selector = "table#tdgbrHRS_CE_JO_EXT_I\\$0 tbody tr";
try {
console.log(`Navigating to ${url}...`);
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 });
// Wait for the frame to be loaded
const frame = await page.waitForFrame((frame) => frame.name() === "TargetContent" || frame.id === "TargetContentFrm");
if (frame) {
console.log("Frame found, extracting content...");
const jobs = await frame.evaluate((job_selector) => {
const rows = document.querySelectorAll(job_selector);
const jobDetails = [];
rows.forEach((row) => {
const date = row.querySelector("td span[id^='OPENED']") ? row.querySelector("td span[id^='OPENED']").textContent.trim() : "N/A";
const title = row.querySelector("td a") ? row.querySelector("td a").textContent.trim() : "N/A";
const jobNumber = row.querySelector("td span[id^='JOBNUMBER']") ? row.querySelector("td span[id^='JOBNUMBER']").textContent.trim() : "N/A";
const location = row.querySelector("td span[id^='HRS_LOCATION_DESCR']") ? row.querySelector("td span[id^='HRS_LOCATION_DESCR']").textContent.trim() : "N/A";
// Add extracted job details to the jobDetails array
jobDetails.push({ date, title, jobNumber, location });
});
return jobDetails;
}, job_selector);
fs.writeFileSync("extracted_jobs.json", JSON.stringify(jobs, null, 2), (err) => {
if (err) {
console.error("Error writing JSON file:", err);
} else {
console.log("Job details successfully extracted and saved to extracted_jobs.json");
}
});
} else {
console.log("Target frame not found.");
}
} catch (error) {
console.error("Error during page load or content extraction:", error);
} finally {
await browser.close();
}
})();
Try using with this code. Do like my answer if its working for you.