I'm trying to crawl a webpage that has a h3
tag under an a
tag. I'm getting the a
tag just fine, but when trying to get the innerText of h3
I'm getting an undefined
value.
This is what I'm trying to crawl:
const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";
const webScraping = async pageURL => {
const browser = await puppeteer.launch({
headless: false,
arges: ["--no-sandbox"]
});
const page = await browser.newPage();
let dataObj = {};
try {
await page.goto(pageURL, { waitUntil: 'networkidle2' });
const publishedNews = await page.evaluate(() => {
const newsDOM = document.querySelectorAll("main ul li");
let newsList = [];
newsDOM.forEach(linkElement => {
const text = linkElement.querySelector("a").textContent;
const innerText = linkElement.querySelector("a").innerText;
const url = linkElement.querySelector("a").getAttribute('href');
const title = linkElement.querySelector("h3").innerText;
console.log(title);
newsList.push({
title,
text,
url
});
});
return newsList;
});
dataObj = {
amount: publishedNews.length,
publishedNews
};
} catch (e) {
console.log(e);
}
console.log(dataObj);
browser.close();
return dataObj;
};
webScraping(pageURL).catch(console.error);
Console log works great, but puppeteer throws:
Cannot read property 'innerText' of null
It looks like your solution is working just fine, but you're not controlling whether the h3
tag is null or not. Try adding an if
statement before accessing the innerText attribute, or use the code I left below.
const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";
const webScraping = async pageURL => {
const browser = await puppeteer.launch({
headless: false,
arges: ["--no-sandbox"]
});
const page = await browser.newPage();
let dataObj = {};
try {
await page.goto(pageURL, { waitUntil: 'networkidle2' });
const publishedNews = await page.evaluate(() => {
let newsList = [];
const newsDOM = document.querySelectorAll("main ul li");
newsDOM.forEach(linkElement => {
const aTag = linkElement.querySelector("a");
const text = aTag.textContent;
const innerText = aTag.innerText;
const url = aTag.getAttribute('href');
let title = aTag.querySelector("h3");
// there may be some <a> without an h3, control
// the null pointer exception here, accessing only
// if title is not 'null'.
if (title) title = title.innerText;
console.log(title);
// changed the object structure to add a key for each attr
newsList.push({
title: title,
text: text,
url: url
});
});
return newsList;
});
// changed the object structure to add a key for the array
dataObj = {
amount: publishedNews.length,
list: publishedNews
};
} catch (e) {
console.log(e);
}
console.log({receivedData: dataObj});
browser.close();
return dataObj;
};
webScraping(pageURL).catch(console.error);
Let me know if this fixes your problem!