web-scrapingweb-crawlerpuppeteerdomcrawler

Puppeteer element is console.log'able but return undefined in puppeteer


I'm trying to crawl a webpage that has a h3 tag under an a tag. I'm getting the a tag just fine, but when trying to get the innerText of h3 I'm getting an undefined value.

This is what I'm trying to crawl:

const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";

const webScraping = async pageURL => {
    const browser = await puppeteer.launch({
        headless: false,
        arges: ["--no-sandbox"]
    });
    const page = await browser.newPage();
    let dataObj = {};

    try {
        await page.goto(pageURL, { waitUntil: 'networkidle2' });

        const publishedNews = await page.evaluate(() => {
            const newsDOM = document.querySelectorAll("main ul li");

            let newsList = [];
            newsDOM.forEach(linkElement => {
                const text = linkElement.querySelector("a").textContent;
                const innerText = linkElement.querySelector("a").innerText;
                const url = linkElement.querySelector("a").getAttribute('href');

                const title = linkElement.querySelector("h3").innerText;
                console.log(title);

                newsList.push({
                    title,
                    text,
                    url
                });
            });
            return newsList;
        });

        dataObj = {
            amount: publishedNews.length,
            publishedNews
        };

    } catch (e) {
        console.log(e);
    }

    console.log(dataObj);
    browser.close();
    return dataObj;
};

webScraping(pageURL).catch(console.error);

Console log works great, but puppeteer throws:

Cannot read property 'innerText' of null

Solution

  • It looks like your solution is working just fine, but you're not controlling whether the h3 tag is null or not. Try adding an if statement before accessing the innerText attribute, or use the code I left below.

    const puppeteer = require('puppeteer');
    const pageURL = "https://producthunt.com";
    
    const webScraping = async pageURL => {
        const browser = await puppeteer.launch({
            headless: false,
            arges: ["--no-sandbox"]
        });
        const page = await browser.newPage();
        let dataObj = {};
    
        try {
            await page.goto(pageURL, { waitUntil: 'networkidle2' });
    
            const publishedNews = await page.evaluate(() => {
                let newsList = [];
                const newsDOM = document.querySelectorAll("main ul li");
    
                newsDOM.forEach(linkElement => {
                    const aTag = linkElement.querySelector("a");
    
                    const text = aTag.textContent;
                    const innerText = aTag.innerText;
                    const url = aTag.getAttribute('href');
    
                    let title = aTag.querySelector("h3");
                    // there may be some <a> without an h3, control
                    // the null pointer exception here, accessing only
                    // if title is not 'null'.
                    if (title) title = title.innerText;
    
                    console.log(title);
    
                    // changed the object structure to add a key for each attr
                    newsList.push({
                        title: title,
                        text: text,
                        url: url
                    });
                });
    
                return newsList;
            });
    
            // changed the object structure to add a key for the array
            dataObj = {
                amount: publishedNews.length,
                list: publishedNews
            };
    
        } catch (e) {
            console.log(e);
        }
    
        console.log({receivedData: dataObj});
        browser.close();
        return dataObj;
    };
    
    webScraping(pageURL).catch(console.error);
    
    

    Let me know if this fixes your problem!