javascriptpuppeteerimdb

IMDB load more button with puppeter (nodejs)


I'm trying to parse imdb movie connections (https://www.imdb.com/title/tt0090887/movieconnections/), but more button don't load info in each category (featured in, followeb by...). Puppeteer click function doesn't work because it's a kind of javascript function

enter image description here

const puppeteer = require('puppeteer');

const scrape = async function () {
    const browser = await puppeteer.launch({ headless: false });

    const page = await browser.newPage();
    await page.goto('https://www.imdb.com/title/tt0090887/movieconnections/');
    await page.click('.ipc-see-more__button');

The button I want to click is inside a span which is the one that contains the text "more", I think that when I press the button it loads a JavaScript function to show more content but not async:

  <span class="ipc-see-more sc-4d3dda93-0 fMZdeF single-page-see-more-button-followed_by">
          <button class="ipc-btn ipc-btn--single-padding ipc-btn--center-align-content ipc-btn--default-height ipc-btn--core-base ipc-btn--theme-base ipc-btn--on-accent2 ipc-text-button ipc-see-more__button" role="button" tabindex="0" aria-disabled="false">
            <span class="ipc-btn__text">
              <span class="ipc-see-more__text">2 more</span>
            </span>
            <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" class="ipc-icon ipc-icon--expand-more ipc-btn__icon ipc-btn__icon--post" viewBox="0 0 24 24" fill="currentColor" role="presentation">
              <path opacity=".87" fill="none" d="M24 24H0V0h24v24z"></path>
              <path d="M15.88 9.29L12 13.17 8.12 9.29a.996.996 0 1 0-1.41 1.41l4.59 4.59c.39.39 1.02.39 1.41 0l4.59-4.59a.996.996 0 0 0 0-1.41c-.39-.38-1.03-.39-1.42 0z"></path>
            </svg>
          </button>
        </span>

Solution

  • This is a bit of a complex scrape operation, because you need to click each button, then wait until the results arrive. You can do this by monitoring requests, or wait until lengths increase for each section with a 'more' button.

    Here's a quick sketch of how you can do the bottom version. It works but can use some cleanup, left as an exercise:

    const puppeteer = require("puppeteer"); // ^22.6.0
    
    const url = "<Your URL>";
    
    let browser;
    (async () => {
      browser = await puppeteer.launch();
      const [page] = await browser.pages();
      const ua =
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
      await page.setUserAgent(ua);
    
      // Performance optimization: block unnecessary requests
      await page.setRequestInterception(true);
      const allowedResources = ["script", "other", "fetch"];
      page.on("request", req => {
        if (
          (req.url().startsWith("https://www.imdb.com") ||
            allowedResources.includes(req.resourceType())) &&
          !/google|amazon|beacon/.test(req.url()) &&
          req.resourceType() !== "xhr"
        ) {
          req.continue();
        } else {
          req.abort();
        }
      });
    
      await page.goto(url, {waitUntil: "domcontentloaded"});
    
      // Retrieve the sections with 'more' buttons as [length, index] pairs
      const lengths = await page.$$eval(
        ".ipc-page-grid .ipc-page-section",
        els =>
          els
            .map((e, i) => [e, i])
            .filter(([e]) => e.querySelector(".ipc-see-more__text"))
            .map(([e, i]) => [e.querySelectorAll("p").length, i])
      );
    
      // Click all of the 'more' buttons
      await page.$$eval(".ipc-see-more__text", els =>
        els.forEach(el => el.click())
      );
    
      // Wait until the lengths of each 'more' section increase
      await page.waitForFunction(
        lengths =>
          [
            ...document.querySelectorAll(
              ".ipc-page-grid .ipc-page-section"
            ),
          ].every((el, i) => {
            const companion = lengths.find(e => e[1] === i);
            const {length} = el.querySelectorAll("p");
            return !companion || companion[0] < length;
          }),
        {},
        lengths
      );
    
      // Scrape the data
      const data = await page.$$eval(
        ".ipc-page-grid .ipc-page-section",
        els =>
          els
            .map(el => ({
              title: el
                .querySelector(".ipc-title")
                ?.textContent.trim(),
              items: [...el.querySelectorAll("p")].map(e => ({
                href: e.querySelector("a").href,
                year: [...e.childNodes].at(-1).textContent.trim(),
              })),
            }))
            .filter(e => e.items.length)
      );
      console.log(JSON.stringify(data, null, 2));
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close());
    

    Partial output:

    [
      {
        "title": "Edited into",
        "items": [
          {
            "href": "https://www.imdb.com/title/tt0101627?ref_=ttcnn",
            "year": "(1991)"
          },
          {
            "href": "https://www.imdb.com/title/tt3233580?ref_=ttcnn",
            "year": "(2013)"
          }
        ]
      },
      {
        "title": "Featured in",
        "items": [
          {
            "href": "https://www.imdb.com/title/tt14701700?ref_=ttcnn",
            "year": "(TV Episode 1986)"
          },
          {
            "href": "https://www.imdb.com/title/tt1577448?ref_=ttcnn",
            "year": "(TV Episode 1986)"
          },
          {
            "href": "https://www.imdb.com/title/tt0093629?ref_=ttcnn",
            "year": "(1987)"
          },
          {
            "href": "https://www.imdb.com/title/tt6079512?ref_=ttcnn",
            "year": "(TV Episode 1989)"
          },
          {
            "href": "https://www.imdb.com/title/tt0116289?ref_=ttcnn",
            "year": "(1996)"
          },
          {
            "href": "https://www.imdb.com/title/tt0834914?ref_=ttcnn",
            "year": "(Video 2006)"
          },
          {
            "href": "https://www.imdb.com/title/tt1748981?ref_=ttcnn",
            "year": "(TV Episode 2010)"
          },
          {
            "href": "https://www.imdb.com/title/tt4213530?ref_=ttcnn",
            "year": "(TV Episode 2014)"
          },
    // ...