pythonweb-scrapingplaywrightplaywright-python

Playwright Python can't find HTML tag which shows up in debugger and in a print statement


I am trying to scrape a page product detail page

but I am not able to find the tag when the code runs. I print the parent tag out, and I see the h2 tag I want, and also when I enter the debugger I can get what I want.

import time

from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright


def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
    with sync_playwright() as this_playwright:
        browser = this_playwright.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        try:
            page.wait_for_load_state("load")
            if wait_after_page_load:
                time.sleep(wait_after_page_load)
        except:
            pass
        
        if selector_to_wait_for:
            page.wait_for_selector(selector_to_wait_for, timeout=15000)

        soup = BeautifulSoup(page.content(), "html.parser")
        browser.close()
    return soup


def parse_product_detail_page(soup):
    parent_block = soup.find("div", class_="primary_block")
    name_and_id_box = parent_block.find("div", class_="item-box")

    print(name_and_id_box) # the h2 tag is visible here

    name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")

    # import ipdb; ipdb.set_trace() # the h2 tag is also visible here

    id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out


def scrape_product_detail_page(product_detail_url):
    try:
        soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
    except:
        return None
    parsed_data = parse_product_detail_page(soup)
    return parsed_data


result = scrape_product_detail_page("https://www.innovation-line.com/four-color-photoimage-products/ventoux-210d-polyester-drawstring-cinch-pack-backpack-907.html")

I would appreciate some help determining why name_and_id_header keeps showing up as none. Thank you


Solution

  • There is a whitespace in your BeautifulSoup class selection:

    name_and_id_box.find("h2", class_="col-xs-6 ")
    

    should be "col-xs-6":

    name_and_id_box.find("h2", class_="col-xs-6").get_text()
    

    or simply, because it is the only <h2> there:

    name_and_id_box.h2.get_text()