I am trying to scrape a page product detail page
but I am not able to find the tag when the code runs. I print the parent tag out, and I see the h2
tag I want, and also when I enter the debugger I can get what I want.
import time
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
with sync_playwright() as this_playwright:
browser = this_playwright.chromium.launch()
page = browser.new_page()
page.goto(url)
try:
page.wait_for_load_state("load")
if wait_after_page_load:
time.sleep(wait_after_page_load)
except:
pass
if selector_to_wait_for:
page.wait_for_selector(selector_to_wait_for, timeout=15000)
soup = BeautifulSoup(page.content(), "html.parser")
browser.close()
return soup
def parse_product_detail_page(soup):
parent_block = soup.find("div", class_="primary_block")
name_and_id_box = parent_block.find("div", class_="item-box")
print(name_and_id_box) # the h2 tag is visible here
name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")
# import ipdb; ipdb.set_trace() # the h2 tag is also visible here
id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out
def scrape_product_detail_page(product_detail_url):
try:
soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
except:
return None
parsed_data = parse_product_detail_page(soup)
return parsed_data
result = scrape_product_detail_page("https://www.innovation-line.com/four-color-photoimage-products/ventoux-210d-polyester-drawstring-cinch-pack-backpack-907.html")
I would appreciate some help determining why name_and_id_header
keeps showing up as none. Thank you
There is a whitespace in your BeautifulSoup class selection:
name_and_id_box.find("h2", class_="col-xs-6 ")
should be "col-xs-6":
name_and_id_box.find("h2", class_="col-xs-6").get_text()
or simply, because it is the only <h2>
there:
name_and_id_box.h2.get_text()