I have a python sript to web scrape an ajax page, url: https://whatson.bfi.org.uk/Online/default.asp
, using Selenium in Chrome. I followed the suggested syntax but still get nothing. My selenium version is v4.32.0
, my Chrome version is 136.0.7103.114
. My code is like this:
from selenium import webdriver
from selenium.webdriver import ChromeOptions
def scrape_multiple_events_from_page(url):
if some_condition:
options = ChromeOptions()
options.add_argument("--headless=new") # Problem here
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument('--window-size=1920,1080')
try:
driver = webdriver.Chrome(options=options)
driver.get(url)
# wait for the preview page to be loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "Highlight"))
)
html = driver.page_source
except TimeoutException:
print(f"Timed out waiting for content on {url}")
except WebDriverException as e:
print(f"Selenium WebDriver error on {url}: {e}")
finally:
driver.quit()
It worked totally fine when I run without the headless mode. However, nothing worked when I enable the headless mode, throwing a TimeOutException
instead. I read through similar questions here and both the Chrome Developer and Selenium docs, tried the --headless=new
, options.headless=True
, but still have no luck. What solutions should I try to make it work? Thanks.
Add the user agent to ChromeOptions()
to avoid bot detection even in headless
mode. See code below:
# Use a user-agent to avoid detection
options.add_argument(
"user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
Full working code:
from selenium import webdriver
from selenium.common import TimeoutException, WebDriverException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_multiple_events_from_page(url):
if 2 > 1:
options = ChromeOptions()
options.add_argument("--headless=new") # Problem here
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument('--window-size=1920,1080')
# Use a user-agent to avoid detection
options.add_argument(
"user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
try:
driver = webdriver.Chrome(options=options)
driver.get(url)
# wait for the preview page to be loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "Highlight"))
)
html = driver.page_source
except TimeoutException:
print(f"Timed out waiting for content on {url}")
except WebDriverException as e:
print(f"Selenium WebDriver error on {url}: {e}")
finally:
driver.quit()
scrape_multiple_events_from_page("https://whatson.bfi.org.uk/Online/default.asp")