I am doing web scraping using selenium
in python with the following code:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
def get_all_search_details(URL):
SEARCH_RESULTS = {}
options = Options()
options.headless = True
options.add_argument("--remote-debugging-port=9222") #
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-extensions")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(URL)
print(f"Scraping {driver.current_url}")
try:
medias = WebDriverWait(driver,timeout=5,).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-row')))
except exceptions.StaleElementReferenceException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.NoSuchElementException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.TimeoutException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.WebDriverException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.SessionNotCreatedException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except Exception as e:
print(f">> {type(e).__name__} line {e.__traceback__.tb_lineno} of {__file__}: {e.args}")
return
except:
print(f">> General Exception: {URL}")
return
for media_idx, media_elem in enumerate(medias):
outer_html = media_elem.get_attribute('outerHTML')
result = scrap_newspaper(outer_html) # some external functions
SEARCH_RESULTS[f"result_{media_idx}"] = result
return SEARCH_RESULTS
if __name__ == '__main__':
in_url = "https://digi.kansalliskirjasto.fi/clippings?query=isokyr%C3%B6&categoryId=12&orderBy=RELEVANCE&page=3&resultMode=THUMB"
my_res = get_all_search_details(in_url)
I applied several try except
mentioned in documentation to ensure I would not get trapped in selenium exceptions, however, here is the error I obtained:
Traceback (most recent call last):
File "nationalbiblioteket_logs.py", line 277, in <module>
run()
File "nationalbiblioteket_logs.py", line 264, in run
all_queries(file_=get_query_log(QUERY=args.query),
File "nationalbiblioteket_logs.py", line 219, in all_queries
df = pd.DataFrame( df.apply( check_urls, axis=1, ) )
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/frame.py", line 8740, in apply
return op.apply()
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/apply.py", line 688, in apply
return self.apply_standard()
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/apply.py", line 812, in apply_standard
results, res_index = self.apply_series_generator()
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/apply.py", line 828, in apply_series_generator
results[i] = self.f(v)
File "nationalbiblioteket_logs.py", line 218, in <lambda>
check_urls = lambda INPUT_DF: analyze_(INPUT_DF)
File "nationalbiblioteket_logs.py", line 201, in analyze_
df["search_results"] = get_all_search_details(in_url)
File "/home/xenial/WS_Farid/DARIAH-FI/url_scraping.py", line 68, in get_all_search_details
outer_html = media_elem.get_attribute('outerHTML')
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 174, in get_attribute
self, name)
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 494, in execute_script
'args': converted_args})['value']
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 429, in execute
self.error_handler.check_response(response)
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/errorhandler.py", line 243, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: headless chrome=110.0.5481.30)
What am I doing wrong in my python script which causes such exception? I want to return None
and get out of function in case such exception occurs.
Here are some more details regarding libraries I use:
>>> selenium.__version__
'4.5.0'
>>> webdriver_manager.__version__
'3.8.4'
Put the For loop inside of the try block to catch the StaleElementReferenceException and return none, since the Exception occurs at the line
outer_html = media_elem.get_attribute('outerHTML')
.
A StaleElementReferenceException occurs when you try to do something with an element that isn't attached to the page you're currently on because for example the page was refreshed, and due to that the reference to that element is "stale".