First, I'm very new to coding and it's likely that one or both of these issues has an easy fix. Don't feel like any suggestion is too basic. I've done a fair amount of research, but I think my code either lacks some basic aspect and/or it's a situation specific enough that it warrants a specific response. I'm trying to create a program that will iterate through each A player and gather data. Eventually, I'd like to make the program iterate through each additional 'letter' page (e.g B, C, D, etc.). I'm using Virtual Code Studio.
I'm encountering 2 problems repeatedly:
After scraping data and navigating back to the prior page (page with player_links), I usually get a StaleElement error within iterating through 2-5 players. I've tried adding an exception with varied results. I chose not to include that version here for the sake of simplicity. I believe that, at least part of the time, I get this error because of the other issue I'm having.
I'm also encountering an overlay that pops-up sporadically on both the player_links and individual player_link pages. The code I'm using to close the overlay doesn't seem to work properly, as I still run into the same StaleElement error and the overlay rarely closes. I have included the ad/overlay HTML after my code below:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
options = Options()
options.headless = True
options.add_argument('--window-size=1366,599')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--log-level=3')
#options.add_argument('--headless')
DRIVER_PATH = r"C:\Users\brand\Desktop\Virtual Code Studio Files\chromedriver.exe"
service = Service(executable_path=DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
actions = ActionChains(driver)
def text(e: WebElement) -> str:
if r := e.text:
return r
return e.get_attribute("textContent") or "n/a"
driver.get('https://www.baseball-reference.com/players/')
def wait(driver):
return WebDriverWait(driver, 30)
def scrape_player_links(driver, url):
driver.get(url)
div = wait(driver).until(EC.presence_of_element_located((By.ID, "div_alphabet")))
letter_links = wait(div).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a")))
driver.execute_script("arguments[0].click();", letter_links)
div2 = wait(driver).until(EC.presence_of_element_located((By.ID, "div_players_")))
player_links = wait(div2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[href]")))
for player_link in player_links:
driver.execute_script("""
// Try to hide the overlay by setting its display property to 'none'
var overlay = document.getElementById('#ad_top');
if (overlay) {
overlay.style.display = 'none';
}
""")
player_links = wait(div2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[href]")))
wait(player_links).until(EC.element_to_be_clickable(player_link))
driver.execute_script("arguments[0].click();", player_link)
driver.execute_script("""
var overlay = document.getElementById('#ad_top');
if (overlay) {
overlay.style.display = 'none';
}
""")
div3 = wait(driver).until(EC.presence_of_element_located((By.ID, "meta")))
name = wait(div3).until(EC.presence_of_element_located((By.CSS_SELECTOR, "h1 span")))
location = div3.find_element(By.XPATH, "//p[.//a]").text
location = location.split(' in ')[-1]
location = location.replace(' us', '')
#state = location.split(', ')[-1]
a_players = {'name': name, 'location': location}
driver.back()
print(a_players)
scrape_player_links(driver, 'https://www.baseball-reference.com/players/')
driver.quit()
desktop_norm_ads = ['<div id="ad_top" class="SITEID"><div class="ad_copy"><div class="ad_headline">Is SITENAME your happy place? Stathead is for you.</div><div class="ad_subhed"><ul class="ad_bullet"><li>Access the SITENAME database</li><li>Affordable</li><li>Discovery Tools</li><li>Ad-Free Viewing</li></ul></div></div><div id="right_button"><a class="button ad_button" href="https://stathead.com/sport/SPORT/?utm_medium=sr_xsite&utm_source=SITEID&utm_campaign=UTM_CAMPAIGN&utm_content=bttn_desktop_cta_happyplace">Get your first month FREE</a></div></div>', '<div id="ad_top" class="SITEID"><div class="ad_copy"><div class="ad_headline">We built Stathead for users like you</div><div class="ad_subhed">Stathead is your all access pass to the SITENAME database. A search engine to easily answer all your statistical questions, plus ad-free viewing.</div></div><div id="right_button"><a class="button ad_button" href="https://stathead.com/sport/SPORT/?utm_medium=sr_xsite&utm_source=SITEID&utm_campaign=UTM_CAMPAIGN&utm_content=bttn_desktop_cta_userslikeyou">Get your first month FREE</a></div></div>', '<div id="ad_top" class="SITEID"><div class="ad_copy"><div class="ad_headline">Get Ad-Free viewing with Stathead</div><div class="ad_subhed">and go inside the SITENAME database. Our sports search engine easily answers all your statistical questions.</div></div><div id="right_button"><a class="button ad_button" href="https://stathead.com/sport/SPORT/?utm_medium=sr_xsite&utm_source=SITEID&utm_campaign=UTM_CAMPAIGN&utm_content=bttn_desktop_cta_adfree">Get your first month FREE</a></div></div>', '<div id="ad_top" class="SITEID"><div class="ad_copy"><div class="ad_headline">Stathead is your all-access ticket to the SITENAME database</div><div class="ad_subhed">A sports search engine to easily answer all your statistical questions</div></div><div id="right_button"><a class="button ad_button" href="https://stathead.com/sport/SPORT/?utm_medium=sr_xsite&utm_source=SITEID&utm_campaign=UTM_CAMPAIGN&utm_content=bttn_desktop_cta_userslikeyou">Get your first month FREE</a></div></div>
You are facing the stale element exception because you have located the elements in a list and then navigated away from the page. When you return to the page, the element references differ from the previous one.
In your code, you are clicking using the js script. You shouldn't have any problem with the overlay or the pop-up ad.
I've just edited a little of your code to make it work. First I collected all the player profile links and then navigated to that page using driver.get()
.
This way the overlay doesn't affect your data scraping.
This way You don't have to be concerned about stale elements.
Also, you don't have to create WebDriverWait for each time you need to wait. You can reuse the same wait.
Check the following code.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.remote.webelement import WebElement
options = Options()
options.headless = True
options.add_argument('--window-size=1366,599')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--log-level=3')
options.add_argument('--headless')
DRIVER_PATH = r"C:\Users\brand\Desktop\Virtual Code Studio Files\chromedriver.exe"
service = Service(executable_path=DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
actions = ActionChains(driver)
def text(e: WebElement) -> str:
if r := e.text:
return r
return e.get_attribute("textContent") or "n/a"
def wait(driver):
return WebDriverWait(driver, 30)
def scrape_player_links(driver, url):
driver.get(url)
s_wait = wait(driver)
letters = s_wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'div#div_alphabet li > a')))
letter_links = []
for letter in letters:
letter_links.append(letter.get_attribute('href'))
for letter_link in letter_links:
driver.get(letter_link)
players = s_wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'div#div_players_ a[href]')))
player_links = []
for player in players:
player_links.append(player.get_attribute('href'))
for player_link in player_links:
driver.get(player_link)
name = s_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#meta h1 span"))).text
locations = driver.find_elements(By.XPATH,"//div[@id='meta']/div/p/span[not(@id)][a]")
if len(locations) > 0:
location = locations[0].text
location = location.split(' in ')[-1]
location = location.replace(' us', '')
else:
location = ''
a_players = {'name': name, 'location': location}
print(a_players)
scrape_player_links(driver, 'https://www.baseball-reference.com/players/')
driver.quit()