pythonselenium-webdriverxpath

How to scrape a webpage url which is same as main page url in python?


The below website url opens a form , in which we only need to select fiscal year and click on search to get the data for that year, but the search year also opens the same url as below -

https://cfpub.epa.gov/compliance/criminal_prosecution/index.cfm

I have written below piece of code by manually putting the year xpath of 2023-

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
b = webdriver.Chrome()
b.get(url)
time.sleep(10)
total_article_xpath = "//*[@id=\"main-content\"]/div[2]/div[1]/div/div/form/table/tbody/tr[8]/td/div/div[2]/select/option[42]"
element = WebDriverWait(b, 10).until(EC.presence_of_element_located((By.XPATH, total_article_xpath)))
time.sleep(10)
print(element)
getdetails = element.find_element(By.XPATH, total_article_xpath)
button_val = "//*[@id=\"searchButton\"]"
b.find_element(By.XPATH, button_val).click()
print(b)
vals = b.current_url

How can we navigate and scrape the fiscal year url which is same as main page url? Any help would be appreciated.


Solution

  • You need to make a selection from the Fiscal Year dropdown then click the Search button. You can then proceed to scrape the resulting page as normal.

    Here's an example:

    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait, Select
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.webdriver import ChromeOptions
    
    URL = "https://cfpub.epa.gov/compliance/criminal_prosecution/index.cfm"
    SELECT_XPATH = "//*[@id='main-content']/div[2]/div[1]/div/div/form/table/tbody/tr[8]/td/div/div[2]/select"
    SEARCH_ID = "searchButton"
    TBODY_XPATH = "//*[@id='main-content']/div[2]/div[1]/div/div/table/tbody/tr[1]/td[2]/table/tbody"
    FISCAL_YEAR = "2000"
    
    options = ChromeOptions()
    options.add_argument("--headless")
    
    with webdriver.Chrome(options) as driver:
        driver.get(URL)
        wait = WebDriverWait(driver, 5)
        s = Select(wait.until(EC.presence_of_element_located((By.XPATH, SELECT_XPATH))))
        if FISCAL_YEAR in {option.text for option in s.options}:
            s.select_by_visible_text(FISCAL_YEAR)
            wait.until(EC.element_to_be_clickable((By.ID, SEARCH_ID))).click()
            tbody = wait.until(EC.presence_of_element_located((By.XPATH, TBODY_XPATH)))
            for td in tbody.find_elements(By.CSS_SELECTOR, "td.valign-top")[::2]:
                print(td.text)
        else:
            print(f"{FISCAL_YEAR} is not an available option")
    

    Output:

    Allen Sinclair
    BP Exploration-Alaska (BPXA)
    Ben Shafsky
    Doyon Drilling, Inc.
    Michael Krupa