The below website url opens a form , in which we only need to select fiscal year and click on search to get the data for that year, but the search year also opens the same url as below -
https://cfpub.epa.gov/compliance/criminal_prosecution/index.cfm
I have written below piece of code by manually putting the year xpath of 2023-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
b = webdriver.Chrome()
b.get(url)
time.sleep(10)
total_article_xpath = "//*[@id=\"main-content\"]/div[2]/div[1]/div/div/form/table/tbody/tr[8]/td/div/div[2]/select/option[42]"
element = WebDriverWait(b, 10).until(EC.presence_of_element_located((By.XPATH, total_article_xpath)))
time.sleep(10)
print(element)
getdetails = element.find_element(By.XPATH, total_article_xpath)
button_val = "//*[@id=\"searchButton\"]"
b.find_element(By.XPATH, button_val).click()
print(b)
vals = b.current_url
How can we navigate and scrape the fiscal year url which is same as main page url? Any help would be appreciated.
You need to make a selection from the Fiscal Year dropdown then click the Search button. You can then proceed to scrape the resulting page as normal.
Here's an example:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions
URL = "https://cfpub.epa.gov/compliance/criminal_prosecution/index.cfm"
SELECT_XPATH = "//*[@id='main-content']/div[2]/div[1]/div/div/form/table/tbody/tr[8]/td/div/div[2]/select"
SEARCH_ID = "searchButton"
TBODY_XPATH = "//*[@id='main-content']/div[2]/div[1]/div/div/table/tbody/tr[1]/td[2]/table/tbody"
FISCAL_YEAR = "2000"
options = ChromeOptions()
options.add_argument("--headless")
with webdriver.Chrome(options) as driver:
driver.get(URL)
wait = WebDriverWait(driver, 5)
s = Select(wait.until(EC.presence_of_element_located((By.XPATH, SELECT_XPATH))))
if FISCAL_YEAR in {option.text for option in s.options}:
s.select_by_visible_text(FISCAL_YEAR)
wait.until(EC.element_to_be_clickable((By.ID, SEARCH_ID))).click()
tbody = wait.until(EC.presence_of_element_located((By.XPATH, TBODY_XPATH)))
for td in tbody.find_elements(By.CSS_SELECTOR, "td.valign-top")[::2]:
print(td.text)
else:
print(f"{FISCAL_YEAR} is not an available option")
Output:
Allen Sinclair
BP Exploration-Alaska (BPXA)
Ben Shafsky
Doyon Drilling, Inc.
Michael Krupa