pythonselenium-webdriver

extract data from webpage using selenium python


from selenium.webdriver import ChromeOptions
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://www.cbp.gov/newsroom/media-releases/all?field_date_release_value%5Bmin%5D=&field_date_release_value%5Bmax%5D=&field_newsroom_type_target_id_1=54&body_value="
SELECT_XPATH = "/html/body/div[1]/main/div/div[2]/div/div/div[4]/article/div/div[4]/div/div/div/div/div[1]/form/div/div[2]/div[1]/div[2]/div/select"
SEARCH_ID = "edit-submit-newsroom--5CcqMDJL-LM"

driver = webdriver.Chrome()
fiscal_yr = '2023'
driver.get(URL)
time.sleep(5)
wait = WebDriverWait(driver, 20)
s = Select(wait.until(EC.presence_of_element_located((By.XPATH, SELECT_XPATH))))
fiscal_yr = str(fiscal_yr)
if fiscal_yr in {option.text for option in s.options}:
    print(fiscal_yr)
    logging.info("fiscal_year: {}".format(fiscal_yr))
    s.select_by_visible_text(fiscal_yr)
    search_element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "edit-submit-newsroom--uZGWwDKmBW8")))
    time.sleep(10)
    search_element.click()
    global page1
    page_xpath = "/html/body/div[1]/main/div/div[2]/div/div/div[4]/article/div/div[4]/div/div/div/div/div[2]"
    if driver.find_elements(By.XPATH, page_xpath):
        page_nums = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, page_xpath)))
        page1 = page_nums.find_element(By.XPATH, page_xpath).text
        print(page1)

Getting error on line - search_element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "edit-submit-newsroom--uZGWwDKmBW8")))

any help would be appreciated to get data from above URL by selecting the year?


Solution

  • search_element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "edit-submit-newsroom--uZGWwDKmBW8")))
    

    ID locator value in the above line is incorrect. There is no ID with the value edit-submit-newsroom--uZGWwDKmBW8

    Change the code as below:

    search_element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "edit-submit-newsroom")))
    

    Refer the refactored code below:

    1. Applied explicit waits effectively and removed time.sleep()
    2. Removed long and ineffective absolute XPaths with short and readable relative XPaths

    Code:

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import Select
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    URL = "https://www.cbp.gov/newsroom/media-releases/all?field_date_release_value%5Bmin%5D=&field_date_release_value%5Bmax%5D=&field_newsroom_type_target_id_1=54&body_value="
    SELECT_XPATH = "//select[@id='edit-field-date-release-value']"
    
    driver = webdriver.Chrome()
    fiscal_yr = '2023'
    driver.get(URL)
    driver.maximize_window()
    wait = WebDriverWait(driver, 10)
    s = Select(wait.until(EC.presence_of_element_located((By.XPATH, SELECT_XPATH))))
    fiscal_yr = str(fiscal_yr)
    if fiscal_yr in {option.text for option in s.options}:
        print(fiscal_yr)
        s.select_by_visible_text(fiscal_yr)
        wait.until(EC.element_to_be_clickable((By.ID, "edit-submit-newsroom"))).click()
        global page1
        page_className = "view-content"
        if wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, page_className))):
            page1 = driver.find_element(By.CLASS_NAME, page_className).text
            print(page1)
    

    Result:

    2023
    Dec 26
    2023
    CBP officers seize 1,018 pounds of methamphetamine, 165 pounds of cocaine valued at $10.2 million at World Trade Bridge | Local Media Release
    LAREDO, Texas—U.S. Customs and Border Protection, Office of Field Operations officers assigned to the World Trade Bridge seized hard narcotics that totaled…
    Dec 22
    2023
    CBP Dallas Discovers Harmful Stinkbug | Local Media Release
    CBP Dallas discovers rare stinkbug harmful to plants
    DALLAS – CBP agriculture specialists at Dallas/Fort Worth International Airport intercepted…
    Dec 21
    2023
    CBP in Partnership with Ontario International Airport Announces the Opening of New Global Entry Enrollment Center | Local Media Release
    LOS ANGELES— U.S. Customs and Border Protection (CBP) in partnership with Ontario International Airport (ONT) announced the grand opening of the Ontario…
    Dec 20
    2023
    U.S. Border Patrol, San Diego Sector warns migrants; cold weather is coming | Local Media Release
    SAN DIEGO — The San Diego Sector Border Patrol is cautioning migrants of the dangers posed by inclement weather that will soon impact San Diego County…
    Dec 20
    2023
    CBP Officers Intercept 110 Pounds of Ketamine in U.K. Man’s Baggage at Detroit Metro Airport | Local Media Release
    ROMULUS, Mich. – U.S. Customs and Border Protection’s (CBP) Office of Field Operations intercepted 110 pounds of ketamine in a traveler’s baggage at Detroit…
    Dec 19
    2023
    Air and Marine Operations Southeast Region crews apprehend 1,086 migrants in December | Local Media Release
    MIAMI— U.S. Customs and Border Protection Air and Marine Operations Southeast Region crews worked with U.S. Coast Guard, Homeland Security Task Force-…
    Dec 18
    2023
    Cincinnati CBP Seizes $6.9 Million in Counterfeit High-end Jewelry | Local Media Release
    CINCINNATI—From December 1-7, U.S. Customs and Border Protection (CBP) officers in Cincinnati were vigilant intercepting 11 shipments containing counterfeit…
    Dec 18
    2023
    CBP JFK Global Entry Enrollment Center In-Person Appointment Availability | Local Media Release
    JAMAICA, N.Y. — U. S. Customs and Border Protection at John F. Kennedy International Airport’s Global Entry Enrollment Center has just…
    Dec 18
    2023
    CBP Arrests Romanian Woman at the Lewiston Bridge for Felony Warrants of Credit Card Fraud | Local Media Release
    LEWISTON, N.Y. – U.S. Customs and Border Protection (CBP) officers at the Port of Buffalo, Lewiston Bridge border-crossing, arrested a 29-year-old female…
    Dec 18
    2023
    CBP officers seize fentanyl and methamphetamine at the Ysleta port of entry | Local Media Release
    EL PASO, Texas - U.S. Customs and Border Protection officers working at the Ysleta port of entry intercepted a combined 123 pounds of…
    
    Process finished with exit code 0
    

    UPDATE: If you want to retrieve all the hrefs from the page, refer below code:

    # page1 = driver.find_element(By.CLASS_NAME, page_className).text
    # print(page1)
    hrefs = driver.find_elements(By.XPATH, "//div[@class='view-content']//a")
    for href in hrefs:
        print(href.get_attribute("href"))