pythonxmlselenium-webdriverbeautifulsoup

Parse XML file using selenium and bs4?


i try to parse a xml-file using the following code:

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

options = Options()
# options.add_argument('--headless=new')  
options.add_argument("start-maximized")
options.add_argument('--log-level=3')  
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 1})    
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled') 
srv=Service()
driver = webdriver.Chrome (service=srv, options=options)    
# driver.minimize_window()
waitWD = WebDriverWait (driver, 10)  

wLink = "https://projects.propublica.org/nonprofits/organizations/830370609"
driver.get(wLink) 
driver.execute_script("arguments[0].click();", waitWD.until(EC.element_to_be_clickable((By.XPATH, '(//a[text()="XML"])[1]'))))  
driver.switch_to.window(driver.window_handles[1])    
time.sleep(3) 
print(driver.current_url)
soup = BeautifulSoup (driver.page_source, 'lxml')   
worker = soup.find("PhoneNum")
print(worker)

But as you can see in the result i am for exmaple not able to parse the element "PhoneNum"

(selenium) C:\DEV\Fiverr2025\TRY\austibn>python test.py
https://pp-990-xml.s3.us-east-1.amazonaws.com/202403189349311780_public.xml?response-content-disposition=inline&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA266MJEJYTM5WAG5Y%2F20250423%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250423T152903Z&X-Amz-Expires=1800&X-Amz-SignedHeaders=host&X-Amz-Signature=9743a63b41a906fac65c397a2bba7208938ca5b865f1e5a33c4f711769c815a4
None

How can i parse the xml-file from this site?


Solution

  • Fixes:

    1. Use requests.get() to fetch the XML directly (faster and more reliable than Selenium for raw XML).

    2. Parse with BeautifulSoup(..., 'xml') (not 'lxml', which is for HTML).

    3. Close Selenium after getting the URL (since it's no longer needed).

    4. Check if the tag exists before accessing .text.

    soup.find("PhoneNum" will return first one phone number. However, I use find_all() to return all matching elements.

    The following code will save the xml data in a xml file. If you don't need it, you could delete this part:

    with open("propublica_data.xml", "wb") as f:
        f.write(response.content)
    print("XML saved to 'propublica_data.xml'")
    

    You also utilized time.sleep(3), which is generally not recommended for production code. A more robust approach would be to use the line below instead (please note, I have not modified the time.sleep in your original code):

    waitWD.until(EC.presence_of_element_located((By.XPATH, '//*')))
    

    The full code with corrections:

    import time
    import requests
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    
    options = Options()
    options.add_argument("start-maximized")
    options.add_argument('--log-level=3')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_argument('--disable-blink-features=AutomationControlled')
    
    srv = Service()
    driver = webdriver.Chrome(service=srv, options=options)
    waitWD = WebDriverWait(driver, 10)
    
    url = "https://projects.propublica.org/nonprofits/organizations/830370609"
    driver.get(url)
    
    xml_button = waitWD.until(EC.element_to_be_clickable((By.XPATH, '(//a[text()="XML"])[1]')))
    driver.execute_script("arguments[0].click();", xml_button)
    
    driver.switch_to.window(driver.window_handles[1])
    time.sleep(3)
    xml_url = driver.current_url
    driver.quit()
    
    response = requests.get(xml_url)
    if response.status_code != 200:
        print("Failed to download XML")
        exit()
    
    soup = BeautifulSoup(response.content, 'xml')
    phone_numbers = soup.find_all('PhoneNum')
    
    if phone_numbers:
        print(f"Found {len(phone_numbers)} phone numbers:")
        for idx, phone in enumerate(phone_numbers, start=1):
            print(f"{idx}. {phone.text.strip()}")
    else:
        print("No <PhoneNum> tags found in the XML.")
    
    with open("propublica_data.xml", "wb") as f:
        f.write(response.content)
    print("XML saved to 'propublica_data.xml'")
    

    Output:

    Found 4 phone numbers:
    1. 6023146022
    2. 6022687502
    3. 6028812483
    4. 6023146022
    XML saved to 'propublica_data.xml'