pythonselenium-webdriverselenium-chromedriverdata-mining

Selenium - Failed to download document


I am currently working on a web scraper and each time i am trying to click or try to get the href of a certain link button with it, it gives me absolutly nothing. However, I tried and I must point out that when I go to the website myself, the link which i need to click works and the data is accessible but when i'm am using my webscraper it doesn't why ?

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import urllib.request
import os

WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
BUTTON_COOKIE_XPATH = '//*[@id="onetrust-accept-btn-handler"]'
BUTTON_AVISO_XPATH = '//*[@id="MapaCapaciadaModalButton"]/span[1]'
BUTTON_PDF_XPATH = '//*[@id="portlet_com_liferay_journal_content_web_portlet_JournalContentPortlet_INSTANCE_aVVDHaAKM4S6"]/div/div/div/div/div/p/a'
DOWNLOAD_PATH = '/path/to/download/directory'
PROFILE_PATH = 'my personal path to my chrome profile'

def setup_driver(profile_path: str = None) -> webdriver.Chrome:
    chrome_options = Options()
    if profile_path:
        chrome_options.add_argument(f"user-data-dir={profile_path}")
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": DOWNLOAD_PATH,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True
    })
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def wait_and_click(driver: webdriver.Chrome, by: By, value: str):
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((by, value))
    )
    element.click()

def get_pdf_url(driver: webdriver.Chrome) -> str:
    pdf_link_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, BUTTON_PDF_XPATH))
    )
    url = pdf_link_element.get_attribute('href')
    if not url:
        raise ValueError("Failed to retrieve the PDF URL")
    return url

def download_pdf(url: str, download_path: str) -> str:
    local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
    urllib.request.urlretrieve(url, local_pdf_path)
    sleep(10)
    if not os.path.isfile(local_pdf_path):
        raise FileNotFoundError("PDF file was not found after downloading")
    return local_pdf_path

def main():
    driver = setup_driver()

    try:
        driver.get(WEBSITE_URL)
        sleep(10)
        wait_and_click(driver, By.XPATH, BUTTON_COOKIE_XPATH)
        wait_and_click(driver, By.XPATH, BUTTON_AVISO_XPATH)
        pdf_url = get_pdf_url(driver)
        downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)
        print(f"PDF downloaded to: {downloaded_pdf_path}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

As you can see it's not a really big scraper and only want to have this one file described as 'BUTTON_PDF_XPATH'.

So i tried things in order to fix it like using my chrome profile with the web scrapper which sometimes resulted in giving me the error: Err_HTTP2_Protocol_Error ,infinite loading until it timed out or in some cases it loaded the website but it could click on nothing (all the XPATH work i can assure you).

I also tried to slow down the scraper with some sleep() but it resulted in just making me wait for nothing, or i even tried to directly click on it but it just keeped making me leave.

Finally i wanted to try to use an argument such as :options.add_argument('--disable-http2') for the Err_HTTP2_Protocol_Error but i don't know how to use it.


Solution

  • You can get the pdf link from the static html, no need for selenium:

    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin
    import os
      
    def extract_pdf_link(url):
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, 'html.parser')
        pdf_url = urljoin(url, soup.select_one('a[href*=".pdf/"]').get('href'))
        return pdf_url
    
    
    def download_pdf(url, download_path):
        local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
        response = requests.get(url, headers=HEADERS)
        
        with open(local_pdf_path, 'wb') as f:
            f.write(response.content)
    
        return local_pdf_path
    
    
    WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
    DOWNLOAD_PATH = ''
    HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}
    
    pdf_url = extract_pdf_link(WEBSITE_URL)
    downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)