python selenium-webdriver web-scraping pdf beautifulsoup

How to reliably download 1969 “Gazzetta Ufficiale” PDFs (Italian Official Gazette) with Python?

I’m trying to programmatically download the full “pubblicazione completa non certificata” PDFs of the Italian Gazzetta Ufficiale – Serie Generale for 1969 (for an academic article). The site has a 1946–1985 “Formato grafico PDF” search and an archive index:

Year index (1946–1985): https://www.gazzettaufficiale.it/ricerca/pdf/foglio_ordinario2/2/0/0?reset=true
Archive list for Serie Generale 1969: https://www.gazzettaufficiale.it/ricercaArchivioCompleto/serie_generale/1969
Detail pages look like: .../gazzetta/serie_generale/caricaDettaglio?dataPubblicazioneGazzetta=1969-01-02&numeroGazzetta=1

What I’ve tried

Selenium: navigate to the year picker and click 1969. On my machine it often times out: the year link/input is either not present, hidden in an iframe, or overlaid by a banner. I tried switching frames and even injecting the year via JS, but it’s brittle and unreliable.
Requests + BeautifulSoup on the “year grid” page: in some HTML copies (from another session) I can see direct links like <a class="download_pdf" href="/do/gazzetta/downloadPdf?...">Download PDF</a> —but in my live session those anchors are not there, so scraping returns 0 links.
Manually building the download URL from the archive list (date & issue number), e.g.: /do/gazzetta/downloadPdf?dataPubblicazioneGazzetta=19690102&numeroGazzetta=1&tipoSerie=SG&tipoSupplemento=GU&numeroSupplemento=0&progressivo=0&edizione=0&estensione=pdf This returns HTML, not a PDF. When saved with “.pdf”, Acrobat says the file is damaged; the file actually contains an HTML message like: “Il pdf selezionato non è stato trovato”.
Requests on each detail page: fetch the detail URL and look for either a.download_pdf or anchors containing “Scarica il PDF” / “pubblicazione completa non certificata”. For 1969 I consistently find no such link on the page, so I can’t discover a valid downloadPdf URL at runtime.

Minimal reproducible example (requests + bs4)

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs

BASE = "https://www.gazzettaufficiale.it"
YEAR = 1969
YEAR_URL = f"{BASE}/ricercaArchivioCompleto/serie_generale/{YEAR}"

s = requests.Session()
s.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer": BASE,
})

# 1) Collect detail pages (date + issue number)
r = s.get(YEAR_URL, timeout=60)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
details = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    if ("/gazzetta/serie_generale/caricaDettaglio" in href
        and "dataPubblicazioneGazzetta=" in href
        and "numeroGazzetta=" in href):
        details.append(urljoin(BASE, href))

print("Detail pages found:", len(details))
print("Sample:", details[:3])

# 2) For one detail page, try to discover a real "download PDF" link
detail_url = details[0]
r = s.get(detail_url, timeout=60, headers={"Referer": YEAR_URL})
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

# Try common selectors / texts
dl = (soup.select_one('a.download_pdf[href]')
      or soup.select_one('a[href*="/do/gazzetta/downloadPdf"]'))
if not dl:
    for a in soup.find_all("a", href=True):
        if "scarica il pdf" in (a.get_text() or "").lower():
            dl = a
            break

print("Download link found on detail page?", bool(dl))
if dl:
    print("Download href:", urljoin(BASE, dl["href"]))

Output I get:

Detail pages found: 264
Sample: [https://www.gazzettaufficiale.it/gazzetta/serie_generale/caricaDettaglio?dataPubblicazioneGazzetta=1969-01-02&numeroGazzetta=1, ...]
Download link found on detail page? False

When I instead build the downloadPdf URL from the query params and try to download it, the response is HTML (not a PDF). Earlier I inadvertently saved those HTML responses as “.pdf”, resulting in 300+ “corrupted PDFs” that open as error in Acrobat.

Any guidance or a working minimal example would be greatly appreciated. Thanks!

Solution

A solution with selenium. Set dl_dir to the directory where pdfs must be stored.

Options are selected by using the arrow keys since selecting by value or text didn't work for me.
Time to wait for download is set to 8 but might need adjustment if downloads are incomplete.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
from lxml import html

url = "https://www.gazzettaufficiale.it/ricerca/pdf/foglio_ordinario2/2/0/0?reset=true"
dl_dir = "/home/lmc/tmp/test-ws/gaz"

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("window-size=2880x1620")
#options.add_experimental_option("excludeSwitches", ["enable-automation"])
#options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--headless")

#options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
options.set_capability("pageLoadStrategy", "normal")
options.add_argument("--enable-javascript")
prefs = {"profile.managed_default_content_settings.images": 2, "permissions.default.stylesheet": 2,
         "download.default_directory": dl_dir,
         'download.prompt_for_download': False,
         'download.directory_upgrade': True,}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=options)
driver.implicitly_wait(30)
driver.get(url)

try:

    select_element = driver.find_element(By.ID, 'annoPubblicazione')
    select_element.click()
    time.sleep(2)

    actions = ActionChains(driver)
    for i in range(17):
        actions.send_keys(Keys.ARROW_DOWN) # Navigate down
    actions.send_keys(Keys.ENTER) # Select
    actions.perform()

    submit = driver.find_element(By.XPATH, '//input[@name="cerca"]')
    submit.click()

    time.sleep(2)
    pageSource = driver.page_source

    doc = html.fromstring(pageSource)
    dl_list = doc.xpath('//a[@class="download_pdf"]/@href')
    print(f"by lxml {dl_list[0]}")
    print(f"dl: {len(dl_list)}, type: {type(dl_list[0])}")
    for gz in dl_list:
        durl = f"https://www.gazzettaufficiale.it{str(gz)}"
        print(f"Downloading: {durl}")
        driver.get(durl)
        # gave time to download
        time.sleep(8)
#    wait_for_download_completion(dl_dir, 600)

except Exception as  e:
    print("Invalid URL")
    raise e
finally:
    driver.quit()

The following method could be used to check for complete downloads instead of a fixed sleep but I didn't tested it much

def wait_for_download_completion(download_dir, timeout=60, check_interval=1):
    import glob
    start_time = time.time()

    while glob.glob(f'{download_dir}/*.crdownload') and time.time() - start_time < timeout:

        time.sleep(check_interval)
    print(f"Download complete {time.time() - start_time:.2f}.")