pythonselenium-webdriverweb-scrapingpdfbeautifulsoup

How to reliably download 1969 “Gazzetta Ufficiale” PDFs (Italian Official Gazette) with Python?


I’m trying to programmatically download the full “pubblicazione completa non certificata” PDFs of the Italian Gazzetta Ufficiale – Serie Generale for 1969 (for an academic article). The site has a 1946–1985 “Formato grafico PDF” search and an archive index:

What I’ve tried

  1. Selenium: navigate to the year picker and click 1969. On my machine it often times out: the year link/input is either not present, hidden in an iframe, or overlaid by a banner. I tried switching frames and even injecting the year via JS, but it’s brittle and unreliable.

  2. Requests + BeautifulSoup on the “year grid” page: in some HTML copies (from another session) I can see direct links like <a class="download_pdf" href="/do/gazzetta/downloadPdf?...">Download PDF</a> —but in my live session those anchors are not there, so scraping returns 0 links.

  3. Manually building the download URL from the archive list (date & issue number), e.g.: /do/gazzetta/downloadPdf?dataPubblicazioneGazzetta=19690102&numeroGazzetta=1&tipoSerie=SG&tipoSupplemento=GU&numeroSupplemento=0&progressivo=0&edizione=0&estensione=pdf This returns HTML, not a PDF. When saved with “.pdf”, Acrobat says the file is damaged; the file actually contains an HTML message like: “Il pdf selezionato non è stato trovato”.

  4. Requests on each detail page: fetch the detail URL and look for either a.download_pdf or anchors containing “Scarica il PDF” / “pubblicazione completa non certificata”. For 1969 I consistently find no such link on the page, so I can’t discover a valid downloadPdf URL at runtime.

Minimal reproducible example (requests + bs4)

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs

BASE = "https://www.gazzettaufficiale.it"
YEAR = 1969
YEAR_URL = f"{BASE}/ricercaArchivioCompleto/serie_generale/{YEAR}"

s = requests.Session()
s.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer": BASE,
})

# 1) Collect detail pages (date + issue number)
r = s.get(YEAR_URL, timeout=60)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
details = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    if ("/gazzetta/serie_generale/caricaDettaglio" in href
        and "dataPubblicazioneGazzetta=" in href
        and "numeroGazzetta=" in href):
        details.append(urljoin(BASE, href))

print("Detail pages found:", len(details))
print("Sample:", details[:3])

# 2) For one detail page, try to discover a real "download PDF" link
detail_url = details[0]
r = s.get(detail_url, timeout=60, headers={"Referer": YEAR_URL})
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

# Try common selectors / texts
dl = (soup.select_one('a.download_pdf[href]')
      or soup.select_one('a[href*="/do/gazzetta/downloadPdf"]'))
if not dl:
    for a in soup.find_all("a", href=True):
        if "scarica il pdf" in (a.get_text() or "").lower():
            dl = a
            break

print("Download link found on detail page?", bool(dl))
if dl:
    print("Download href:", urljoin(BASE, dl["href"]))

Output I get:

Detail pages found: 264
Sample: [https://www.gazzettaufficiale.it/gazzetta/serie_generale/caricaDettaglio?dataPubblicazioneGazzetta=1969-01-02&numeroGazzetta=1, ...]
Download link found on detail page? False

When I instead build the downloadPdf URL from the query params and try to download it, the response is HTML (not a PDF). Earlier I inadvertently saved those HTML responses as “.pdf”, resulting in 300+ “corrupted PDFs” that open as error in Acrobat.

Any guidance or a working minimal example would be greatly appreciated. Thanks!


Solution

  • A solution with selenium. Set dl_dir to the directory where pdfs must be stored.

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.common.keys import Keys
    import time
    from lxml import html
    
    url = "https://www.gazzettaufficiale.it/ricerca/pdf/foglio_ordinario2/2/0/0?reset=true"
    dl_dir = "/home/lmc/tmp/test-ws/gaz"
    
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_argument("window-size=2880x1620")
    #options.add_experimental_option("excludeSwitches", ["enable-automation"])
    #options.add_experimental_option('useAutomationExtension', False)
    options.add_argument("--headless")
    
    #options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
    options.set_capability("pageLoadStrategy", "normal")
    options.add_argument("--enable-javascript")
    prefs = {"profile.managed_default_content_settings.images": 2, "permissions.default.stylesheet": 2,
             "download.default_directory": dl_dir,
             'download.prompt_for_download': False,
             'download.directory_upgrade': True,}
    options.add_experimental_option("prefs", prefs)
    
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(30)
    driver.get(url)
    
    try:
    
        select_element = driver.find_element(By.ID, 'annoPubblicazione')
        select_element.click()
        time.sleep(2)
    
        actions = ActionChains(driver)
        for i in range(17):
            actions.send_keys(Keys.ARROW_DOWN) # Navigate down
        actions.send_keys(Keys.ENTER) # Select
        actions.perform()
    
        submit = driver.find_element(By.XPATH, '//input[@name="cerca"]')
        submit.click()
    
        time.sleep(2)
        pageSource = driver.page_source
    
        doc = html.fromstring(pageSource)
        dl_list = doc.xpath('//a[@class="download_pdf"]/@href')
        print(f"by lxml {dl_list[0]}")
        print(f"dl: {len(dl_list)}, type: {type(dl_list[0])}")
        for gz in dl_list:
            durl = f"https://www.gazzettaufficiale.it{str(gz)}"
            print(f"Downloading: {durl}")
            driver.get(durl)
            # gave time to download
            time.sleep(8)
    #    wait_for_download_completion(dl_dir, 600)
    
    except Exception as  e:
        print("Invalid URL")
        raise e
    finally:
        driver.quit()
    

    The following method could be used to check for complete downloads instead of a fixed sleep but I didn't tested it much

    def wait_for_download_completion(download_dir, timeout=60, check_interval=1):
        import glob
        start_time = time.time()
    
        while glob.glob(f'{download_dir}/*.crdownload') and time.time() - start_time < timeout:
    
            time.sleep(check_interval)
        print(f"Download complete {time.time() - start_time:.2f}.")