I’m trying to programmatically download the full “pubblicazione completa non certificata” PDFs of the Italian Gazzetta Ufficiale – Serie Generale for 1969 (for an academic article). The site has a 1946–1985 “Formato grafico PDF” search and an archive index:
https://www.gazzettaufficiale.it/ricerca/pdf/foglio_ordinario2/2/0/0?reset=true
https://www.gazzettaufficiale.it/ricercaArchivioCompleto/serie_generale/1969
.../gazzetta/serie_generale/caricaDettaglio?dataPubblicazioneGazzetta=1969-01-02&numeroGazzetta=1
Selenium: navigate to the year picker and click 1969. On my machine it often times out: the year link/input is either not present, hidden in an iframe, or overlaid by a banner. I tried switching frames and even injecting the year via JS, but it’s brittle and unreliable.
Requests + BeautifulSoup on the “year grid” page: in some HTML copies (from another session) I can see direct links like
<a class="download_pdf" href="/do/gazzetta/downloadPdf?...">Download PDF</a>
—but in my live session those anchors are not there, so scraping returns 0 links.
Manually building the download URL from the archive list (date & issue number), e.g.:
/do/gazzetta/downloadPdf?dataPubblicazioneGazzetta=19690102&numeroGazzetta=1&tipoSerie=SG&tipoSupplemento=GU&numeroSupplemento=0&progressivo=0&edizione=0&estensione=pdf
This returns HTML, not a PDF. When saved with “.pdf”, Acrobat says the file is damaged; the file actually contains an HTML message like: “Il pdf selezionato non è stato trovato”.
Requests on each detail page: fetch the detail URL and look for either a.download_pdf
or anchors containing “Scarica il PDF” / “pubblicazione completa non certificata”. For 1969 I consistently find no such link on the page, so I can’t discover a valid downloadPdf
URL at runtime.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
BASE = "https://www.gazzettaufficiale.it"
YEAR = 1969
YEAR_URL = f"{BASE}/ricercaArchivioCompleto/serie_generale/{YEAR}"
s = requests.Session()
s.headers.update({
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": BASE,
})
# 1) Collect detail pages (date + issue number)
r = s.get(YEAR_URL, timeout=60)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
details = []
for a in soup.find_all("a", href=True):
href = a["href"]
if ("/gazzetta/serie_generale/caricaDettaglio" in href
and "dataPubblicazioneGazzetta=" in href
and "numeroGazzetta=" in href):
details.append(urljoin(BASE, href))
print("Detail pages found:", len(details))
print("Sample:", details[:3])
# 2) For one detail page, try to discover a real "download PDF" link
detail_url = details[0]
r = s.get(detail_url, timeout=60, headers={"Referer": YEAR_URL})
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Try common selectors / texts
dl = (soup.select_one('a.download_pdf[href]')
or soup.select_one('a[href*="/do/gazzetta/downloadPdf"]'))
if not dl:
for a in soup.find_all("a", href=True):
if "scarica il pdf" in (a.get_text() or "").lower():
dl = a
break
print("Download link found on detail page?", bool(dl))
if dl:
print("Download href:", urljoin(BASE, dl["href"]))
Output I get:
Detail pages found: 264
Sample: [https://www.gazzettaufficiale.it/gazzetta/serie_generale/caricaDettaglio?dataPubblicazioneGazzetta=1969-01-02&numeroGazzetta=1, ...]
Download link found on detail page? False
When I instead build the downloadPdf
URL from the query params and try to download it, the response is HTML (not a PDF). Earlier I inadvertently saved those HTML responses as “.pdf”, resulting in 300+ “corrupted PDFs” that open as error in Acrobat.
Any guidance or a working minimal example would be greatly appreciated. Thanks!
A solution with selenium. Set dl_dir
to the directory where pdfs must be stored.
8
but might need adjustment if downloads are incomplete.from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
from lxml import html
url = "https://www.gazzettaufficiale.it/ricerca/pdf/foglio_ordinario2/2/0/0?reset=true"
dl_dir = "/home/lmc/tmp/test-ws/gaz"
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("window-size=2880x1620")
#options.add_experimental_option("excludeSwitches", ["enable-automation"])
#options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--headless")
#options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
options.set_capability("pageLoadStrategy", "normal")
options.add_argument("--enable-javascript")
prefs = {"profile.managed_default_content_settings.images": 2, "permissions.default.stylesheet": 2,
"download.default_directory": dl_dir,
'download.prompt_for_download': False,
'download.directory_upgrade': True,}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(30)
driver.get(url)
try:
select_element = driver.find_element(By.ID, 'annoPubblicazione')
select_element.click()
time.sleep(2)
actions = ActionChains(driver)
for i in range(17):
actions.send_keys(Keys.ARROW_DOWN) # Navigate down
actions.send_keys(Keys.ENTER) # Select
actions.perform()
submit = driver.find_element(By.XPATH, '//input[@name="cerca"]')
submit.click()
time.sleep(2)
pageSource = driver.page_source
doc = html.fromstring(pageSource)
dl_list = doc.xpath('//a[@class="download_pdf"]/@href')
print(f"by lxml {dl_list[0]}")
print(f"dl: {len(dl_list)}, type: {type(dl_list[0])}")
for gz in dl_list:
durl = f"https://www.gazzettaufficiale.it{str(gz)}"
print(f"Downloading: {durl}")
driver.get(durl)
# gave time to download
time.sleep(8)
# wait_for_download_completion(dl_dir, 600)
except Exception as e:
print("Invalid URL")
raise e
finally:
driver.quit()
The following method could be used to check for complete downloads instead of a fixed sleep but I didn't tested it much
def wait_for_download_completion(download_dir, timeout=60, check_interval=1):
import glob
start_time = time.time()
while glob.glob(f'{download_dir}/*.crdownload') and time.time() - start_time < timeout:
time.sleep(check_interval)
print(f"Download complete {time.time() - start_time:.2f}.")