pythonselenium-webdriverweb-scrapingpdfpython-requests

How to download protected PDF (ViewDocument) using Selenium or requests?


I'm trying to download a protected PDF from the New York State Courts NYSCEF website using Python. The URL looks like this:

https://iapps.courts.state.ny.us/nyscef/ViewDocument?docIndex=cdHe_PLUS_DaUdFKcTLzBtSo6zw==

When I try to use requests.get() or even navigate to the page with Selenium, I either get:

Here’s what I’ve tried:

Using requests:

import requests

url = "https://iapps.courts.state.ny.us/nyscef/ViewDocument?docIndex=..."
headers = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://iapps.courts.state.ny.us/nyscef/"
}
response = requests.get(url, headers=headers)
print(response.status_code)  # Always 403

And using SeleniumBase:

from seleniumbase import SB

with SB(headless=False) as sb:
    sb.open(url)
    sb.wait(5)
    try:
        embed = sb.find_element("embed")
        print(embed.get_attribute("src"))
    except Exception as e:
        print("❌ No embed tag found", e)

Nothing works.

Full code for reference:

from seleniumbase import SB
import requests
import os
import time

def download_pdf_with_selenium_and_requests():
    # Target document URL
    doc_url = "https://iapps.courts.state.ny.us/nyscef/ViewDocument?docIndex=cdHe_PLUS_DaUdFKcTLzBtSo6zw=="

    # Setup download directory
    download_dir = os.path.join(os.getcwd(), "downloads")
    os.makedirs(download_dir, exist_ok=True)
    filename = os.path.join(download_dir, "NYSCEF_Document.pdf")

    with SB(headless=True) as sb:
        # Step 1: Navigate to the document page (using browser session)
        sb.open(doc_url)
        time.sleep(5)  # Wait for any redirects/cookies to be set

        # Step 2: Grab the actual PDF <embed src>
        try:
            embed = sb.find_element("embed")
            pdf_url = embed.get_attribute("src")
            print(f"Found PDF URL: {pdf_url}")
        except Exception as e:
            print(f"No <embed> tag found: {e}")
            return

        # Step 3: Extract cookies from Selenium session
        selenium_cookies = sb.driver.get_cookies()
        session = requests.Session()
        for cookie in selenium_cookies:
            session.cookies.set(cookie['name'], cookie['value'])

        # Step 4: Download PDF using requests with cookies
        headers = {
            "User-Agent": "Mozilla/5.0",
            "Referer": doc_url
        }

        response = session.get(pdf_url, headers=headers)
        if response.status_code == 200 and "application/pdf" in response.headers.get("Content-Type", ""):
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"PDF saved as: {filename}")
        else:
            print(f"PDF download failed. Status: {response.status_code}")
            print(f"Content-Type: {response.headers.get('Content-Type')}")
            print(f"Final URL: {response.url}")

if __name__ == "__main__":
    download_pdf_with_selenium_and_requests()

Response:

No <embed> tag found: Message: 
 Element {embed} was not present after 10 seconds!

Solution

  • With SeleniumBase, you can do the following to download that file to the ./downloaded_files/ folder:

    from seleniumbase import SB
    
    with SB(uc=True, test=True, external_pdf=True) as sb:
        url = "https://iapps.courts.state.ny.us/nyscef/ViewDocument?docIndex=cdHe_PLUS_DaUdFKcTLzBtSo6zw=="
        sb.activate_cdp_mode(url)
        sb.sleep(10)