pythonselenium-webdriverpdfweb-scrapinghttp-status-code-403

Python web scraping - Bulk downloading linked files from the SEC AAER site, 403 Forbidden error


I've been trying to download 300 linked files from SEC's AAER site. Most of the links are pdf's, but some are websites that I would need to save to pdf instead of just downloading. I'm teaching myself some python web scraping and this didn't seem like too hard a task, but I havent been able to get past the 403 error when downloading.

This code is working fine to scrape the links to the files and the 4 digit code I would like to name the files:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import requests

# Set up Chrome options to allow direct PDF download (for the download step)
download_path = "C:/Users/taylo/Downloads/sec_aaer_downloads"
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_path,  # Specify your preferred download directory
    "download.prompt_for_download": False,  # Disable download prompt
    "plugins.always_open_pdf_externally": True,  # Automatically open PDF in browser
    "safebrowsing.enabled": False,  # Disable Chrome’s safe browsing check that can block downloads
    "profile.default_content_settings.popups": 0  # Disable popups
})

# Set up the webdriver with options
driver = webdriver.Chrome(executable_path="C:/chromedriver/chromedriver", options=chrome_options)

# URLs for pages 1, 2, and 3
urls = [
    "https://www.sec.gov/enforcement-litigation/accounting-auditing-enforcement-releases?page=0",
    "https://www.sec.gov/enforcement-litigation/accounting-auditing-enforcement-releases?page=1",
    "https://www.sec.gov/enforcement-litigation/accounting-auditing-enforcement-releases?page=2"
]

# Initialize an empty list to store the URLs and AAER numbers
pdf_data = []

# Loop through each URL (pages 1, 2, and 3)
for url in urls:
    print(f"Scraping URL: {url}...")
    driver.get(url)

    # Wait for the table rows containing links to be loaded
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="block-uswds-sec-content"]/div/div/div[3]/div/table/tbody/tr[1]')))
    
    # Extract the link and AAER number from each row on the current page
    rows = driver.find_elements(By.XPATH, '//*[@id="block-uswds-sec-content"]/div/div/div[3]/div/table/tbody/tr')
    for row in rows:
        try:
            # Extract the link from the first column (PDF link)
            link_element = row.find_element(By.XPATH, './/td[2]/div[1]/a')
            link_href = link_element.get_attribute('href')
            
            # Extract the AAER number from the second column
            aaer_text_element = row.find_element(By.XPATH, './/td[2]/div[2]/span[2]')
            aaer_text = aaer_text_element.text
            aaer_number = aaer_text.split("AAER-")[1].split()[0]  # Extract the number after AAER-

            # Store the data in a list of dictionaries
            pdf_data.append({'link': link_href, 'aaer_number': aaer_number})
        except Exception as e:
            print(f"Error extracting data from row: {e}")

# Print the scraped data (optional for verification)
for entry in pdf_data:
    print(f"Link: {entry['link']}, AAER Number: {entry['aaer_number']}")

But when I try to do something like this, I can't get the downloads to go through:

import os
import time
import requests

# Set the download path
download_path = "C:/Users/taylo/Downloads/sec_aaer_downloads"
os.makedirs(download_path, exist_ok=True)

# Loop through each entry in the pdf_data list
for entry in pdf_data:
    try:
        # Extract the PDF link and AAER number
        link_href = entry['link']
        aaer_number = entry['aaer_number']

        # Send a GET request to download the PDF
        pdf_response = requests.get(link_href, stream=True, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        })

        # Check if the request was successful
        if pdf_response.status_code == 200:
            # Save the PDF to the download folder, using the AAER number as the filename
            pdf_file_path = os.path.join(download_path, f"{aaer_number}.pdf")
            with open(pdf_file_path, "wb") as pdf_file:
                for chunk in pdf_response.iter_content(chunk_size=8192):
                    pdf_file.write(chunk)
            print(f"Downloaded: {aaer_number}.pdf")
        else:
            print(f"Failed to download the file from {link_href}, status code: {pdf_response.status_code}")
    
    except Exception as e:
        print(f"Error downloading the PDF for AAER {aaer_number}: {e}")

At this point it would have been faster to manually download the files but I want to know what I'm doing wrong. I've tried Setting User-Agent Header and Simulating User Click with Selenium. Thanks for any advice you may have!


Solution

  • After copying all the Headers inside the request header when you manually open the link containing the PDF:

    enter image description here

            pdf_response = requests.get(link_href, headers={
                "Host": "www.sec.gov",
                "User-Agent": "YOUR_USER_AGENT",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Accept-Encoding": "gzip, deflate, br, zstd",
                "Connection": "keep-alive",
                "Cookie": "YOUR_COOKIE",
                "Upgrade-Insecure-Requests": "1",
                "Sec-Fetch-Dest": "document",
                "Sec-Fetch-Mode": "navigate",
                "Sec-Fetch-Site": "none",
                "Sec-Fetch-User": "?1",
                "Priority": "u=0, i",
                "Pragma": "no-cache",
            })
    

    I was able to download the files:

    enter image description here

    You also need to remove the stream=true argument inside the requests.

    These answers why Status Code 403 Forbidden is occurs, you need all the headers to access the URLs.

    Hope this helps!