asp.netweb-scrapingbeautifulsouprequest

Request breaks down with field change (scraping)


I am trying to download some data (in an efficient way), however I encountered an unexpected problem.

Here is the code that works just fine:

import requests
import os
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1'
}

url = 'https://profiles.doe.mass.edu/statereport/ssdr.aspx'

session = requests.Session()
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

viewstate = soup.select("#__VIEWSTATE")[0]['value']
viewstategenerator = soup.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
eventtarget = soup.select("#__EVENTTARGET")[0]['value']

subgroup = 'WH' 
offense = 'ALL' # <-- PROBLEM if changed to '2' or '17' or any other offense type
year = '2013'

form_data = {
    '__VIEWSTATE': viewstate,
    '__VIEWSTATEGENERATOR': viewstategenerator,
    '__EVENTVALIDATION': eventvalidation,
    'ctl00$ContentPlaceHolder1$ddReportType': 'SCHOOL',
    'ctl00$ContentPlaceHolder1$ddYear': year,
    'ctl00$ContentPlaceHolder1$ddOffense': offense,
    'ctl00$ContentPlaceHolder1$ddStudentGroup': subgroup,
    'ctl00$ContentPlaceHolder1$btnViewReport': 'View+Report',
    'ctl00$ContentPlaceHolder1$hfExport': "Excel"
    
}

response = session.post(url, data=form_data, headers=headers, stream=True)

if 'application/vnd.ms-excel' in response.headers.get('Content-Type', '') or \
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in response.headers.get('Content-Type', ''):
    
    filename = subgroup+'.xlsx'
    file_path = os.path.join(os.getcwd(), filename)

    with open(file_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    print(f"Excel file successfully downloaded to: {file_path}")
else:
    print("Did not receive an Excel file. Response content type:", response.headers.get('Content-Type'))
    with open('response.html', 'wb') as f:
        f.write(response.content)
    print("Response saved to response.html for debugging")

The problem is that as soon as I change `offense' variable to '2', '3', etc. (consistent with the request I see in the browser) to change the offense type, the request fails to get the excel file. It does work for later year (2018, 2019) when the coding changed and offense types are referred to using something like 'BULLY' and so on. Tried to debug with that if-else in the end but it just throws some generic error (while still working just fine through the web interface).

Thanks, any help is MUCH appreciated!


Solution

  • The web page uses ASP.NET Web Forms, and the dropdown behaviour depends not just on posted values, but on JavaScript events that populate or validate fields server-side — triggered when dropdowns change __EVENTTARGET or __EVENTARGUMENT values.

    Your script is posting directly without simulating the intermediate step that happens when you manually select '7' in the browser — namely:

    You select Report Type = SCHOOL The page posts back and refreshes available Offense values Then you select Offense = 7 for example, and click "View Report"

    In your script, you're skipping that first refresh.

    import requests
    import os
    from bs4 import BeautifulSoup
    
    google_drive_bc = os.environ.get('google_drive_bc')
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    
    url = 'https://profiles.doe.mass.edu/statereport/ssdr.aspx'
    
    session = requests.Session()
    
    # initial page
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    
    def extract_state(soup):
        return {
            '__VIEWSTATE': soup.select_one("#__VIEWSTATE")['value'],
            '__VIEWSTATEGENERATOR': soup.select_one("#__VIEWSTATEGENERATOR")['value'],
            '__EVENTVALIDATION': soup.select_one("#__EVENTVALIDATION")['value']
        }
    
    state_data = extract_state(soup)
    
    # POST back to set ReportType=SCHOOL
    report_type_post = {
        **state_data,
        '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$ddReportType',
        '__EVENTARGUMENT': '',
        'ctl00$ContentPlaceHolder1$ddReportType': 'SCHOOL',
        'ctl00$ContentPlaceHolder1$ddYear': '2013',
        'ctl00$ContentPlaceHolder1$ddStudentGroup': 'WH',
    }
    
    response = session.post(url, data=report_type_post, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    state_data = extract_state(soup)
    
    #  full report request
    final_form = {
        **state_data,
        '__EVENTTARGET': '',
        '__EVENTARGUMENT': '',
        '__LASTFOCUS': '',
        'ctl00$ContentPlaceHolder1$ddReportType': 'SCHOOL',
        'ctl00$ContentPlaceHolder1$ddYear': '2013',
        'ctl00$ContentPlaceHolder1$ddOffense': '7',
        'ctl00$ContentPlaceHolder1$ddStudentGroup': 'WH',
        'ctl00$ContentPlaceHolder1$btnViewReport': 'View Report',
        'ctl00$ContentPlaceHolder1$hfExport': 'Excel'
    }
    
    response = session.post(url, data=final_form, headers=headers, stream=True)
    
    
    if 'application/vnd.ms-excel' in response.headers.get('Content-Type', '') or \
       'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in response.headers.get('Content-Type', ''):
    
        filename = 'WH_2013_Offense7.xlsx'
        file_path = os.path.join(os.getcwd(), filename)
    
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    
        print(f"Excel file successfully downloaded to: {file_path}")
    
    else:
        print("Did not receive an Excel file. Content-Type:", response.headers.get('Content-Type'))
        with open('response.html', 'wb') as f:
            f.write(response.content)
        print("Saved response to 'response.html' for debugging.")