I am trying to download some data (in an efficient way), however I encountered an unexpected problem.
Here is the code that works just fine:
import requests
import os
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
url = 'https://profiles.doe.mass.edu/statereport/ssdr.aspx'
session = requests.Session()
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
viewstate = soup.select("#__VIEWSTATE")[0]['value']
viewstategenerator = soup.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
eventtarget = soup.select("#__EVENTTARGET")[0]['value']
subgroup = 'WH'
offense = 'ALL' # <-- PROBLEM if changed to '2' or '17' or any other offense type
year = '2013'
form_data = {
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategenerator,
'__EVENTVALIDATION': eventvalidation,
'ctl00$ContentPlaceHolder1$ddReportType': 'SCHOOL',
'ctl00$ContentPlaceHolder1$ddYear': year,
'ctl00$ContentPlaceHolder1$ddOffense': offense,
'ctl00$ContentPlaceHolder1$ddStudentGroup': subgroup,
'ctl00$ContentPlaceHolder1$btnViewReport': 'View+Report',
'ctl00$ContentPlaceHolder1$hfExport': "Excel"
}
response = session.post(url, data=form_data, headers=headers, stream=True)
if 'application/vnd.ms-excel' in response.headers.get('Content-Type', '') or \
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in response.headers.get('Content-Type', ''):
filename = subgroup+'.xlsx'
file_path = os.path.join(os.getcwd(), filename)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Excel file successfully downloaded to: {file_path}")
else:
print("Did not receive an Excel file. Response content type:", response.headers.get('Content-Type'))
with open('response.html', 'wb') as f:
f.write(response.content)
print("Response saved to response.html for debugging")
The problem is that as soon as I change `offense' variable to '2', '3', etc. (consistent with the request I see in the browser) to change the offense type, the request fails to get the excel file. It does work for later year (2018, 2019) when the coding changed and offense types are referred to using something like 'BULLY' and so on. Tried to debug with that if-else in the end but it just throws some generic error (while still working just fine through the web interface).
Thanks, any help is MUCH appreciated!
The web page uses ASP.NET Web Forms, and the dropdown behaviour depends not just on posted values, but on JavaScript events that populate or validate fields server-side — triggered when dropdowns change __EVENTTARGET
or __EVENTARGUMENT
values.
Your script is posting directly without simulating the intermediate step that happens when you manually select '7'
in the browser — namely:
You select Report Type = SCHOOL
The page posts back and refreshes available Offense values
Then you select Offense = 7 for example, and click "View Report"
In your script, you're skipping that first refresh.
import requests
import os
from bs4 import BeautifulSoup
google_drive_bc = os.environ.get('google_drive_bc')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
url = 'https://profiles.doe.mass.edu/statereport/ssdr.aspx'
session = requests.Session()
# initial page
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
def extract_state(soup):
return {
'__VIEWSTATE': soup.select_one("#__VIEWSTATE")['value'],
'__VIEWSTATEGENERATOR': soup.select_one("#__VIEWSTATEGENERATOR")['value'],
'__EVENTVALIDATION': soup.select_one("#__EVENTVALIDATION")['value']
}
state_data = extract_state(soup)
# POST back to set ReportType=SCHOOL
report_type_post = {
**state_data,
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$ddReportType',
'__EVENTARGUMENT': '',
'ctl00$ContentPlaceHolder1$ddReportType': 'SCHOOL',
'ctl00$ContentPlaceHolder1$ddYear': '2013',
'ctl00$ContentPlaceHolder1$ddStudentGroup': 'WH',
}
response = session.post(url, data=report_type_post, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
state_data = extract_state(soup)
# full report request
final_form = {
**state_data,
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__LASTFOCUS': '',
'ctl00$ContentPlaceHolder1$ddReportType': 'SCHOOL',
'ctl00$ContentPlaceHolder1$ddYear': '2013',
'ctl00$ContentPlaceHolder1$ddOffense': '7',
'ctl00$ContentPlaceHolder1$ddStudentGroup': 'WH',
'ctl00$ContentPlaceHolder1$btnViewReport': 'View Report',
'ctl00$ContentPlaceHolder1$hfExport': 'Excel'
}
response = session.post(url, data=final_form, headers=headers, stream=True)
if 'application/vnd.ms-excel' in response.headers.get('Content-Type', '') or \
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in response.headers.get('Content-Type', ''):
filename = 'WH_2013_Offense7.xlsx'
file_path = os.path.join(os.getcwd(), filename)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Excel file successfully downloaded to: {file_path}")
else:
print("Did not receive an Excel file. Content-Type:", response.headers.get('Content-Type'))
with open('response.html', 'wb') as f:
f.write(response.content)
print("Saved response to 'response.html' for debugging.")