I am trying to scrape data from the following web page, "https://www.cocorahs.org/ViewData/StationPrecipSummary.aspx" using html_session(). I need the "Date" and "Precip In" in two separate columns in a table for Station 1 : MD-BL-13..
Can someone help me out !!
I found this code on stack overflow but it is giving me an error: "NoneType object has no attribute html" on the second last line (df = pd.read_html..). What do I change in the below code ?
import requests
from requests_html import HTMLSession
import pandas as pd
with HTMLSession() as s:
r = s.get('https://www.cocorahs.org/ViewData/StationPrecipSummary.aspx')
hiddens = r.html.find('input[name=__VIEWSTATE]', first=True).attrs.get('value')
payload = {
'__EVENTTARGET': '',
'_VIEWSTATE': hiddens,
'obsSwitcher:ddlObsUnits': 'usunits',
'tbStation1': 'MD-BL-13',
'ucDateRangeFilter:dcStartDate': '8/1/2019',
'ucDateRangeFilter_dcStartDate_p': '2019-8-1-0-0-0-0',
'ucDateRangeFilter:dcEndDate': '8/10/2019',
'ucDateRangeFilter_dcEndDate_p': '2019-8-10-0-0-0-0',
'btnSubmit': 'Get Summary'
}
r = s.post('https://www.cocorahs.org/ViewData/StationPrecipSummary.aspx', data=payload)
table = r.html.find('table.Grid', first=True)
df = pd.read_html(table.html, header=0)[0]
print(df)
Welcome to SO!
The problem is that you are not submitting enough data to the server, the server will evaluate this as an incorrect request and send you the home page. You can verify this by saving the response to a file and opening it in a browser.
So I opened developer tools and copied the exact same request that the web page sends and here is my solution, using bs4
and requests
. Suggestion: add datetime to format start and end dates automatically.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
session = requests.Session()
response = session.get('https://www.cocorahs.org/ViewData/StationPrecipSummary.aspx')
soup = BeautifulSoup(response.content, "html.parser")
view_state = soup.find("input", {"name": "__VIEWSTATE", "value": True})["value"]
view_state_generator = soup.find("input", {"name": "__VIEWSTATEGENERATOR", "value": True})["value"]
event_validation = soup.find("input", {"name": "__EVENTVALIDATION", "value": True})["value"]
response = session.post('https://www.cocorahs.org/ViewData/StationPrecipSummary.aspx', data={
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__LASTFOCUS": "",
"VAM_Group": "",
"__VIEWSTATE": view_state,
"VAM_JSE": "1",
"__VIEWSTATEGENERATOR": view_state_generator,
"__EVENTVALIDATION": event_validation,
"obsSwitcher:ddlObsUnits": "usunits",
"tbStation1": "MD-BL-13",
"tbStation2": "",
"tbStation3": "",
"ucDateRangeFilter:dcStartDate:di": "8/1/2019",
"ucDateRangeFilter:dcStartDate:hfDate": "2019-08-01",
"ucDateRangeFilter:dcEndDate:di": "8/10/2019",
"ucDateRangeFilter:dcEndDate:hfDate": "2019-08-10",
"btnSubmit": "Get+Summary",
})
table = BeautifulSoup(response.content, "html.parser").find("table", id="dgReports")
if table is None:
raise RuntimeError("table#dgReports not found")
df = pd.read_html(StringIO(str(table)))[0]
print(df)