I am trying to scrape web results from the website: https://promedmail.org/promed-posts/
I have followed beutifulsoup. mechanical soup and mechanize so far unable to scrape the search results.
import re
from mechanize import Browser,urlopen
browser = Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
browser.open("https://promedmail.org/promed-posts")
for form in browser.forms():
if form.attrs['id'] == 'full_search':
browser.form = form
break
browser['search'] = 'US'
response = browser.submit()
content = response.read()
The content does not show the search results when typed in US. Any idea on what am I doing wrong here?
As you mention bs4 you can mimic the POST request the page makes. Extract the json item which contains the html the page would have been updated with (containing the results); parse that into BeautifulSoup object then reconstruct the results table as a dataframe:
import requests
from bs4 import BeautifulSoup as bs
headers = {'user-agent': 'Mozilla/5.0'}
data = {
'action': 'get_promed_search_content',
'query[0][name]': 'kwby1',
'query[0][value]': 'summary',
'query[1][name]': 'search',
'query[1][value]': 'US',
'query[2][name]': 'date1',
# 'query[2][value]': '',
'query[3][name]': 'date2',
# 'query[3][value]': '',
'query[4][name]': 'feed_id',
'query[4][value]': '1'
}
r = requests.post('https://promedmail.org/wp-admin/admin-ajax.php', headers=headers, data=data).json()
soup = bs(r['results'], 'lxml')
df = pd.DataFrame([(i.find_next(text=True),
i.a.text,
f"https://promedmail.org/promed-post/?id={i.a['id'].replace('id','')}") for i in soup.select('li')]
, columns = ['Date', 'Title', 'Link'])
print(df)