I've finally figured out that BS4 no longer uses "markup massaging" as was the case in BS3. But I still need an analogous way to dispose of the unwanted document.write. You would do the following in BS3, but how to do in BS4?
# Javascript code in ths page generates HTML markup
# that isn't parsed correctly by BeautifulSoup.
# To avoid this problem, all document.write fragments are removed
my_massage = copy(BeautifulSoup.MARKUP_MASSAGE)
my_massage.append((re.compile(u"document.write(.+);"), lambda match: ""))
my_massage.append((re.compile(u'alt=".+">'), lambda match: ">"))
Also, since BS4 BeautifulSoup consctuctor no longer supports the markupmassage argument, where in my program should I take care of the document.write problem? I'm assuming that is the problem because I'm just trying to print out the table markup, and I'm getting a thread exception when I run windmill.
This is what my code looks like:
#!/usr/bin/env python
# Generated by the windmill services transformer
#from windmill.authoring import WindmillTestClient
from bs4 import BeautifulSoup
import re, urlparse
from copy import copy
from windmill.authoring import setup_module, WindmillTestClient
from windmill.conf import global_settings
import sys
global_settings.START_CHROME = True # This makes it use Firefox
setup_module(sys.modules[__name__])
def get_table_info(client):
"""
Parse HTML page and extract featured image name and link
"""
# Get Javascript updated HTML page
client.waits.forElement(xpath=u"//table[@id='trades']",
timeout=40000)
response = client.commands.getPageText()
assert response['status']
assert response['result']
# Create soup from HTML page and get desired information
soup = BeautifulSoup(response['result'])
table_info = soup.select("#trades")
return table_info
def test_scrape():
"""
Scrape site
"""
# Open main gallery page
client = WindmillTestClient(__name__)
client.open(url='http://www.zulutrade.com/trader/128391')
table_info = {}
table_info = get_table_info(client)
print table_info
test_scrape()
You do not need to tell BeautifulSoup how to massage the markup — you can modify it yourself before feeding it to the BeautifulSoup
constructor:
html = response['result']
html = re.sub(r'document.write(.+);', '', html)
html = re.sub(r'alt=".+">', '>', html)
soup = BeautifulSoup(html)