I am trying to learn Dynamic Web scraping on PyQt5. I was looking up the tutorials meant for PyQt4 so have some different libraries in Qt5.
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import bs4 as bs
import urllib.request
class Client(QWebEnginePage):
def _init_(self, url):
self.app=QApplication(sys.argv)
QWebPage._init_(self)
self.loadFinished.connect(self.on_page_load)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def on_page_load(self):
self.app.quit()
url='https://pythonprogramming.net/parsememcparseface/'
client_response=Client(url)
source=client_response.mainFrame().toHtml()
#sauce=urllib.request.urlopen('https://pythonprogramming.net/sitemap.xml').read()
soup=bs.BeautifulSoup(sauce,'xml')
js_test=soup.find('p',class_='jstest')
print(js_test.text)
The following error is being show:
Traceback (most recent call last):
File "jsp.py", line 19, in <module>
client_response=Client(url)
TypeError: arguments did not match any overloaded call:
QWebEnginePage(parent: QObject = None): argument 1 has unexpected type 'str'
QWebEnginePage(QWebEngineProfile, parent: QObject = None): argument 1 has unexpected
Someone help me!
Your code has several errors:
QWebEnginePage
does not have a mainFrame()
as a method, now you have to load it directlytoHtml()
function is no longer synchronous so it will ask you for a callback to get the html, but with my modification it is again synchronous.Code:
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, pyqtSignal, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class Client(QWebEnginePage):
toHtmlFinished = pyqtSignal()
def __init__(self, url):
self.app=QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self.on_page_load)
self.load(QUrl(url))
self.app.exec_()
def on_page_load(self):
self.app.quit()
def store_html(self, html):
self.html = html
self.toHtmlFinished.emit()
def get_html(self):
self.toHtml(self.store_html)
loop = QEventLoop()
self.toHtmlFinished.connect(loop.quit)
loop.exec_()
return self.html
url='https://pythonprogramming.net/parsememcparseface/'
client_response=Client(url)
source=client_response.get_html()
print(source)
References: