I'm trying to scrape a page where I want to wait until a string has been detected in a script
element before returning the page's HTML.
Here's my MRE scraper:
from scrapy import Request, Spider
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
class FlashscoreSpider(Spider):
name = "flashscore"
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
}
def start_requests(self):
yield Request(
url="https://www.flashscore.com/match/WKM03Vff/#/match-summary/match-summary",
meta=dict(
dont_redirect=True,
playwright=True,
playwright_page_methods=[
PageMethod(
method="wait_for_selector",
selector="//script[contains(text(), 'WKM03Vff')]",
timeout=5000,
),
],
),
callback=self.parse,
)
def parse(self, response):
print("I've loaded the page ready to parse!!!")
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(FlashscoreSpider)
process.start()
This results in the following error:
playwright._impl._api_types.TimeoutError: Timeout 5000ms exceeded.
My understanding is that this is because there are multiple text nodes in script
and I'm only picking up the first one with the XPath. As the string I'm looking for is in a later node then I get the TimeoutError
error.
This answer gives a neat solution however scrapy doesn't support XPath 2.0 so when I use:
"string-join(//script/text()[normalize-space()], ' ')"
I get the following error:
playwright._impl._api_types.Error: Unexpected token "string-join(" while parsing selector "string-join(//script/text()[normalize-space()], ' ')"
There is an alternative given in the comments to the answer but my worry there is a changing number of text nodes.
From some fairly intensive googling I don't think there is a robust XPath solution. However, is there a CSS equivalent? I've tried:
"script:has-text('WKM03Vff')"
However, that results in a Timeout
exception again.
As I mentioned in the comments, script tags typically do not require waiting for any amount of time because they do not require being rendered.
You should simply be able to immdiately access their contents from within the parse method.
For example:
from scrapy import Request, Spider
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
class FlashscoreSpider(Spider):
name = "flashscore"
custom_settings = {
"ROBOTSTXT_OBEY": False,
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
}
def start_requests(self):
yield Request(
url="https://www.flashscore.com/match/WKM03Vff/#/match-summary/match-summary",
meta=dict(
dont_redirect=True,
playwright=True,
playwright_include_page=True),
callback=self.parse,
)
def parse(self, response):
print(response.xpath("//script[contains(text(), 'WKM03Vff')]"))
print(response.xpath("//script[contains(text(), 'WKM03Vff')]/text()").get())
print("I've loaded the page ready to parse!!!")
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(FlashscoreSpider)
process.start()
2023-09-13 00:07:02 [scrapy-playwright] DEBUG: [Context=default] Request: <GET
https://cdn.cookielaw.org/scripttemplates/202210.1.0/assets/otCommonStyles.css>
(resource type: fetch, referrer: https://www.flashscore.com/)
[<Selector query="//script[contains(text(), 'WKM03Vff')]"
data='<script>\n\t\t\twindow.environment = {"ev...'>]
window.environment = {"event_id_c":"WKM03Vff",
"eventStageTranslations":{"1":" ","45":"To finish","42":"Awaiting
updates","2":"Live","17": "Set 1","18":"Set 2","19":"Set 3","20":"Set
4","21":"Set 5","47":"Set 1 - Tiebreak","48":"Set 2 - Tiebreak","49":"Set 3 -
Tiebreak","50":"Set 4 - Tiebreak","51":"Set 5 - Tiebreak","46":"Break
Time","3":"Finished",....p10:100","port":443,"sslEnabled":true,"namespace":"\/f
s\/fs3_","projectId":2,"enabled":false},"project_id":2};
I've loaded the page ready to parse!!!
2023-09-13 00:07:02 [scrapy.core.engine] INFO: Closing spider (finished)