pythonweb-scrapingscrapygoogle-crawlersscrapy-playwright

Scrapy-playwright scraper does not return 'page' or 'playwright_page' in response's meta


I am stuck in the scraper portion of my project, I continued troubleshooting errors and my latest approach is at least not crashing and burning. However, the response.meta I am getting for whatever reason is not returning a playwright page.

Hardware/setup:

Functionality I am after is rather simple; scrape results from google. However I need to automate this preferably with a headless browser, and be able to pass in some user-defined parameters including the url, and how many results to scrape before stopping.

Here is the main portion of my scraper, i.e. imports and spider definition:

from scrapy.crawler import CrawlerProcess
import scrapy


class GoogleSpider(scrapy.Spider):
    name = 'google_spider'
    allowed_domains = ['www.google.com']
    custom_settings = {
        'CONCURRENT_REQUESTS': 1,
        'DOWNLOAD_DELAY': 3,
        'COOKIES_ENABLED': False,
        'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
        'MIDDLEWARES': {
            'scrapy_playwright.middleware.PlaywrightMiddleware': 800,
        },
    }

    def __init__(self, domain, stop, user_agent, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.domain = domain
        self.stop = int(stop)
        self.custom_settings['USER_AGENT'] = user_agent
        self.start_urls = [f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
        self.urls_collected = []

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        return super().from_crawler(crawler, *args, **kwargs)


    def start_requests(self):
        yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
                                                       "playwright_include_page": True})

    async def parse(self, response):
        print(f"\n\nRESPONSE STATUS: {response.status}, RESPONSE URL: {response.url}\n\n")
        print(f"RESPONSE META KEYS: {response.meta.keys()}\n\n")
        page = response.meta['page']
        current_urls_length = 0

        while True:
            locator = page.locator('.yuRUbf>a')
            urls = await locator.evaluate_all('nodes => nodes.map(n => n.href)')
            
            new_urls = [url for url in urls if self.domain in url and url not in self.urls_collected]

            self.urls_collected.extend(new_urls)

            if len(self.urls_collected) >= self.stop:
                self.urls_collected = self.urls_collected[:self.stop]
                break

            if len(urls) > current_urls_length:
                current_urls_length = len(urls)
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.waitForTimeout(1000)
            else:
                break

        self.logger.info(f'Collected {len(self.urls_collected)} URLs:')
        for url in self.urls_collected:
            self.logger.info(url)

And the latest execution file:

from scrapy.crawler import CrawlerProcess
from spiders.googlespider import GoogleSpider

def main(domain, stop, user_agent):
    process = CrawlerProcess()
    process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
    process.start()

if __name__ == '__main__':
    domain = 'jobs.lever.co'
    stop = 25
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
    user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
    main(domain=domain, stop=stop, user_agent=user_agent3)

And the logs:

2023-04-07 09:01:17 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2023-04-07 09:01:17 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.4, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.1, Twisted 22.10.0, Python 3.11.2 (v3.11.2:878ead1ac1, Feb  7 2023, 10:02:41) [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 23.1.1 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.1, Platform macOS-12.6.4-x86_64-i386-64bit
2023-04-07 09:01:17 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS': 1, 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 3}
2023-04-07 09:01:17 [py.warnings] WARNING: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/scrapy/utils/request.py:232: ScrapyDeprecationWarning: '2.6' is a deprecated value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting.

It is also the default value. In other words, it is normal to get this warning if you have not defined a value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting. This is so for backward compatibility reasons, but it will change in a future version of Scrapy.

See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-04-07 09:01:17 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-04-07 09:01:17 [scrapy.extensions.telnet] INFO: Telnet Password: f1350e3a3455ff22
2023-04-07 09:01:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2023-04-07 09:01:18 [scrapy.core.engine] INFO: Spider opened
2023-04-07 09:01:18 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2023-04-07 09:01:18 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024
2023-04-07 09:01:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27> (referer: None)


RESPONSE STATUS: 200, RESPONSE URL: https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27


RESPONSE META KEYS: dict_keys(['playwright', 'playwright_include_page', 'download_timeout', 'download_slot', 'download_latency'])


2023-04-07 09:01:18 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27> (referer: None)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/twisted/internet/defer.py", line 1697, in _inlineCallbacks
    result = context.run(gen.send, result)
  File "/Users/reesh/Projects/qj/app/gs/gs/spiders/googlespider.py", line 37, in parse
    page = response.meta['page']
KeyError: 'page'
2023-04-07 09:01:19 [scrapy.core.engine] INFO: Closing spider (finished)
2023-04-07 09:01:19 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 507,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 17104,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 0.874591,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 4, 7, 16, 1, 19, 103146),
 'httpcompression/response_bytes': 53816,
 'httpcompression/response_count': 1,
 'log_count/DEBUG': 2,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,
 'log_count/WARNING': 1,
 'memusage/max': 61571072,
 'memusage/startup': 61571072,
 'response_received_count': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'spider_exceptions/KeyError': 1,
 'start_time': datetime.datetime(2023, 4, 7, 16, 1, 18, 228555)}
2023-04-07 09:01:19 [scrapy.core.engine] INFO: Spider closed (finished)

So response.meta is completely missing the "playwright_page" or "page" entry, and that's where my spider stops working. In fact anything after that definition I am not sure works.

Truth be told, I am not married to using Scrapy-playwright, it simply was the first solution I found to handle google's new-ish infinite scroll interface. I truly don't mind going back to the drawing board and starting fresh, as long as my scraper works as intended.

Please weigh in, I am open to any and all suggestions!


Solution

  • What you are shown in the browser is not always the same as what you might receive when using a headless browser.

    When in doubt it's a good idea to write the entire contents of the page to an html file and then inspect it either with a code editor or with your browser so you can see exactly what the page you are actually receiving in your response objects is getting.

    First thing is that your custom settings need to be adjusted.

    You should have the http and https download handlers installed when using scrapy playwright... like so:

        custom_settings = {
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            'CONCURRENT_REQUESTS': 1,
            'DOWNLOAD_DELAY': 3,
            'COOKIES_ENABLED': False,
            'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
            "DOWNLOAD_HANDLERS": {
                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            }
        }
    

    Also you the value you need to search for in response.meta is 'playwright_page', which should fix your issue with not receiving the correct page.

    Finally if you were to follow my first piece of advice you would see that your html selectors might not exist in the actual page you are receiving from the headless browser. In my case there is no infinite scroll implemented and instead there is a "Next" link on the bottom of each page that needs to be clicked. Also the class selectors are all different from the one shown in the browser.

    The example below worked for me, but it might not be the same for you, however using the process described above you might be able to get the results you are looking for.

    import re
    import scrapy
    from scrapy.crawler import CrawlerProcess
    from scrapy.selector import Selector
    
    class GoogleSpider(scrapy.Spider):
        name = 'google_spider'
        allowed_domains = ['www.google.com']
        custom_settings = {
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            "DOWNLOAD_HANDLERS": {
                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            }
        }
    
        def __init__(self, domain, stop, user_agent, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.domain = domain
            self.stop = int(stop)
            self.custom_settings['USER_AGENT'] = user_agent
            self.start_urls = [f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
            self.urls_collected = []
    
        @classmethod
        def from_crawler(cls, crawler, *args, **kwargs):
            return super().from_crawler(crawler, *args, **kwargs)
    
    
        def start_requests(self):
            yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
                                                           "playwright_include_page": True})
    
        async def get_page_info(self, page):
            for i in range(10):
                val = page.viewport_size["height"]
                await page.mouse.wheel(0, val)
                await page.wait_for_timeout(1000)
            text = await page.content()
            selector = Selector(text=text)
            urls = []
            for row in selector.xpath("//div[contains(@class, 'kCrYT')]"):
                text = row.xpath(".//h3//text()").get()
                url = row.xpath(".//a/@href").get()
                if url:
                    urls.append({text: url})
                    print(urls)
            self.urls_collected += urls
            return urls 
        
        
        async def parse(self, response):
            page = response.meta['playwright_page']
            urls = await self.get_page_info(page)       
            found = True
            while found:
                try:
                    element = page.get_by_text("Next")
                    print(element, "parsing next page")
                    await element.click()
                    more_urls = await self.get_page_info(page)
                    urls += more_urls
                except:
                    found = False
            return urls
     
    
    def main(domain, stop, user_agent):
        process = CrawlerProcess()
        process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
        process.start()
    
    if __name__ == '__main__':
        domain = 'jobs.lever.co'
        stop = 25
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
        user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
        main(domain=domain, stop=stop, user_agent=user_agent3)