I am stuck in the scraper portion of my project, I continued troubleshooting errors and my latest approach is at least not crashing and burning. However, the response.meta I am getting for whatever reason is not returning a playwright page.
Hardware/setup:
Functionality I am after is rather simple; scrape results from google. However I need to automate this preferably with a headless browser, and be able to pass in some user-defined parameters including the url, and how many results to scrape before stopping.
Here is the main portion of my scraper, i.e. imports and spider definition:
from scrapy.crawler import CrawlerProcess
import scrapy
class GoogleSpider(scrapy.Spider):
name = 'google_spider'
allowed_domains = ['www.google.com']
custom_settings = {
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'COOKIES_ENABLED': False,
'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
'MIDDLEWARES': {
'scrapy_playwright.middleware.PlaywrightMiddleware': 800,
},
}
def __init__(self, domain, stop, user_agent, *args, **kwargs):
super().__init__(*args, **kwargs)
self.domain = domain
self.stop = int(stop)
self.custom_settings['USER_AGENT'] = user_agent
self.start_urls = [f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
self.urls_collected = []
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
return super().from_crawler(crawler, *args, **kwargs)
def start_requests(self):
yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
"playwright_include_page": True})
async def parse(self, response):
print(f"\n\nRESPONSE STATUS: {response.status}, RESPONSE URL: {response.url}\n\n")
print(f"RESPONSE META KEYS: {response.meta.keys()}\n\n")
page = response.meta['page']
current_urls_length = 0
while True:
locator = page.locator('.yuRUbf>a')
urls = await locator.evaluate_all('nodes => nodes.map(n => n.href)')
new_urls = [url for url in urls if self.domain in url and url not in self.urls_collected]
self.urls_collected.extend(new_urls)
if len(self.urls_collected) >= self.stop:
self.urls_collected = self.urls_collected[:self.stop]
break
if len(urls) > current_urls_length:
current_urls_length = len(urls)
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.waitForTimeout(1000)
else:
break
self.logger.info(f'Collected {len(self.urls_collected)} URLs:')
for url in self.urls_collected:
self.logger.info(url)
And the latest execution file:
from scrapy.crawler import CrawlerProcess
from spiders.googlespider import GoogleSpider
def main(domain, stop, user_agent):
process = CrawlerProcess()
process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
process.start()
if __name__ == '__main__':
domain = 'jobs.lever.co'
stop = 25
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
main(domain=domain, stop=stop, user_agent=user_agent3)
And the logs:
2023-04-07 09:01:17 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2023-04-07 09:01:17 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.4, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.1, Twisted 22.10.0, Python 3.11.2 (v3.11.2:878ead1ac1, Feb 7 2023, 10:02:41) [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 23.1.1 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.1, Platform macOS-12.6.4-x86_64-i386-64bit
2023-04-07 09:01:17 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS': 1, 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 3}
2023-04-07 09:01:17 [py.warnings] WARNING: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/scrapy/utils/request.py:232: ScrapyDeprecationWarning: '2.6' is a deprecated value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting.
It is also the default value. In other words, it is normal to get this warning if you have not defined a value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting. This is so for backward compatibility reasons, but it will change in a future version of Scrapy.
See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
return cls(crawler)
2023-04-07 09:01:17 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-04-07 09:01:17 [scrapy.extensions.telnet] INFO: Telnet Password: f1350e3a3455ff22
2023-04-07 09:01:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2023-04-07 09:01:18 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2023-04-07 09:01:18 [scrapy.core.engine] INFO: Spider opened
2023-04-07 09:01:18 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2023-04-07 09:01:18 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024
2023-04-07 09:01:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27> (referer: None)
RESPONSE STATUS: 200, RESPONSE URL: https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27
RESPONSE META KEYS: dict_keys(['playwright', 'playwright_include_page', 'download_timeout', 'download_slot', 'download_latency'])
2023-04-07 09:01:18 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3Ajobs.lever.co%2F%2A+after%3A2023-03-27> (referer: None)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/twisted/internet/defer.py", line 1697, in _inlineCallbacks
result = context.run(gen.send, result)
File "/Users/reesh/Projects/qj/app/gs/gs/spiders/googlespider.py", line 37, in parse
page = response.meta['page']
KeyError: 'page'
2023-04-07 09:01:19 [scrapy.core.engine] INFO: Closing spider (finished)
2023-04-07 09:01:19 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 507,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 17104,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.874591,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 4, 7, 16, 1, 19, 103146),
'httpcompression/response_bytes': 53816,
'httpcompression/response_count': 1,
'log_count/DEBUG': 2,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'memusage/max': 61571072,
'memusage/startup': 61571072,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/KeyError': 1,
'start_time': datetime.datetime(2023, 4, 7, 16, 1, 18, 228555)}
2023-04-07 09:01:19 [scrapy.core.engine] INFO: Spider closed (finished)
So response.meta is completely missing the "playwright_page" or "page" entry, and that's where my spider stops working. In fact anything after that definition I am not sure works.
Truth be told, I am not married to using Scrapy-playwright, it simply was the first solution I found to handle google's new-ish infinite scroll interface. I truly don't mind going back to the drawing board and starting fresh, as long as my scraper works as intended.
Please weigh in, I am open to any and all suggestions!
What you are shown in the browser is not always the same as what you might receive when using a headless browser.
When in doubt it's a good idea to write the entire contents of the page to an html file and then inspect it either with a code editor or with your browser so you can see exactly what the page you are actually receiving in your response objects is getting.
First thing is that your custom settings need to be adjusted.
You should have the http and https download handlers installed when using scrapy playwright... like so:
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'COOKIES_ENABLED': False,
'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
}
Also you the value you need to search for in response.meta is 'playwright_page'
, which should fix your issue with not receiving the correct page.
Finally if you were to follow my first piece of advice you would see that your html selectors might not exist in the actual page you are receiving from the headless browser. In my case there is no infinite scroll implemented and instead there is a "Next"
link on the bottom of each page that needs to be clicked. Also the class selectors are all different from the one shown in the browser.
The example below worked for me, but it might not be the same for you, however using the process described above you might be able to get the results you are looking for.
import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
class GoogleSpider(scrapy.Spider):
name = 'google_spider'
allowed_domains = ['www.google.com']
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
}
def __init__(self, domain, stop, user_agent, *args, **kwargs):
super().__init__(*args, **kwargs)
self.domain = domain
self.stop = int(stop)
self.custom_settings['USER_AGENT'] = user_agent
self.start_urls = [f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
self.urls_collected = []
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
return super().from_crawler(crawler, *args, **kwargs)
def start_requests(self):
yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
"playwright_include_page": True})
async def get_page_info(self, page):
for i in range(10):
val = page.viewport_size["height"]
await page.mouse.wheel(0, val)
await page.wait_for_timeout(1000)
text = await page.content()
selector = Selector(text=text)
urls = []
for row in selector.xpath("//div[contains(@class, 'kCrYT')]"):
text = row.xpath(".//h3//text()").get()
url = row.xpath(".//a/@href").get()
if url:
urls.append({text: url})
print(urls)
self.urls_collected += urls
return urls
async def parse(self, response):
page = response.meta['playwright_page']
urls = await self.get_page_info(page)
found = True
while found:
try:
element = page.get_by_text("Next")
print(element, "parsing next page")
await element.click()
more_urls = await self.get_page_info(page)
urls += more_urls
except:
found = False
return urls
def main(domain, stop, user_agent):
process = CrawlerProcess()
process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
process.start()
if __name__ == '__main__':
domain = 'jobs.lever.co'
stop = 25
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
main(domain=domain, stop=stop, user_agent=user_agent3)