pythonweb-scrapingloggingscrapyweb-crawler

Add the spider's name to each line of log


I am looking for a way to prefix each log produced by Scrapy with the name of the spider that generated it. Until now, I was launching each spider synchronously in a loop, so it was easy to track which spider generated which log. But I have recently refactored my code in order to either accept a list of spiders as argument, or launch them all at once throught the CrawlerProcess() function. The result is that they are launched asynchronously, so the logs are all mixed up.

I have thought about adding something like [%(name)] to the LOG_FORMAT setting, but the name produced is the module that called it (scrapy.core.engine, scrapy.utils.log, etc.) and not the spider's name.

I also tried creating an extension that would modify the crawler's settings by retrieving spider.name and adding it to the LOG_FORMAT constant, but as far as I'm aware changing the settings while the crawler is running has no effect (and I haven't found a clean way of doing it since they are immutable).

Any help would be greatly appreciated ! Thank you


Solution

  • You need to create a custom log format, and set it as the log formatter for the project.

    Basically you need to extend Scrapy's log formatter and set the message with the new format.

    main2.py:

    from scrapy import logformatter
    import logging
    import os
    from twisted.python.failure import Failure
    from scrapy.utils.request import referer_str
    
    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings
    
    
    SCRAPEDMSG = "Scraped from %(src)s" + os.linesep + "%(item)s"
    # DROPPEDMSG = "Dropped: %(exception)s" + os.linesep + "%(item)s"
    CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
    # ITEMERRORMSG = "Error processing %(item)s"
    # SPIDERERRORMSG = "Spider error processing %(request)s (referer: %(referer)s)"
    # DOWNLOADERRORMSG_SHORT = "Error downloading %(request)s"
    # DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
    
    
    class ExampleLogFormatter(logformatter.LogFormatter):
        def crawled(self, request, response, spider):
            request_flags = f' {str(request.flags)}' if request.flags else ''
            response_flags = f' {str(response.flags)}' if response.flags else ''
            return {
                'level': logging.DEBUG,
                'msg': f'{spider.name} {CRAWLEDMSG}',
                'args': {
                    'status': response.status,
                    'request': request,
                    'request_flags': request_flags,
                    'referer': referer_str(request),
                    'response_flags': response_flags,
                    # backward compatibility with Scrapy logformatter below 1.4 version
                    'flags': response_flags
                }
            }
    
        def scraped(self, item, response, spider):
            if isinstance(response, Failure):
                src = response.getErrorMessage()
            else:
                src = response
            return {
                'level': logging.DEBUG,
                'msg': f'{spider.name} {SCRAPEDMSG}',
                'args': {
                    'src': src,
                    'item': item,
                }
            }
    
    
    if __name__ == "__main__":
        spider = 'example_spider'
        settings = get_project_settings()
        settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
        settings['LOG_FORMATTER'] = 'tempbuffer.main2.ExampleLogFormatter'
        process = CrawlerProcess(settings)
        process.crawl(spider)
        process.start()
    

    spider.py:

    import scrapy
    
    
    class ExampleSpider(scrapy.Spider):
        name = 'example_spider'
        allowed_domains = ['scrapingclub.com']
        start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
    
        def parse(self, response):
            item = dict()
            item['title'] = response.xpath('//h3/text()').get()
            item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
            yield item
    

    Output:

    [scrapy.core.engine] DEBUG: example_spider Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
    [scrapy.core.scraper] DEBUG: example_spider Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
    {'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}
    

    Update:

    A non global working solution:

    import logging
    import scrapy
    from scrapy.utils.log import configure_logging
    
    
    class ExampleSpider(scrapy.Spider):
        name = 'example_spider'
        allowed_domains = ['scrapingclub.com']
        start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
    
        configure_logging(install_root_handler=False)
        logging.basicConfig(level=logging.DEBUG, format=name + ': %(levelname)s: %(message)s')
    
        def parse(self, response):
            item = dict()
            item['title'] = response.xpath('//h3/text()').get()
            item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
            yield item
    

    Update 2: Finally a working solution.

    main2.py:

    import logging
    import scrapy
    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings
    
    
    # create a logging filter
    class ContentFilter(logging.Filter):
        def filter(self, record):
            record.spider_name = ''
            # enter the spider's name
            if hasattr(record, 'spider'):
                record.spider_name = record.spider.name
    
            return True
    
            # record.spider.name was enough for my tests, but maybe you'll need this:
            # record.spider_name = ''
            # if hasattr(record, 'crawler'):
            #     record.spider_name = record.crawler.spidercls.name
            # elif hasattr(record, 'spider'):
            #     record.spider_name = record.spider.name
            # return True
    
    
    # Extend scrapy.Spider class
    class Spider(scrapy.Spider):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # the new format with "spider_name" variable:
            formatter = logging.Formatter('[%(spider_name)s]: %(levelname)s: %(message)s')
    
            # add the new format and filter to all the handlers
            for handler in logging.root.handlers:
                handler.formatter = formatter
                handler.addFilter(ContentFilter())
    
    
    if __name__ == "__main__":
        spider1 = 'example_spider'
        spider2 = 'example_spider2'
        settings = get_project_settings()
        settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    
        process = CrawlerProcess(settings)
        process.crawl(spider1)
        process.crawl(spider2)
        process.start()
    

    spider.py:

    from tempbuffer.main2 import Spider
    
    
    # use the extended "Spider" class
    class ExampleSpider(Spider):
        name = 'example_spider'
        allowed_domains = ['scrapingclub.com']
        start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
    
        def parse(self, response):
            item = dict()
            item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
            yield item
    
    
    # use the extended "Spider" class
    class ExampleSpider2(Spider):
        name = 'example_spider2'
        allowed_domains = ['scrapingclub.com']
        start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
    
        def parse(self, response):
            item = dict()
            item['title'] = response.xpath('//h3/text()').get()
            yield item