python web-scraping logging scrapy web-crawler

Add the spider's name to each line of log

I am looking for a way to prefix each log produced by Scrapy with the name of the spider that generated it. Until now, I was launching each spider synchronously in a loop, so it was easy to track which spider generated which log. But I have recently refactored my code in order to either accept a list of spiders as argument, or launch them all at once throught the CrawlerProcess() function. The result is that they are launched asynchronously, so the logs are all mixed up.

I have thought about adding something like [%(name)] to the LOG_FORMAT setting, but the name produced is the module that called it (scrapy.core.engine, scrapy.utils.log, etc.) and not the spider's name.

I also tried creating an extension that would modify the crawler's settings by retrieving spider.name and adding it to the LOG_FORMAT constant, but as far as I'm aware changing the settings while the crawler is running has no effect (and I haven't found a clean way of doing it since they are immutable).

Any help would be greatly appreciated ! Thank you

I tried setting a custom LOG_FORMAT but there does not seem to be any way to access the spider's name ;
I tried using an extension to catch the crawler's settings and modify them, but they are immutable and they are only evaluated at the beginning of the process ;

Solution

You need to create a custom log format, and set it as the log formatter for the project.

Basically you need to extend Scrapy's log formatter and set the message with the new format.

Example with crawled and scraped:

main2.py:

from scrapy import logformatter
import logging
import os
from twisted.python.failure import Failure
from scrapy.utils.request import referer_str

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


SCRAPEDMSG = "Scraped from %(src)s" + os.linesep + "%(item)s"
# DROPPEDMSG = "Dropped: %(exception)s" + os.linesep + "%(item)s"
CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
# ITEMERRORMSG = "Error processing %(item)s"
# SPIDERERRORMSG = "Spider error processing %(request)s (referer: %(referer)s)"
# DOWNLOADERRORMSG_SHORT = "Error downloading %(request)s"
# DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"


class ExampleLogFormatter(logformatter.LogFormatter):
    def crawled(self, request, response, spider):
        request_flags = f' {str(request.flags)}' if request.flags else ''
        response_flags = f' {str(response.flags)}' if response.flags else ''
        return {
            'level': logging.DEBUG,
            'msg': f'{spider.name} {CRAWLEDMSG}',
            'args': {
                'status': response.status,
                'request': request,
                'request_flags': request_flags,
                'referer': referer_str(request),
                'response_flags': response_flags,
                # backward compatibility with Scrapy logformatter below 1.4 version
                'flags': response_flags
            }
        }

    def scraped(self, item, response, spider):
        if isinstance(response, Failure):
            src = response.getErrorMessage()
        else:
            src = response
        return {
            'level': logging.DEBUG,
            'msg': f'{spider.name} {SCRAPEDMSG}',
            'args': {
                'src': src,
                'item': item,
            }
        }


if __name__ == "__main__":
    spider = 'example_spider'
    settings = get_project_settings()
    settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    settings['LOG_FORMATTER'] = 'tempbuffer.main2.ExampleLogFormatter'
    process = CrawlerProcess(settings)
    process.crawl(spider)
    process.start()

spider.py:

import scrapy


class ExampleSpider(scrapy.Spider):
    name = 'example_spider'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    def parse(self, response):
        item = dict()
        item['title'] = response.xpath('//h3/text()').get()
        item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
        yield item

Output:

[scrapy.core.engine] DEBUG: example_spider Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
[scrapy.core.scraper] DEBUG: example_spider Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
{'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}

Update:

A non global working solution:

import logging
import scrapy
from scrapy.utils.log import configure_logging


class ExampleSpider(scrapy.Spider):
    name = 'example_spider'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    configure_logging(install_root_handler=False)
    logging.basicConfig(level=logging.DEBUG, format=name + ': %(levelname)s: %(message)s')

    def parse(self, response):
        item = dict()
        item['title'] = response.xpath('//h3/text()').get()
        item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
        yield item

Update 2: Finally a working solution.

main2.py:

import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


# create a logging filter
class ContentFilter(logging.Filter):
    def filter(self, record):
        record.spider_name = ''
        # enter the spider's name
        if hasattr(record, 'spider'):
            record.spider_name = record.spider.name

        return True

        # record.spider.name was enough for my tests, but maybe you'll need this:
        # record.spider_name = ''
        # if hasattr(record, 'crawler'):
        #     record.spider_name = record.crawler.spidercls.name
        # elif hasattr(record, 'spider'):
        #     record.spider_name = record.spider.name
        # return True


# Extend scrapy.Spider class
class Spider(scrapy.Spider):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # the new format with "spider_name" variable:
        formatter = logging.Formatter('[%(spider_name)s]: %(levelname)s: %(message)s')

        # add the new format and filter to all the handlers
        for handler in logging.root.handlers:
            handler.formatter = formatter
            handler.addFilter(ContentFilter())


if __name__ == "__main__":
    spider1 = 'example_spider'
    spider2 = 'example_spider2'
    settings = get_project_settings()
    settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

    process = CrawlerProcess(settings)
    process.crawl(spider1)
    process.crawl(spider2)
    process.start()

spider.py:

from tempbuffer.main2 import Spider


# use the extended "Spider" class
class ExampleSpider(Spider):
    name = 'example_spider'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    def parse(self, response):
        item = dict()
        item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
        yield item


# use the extended "Spider" class
class ExampleSpider2(Spider):
    name = 'example_spider2'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    def parse(self, response):
        item = dict()
        item['title'] = response.xpath('//h3/text()').get()
        yield item