python-3.xweb-scrapingscrapyscrapinghub

scrapy passing custom_settings to spider from script using CrawlerProcess.crawl()


I am trying to programatically call a spider through a script. I an unable to override the settings through the constructor using CrawlerProcess. Let me illustrate this with the default spider for scraping quotes from the official scrapy site (last code snippet at official scrapy quotes example spider).

class QuotesSpider(Spider):

    name = "quotes"

    def __init__(self, somestring, *args, **kwargs):
        super(QuotesSpider, self).__init__(*args, **kwargs)
        self.somestring = somestring
        self.custom_settings = kwargs


    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield Request(url=url, callback=self.parse)

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('small.author::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

Here is the script through which I try to run the quotes spider

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings

    def main():

    proc = CrawlerProcess(get_project_settings())

    custom_settings_spider = \
    {
        'FEED_URI': 'quotes.csv',
        'LOG_FILE': 'quotes.log'
    }
    proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
    proc.start()

Solution

  • Scrapy Settings are a bit like Python dicts. So you can update the settings object before passing it to CrawlerProcess:

    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings
    from scrapy.settings import Settings
    
    def main():
    
        s = get_project_settings()
        s.update({
            'FEED_URI': 'quotes.csv',
            'LOG_FILE': 'quotes.log'
        })
        proc = CrawlerProcess(s)
    
        proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
        proc.start()
    

    Edit following OP's comments:

    Here's a variation using CrawlerRunner, with a new CrawlerRunner for each crawl and re-configuring logging at each iteration to write to different files each time:

    import logging
    from twisted.internet import reactor, defer
    
    import scrapy
    from scrapy.crawler import CrawlerRunner
    from scrapy.utils.log import configure_logging, _get_handler
    from scrapy.utils.project import get_project_settings
    
    
    class QuotesSpider(scrapy.Spider):
        name = "quotes"
    
        def start_requests(self):
            page = getattr(self, 'page', 1)
            yield scrapy.Request('http://quotes.toscrape.com/page/{}/'.format(page),
                                 self.parse)
    
        def parse(self, response):
            for quote in response.css('div.quote'):
                yield {
                    'text': quote.css('span.text::text').extract_first(),
                    'author': quote.css('small.author::text').extract_first(),
                    'tags': quote.css('div.tags a.tag::text').extract(),
                }
    
    
    @defer.inlineCallbacks
    def crawl():
        s = get_project_settings()
        for i in range(1, 4):
            s.update({
                'FEED_URI': 'quotes%03d.csv' % i,
                'LOG_FILE': 'quotes%03d.log' % i
            })
    
            # manually configure logging for LOG_FILE
            configure_logging(settings=s, install_root_handler=False)
            logging.root.setLevel(logging.NOTSET)
            handler = _get_handler(s)
            logging.root.addHandler(handler)
    
            runner = CrawlerRunner(s)
            yield runner.crawl(QuotesSpider, page=i)
    
            # reset root handler
            logging.root.removeHandler(handler)
        reactor.stop()
    
    crawl()
    reactor.run() # the script will block here until the last crawl call is finished