I am trying to programatically call a spider through a script. I an unable to override the settings through the constructor using CrawlerProcess. Let me illustrate this with the default spider for scraping quotes from the official scrapy site (last code snippet at official scrapy quotes example spider).
class QuotesSpider(Spider):
name = "quotes"
def __init__(self, somestring, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.somestring = somestring
self.custom_settings = kwargs
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
Here is the script through which I try to run the quotes spider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
def main():
proc = CrawlerProcess(get_project_settings())
custom_settings_spider = \
{
'FEED_URI': 'quotes.csv',
'LOG_FILE': 'quotes.log'
}
proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
proc.start()
Scrapy Settings are a bit like Python dicts.
So you can update the settings object before passing it to CrawlerProcess
:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
def main():
s = get_project_settings()
s.update({
'FEED_URI': 'quotes.csv',
'LOG_FILE': 'quotes.log'
})
proc = CrawlerProcess(s)
proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
proc.start()
Edit following OP's comments:
Here's a variation using CrawlerRunner
, with a new CrawlerRunner
for each crawl and re-configuring logging at each iteration to write to different files each time:
import logging
from twisted.internet import reactor, defer
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging, _get_handler
from scrapy.utils.project import get_project_settings
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
page = getattr(self, 'page', 1)
yield scrapy.Request('http://quotes.toscrape.com/page/{}/'.format(page),
self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
@defer.inlineCallbacks
def crawl():
s = get_project_settings()
for i in range(1, 4):
s.update({
'FEED_URI': 'quotes%03d.csv' % i,
'LOG_FILE': 'quotes%03d.log' % i
})
# manually configure logging for LOG_FILE
configure_logging(settings=s, install_root_handler=False)
logging.root.setLevel(logging.NOTSET)
handler = _get_handler(s)
logging.root.addHandler(handler)
runner = CrawlerRunner(s)
yield runner.crawl(QuotesSpider, page=i)
# reset root handler
logging.root.removeHandler(handler)
reactor.stop()
crawl()
reactor.run() # the script will block here until the last crawl call is finished