pythonscrapy

Scrapy Spider: Stuck with callback not firing


I am trying to scrape a github repo.

I want to extract all XML file urls at level1 of every repo and in the best case also extract information from the XML files.

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


repo_rule = Rule(
    LinkExtractor(
        restrict_xpaths="//a[@itemprop='name codeRepository']",
        restrict_text=r"ELTeC-.+"
    )
)

pagination_rule = Rule(
    LinkExtractor(restrict_xpaths="//a[@class='next_page']")
)

level_rule = Rule(
    LinkExtractor(allow=r"/level1"),
    follow=True,
    callback="parse_level"
)


class ELTecSpider(CrawlSpider):
    """Scrapy CrawlSpider for crawling the ELTec repo."""

    name = "eltec"
    start_urls = ["https://github.com/orgs/COST-ELTeC/repositories"]

    rules = [
        repo_rule,
        pagination_rule,
        level_rule,
    ]

    def parse_level(self, response):
        print("INFO: ", response.url)



process = CrawlerProcess(
    settings={
        "FEEDS": {
            "items.json": {
                "format": "json",
                "overwrite": True
            },
        },
    }
)

process.crawl(ELTecSpider)
process.start()

The above extracts the responses for all level1 folders, but somehow I am stuck at this point. My plan was to go down every level1 url using callbacks like so:

def parse_level(self, response):
    yield scrapy.Request(response.url, callback=self.parse_docs)

def parse_docs(self, response):
    docs_urls = response.xpath("//a[@class='Link--primary']")

    for url in docs_urls:
        print("INFO: ", url)

But apparently the callback never even fires.

What am I doing wrong?


Solution

  • scrapy remembers visited pages and it skips scraping the same url again.
    This way it doesn't waste time to get the same page again, and it also prevents crawling loops.

    When you run scrapy.Request(response.url, ...) then you try to scrape the same url again and scrapy skips it.

    If you really need to scrape the same page again then you may need

    Request(..., dont_filter=True)
    

    (Doc: scrapy.http.Request)


    I would rather run directly

    yield self.parse_docs(response)
    

    There is another problem inside parse_doc(). xpath doesn't find any element - so for-loop doesn't run any print(). You should add extra print() at the beginning of parse_doc() to see when it was executed.

    xpath may not find class='Link--primary' because this page uses JavaScript to add elements. And this may need to use Selenium and module scrapy-selenium to control real web browser which can run JavaScript. scrapy has also Splash and scrapy-splash to work with JavaScript.

    (Doc: Selecting dynamically-loaded content)


    Maybe GitHub has some API to get information without scraping.


    EDIT:

    Full working code which use scrapy-selenium and Selenium 3.

    It doesn't work with Selenium 4 because scrapy-selenium wasn't updated since 2020 and it doesn't work with newest Selenium 4

    pip install scrapy-selenium
    pip install 'selenium<4' 
    
    import scrapy
    from scrapy.crawler import CrawlerProcess
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy.linkextractors import LinkExtractor
    
    from scrapy_selenium import SeleniumRequest
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    
    #from shutil import which   # to run `which('firefox')` or `which('chrome')` in settings
    
    
    repo_rule = Rule(
        LinkExtractor(
            restrict_xpaths="//a[@itemprop='name codeRepository']",
            restrict_text=r"ELTeC-.+"
        )
    )
    
    pagination_rule = Rule(
        LinkExtractor(restrict_xpaths="//a[@class='next_page']")
    )
    
    level_rule = Rule(
        LinkExtractor(allow=r"/level1"),
        follow=True,
        callback="parse_level"
    )
    
    
    class ELTecSpider(CrawlSpider):
        """Scrapy CrawlSpider for crawling the ELTec repo."""
    
        name = "eltec"
        start_urls = [
            "https://github.com/orgs/COST-ELTeC/repositories", 
            "https://github.com/COST-ELTeC/ELTeC-lit/tree/master/level1"
        ]
    
        rules = [
            repo_rule,
            pagination_rule,
            level_rule,
        ]
    
        def parse_level(self, response):
            print("\n>>> PARSE LEVEL:", response.url)
            #yield scrapy.Request(response.url, callback=self.parse_docs, dont_filter=True)
            yield SeleniumRequest(url=response.url, callback=self.parse_docs, dont_filter=True, 
                    wait_time=10,
                    #wait_until=EC.element_to_be_clickable((By.CLASS_NAME, 'Link--primary'))
                    wait_until=EC.presence_of_element_located((By.CLASS_NAME, 'Link--primary'))
                    )
            
        def parse_docs(self, response):
            print("\n>>> PARSE DOC:", response.url)
            
            docs_urls = response.selector.xpath("//a[@class='Link--primary']")
            #print("\n>>> LEN:", len(docs_urls))
            
            for url in docs_urls:
                text = url.xpath('.//text()').get()
                href = url.xpath('.//@href').get()
                #print("\n>>> INFO:", href, text)
                yield {"text": text, "url": href}
                
                
    process = CrawlerProcess(
        settings={
            "FEEDS": {
                "items.json": {
                    "format": "json",
                    "overwrite": True
                },
            },
            
            'SELENIUM_DRIVER_NAME': 'firefox',  # or 'chrome'
            'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',  # or which('geckodriver'), which('chromedrive')
            'SELENIUM_DRIVER_ARGUMENTS': ['-headless'],  # '--headless' if using `chrome` instead of `firefox`
            #'SELENIUM_DRIVER_ARGUMENTS': [],  # needs at least empty list
    
            'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},    
        }   
    )
    
    process.crawl(ELTecSpider)
    process.start()
    

    Result from items.json

    [
    {"text": "LIT00001_seinius_kuprelis.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00001_seinius_kuprelis.xml"},
    {"text": "LIT00001_seinius_kuprelis.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00001_seinius_kuprelis.xml"},
    {"text": "LIT00002_pietaris_algimantas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00002_pietaris_algimantas.xml"},
    {"text": "LIT00002_pietaris_algimantas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00002_pietaris_algimantas.xml"},
    {"text": "LIT00004_dobilas_bludas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00004_dobilas_bludas.xml"},
    {"text": "LIT00004_dobilas_bludas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00004_dobilas_bludas.xml"},
    {"text": "LIT00005_daukantas_zemaiciu_pasakos.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00005_daukantas_zemaiciu_pasakos.xml"},
    {"text": "LIT00005_daukantas_zemaiciu_pasakos.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00005_daukantas_zemaiciu_pasakos.xml"},
    {"text": "LIT00006_kudirka_virsininkai.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00006_kudirka_virsininkai.xml"},