pythonweb-scrapingscrapyscrapinghubsplash-js-render

Scrapy does not fetch markup on response.css


I've built a simple scrapy spider running on scrapinghub:

class ExtractionSpider(scrapy.Spider):
    name = "extraction"
    allowed_domains = ['domain']
    start_urls = ['http://somedomainstart']
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"

    def parse(self, response):
        urls = response.css('a.offer-details__title-link::attr(href)').extract()

        print(urls)
        for url in urls:
            url = response.urljoin(url)
            yield SplashRequest(url=url, callback=self.parse_details)

        multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
        print(multiple_locs_urls)        
        for url in multiple_locs_urls:
            url = response.urljoin(url)
            yield SplashRequest(url=url, callback=self.parse_details)

        next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
        if next_page_url:
            next_page_url = response.urljoin(next_page_url)
            yield SplashRequest(url=next_page_url, callback=self.parse)

    def parse_details(self, response): 
        yield {
        'title': response.css('#jobTitle').extract_first(),
        'content': response.css('#description').extract_first(),
        'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
        'address': response.css('span[itemprop="address"]').extract_first()
        }

The problem I am facing is that the multiple_locs_url response.css returns an empty array despite me seeing it in the markup on the browser side.

I checked with scrapy shell and scrapy shell does not see the markup. I guess this is due to the markup being rendered through javascript when the page is loaded.

I added splash but that does not seem to apply to response. How would I make scrapy wait with the query until the page is loaded?


Solution

  • See source code for the page: view-source:pracuj.pl/praca/polska;ct,1 . There is no element with class "offer-regions__label" in html code.

    This code will always return an empty list:

    multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')
    

    But as explained here https://stackoverflow.com/a/17697329/9913319:

    Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it.

    In this case you can use Selenium. I changed your code and checked it and it works:

    class ExtractionSpider(scrapy.Spider):
        name = "extraction"
        allowed_domains = ['domain']
        start_urls = ['http://somedomainstart']
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
    
        def __init__( self, **kwargs ):
    
            super().__init__( **kwargs )
    
            profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
            firefox_binary = "pathToFirefoxBinary"  # Must be the developer edition!!!
            # self.driver = webdriver.Firefox()
            self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
    
        def parse(self, response):
    
            self.driver.get( response.url )
    
            elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
            self.driver.get( response.url )
            for element in elements:
                print( "****" )
                print( str( element.get_attribute( "href" ) ) )
                print( str( element.text ) )
    
            # your old code below
    
            urls = response.css('a.offer-details__title-link::attr(href)').extract()
    
            print(urls)
            for url in urls:
                url = response.urljoin(url)
                yield SplashRequest(url=url, callback=self.parse_details)
    
            multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
            print(multiple_locs_urls)        
            for url in multiple_locs_urls:
                url = response.urljoin(url)
                yield SplashRequest(url=url, callback=self.parse_details)
    
            next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
            if next_page_url:
                next_page_url = response.urljoin(next_page_url)
                yield SplashRequest(url=next_page_url, callback=self.parse)
    
        def parse_details(self, response): 
            yield {
            'title': response.css('#jobTitle').extract_first(),
            'content': response.css('#description').extract_first(),
            'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
            'address': response.css('span[itemprop="address"]').extract_first()
            }