javascriptselenium-webdriverclickweb-crawlerscrapy

Selenium Click() not working with scrapy spider


I am trying to scrape links to product pages from a listing page using a scrapy spider. The page shows the first 10 machines and has a button for 'show all machines' that calls some javascript. The javascript is reasonably complicated (i.e. I can't just look at the function and see the url that the button points to). I'm trying to use the selenium webdriver to simulate a click on the button but it isn't working for some reason. When I scrape the product links I only get the first 10, not the complete list.

Can anybody tell me why it doesn't work?

The page I'm trying to scrape is http://www.ncservice.com/en/second-hand-milling-machines

The spider is

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, FormRequest
from scrapy import log
from scrapy.exceptions import DropItem
from scrapy import signals
from mtispider.items import MachineItem

import urlparse
import time
import MySQLdb
import unicodedata
import re
from mtispider import tools
from selenium import webdriver


class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']

def start_requests(self):
    requests = list(super(MachineSpider, self).start_requests())
    requests.append(Request('http://www.ncservice.com/en/second-hand-milling-machines', callback=self.parsencmilllist))
    return requests

def parsencmilllist(self,response):
    hxs=HtmlXPathSelector(response)
    driver= webdriver.Firefox()
    driver.get(response.url)
    try:
        driver.FindElement(By.Id("mas-resultados-fresadoras")).Click()
    except:
        log.msg("Couldnt get all the machines", level=log.INFO)
    ncmachs = hxs.select('//div[@id="resultados"]//a/@href').extract()
    for ncmach in ncmachs:
        yield Request(ncmach,
                      meta = {'type':'Milling'},
                      callback=self.parsencmachine)
    driver.quit()

def parsencmachine(self,response):
    #scrape the machine
    return item

Thanks!


Solution

  • The main problem is that you need to initialize your Selector from the webdriver's page_source and not the response passed into the callback:

    from scrapy.contrib.spiders import CrawlSpider
    from scrapy.http import Request
    from scrapy import Selector
    
    from selenium import webdriver
    
    class MachineSpider(CrawlSpider):
        name = 'nc-spider'
        allowed_domains = ['ncservice.com']
    
        def start_requests(self):
            yield Request('http://www.ncservice.com/en/second-hand-milling-machines',
                          callback=self.parsencmilllist)
    
        def parsencmilllist(self, response):
            driver = webdriver.Firefox()
    
            driver.get(response.url)
            driver.find_element_by_id("mas-resultados-fresadoras").click()
    
            sel = Selector(text=driver.page_source)
            driver.quit()
    
            links = sel.xpath('//div[@id="resultados"]//a/@href').extract()
            for link in links:
                yield Request(link,
                              meta={'type': 'Milling'},
                              callback=self.parsencmachine)
    
        def parsencmachine(self, response):
            print response.url