pythonweb-scrapingconditional-statementsscrapy

Scrapy send condition to parse from start_requests(self)


Im scraping a website which has different rows base on the type of item that Im scraping. I have a working scraper that looks like the 1st blockcode below, however, I would like to be able to take a type from the database and send from the start_requests(self) to the parse function. I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same. I have tried showing the code in the 2nd blockcode.

How do I accomplish taking the type from the database in the start_requests, and sending it to parse?

1st blockcode

# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc


class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []

def start_requests(self):

    #Get infoID and Type from database
    self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
    self.cursor = self.conn.cursor()
    self.cursor.execute("SELECT InfoID FROM dbo.infostage")

    rows = self.cursor.fetchall()

    for row in rows:
        url = 'http://www.nevermind.com/info/'
        yield self.make_requests_from_url(url+row[0])   

def parse(self, response):
    hxs = Selector(response)
    infodata = hxs.xpath('div[2]/div[2]')  # input item path

    itemPool = []

    InfoID = ''.join(response.url)
    id = InfoID[29:len(InfoID)-1]        


    for info in infodata:
        item = infoItem()

        # Details
        item['id'] = id #response.url
        item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
        item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
        item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
        item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
        item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
        item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()


        itemPool.append(item)
        yield item
    pass

2nd blockcode
This does not work, but Im not sure how to get it working. Do I create a global list, a new function?

# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc


class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []

def start_requests(self):

    #Get infoID and Type from database
    self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
    self.cursor = self.conn.cursor()
    self.cursor.execute("SELECT InfoID, type FROM dbo.infostage")

    rows = self.cursor.fetchall()

    for row in rows:
        url = 'http://www.nevermind.com/info/'
        type = row[1] # how do I send this value to the parse function?
        yield self.make_requests_from_url(url+row[0])

def parse(self, response):
    hxs = Selector(response)
    infodata = hxs.xpath('div[2]/div[2]')  # input base path

    itemPool = []

    InfoID = ''.join(response.url)
    id = InfoID[29:len(InfoID)-1]        


    for info in infodata:
        item = infoItem()

        # Details
        item['id'] = id #response.url

        # Here I need to implement a condition that comes from def start_requests(self).
        # If condition meet then scrape the following fields else the next
        if type = 'type1': 
# This is where I would like to use it. 
# I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same.
        # Type 1
            item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
            item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
            item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
            item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
            item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
            item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
        else:
            item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
            item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
            item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()

        itemPool.append(item)
        yield item
    pass


Thank you all for your help and insight!


Solution

  • You can use request.meta

    def make_requests_from_url(self, url, type, callback):
        request = scrapy.Request(url, callback)
        request.meta['type'] = type
        return request
    

    In parse you can access type using response.meta['type']