pythonhtmlscrapyrequestscreen-scraping

pagination, next page with scrapy


the next page button doesn't change the url when it's pressed, so i have problem with scrapy.

'''

import scrapy

class LegonSpider(scrapy.Spider):
    name = "legon"

    def start_requests(self):
        yield scrapy.Request(
            url="https://mylegion.org/PersonifyEbusiness/Find-a-Post",
            callback=self.parse
        )

    def parse(self, response):
        # Select distance and country
        yield scrapy.FormRequest.from_response(
            response,
            formid='aspnetForm',
            formdata={'dnn$ctr2802$DNNWebControlContainer$ctl00$DistanceList': '100',
                      '@IP_COUNTRY': 'USA',
                      '@IP_DEPARTMENT': '00000000001L'},
            callback=self.parse_post_page
        )
    def parse_post_page(self, response):
        # Extract and yield requests for post detail pages
        post_elements = response.xpath("//div[@class='membership-dir-result-item']")
        for post_element in post_elements:
            post_num = post_element.xpath(".//div[contains(@class,'POST_NAME')]/text()").get().strip()
            post_link = post_element.xpath("./a/@href").get()
            yield response.follow(post_link, callback=self.parse_post_detail, meta={'post_num': post_num})


        next_page_button = response.xpath("/input[@id='dnn_ctr2802_DNNWebControlContainer_ctl00_Next']")
        if next_page_button:

        # Extract form data for next page submission
            formdata = {
                '__EVENTTARGET': 'dnn$ctr2802$DNNWebControlContainer$ctl00$Next',
                '__EVENTARGUMENT': ''
                }
            yield scrapy.FormRequest.from_response(response, formdata=formdata, callback=self.parse_post_page)
        
    def parse_post_detail(self,response):
        leader1 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[1]").get()
        leader2 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[2]").get()
        address = response.xpath("//div[contains(@class,'Address')]/div[2]/text()").get()
        typ = response.xpath("//div[contains(@class,'Type')]/div[2]/text()").get()

        yield {
            "post_num": response.meta['post_num'],
            "leader1": leader1,
            "leader2": leader2,
            "address": address,
            "type" : typ

        }
        

i think scrapy didn't even go the next page he's going to the base url which is not changing at all when i press next page or i try to use new search method .


Solution

  • When I checked the responses I saw that I get the same page over and over.

    If we use BurpSuite to inspect the requests and compare them we can see this part: BurpSuite comparer

    You can see on the RHS the the value "Next", but if we inspect the form data in the response we can see that the value is missing. We just need to add it:

    import scrapy
    
    
    class LegonSpider(scrapy.Spider):
        name = "legon"
    
        def start_requests(self):
            yield scrapy.Request(
                url="https://mylegion.org/PersonifyEbusiness/Find-a-Post",
                callback=self.parse
            )
    
        def parse(self, response):
            # Select distance and country
            yield scrapy.FormRequest.from_response(
                response,
                formid='aspnetForm',
                formdata={'dnn$ctr2802$DNNWebControlContainer$ctl00$DistanceList': '100',
                          '@IP_COUNTRY': 'USA',
                          '@IP_DEPARTMENT': '00000000001L'},
                callback=self.parse_post_page
            )
    
        def parse_post_page(self, response):
            post_elements = response.xpath("//div[@class='membership-dir-result-item']")
            for post_element in post_elements:
                post_num = post_element.xpath(".//div[contains(@class,'POST_NAME')]/text()").get().strip()
                post_link = post_element.xpath("./a/@href").get()
                yield response.follow(post_link, callback=self.parse_post_detail, meta={'post_num': post_num})
    
            next_page_button = response.xpath("//input[@id='dnn_ctr2802_DNNWebControlContainer_ctl00_Next']")
            if next_page_button:
                form_data = {'dnn$ctr2802$DNNWebControlContainer$ctl00$Next': 'Next'}
                yield scrapy.FormRequest.from_response(response, formdata=form_data, callback=self.parse_post_page)
    
        def parse_post_detail(self, response):
            leader1 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[1]").get()
            leader2 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[2]").get()
            address = response.xpath("//div[contains(@class,'Address')]/div[2]/text()").get()
            typ = response.xpath("//div[contains(@class,'Type')]/div[2]/text()").get()
    
            yield {
                "post_num": response.meta['post_num'],
                "leader1": leader1,
                "leader2": leader2,
                "address": address,
                "type": typ
            }
    

    See the differences between my form data and yours.

    BTW you missed a / in the selector of next_page_button.