pythonweb-scrapingscrapyscrapy-shell

Scrapy crawl loop for next page


Hello I am trying to get into word scrapers and crawlers however I don't understand why my code is not going to the next page and looping.

import scrapy 
from scrapy import*

    import scrapy 
from scrapy import*

class SpiderSpider(scrapy.Spider):
    name = 'spider'
    start_urls = ['https://www.thehousedirectory.com/category/interior-designers-architects/london-interior-designers/']
            
     
    def parse(self, response):

        allbuyers = response.xpath('//div[@class="company-details"]')

        for buyers in allbuyers:

            name = buyers.xpath('.//div/a/h2/text()').extract_first()
            email = buyers.xpath('.//p/a[contains(text(),"@")]/text()').extract_first()
            
            yield{

                'Name' : name,
                'Email' : email,

            }  
        
        next_url = response.css('#main > div > nav > a.next.page-numbers')

        if next_url:
            print("test")
            url = response.xpath("href").extract()
            yield scrapy.Request(url, self.parse)

Solution

  • What you did to get the next page doesn't really make any sense. To be specific, this line I meant url = response.xpath("href").extract()

    Here is the modified version of your spider:

    class HouseDirectorySpider(scrapy.Spider):
        name = 'thehousedirectory'
        start_urls = ['https://www.thehousedirectory.com/category/interior-designers-architects/london-interior-designers/']
                
        def parse(self, response):
            for buyers in response.xpath('//*[@class="company-details"]'):
                yield {
                    'Name' : buyers.xpath('.//*[@class="heading"]/a/h2/text()').get(),
                    'Email' : buyers.xpath('.//p/a[starts-with(@href,"mailto:")]/text()').get(),
                }  
            
            next_url = response.css('.custom-pagination > a.next:contains("Next Page")')
            if next_url:
                url = next_url.css("::attr(href)").get()
                yield scrapy.Request(url,callback=self.parse)