web-scrapingscrapyscrapy-splash

Scraping dynamic amazon page with scrolling


I am trying to scrape products on Amazon's Best Seller 100 for a particular category. For example -

https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0

The 100 products are divided into two pages with 50 products on each page.

Earlier, the page was static and all 50 products use to appear on the page. However, now the page is dynamic and I need to scroll down to see all 50 products on the page.

I was using scrapy to scrape the page earlier. Would really appreciate if you could help me out with this. Thanks!

Adding my code below -

import scrapy
from scrapy_splash import SplashRequest

class BsrNewSpider(scrapy.Spider):
    name = 'bsr_new'
    allowed_domains = ['www.amazon.in']
    #start_urls = ['https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_0']

script = '''
    function main(splash, args)
        splash.private_mode_enabled = false
        url = args.url
        assert(splash:go(url))
        assert(splash:wait(0.5))
        return splash:html()
    end
'''

def start_requests(self):
    url = 'https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_0'
    yield SplashRequest(url, callback = self.parse, endpoint = "execute", args = {
        'lua_source': self.script
    })

def parse(self, response):
    for rev in response.xpath("//div[@id='gridItemRoot']"):   
        yield {
            'Segment': "Home", #Enter name of the segment here
            #'Sub-segment':segment,
            'ASIN' : rev.xpath(".//div/div[@class='zg-grid-general-faceout']/div/a[@class='a-link-normal']/@href").re('\S*/dp/(\S+)_\S+')[0][:10],
            'Rank' : rev.xpath(".//span[@class='zg-bdg-text']/text()").get(),
            'Name' : rev.xpath("normalize-space(.//a[@class='a-link-normal']/span/div/text())").get(),
            'No. of Ratings' : rev.xpath(".//span[contains(@class,'a-size-small')]/text()").get(),
            'Rating' : rev.xpath(".//span[@class='a-icon-alt']/text()").get(),
            'Price' : rev.xpath(".//span[@class='a-size-base a-color-price']//text()").get()
            }      
        
        next_page = response.xpath("//a[text()='Next page']/@href").get()
        if next_page:
            url = response.urljoin(next_page)
            yield SplashRequest(url, callback = self.parse, endpoint = "execute", args = {
                'lua_source': self.script
            })

Regards Sreejan


Solution

  • Here is an alternate approach that does not need Splash.

    All 50 products' ASIN is hidden on the first page itself. You can extract these ASIN and build all those 50 product URLs.

    import scrapy
    import json
    
    class AmazonSpider(scrapy.Spider):
        custom_settings ={
            'DEFAULT_REQUEST_HEADERS':''# Important
        }
        name = 'amazon'
        start_urls = ['https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_pg_1?_encoding=UTF8&pg=1']
    
        def parse(self, response):
            raw_data = response.css('[data-client-recs-list]::attr(data-client-recs-list)').get()
            data = json.loads(raw_data)
            for item in data:
                url = 'https://www.amazon.com/dp/{}'.format(item['id'])
                yield scrapy.Request(url, callback=self.parse_item)
        def parse_item(self, response,):
            ...