I am trying to scrape products on Amazon's Best Seller 100 for a particular category. For example -
https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0
The 100 products are divided into two pages with 50 products on each page.
Earlier, the page was static and all 50 products use to appear on the page. However, now the page is dynamic and I need to scroll down to see all 50 products on the page.
I was using scrapy to scrape the page earlier. Would really appreciate if you could help me out with this. Thanks!
Adding my code below -
import scrapy
from scrapy_splash import SplashRequest
class BsrNewSpider(scrapy.Spider):
name = 'bsr_new'
allowed_domains = ['www.amazon.in']
#start_urls = ['https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_0']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(0.5))
return splash:html()
end
'''
def start_requests(self):
url = 'https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_0'
yield SplashRequest(url, callback = self.parse, endpoint = "execute", args = {
'lua_source': self.script
})
def parse(self, response):
for rev in response.xpath("//div[@id='gridItemRoot']"):
yield {
'Segment': "Home", #Enter name of the segment here
#'Sub-segment':segment,
'ASIN' : rev.xpath(".//div/div[@class='zg-grid-general-faceout']/div/a[@class='a-link-normal']/@href").re('\S*/dp/(\S+)_\S+')[0][:10],
'Rank' : rev.xpath(".//span[@class='zg-bdg-text']/text()").get(),
'Name' : rev.xpath("normalize-space(.//a[@class='a-link-normal']/span/div/text())").get(),
'No. of Ratings' : rev.xpath(".//span[contains(@class,'a-size-small')]/text()").get(),
'Rating' : rev.xpath(".//span[@class='a-icon-alt']/text()").get(),
'Price' : rev.xpath(".//span[@class='a-size-base a-color-price']//text()").get()
}
next_page = response.xpath("//a[text()='Next page']/@href").get()
if next_page:
url = response.urljoin(next_page)
yield SplashRequest(url, callback = self.parse, endpoint = "execute", args = {
'lua_source': self.script
})
Regards Sreejan
Here is an alternate approach that does not need Splash.
All 50 products' ASIN is hidden on the first page itself. You can extract these ASIN and build all those 50 product URLs.
import scrapy
import json
class AmazonSpider(scrapy.Spider):
custom_settings ={
'DEFAULT_REQUEST_HEADERS':''# Important
}
name = 'amazon'
start_urls = ['https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_pg_1?_encoding=UTF8&pg=1']
def parse(self, response):
raw_data = response.css('[data-client-recs-list]::attr(data-client-recs-list)').get()
data = json.loads(raw_data)
for item in data:
url = 'https://www.amazon.com/dp/{}'.format(item['id'])
yield scrapy.Request(url, callback=self.parse_item)
def parse_item(self, response,):
...