I am trying to click on the load more button until it disappears and all products are loaded. Then I want to click on all individual products to scrape the data I need from the products individual site.
I have tried multiple ways of scrolling down and rearranged the code and syntax a few times using chat gpt and gemini. However, I still get returned an empty json file.
import scrapy
import datetime
import re
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector
class LidlSpider(scrapy.Spider):
name = 'lidl_snacks'
allowed_domains = ['sortiment.lidl.ch']
custom_settings = {
'ROBOTSTXT_OBEY': False
}
start_urls = [
'https://sortiment.lidl.ch/de/sussigkeiten-snacks#/', #246 Produkte
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
dont_filter=True,
callback=self.parse,
meta={
'url': url,
'playwright': True,
'playwright_include_page': True,
'playwright_page_methods':[
PageMethod('wait_for_selector', 'div.product-item-info'),
PageMethod("wait_for_selector", "button.primary.amscroll-load-button-new"),
]
}
)
async def scroll_to_bottom(self,page):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
async def parse(self, response):
page = response.meta["playwright_page"]
pagination_buttons = page.locator("button.primary.amscroll-load-button-new") # Adjust the selector as needed
if pagination_buttons:
buttons = await pagination_buttons.all()
for button in buttons:
await button.click() # Trigger pagination action
await page.wait_for_navigation()
await self.scroll_to_bottom(page) # Optional scroll down on the new page
# Extract product information after pagination click
content = await page.content()
sel = Selector(text=content)
produkte = sel.css('div.product-item-info')
for produkt in produkte:
produkt_url = produkt.css('a.product-item-link::attr(href)').get()
yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})
def parse_produkt(self, response):
mini_dict = {
'retailer': self.name,
'datetime': datetime.date.today(),
'categorie': None,
'id': None, #response.css('div.col-left>p::text').get().split()[1],
'brand': str(response.css('p.brand-name::text').get()),
'detail': str(response.css('span.base::text').get()),
'actual_price': response.css('strong.pricefield__price::attr(content)').get(),
'quantity': None,
'regular_price': None,
'price_per_unit': None,
}
yield mini_dict
if __name__ == "__main__": # __main__ was only created for debug purposes
process = CrawlerProcess()
process.crawl(LidlSpider)
process.start()
There are a couple of problems I can see,
Zustimmen
(Agree) button before you can click anything else. So add the following to your code :popup = 'div#onetrust-banner-sdk'
if await page.is_visible(popup, timeout = 5000):
await page.locator('button#onetrust-accept-btn-handler').click()
await page.wait_for_selector(popup, state='hidden')
The page.wait_for_navigation()
gives an error as there is no such method in playwright.page so you could replace it with page.wait_for_load_state("domcontentloaded")
There is a single Weitere Produkte laden
(Load More Products) button, which you need to click multiple times until it goes away, so pagination_buttons
in your code returns a single button, that gets clicked once.
pagination_buttons = page.locator("button.primary.amscroll-load-button-new")
buttons = await pagination_buttons.all()
for button in buttons:
await button.click() # Trigger pagination action
await page.wait_for_load_state("domcontentloaded") # Wait for new page to load
await self.scroll_to_bottom(page) # Optional scroll down on the new page
you can fix that by replacing the above with:
while True:
try:
show_more_button = page.locator("button.primary.amscroll-load-button-new")
if show_more_button:
await show_more_button.click()
await page.wait_for_load_state("domcontentloaded", timeout=5000)
await self.scroll_to_bottom(page)
else:
break
except Exception:
break
Here is the full code :
import datetime
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector
class LidlSpider(scrapy.Spider):
name = 'lidl_snacks'
allowed_domains = ['sortiment.lidl.ch']
custom_settings = {
'ROBOTSTXT_OBEY': False
}
start_urls = [
'https://sortiment.lidl.ch/de/kaffee-tee', #72 products
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
dont_filter=True,
callback=self.parse,
meta={
'url': url,
'playwright': True,
'playwright_include_page': True,
'playwright_page_methods':[
PageMethod('wait_for_load_state',"domcontentloaded"),
]
}
)
async def scroll_to_bottom(self,page):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
async def parse(self, response):
page = response.meta["playwright_page"]
#await page.screenshot(path="popup.png")
popup = 'div#onetrust-banner-sdk'
if await page.is_visible(popup, timeout = 5000):
await page.locator('button#onetrust-accept-btn-handler').click()
await page.wait_for_selector(popup, state='hidden')
#await page.screenshot(path="popup_clicked_check.png", full_page=True)
#count = 0
while True:
try:
show_more_button = page.locator("button.primary.amscroll-load-button-new")
if show_more_button:
await show_more_button.click()
await page.wait_for_load_state("domcontentloaded", timeout=5000) # Wait for new page to load
await self.scroll_to_bottom(page) # Optional scroll down on the new page
# await page.screenshot(path=f"page_scrolled_{count}.png", full_page=True)
# count+=1
else:
break
except Exception:
break
#Extract product information after pagination click
content = await page.content()
sel = Selector(text=content)
produkte = sel.css('div.product-item-info')
for produkt in produkte:
produkt_url = produkt.css('a.product-item-link::attr(href)').get()
yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})
def parse_produkt(self, response):
mini_dict = {
'retailer': self.name,
'datetime': datetime.date.today(),
'categorie': None,
'id': None, #response.css('div.col-left>p::text').get().split()[1],
'brand': str(response.css('p.brand-name::text').get()),
'detail': str(response.css('span.base::text').get()),
'actual_price': response.css('strong.pricefield__price::attr(content)').get(),
'quantity': None,
'regular_price': None,
'price_per_unit': None,
}
yield mini_dict
if __name__ == "__main__": # __main__ was only created for debug purposes
process = CrawlerProcess()
process.crawl(LidlSpider)
process.start()
Note(s) :
/sussigkeiten-snacks#/
with /kaffee-tee
as the page has less products to scrape.scrapy crawl lidl_snacks -O snacks.json
to see what it returns.