I tried to get some data from a react based website, but when I use CrawlSpider I can't parse other pages. For Example I can parse my first URL with splash and other urls will parse regularly without dynamic content.
this is my code:
class PageSpider(CrawlSpider):
host = 'hooshmandsazeh.com'
protocol = 'https'
root_domain = 'hooshmandsazeh.com'
name = 'page'
allowed_domains = [host]
#start_urls = [f'{protocol}://{host}',]
def start_requests(self):
url = f'{self.protocol}://{self.host}'
yield SplashRequest(url, dont_process_response=True, args={'wait': 1}, meta={'real_url': url})
custom_settings = {
#'DEPTH_LIMIT': 9,
}
rules = (
# Rule(LinkExtractor(allow=('node_\d+\.htm',)), follow=True),
Rule(LinkExtractor(allow=(host),deny=('\.webp', '\.js', '\.css', '\.jpg', '\.png'),unique=True),
callback='parse',
follow=True,
process_request='splash_request'
),
)
def splash_request(self, request):
request.meta['real_url'] = request.url
print("Aliii",request.meta['real_url'])
return request
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
newresponse = response.replace(url=response.meta.get('real_url'))
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(newresponse)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def parse(self,response):
if len(LinkExtractor(deny = self.host).extract_links(response)) > 0:
loader = ItemLoader(item=PageLevelItem(), response=response)
loader.add_value('page_source_url', response.url)
yield loader.load_item()
Check below code worked for me:
def splash_request(self, request):
# request = request.replace(url=RENDER_HTML_URL + request.url)
request.meta['real_url'] = request.url
return SplashRequest(request.meta['real_url'], dont_process_response=True, args={'wait': 0}, meta={'real_url': request.meta['real_url']})