I am building my first scrapy and am trying to retrieve the urls in the sitemap. The code works, but it seems that scrapy is also already crawling the website urls themselves. I do not want that. How do I get only the urls in the sitemap? Thanks a lot for your advice:
import scrapy
from scrapy.crawler import CrawlerProcess
class SitemapSpider(scrapy.spiders.SitemapSpider):
name = "sitemap_spider"
def __init__(self, sitemap_url, *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.sitemap_urls = [sitemap_url]
self.extracted_urls = []
def parse(self, response):
print(response.url )
yield None
def run_sitemap_scraper(sitemap_url):
# Run the scraper
process = CrawlerProcess()
process.crawl(SitemapSpider, sitemap_url=sitemap_url)
process.start()
# Example usage
run_sitemap_scraper("https://ferienparkguide.de/sitemap_index.xml")
Have a look at scrapy.spiders.SitemapSpider._parse_sitemap