After starting a spider, there is the problem with freezing on a stage when pipeline must enable. There is no errors, just scrapy-playwrigth script, but it stopes on beggining before even starts request. Python 3.13, scrapy-playwrigth 0.0.43, scrapy 2.13.2, playwrigth 1.54
I`ve turned off my firewall, checked settings on possible issues (concurent_requests reduced to 1, tried to switch off headless mode). There is no problem with database connection in PostgreSQLPipeline, tested on separate script. Switched off proxy, scrapeops rotating user-agent.
here is my code carspider.py
def start_requests(self):
"""Start requests with error handling"""
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
async def parse(self, response):
"""Parse the car listing page and extract car URLs"""
logger.info(f"Parsing list page: {response.url}")
# Extract car listings
cars = response.css('section.ticket-item')
logger.info(f"Found {len(cars)} cars on page")
# Process each car listing
for car in cars:
try:
car_url = car.css('a.m-link-ticket::attr(href), a.address::attr(href)').get()
if not car_url:
logger.debug("No car URL found in listing")
continue
# Convert relative URLs to absolute
car_url = urljoin(response.url, car_url)
# Skip new car listings
if 'newauto' in car_url:
logger.debug(f'Skipping new car listing: {car_url}')
continue
logger.debug(f'Yielding request for car: {car_url}')
# Create request for car details page
yield scrapy.Request(
url=car_url,
callback=self.parse_car_page,
errback=self.handle_error,
meta={
'playwright': True,
'playwright_include_page': True
}
)
except Exception as e:
logger.error(f"Error processing car listing: {str(e)}", exc_info=True)
continue
# Handle pagination
try:
next_page = response.css('a.js-next.page-link::attr(href), a.page-link.js-next::attr(href)').get()
if next_page:
next_page_url = urljoin(response.url, next_page)
logger.info(f"Found next page: {next_page_url}")
# Add delay before processing next page
await asyncio.sleep(2)
yield scrapy.Request(
url=next_page_url,
callback=self.parse,
errback=self.handle_error,
meta={'playwright': False}
)
else:
logger.info('No more pages found')
except Exception as e:
logger.error(f"Error handling pagination: {str(e)}", exc_info=True)
async def handle_error(self, failure):
"""Handle errors in request callbacks"""
try:
# Log the error
logger.error(f"Request failed: {failure.value}")
logger.debug(f"Failure type: {type(failure).__name__}")
# Log the request that failed
if hasattr(failure, 'request'):
logger.debug(f"Failed URL: {failure.request.url}")
logger.debug(f"Request meta: {failure.request.meta}")
# Close any open Playwright pages
if hasattr(failure, 'request') and 'playwright_page' in failure.request.meta:
try:
page = failure.request.meta['playwright_page']
if page:
await page.close()
logger.debug("Closed Playwright page after error")
except Exception as e:
logger.error(f"Error closing page in error handler: {str(e)}")
# Log traceback if available
if hasattr(failure, 'getTraceback'):
logger.debug(f"Traceback: {failure.getTraceback()}")
except Exception as e:
logger.error(f"Error in handle_error: {str(e)}")
# Return None to prevent further processing of the failed request
return None
async def parse_car_page(self, response):
"""Parse individual car page and extract details"""
logger.info(f'Processing car page: {response.url}')
page = None
phone_numbers = []
try:
# Get Playwright page from response
page = response.meta.get('playwright_page')
if not page:
logger.error("No Playwright page found in response.meta")
return
# Set page timeout
page.set_default_timeout(30000) # 30 seconds
# Handle cookie banner if present
await self._handle_cookie_banner(page)
# Extract phone numbers
phone_numbers = await self._extract_phone_numbers(page)
# Create item
item = CarItem()
item['url'] = response.url
# Extract basic info using CSS selectors
item['title'] = response.css('h1.head::text').get()
item['price_usd'] = response.css('div.price_value strong::text, div.price_value--additional span.i-block span::text').get()
item['odometer'] = response.css('div.base-information span.size18::text').get()
item['username'] = response.css('div.seller_info_name.bold a.sellerPro::text, h4.seller_info_name a::text, div.seller_info_name::text').get()
item['image_url'] = response.css('div.photo-620x465 picture img::attr(src)').get()
item['image_count'] = response.css('span.count span.mhide::text').get()
item['car_number'] = response.css('span.state-num::text').get()
item['car_vin'] = response.css('span.label-vin::text, div.t-check span.vin-code::text').get()
item['phone_number'] = phone_numbers
yield item
except Exception as e:
logger.error(f"Error processing car page {response.url}: {str(e)}", exc_info=True)
finally:
# Always close the page when done
if page:
try:
await page.close()
logger.debug("Closed Playwright page")
except Exception as e:
logger.error(f"Error closing Playwright page: {str(e)}")
scrapy-playwright requires the asyncio reactor. If it isn’t installed, Playwright never boots and Scrapy appears to hang right after “Enabled item pipelines…”.
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# make sure the handler is actually enabled
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
You may also try to upgrade scrapy-playwright to 0.0.44 - changelog says they fixed an issue about getting stuck on Windows.