windowsscrapypython-asyncioplaywrightscrapy-playwright

Scrapy Playwright freezes after initialization ([scrapy.middleware] INFO: Enabled item pipelines:[‘carscraper.pipelines.PostgreSQLPipeline’])


After starting a spider, there is the problem with freezing on a stage when pipeline must enable. There is no errors, just scrapy-playwrigth script, but it stopes on beggining before even starts request. Python 3.13, scrapy-playwrigth 0.0.43, scrapy 2.13.2, playwrigth 1.54

I`ve turned off my firewall, checked settings on possible issues (concurent_requests reduced to 1, tried to switch off headless mode). There is no problem with database connection in PostgreSQLPipeline, tested on separate script. Switched off proxy, scrapeops rotating user-agent.

here is my code carspider.py

    def start_requests(self):
        """Start requests with error handling"""
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse
            )
    async def parse(self, response):
        """Parse the car listing page and extract car URLs"""
        logger.info(f"Parsing list page: {response.url}")
        
        # Extract car listings
        cars = response.css('section.ticket-item')
        logger.info(f"Found {len(cars)} cars on page")
        
        # Process each car listing
        for car in cars:
            try:
                car_url = car.css('a.m-link-ticket::attr(href), a.address::attr(href)').get()
                if not car_url:
                    logger.debug("No car URL found in listing")
                    continue
                
                # Convert relative URLs to absolute
                car_url = urljoin(response.url, car_url)
                
                # Skip new car listings
                if 'newauto' in car_url:
                    logger.debug(f'Skipping new car listing: {car_url}')
                    continue
                
                logger.debug(f'Yielding request for car: {car_url}')
                
                # Create request for car details page
                yield scrapy.Request(
                    url=car_url,
                    callback=self.parse_car_page,
                    errback=self.handle_error,
                    meta={
                        'playwright': True,
                        'playwright_include_page': True
                    }
                )

                            except Exception as e:
                logger.error(f"Error processing car listing: {str(e)}", exc_info=True)
                continue
        
        # Handle pagination
        try:
            next_page = response.css('a.js-next.page-link::attr(href), a.page-link.js-next::attr(href)').get()
            if next_page:
                next_page_url = urljoin(response.url, next_page)
                logger.info(f"Found next page: {next_page_url}")
                
                # Add delay before processing next page
                await asyncio.sleep(2)
                
                yield scrapy.Request(
                    url=next_page_url,
                    callback=self.parse,
                    errback=self.handle_error,
                    meta={'playwright': False}
                )
            else:
                logger.info('No more pages found')
                
        except Exception as e:
            logger.error(f"Error handling pagination: {str(e)}", exc_info=True)
    
    async def handle_error(self, failure):
        """Handle errors in request callbacks"""
        try:
            # Log the error
            logger.error(f"Request failed: {failure.value}")
            logger.debug(f"Failure type: {type(failure).__name__}")
            
            # Log the request that failed
            if hasattr(failure, 'request'):
                logger.debug(f"Failed URL: {failure.request.url}")
                logger.debug(f"Request meta: {failure.request.meta}")
            
            # Close any open Playwright pages
            if hasattr(failure, 'request') and 'playwright_page' in failure.request.meta:
                try:
                    page = failure.request.meta['playwright_page']
                    if page:
                        await page.close()
                        logger.debug("Closed Playwright page after error")
                except Exception as e:
                    logger.error(f"Error closing page in error handler: {str(e)}")
            
            # Log traceback if available
            if hasattr(failure, 'getTraceback'):
                logger.debug(f"Traceback: {failure.getTraceback()}")
                
        except Exception as e:
            logger.error(f"Error in handle_error: {str(e)}")
            
        # Return None to prevent further processing of the failed request
        return None

    async def parse_car_page(self, response):
        """Parse individual car page and extract details"""
        logger.info(f'Processing car page: {response.url}')
        page = None
        phone_numbers = []
        
        try:
            # Get Playwright page from response
            page = response.meta.get('playwright_page')
            if not page:
                logger.error("No Playwright page found in response.meta")
                return
                
            # Set page timeout
            page.set_default_timeout(30000)  # 30 seconds
            
            # Handle cookie banner if present
            await self._handle_cookie_banner(page)
            
            # Extract phone numbers
            phone_numbers = await self._extract_phone_numbers(page)
            
            # Create item
            item = CarItem()
            item['url'] = response.url
            
            # Extract basic info using CSS selectors
            item['title'] = response.css('h1.head::text').get()
            item['price_usd'] = response.css('div.price_value strong::text, div.price_value--additional span.i-block span::text').get()
            item['odometer'] = response.css('div.base-information span.size18::text').get()
            item['username'] = response.css('div.seller_info_name.bold a.sellerPro::text, h4.seller_info_name a::text, div.seller_info_name::text').get()
            item['image_url'] = response.css('div.photo-620x465 picture img::attr(src)').get()
            item['image_count'] = response.css('span.count span.mhide::text').get()
            item['car_number'] = response.css('span.state-num::text').get()
            item['car_vin'] = response.css('span.label-vin::text, div.t-check span.vin-code::text').get()
            item['phone_number'] = phone_numbers
            
            yield item
            
        except Exception as e:
            logger.error(f"Error processing car page {response.url}: {str(e)}", exc_info=True)
            
        finally:
            # Always close the page when done
            if page:
                try:
                    await page.close()
                    logger.debug("Closed Playwright page")
                except Exception as e:
                    logger.error(f"Error closing Playwright page: {str(e)}")

Solution

  • scrapy-playwright requires the asyncio reactor. If it isn’t installed, Playwright never boots and Scrapy appears to hang right after “Enabled item pipelines…”.

    TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
    
    # make sure the handler is actually enabled
    DOWNLOAD_HANDLERS = {
        "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
    }
    

    You may also try to upgrade scrapy-playwright to 0.0.44 - changelog says they fixed an issue about getting stuck on Windows.