pythonpyppeteer

using pyppeteer in a continuous scraping mode


Every example and use case uses pyppeteer where browser is opened and close immediately. e.g. import asyncio from pyppeteer import launch

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('http://someurl')
    content = await page.content()
    cookieslist=await page.cookies()
    cookiejar=createCookieJar(cookieslist)
    await browser.close()
 
asyncio.get_event_loop().run_until_complete(main())

what happen if you want to keep browser open, and continuously scrape the data? thats easily done with selenium, but with pyppeteer, it doesnt work without asyncio it seems. the other way to make it work is to save the session and re-open the browser by schedule and scrape, but that feels like a very inefficient way. Anyone tried?


Solution

  • You can use asyncio.Queue and continuously pump your data into the queue:

    import asyncio
    import traceback
    
    from contextlib import suppress
    
    from pyppeteer import launch
    
    WORKERS = 10
    URLS = [
        "http://airbnb.com",
        "http://amazon.co.uk",
        "http://amazon.com",
        "http://bing.com",
        "http://djangoproject.com",
        "http://envato.com",
        "http://facebook.com",
        "http://github.com",
        "http://google.co.uk",
        "http://google.com",
        "http://google.es",
        "http://google.fr",
        "http://heroku.com",
        "http://instagram.com",
        "http://linkedin.com",
        "http://live.com",
        "http://netflix.com",
        "http://rubyonrails.org",
        "http://shopify.com",
        "http://stackoverflow.com",
        "http://trello.com",
        "http://wordpress.com",
        "http://yahoo.com",
        "http://yandex.ru",
        "http://yiiframework.com",
        "http://youtube.com",
    ]
    
    
    async def worker(q, browser):
        # One tab per worker
        page = await browser.newPage()
    
        with suppress(asyncio.CancelledError):
            while True:
                url = await q.get()
    
                try:
                    await page.goto(url, {"timeout": 10000})
                    html = await page.content()
                except Exception:
                    traceback.print_exc()
                else:
                    print(f"{url}: {len(html)}")
                finally:
                    q.task_done()
    
        await page.close()
    
    
    async def main():
        q = asyncio.Queue()
        browser = await launch(headless=True, args=["--no-sandbox"])
    
        tasks = []
    
        for _ in range(WORKERS):
            tasks.append(asyncio.create_task(worker(q, browser)))
    
        for url in URLS:
            await q.put(url)
    
        await q.join()
    
        for task in tasks:
            task.cancel()
    
        await asyncio.gather(*tasks, return_exceptions=True)
    
        await browser.close()
    
    
    if __name__ == "__main__":
        asyncio.run(main())