Every example and use case uses pyppeteer where browser is opened and close immediately. e.g. import asyncio from pyppeteer import launch
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto('http://someurl')
content = await page.content()
cookieslist=await page.cookies()
cookiejar=createCookieJar(cookieslist)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
what happen if you want to keep browser open, and continuously scrape the data? thats easily done with selenium, but with pyppeteer, it doesnt work without asyncio it seems. the other way to make it work is to save the session and re-open the browser by schedule and scrape, but that feels like a very inefficient way. Anyone tried?
You can use asyncio.Queue and continuously pump your data into the queue:
import asyncio
import traceback
from contextlib import suppress
from pyppeteer import launch
WORKERS = 10
URLS = [
"http://airbnb.com",
"http://amazon.co.uk",
"http://amazon.com",
"http://bing.com",
"http://djangoproject.com",
"http://envato.com",
"http://facebook.com",
"http://github.com",
"http://google.co.uk",
"http://google.com",
"http://google.es",
"http://google.fr",
"http://heroku.com",
"http://instagram.com",
"http://linkedin.com",
"http://live.com",
"http://netflix.com",
"http://rubyonrails.org",
"http://shopify.com",
"http://stackoverflow.com",
"http://trello.com",
"http://wordpress.com",
"http://yahoo.com",
"http://yandex.ru",
"http://yiiframework.com",
"http://youtube.com",
]
async def worker(q, browser):
# One tab per worker
page = await browser.newPage()
with suppress(asyncio.CancelledError):
while True:
url = await q.get()
try:
await page.goto(url, {"timeout": 10000})
html = await page.content()
except Exception:
traceback.print_exc()
else:
print(f"{url}: {len(html)}")
finally:
q.task_done()
await page.close()
async def main():
q = asyncio.Queue()
browser = await launch(headless=True, args=["--no-sandbox"])
tasks = []
for _ in range(WORKERS):
tasks.append(asyncio.create_task(worker(q, browser)))
for url in URLS:
await q.put(url)
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
await browser.close()
if __name__ == "__main__":
asyncio.run(main())