pythonplaywrightplaywright-python

Error in Python Playwright: playwright._impl._api_types.Error: net::ERR_ABORTED at ... waiting until "load" error Python Playwright


I've struggled a while now trying to download a pdf in Python Playwright which is rendered from a php web page (not included in this example as it has sensitive code - instead I've included a link to a pdf).

Here's my code so far, using the JavaScript code from https://github.com/microsoft/playwright/issues/3509 as example:

from playwright.async_api import Playwright, async_playwright, expect
import asyncio
import os
import json

tmp_dir = './pwtest/'
user_dir = os.path.join(os.getcwd(),"pwtest","user_dir")
print("User dir: ", user_dir)
downloads_path = os.path.join(os.getcwd(),"pwtest","downloads")
print("Downloads path: ", downloads_path)
storage_state_path = "./pwtest/"

default_preferences = {
    "plugins": {
        "always_open_pdf_externally": True
    }
}
#Making directories can likely be done more efficiently... please comment if you know how, then I'll edit the post.
try:
    os.mkdir(os.path.join(os.getcwd(),"pwtest"))
except:
    print("Unable to create folder... Likely it already exists.")
try:
    os.mkdir(os.path.join(os.getcwd(),"pwtest","downloads"))
except:
    print("Unable to create folder... Likely it already exists.")
try:
    os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir"))
except:
    print("Unable to create folder... Likely it already exists.")
try:
    os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir","Default"))
except:
    print("Unable to create folder... Likely it already exists.")

with open(os.path.join(user_dir, "Default", "Preferences"), "w") as f:
    f.write(json.dumps(default_preferences))


async def run(playwright: Playwright) -> None:
    browser = await playwright.chromium.launch_persistent_context(user_dir, accept_downloads=True, headless=False, slow_mo=1000)
    browser.set_default_timeout(10000)
    page = await browser.new_page()

    # Start waiting for the download
    file_name = "test_d.pdf"
    async with page.expect_download() as download_info:
        await page.goto("https://www.africau.edu/images/default/sample.pdf", timeout= 5000)
        await page.wait_for_timeout(200)
        print("Saving file to ", downloads_path, file_name)
        # Wait for the download to start
        download = await download_info.value
        # Wait for the download process to complete
        print(await download.path())
        # Save downloaded file somewhere
        await download.save_as(os.path.join(downloads_path, file_name))

    await browser.close()

async def main() -> None:
    async with async_playwright() as playwright:
        await run(playwright)

asyncio.run(main())

Help will be appreciated.

I keep getting the following error after trying sync, async etc. code. Another alternative is likely to intercept the blob transfer, but I don't know how that's done. Please advise.

playwright._impl._api_types.Error: net::ERR_ABORTED at https://www.africau.edu/images/default/sample.pdf
=========================== logs ===========================
navigating to "https://www.africau.edu/images/default/sample.pdf", waiting until "load"
============================================================

Solution

  • This is how I fixed it in the end as explained in my comment of original post. This is probably not the best way to do it, but it worked. Please comment if you can improve on using the try/except methods for the download portion of the pdf.

    from playwright.async_api import Playwright, async_playwright, expect
    import asyncio
    import os
    import json
    
    tmp_dir = './pwtest/'
    user_dir = os.path.join(os.getcwd(),"pwtest","user_dir")
    print("User dir: ", user_dir)
    downloads_path = os.path.join(os.getcwd(),"pwtest","downloads")
    print("Downloads path: ", downloads_path)
    storage_state_path = "./pwtest/"
    # os.makedirs(os.path.join(tmp_dir, user_dir), exist_ok=True)
    
    default_preferences = {
        "plugins": {
            "always_open_pdf_externally": True
        }
    }
    #Making directories can likely be done more efficiently... please comment if you know how, then I'll edit the post.
    try:
        os.mkdir(os.path.join(os.getcwd(),"pwtest"))
    except:
        print("Unable to create folder... Likely it already exists.")
    try:
        os.mkdir(os.path.join(os.getcwd(),"pwtest","downloads"))
    except:
        print("Unable to create folder... Likely it already exists.")
    try:
        os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir"))
    except:
        print("Unable to create folder... Likely it already exists.")
    try:
        os.mkdir(os.path.join(os.getcwd(),"pwtest","user_dir","Default"))
    except:
        print("Unable to create folder... Likely it already exists.")
    
    with open(os.path.join(user_dir, "Default", "Preferences"), "w") as f:
        f.write(json.dumps(default_preferences))
    
    
    async def run(playwright: Playwright) -> None:
        browser = await playwright.chromium.launch_persistent_context(user_dir, accept_downloads=True, headless=False, slow_mo=1000)
        browser.set_default_timeout(10000)
        page = await browser.new_page()
    
        # Start waiting for the download
        file_name = "test_d.pdf"
        async with page.expect_download() as download_info:
            try:
                await page.goto("https://www.africau.edu/images/default/sample.pdf", timeout= 0)
            except:
                print("Saving file to ", downloads_path, file_name)
                # Wait for the download to start
                download = await download_info.value
                # Wait for the download process to complete
                print(await download.path())
                # Save downloaded file somewhere
                await download.save_as(os.path.join(downloads_path, file_name))
            await page.wait_for_timeout(200)
    
        await browser.close()
    
    async def main() -> None:
        async with async_playwright() as playwright:
            await run(playwright)
    
    asyncio.run(main())