pythonasync-awaitconcurrencypython-asyncioaiohttp

aiohttp showing 403 Forbidden error but requests.get giving 200 OK response


I'm using aiohttp to asynchronously get download images from different sites using image URL. Before, I used requests.get to synchronously do the same. I am able successfully able to download images using requests.get but the same URL throws 403 Forbidden error when I'm trying to download images using aiohttp. I try to find what could be the issue but I haven't got any success so far. requests.get doesn't need any extra headers to get the image. The URL is important because that site's URL are getting this 403 error.

The requests.get version:

import requests
from io import BytesIO

async def download_image(self, url: str):
    
    ## is_url is just a small function which returns True if url is valid
    if not is_url(url):
        return None

    VALID_MIME_TYPES = {
        "image/jpeg": ".jpeg",
        "image/png": ".png",
        "image/jpg": ".jpg",
        "image/gif": ".gif",
        "image/tiff": ".tiff",
        "image/webp": ".webp",
        "image/apng": ".apng",
        "image/svg+xml": ".svg",
        "application/octet-stream": get_file_extension_from_url(url=url)  
        # get_file_extension_from_url used to get image's type from the URL
    }

    response = requests.get(url)    # Worked successfully and downloads the image
    mimetype = response.headers.get("Content-Type", "").lower()
    
    if mimetype in VALID_MIME_TYPES:
            # creating file name for file
            file_name = f"cimage.{VALID_MIME_TYPES[mimetype]}"
            content = response.content
            
            # converting to BytesIO stream
            return BytesIO(content), file_name, mimetype
    else:
        return None

image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_output = download_image(image_url)

The aiohttp version:

import aiohttp, asyncio
from io import BytesIO

async def download_image(self, url: str, session: aiohttp.ClientSession):
    """
    Args:
        url (str): image url
        session (aiohttp.ClientSession): Using a common aiohttp session for speedup
    """
    if not is_url(url):
        return None

    VALID_MIME_TYPES = {
        "image/jpeg": ".jpeg",
        "image/png": ".png",
        "image/jpg": ".jpg",
        "image/gif": ".gif",
        "image/tiff": ".tiff",
        "image/webp": ".webp",
        "image/apng": ".apng",
        "image/svg+xml": ".svg",
        "application/octet-stream": get_file_extension_from_url(url=url)
    }
    res = await session.request(method="GET", url=url)

    mimetype = res.headers.get("Content-Type", "").lower()

    if mimetype in VALID_MIME_TYPES:
        file_name = f"cimage.{VALID_MIME_TYPES[mimetype]}"
        content = await res.read()
        return BytesIO(content), file_name, mimetype
    else:
        return None

if __name__ == "__main__":
    image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
    image_urls = [image_url] * 1 # I just increase the number for testing 

    async def main():
        async with aiohttp.ClientSession(trust_env=True) as session:
            tasks = []
            for url in image_urls:
                task = asyncio.create_task(download_image(url=url, session=session))
                tasks.append(task)
                
            # returns list of output image in 3 item tuple.
            images = await asyncio.gather(*tasks)


    asyncio.run(main())

Here's print output of what res returns after the aiohttp request is done:

<ClientResponse(https://img.evbuc.com/https:%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format,compress&q=75&sharp=10&rect=0,15,1200,600&s=13645e838fd09f2552c8f8500410abec) [403 Forbidden]>
<CIMultiDictProxy('Content-Type': 'text/plain', 'Content-Length': '14', 'Connection': 'keep-alive', 'Cache-Control': 'public, max-age=5', 'Server': 'imgix', 'x-imgix-id': 'e3fb1d9c2f4cdf79dc45ca6fa20455560bdc05a5', 'x-imgix-proxy-status': '403', 'x-imgix-proxy-reason': '', 'X-Imgix-Render-Farm': '01.140360', 'Date': 'Wed, 18 Oct 2023 20:11:20 GMT', 'Accept-Ranges': 'bytes', 'Access-Control-Allow-Origin': '*', 'Timing-Allow-Origin': '*', 'Cross-Origin-Resource-Policy': 'cross-origin', 'X-Content-Type-Options': 'nosniff', 'X-Served-By': 'cache-sjc10076-SJC, cache-bom4734-BOM', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 9e8c29342ff6f7610166562f3559cbe4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'BOM78-P1', 'X-Amz-Cf-Id': 'cFMR0YKkz5pLrgzH-IkmYd0JTYqgZPT-wDKbTdxDiOr_ZJH_v3xLeg==', 'Age': '0')>

What is the issue in my situation?

I hope I get the solution. Thanks.

Just to be clear this aiohttp code works on other URL but I'm facing this strange issue with this type of URLs.


Solution

  • aiohttp normalizes the URL (that's not what requests does, so the request succeeds). You can disable this behavior using yarl.URL with encoded=True (aiohttp uses Yarl for URL processing):

    import asyncio
    import aiohttp
    import yarl
    
    
    async def download_image(url: str, session: aiohttp.ClientSession):
        """
        Args:
            url (str): image url
            session (aiohttp.ClientSession): Using a common aiohttp session for speedup
        """
    
        url = yarl.URL(url, encoded=True)
        res = await session.request(method="GET", url=url)
    
        print(res)
        print()
    
    
    if __name__ == "__main__":
        image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
        image_urls = [image_url] * 1  # I just increase the number for testing
    
        async def main():
            async with aiohttp.ClientSession(trust_env=True) as session:
                tasks = []
                for url in image_urls:
                    task = asyncio.create_task(download_image(url=url, session=session))
                    tasks.append(task)
    
                images = await asyncio.gather(*tasks)
    
        asyncio.run(main())
    

    Prints:

    <ClientResponse(https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec) [200 OK]>
    <CIMultiDictProxy('Content-Type': 'image/jpeg', 'Content-Length': '94266', 'Connection': 'keep-alive', 'Last-Modified': 'Wed, 20 Sep 2023 13:07:41 GMT', 'Cache-Control': 'public, max-age=315360001', 'Server': 'imgix', 'x-imgix-id': 'ec36b5116879ca860a0352578065a6c96481160e', 'X-Imgix-Render-Farm': '01.140360', 'Date': 'Wed, 18 Oct 2023 21:05:54 GMT', 'Accept-Ranges': 'bytes', 'Access-Control-Allow-Origin': '*', 'Timing-Allow-Origin': '*', 'Cross-Origin-Resource-Policy': 'cross-origin', 'X-Content-Type-Options': 'nosniff', 'X-Served-By': 'cache-sjc10040-SJC, cache-fra-eddf8230087-FRA', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 41b7bdf4fb536a6c72b9f49d9b6affe8.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'PRG50-C1', 'X-Amz-Cf-Id': 'iYy-EXyB519KYFu_luKo9bAMnMvANxcrHEj6-Sps0LYWJ5cx60Rbvg==', 'Age': '2447892')>