pythonasync-awaitconcurrencypython-asyncio

Using Asyncio for downloading big files: 10 times slower than threading


The goal is to download images from the URLs asynchronously.

Here is the code for threading, which works fine:

import requests
import threading
import os
import time
import concurrent.futures as futures


def download_image(url, folder):
    os.makedirs(folder, exist_ok=True)
    filename = url.split('/')[-1]
    filepath = os.path.join(folder, filename)
    
    response = requests.get(url)
    with open(filepath, "wb") as f:
        f.write(response.content)


urls = [
    "https://images-assets.nasa.gov/image/PIA03149/PIA03149~orig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/37/African_Bush_Elephant.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/97/The_Earth_seen_from_Apollo_17.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/2/29/%22Arte_Colonial%22.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d2/%22CUT%22_June_1929_04.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/8/82/%22CUT%22_June_1929_05.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/b/b1/%22Council_of_the_Gods%22_in_Galleria_Borghese_%28Rome%29_ceiling.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/71/%22East_front_of_Court_House_and_Planter%27s_House.%22.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/b/b6/%22Greater_Germany%22._Major_administrative_divisions._July_1._1944._100_kms_-_btv1b531213280.jpg",
] * 2

start_time = time.time()
with futures.ThreadPoolExecutor(max_workers=3) as executor:
    executor.map(download_image, urls, ["2a_threadpool"] * len(urls))
end_time = time.time()
print(f"ThreadPoolExecutor download time: {end_time - start_time:.2f} seconds")

Here is the code for asyncio, which is very slow:

import requests
import threading
import asyncio
import os
import time
import aiohttp
import aiofiles


async def download_image_async(url, folder, session):
        async with session.get(url) as response:
            data = await response.read()
            filename = url.split('/')[-1]
            filepath = os.path.join(folder, filename)

            async with aiofiles.open(filepath, "wb") as f:
                await f.write(data)


async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [download_image_async(url, "3_asyncio", session) for url in urls]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    start_time = time.time()
    asyncio.run(main())
    end_time = time.time()
    print(f"Asyncio download time: {end_time - start_time:.2f} seconds")

The images are quite large (more than 5 MB), and I can't find any mistakes in my code. Are there any possible improvements to the code? Or are large files a bottleneck? If so, why?


Solution

  • After testing your code, I noticed that your examples aren't downloading the complete files due to a download error. The files are there, but they can't be opened.

    I also think that if you limit the number of threads to three, you should also reduce the number of concurrent calls within the second example.

    In my comparison, both versions have a similar execution time.

    import requests
    import os
    import time
    from concurrent import futures
    
    def download_image(url, target):
        try:
            with requests.get(url, stream=True, headers={'User-Agent': 'Downloader/1.0'}) as resp:
                resp.raise_for_status()
                with open(target, 'wb') as f:
                    for chunk in resp.iter_content(chunk_size=8192):
                        if not chunk: break
                        f.write(chunk)
        except Exception as e:
            print(f"Error retrieving {url}: {e}")
    
    def url2dest(folder, url):
        filename = url.split('/')[-1]
        filepath = os.path.join(folder, filename)
        return filepath
    
    def main():
        urls = [
            "https://images-assets.nasa.gov/image/PIA03149/PIA03149~orig.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/3/37/African_Bush_Elephant.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/9/97/The_Earth_seen_from_Apollo_17.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/2/29/%22Arte_Colonial%22.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/d/d2/%22CUT%22_June_1929_04.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/8/82/%22CUT%22_June_1929_05.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/b/b1/%22Council_of_the_Gods%22_in_Galleria_Borghese_%28Rome%29_ceiling.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/7/71/%22East_front_of_Court_House_and_Planter%27s_House.%22.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/b/b6/%22Greater_Germany%22._Major_administrative_divisions._July_1._1944._100_kms_-_btv1b531213280.jpg",
        ] * 2
        folder = '2a_threadpool'
        os.makedirs(folder, exist_ok=True)
        with futures.ThreadPoolExecutor(max_workers=3) as executor:
            executor.map(lambda u: download_image(u, url2dest(folder, u)), urls)
    
    
    if __name__ == '__main__':
        bgn_time = time.time()
        main()
        end_time = time.time()
        print(f"ThreadPoolExecutor download time: {end_time - bgn_time:.2f} seconds")
    
    from functools import wraps
    import aiofiles
    import aiohttp
    import asyncio 
    import os
    import time
    
    def limit_concurrency(limit=10):
        sem = asyncio.Semaphore(limit)
        def executor(func):
            @wraps(func)
            async def wrapper(*args, **kwargs):
                async with sem:
                    return await func(*args, **kwargs)
            return wrapper
        return executor
    
    @limit_concurrency(limit=3)
    async def download(session: aiohttp.ClientSession, url: str, filename: str):
        try:
            async with session.get(url, headers={'User-Agent': 'Downloader/1.0'}) as response:
                assert response.status == 200
                async with aiofiles.open(filename, mode='wb') as fp:
                    await fp.write(await response.read())
        except Exception as e:
            print(f"Error retrieving {url}: {e}")
    
    def url2dest(folder, url):
        filename = url.split('/')[-1]
        filepath = os.path.join(folder, filename)
        return filepath
    
    async def main():
        urls = [
            "https://images-assets.nasa.gov/image/PIA03149/PIA03149~orig.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/3/37/African_Bush_Elephant.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/9/97/The_Earth_seen_from_Apollo_17.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/2/29/%22Arte_Colonial%22.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/d/d2/%22CUT%22_June_1929_04.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/8/82/%22CUT%22_June_1929_05.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/b/b1/%22Council_of_the_Gods%22_in_Galleria_Borghese_%28Rome%29_ceiling.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/7/71/%22East_front_of_Court_House_and_Planter%27s_House.%22.jpg",
            "https://upload.wikimedia.org/wikipedia/commons/b/b6/%22Greater_Germany%22._Major_administrative_divisions._July_1._1944._100_kms_-_btv1b531213280.jpg",
        ] * 2
        folder = '3_asyncio'
        os.makedirs(folder, exist_ok=True)
        async with aiohttp.ClientSession() as session:
            tasks = [download(session, url, url2dest(folder, url)) for url in urls]
            await asyncio.gather(*tasks)
    
    if __name__ == '__main__':
        bgn_time = time.time()
        asyncio.run(main())
        end_time = time.time()
        print(f"Asyncio download time: {end_time - bgn_time:.2f} seconds")