The goal is to download images from the URLs asynchronously.
Here is the code for threading, which works fine:
import requests
import threading
import os
import time
import concurrent.futures as futures
def download_image(url, folder):
os.makedirs(folder, exist_ok=True)
filename = url.split('/')[-1]
filepath = os.path.join(folder, filename)
response = requests.get(url)
with open(filepath, "wb") as f:
f.write(response.content)
urls = [
"https://images-assets.nasa.gov/image/PIA03149/PIA03149~orig.jpg",
"https://upload.wikimedia.org/wikipedia/commons/3/37/African_Bush_Elephant.jpg",
"https://upload.wikimedia.org/wikipedia/commons/9/97/The_Earth_seen_from_Apollo_17.jpg",
"https://upload.wikimedia.org/wikipedia/commons/2/29/%22Arte_Colonial%22.jpg",
"https://upload.wikimedia.org/wikipedia/commons/d/d2/%22CUT%22_June_1929_04.jpg",
"https://upload.wikimedia.org/wikipedia/commons/8/82/%22CUT%22_June_1929_05.jpg",
"https://upload.wikimedia.org/wikipedia/commons/b/b1/%22Council_of_the_Gods%22_in_Galleria_Borghese_%28Rome%29_ceiling.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/71/%22East_front_of_Court_House_and_Planter%27s_House.%22.jpg",
"https://upload.wikimedia.org/wikipedia/commons/b/b6/%22Greater_Germany%22._Major_administrative_divisions._July_1._1944._100_kms_-_btv1b531213280.jpg",
] * 2
start_time = time.time()
with futures.ThreadPoolExecutor(max_workers=3) as executor:
executor.map(download_image, urls, ["2a_threadpool"] * len(urls))
end_time = time.time()
print(f"ThreadPoolExecutor download time: {end_time - start_time:.2f} seconds")
Here is the code for asyncio, which is very slow:
import requests
import threading
import asyncio
import os
import time
import aiohttp
import aiofiles
async def download_image_async(url, folder, session):
async with session.get(url) as response:
data = await response.read()
filename = url.split('/')[-1]
filepath = os.path.join(folder, filename)
async with aiofiles.open(filepath, "wb") as f:
await f.write(data)
async def main():
async with aiohttp.ClientSession() as session:
tasks = [download_image_async(url, "3_asyncio", session) for url in urls]
await asyncio.gather(*tasks)
if __name__ == "__main__":
start_time = time.time()
asyncio.run(main())
end_time = time.time()
print(f"Asyncio download time: {end_time - start_time:.2f} seconds")
The images are quite large (more than 5 MB), and I can't find any mistakes in my code. Are there any possible improvements to the code? Or are large files a bottleneck? If so, why?
After testing your code, I noticed that your examples aren't downloading the complete files due to a download error. The files are there, but they can't be opened.
I also think that if you limit the number of threads to three, you should also reduce the number of concurrent calls within the second example.
In my comparison, both versions have a similar execution time.
import requests
import os
import time
from concurrent import futures
def download_image(url, target):
try:
with requests.get(url, stream=True, headers={'User-Agent': 'Downloader/1.0'}) as resp:
resp.raise_for_status()
with open(target, 'wb') as f:
for chunk in resp.iter_content(chunk_size=8192):
if not chunk: break
f.write(chunk)
except Exception as e:
print(f"Error retrieving {url}: {e}")
def url2dest(folder, url):
filename = url.split('/')[-1]
filepath = os.path.join(folder, filename)
return filepath
def main():
urls = [
"https://images-assets.nasa.gov/image/PIA03149/PIA03149~orig.jpg",
"https://upload.wikimedia.org/wikipedia/commons/3/37/African_Bush_Elephant.jpg",
"https://upload.wikimedia.org/wikipedia/commons/9/97/The_Earth_seen_from_Apollo_17.jpg",
"https://upload.wikimedia.org/wikipedia/commons/2/29/%22Arte_Colonial%22.jpg",
"https://upload.wikimedia.org/wikipedia/commons/d/d2/%22CUT%22_June_1929_04.jpg",
"https://upload.wikimedia.org/wikipedia/commons/8/82/%22CUT%22_June_1929_05.jpg",
"https://upload.wikimedia.org/wikipedia/commons/b/b1/%22Council_of_the_Gods%22_in_Galleria_Borghese_%28Rome%29_ceiling.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/71/%22East_front_of_Court_House_and_Planter%27s_House.%22.jpg",
"https://upload.wikimedia.org/wikipedia/commons/b/b6/%22Greater_Germany%22._Major_administrative_divisions._July_1._1944._100_kms_-_btv1b531213280.jpg",
] * 2
folder = '2a_threadpool'
os.makedirs(folder, exist_ok=True)
with futures.ThreadPoolExecutor(max_workers=3) as executor:
executor.map(lambda u: download_image(u, url2dest(folder, u)), urls)
if __name__ == '__main__':
bgn_time = time.time()
main()
end_time = time.time()
print(f"ThreadPoolExecutor download time: {end_time - bgn_time:.2f} seconds")
from functools import wraps
import aiofiles
import aiohttp
import asyncio
import os
import time
def limit_concurrency(limit=10):
sem = asyncio.Semaphore(limit)
def executor(func):
@wraps(func)
async def wrapper(*args, **kwargs):
async with sem:
return await func(*args, **kwargs)
return wrapper
return executor
@limit_concurrency(limit=3)
async def download(session: aiohttp.ClientSession, url: str, filename: str):
try:
async with session.get(url, headers={'User-Agent': 'Downloader/1.0'}) as response:
assert response.status == 200
async with aiofiles.open(filename, mode='wb') as fp:
await fp.write(await response.read())
except Exception as e:
print(f"Error retrieving {url}: {e}")
def url2dest(folder, url):
filename = url.split('/')[-1]
filepath = os.path.join(folder, filename)
return filepath
async def main():
urls = [
"https://images-assets.nasa.gov/image/PIA03149/PIA03149~orig.jpg",
"https://upload.wikimedia.org/wikipedia/commons/3/37/African_Bush_Elephant.jpg",
"https://upload.wikimedia.org/wikipedia/commons/9/97/The_Earth_seen_from_Apollo_17.jpg",
"https://upload.wikimedia.org/wikipedia/commons/2/29/%22Arte_Colonial%22.jpg",
"https://upload.wikimedia.org/wikipedia/commons/d/d2/%22CUT%22_June_1929_04.jpg",
"https://upload.wikimedia.org/wikipedia/commons/8/82/%22CUT%22_June_1929_05.jpg",
"https://upload.wikimedia.org/wikipedia/commons/b/b1/%22Council_of_the_Gods%22_in_Galleria_Borghese_%28Rome%29_ceiling.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/71/%22East_front_of_Court_House_and_Planter%27s_House.%22.jpg",
"https://upload.wikimedia.org/wikipedia/commons/b/b6/%22Greater_Germany%22._Major_administrative_divisions._July_1._1944._100_kms_-_btv1b531213280.jpg",
] * 2
folder = '3_asyncio'
os.makedirs(folder, exist_ok=True)
async with aiohttp.ClientSession() as session:
tasks = [download(session, url, url2dest(folder, url)) for url in urls]
await asyncio.gather(*tasks)
if __name__ == '__main__':
bgn_time = time.time()
asyncio.run(main())
end_time = time.time()
print(f"Asyncio download time: {end_time - bgn_time:.2f} seconds")