As you see, I'm trying to make an multiprocess downloader. It works well untill I open the final file: it's broken. I check it but can't find any mistake. But It's probably the header download header is wrong and the Range value error. this is the code
class MultiprocessDownload:
def __init__(self, url, path, filename, thread_num):
self.url = url
self.path = path
self.filename = filename
self.thread_num = thread_num
self.threads = []
self.head = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}).headers
self.length = int(self.head.get('Content-Length', False))
print(self.length)
self.proc = []
self.lock = []
for i in range(self.thread_num):
self.proc.append(None)
self.lock.append(None)
if self.length == False: raise Exception('Thik file does not support multiprocess download')
num = self.length // self.thread_num
last = -1
for i in range(1, thread_num + 1):
self.threads.append([last + 1, num * i])
last = num * i
self.threads[-1:][0][1] += self.length % self.thread_num
print(self.threads)
def thread(self, num):
self.lock[num] = _thread.allocate_lock()
with self.lock[num]:
header = {'Range': f'bytes=' + str(self.threads[num][0]) + '-' + str(self.threads[num][1])}
print(header)
self.proc[num] = 0
req = requests.get(self.url, headers=header, stream = True)
blk_size = self.threads[num][1] - self.threads[num][0]
file = open('dl_block_' + str(num), 'wb')
i = 0
for chunk in req.iter_content(chunk_size=512):
if chunk:
file.write(chunk)
i += 1
self.proc[num] = i * 512 / blk_size
file.close()
return 0
def getDownloadInfo(self):
info = []
total = 0
for i in range(self.thread_num):
if None in self.proc: return 1
info.append(str(self.proc[i] * 100) + '%')
total += self.proc[i]
info.append(str(total / self.thread_num * 100) + '%') #[*threads_info, total_info]
return info
def run(self):
for i in range(self.thread_num):
print(i)
_thread.start_new_thread(self.thread, (i,))
time.sleep(10)
locked = 1
print(self.lock)
while locked:
print(self.getDownloadInfo())
locked = 0
for n in range(self.thread_num):
if None in self.lock:
locked = 1
continue
if self.lock[n].locked():
locked += 1
print(locked)
time.sleep(1)
for num in range(self.thread_num):
blk = open('dl_block_' + str(num), 'rb')
target = open(self.path + self.filename, 'ab')
target.write(blk.read())
target.close()
blk.close()
os.remove('dl_block_' + str(num))
print('file downloaded as', self.path + self.filename)
Some of its output: [[0, 48967091], [48967092, 97934182], [97934183, 146901273], [146901274, 195868364], [195868365, 244835455], [244835456, 293802546], [293802547, 342769637], [342769638, 391736728], [391736729, 440703819], [440703820, 489670910], [489670911, 538638001], [538638002, 587605092], [587605093, 636572183], [636572184, 685539274], [685539275, 734506365], [734506366, 783473471]] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 {'Range': 'bytes=48967092-97934182'} {'Range': 'bytes=244835456-293802546'}{'Range': 'bytes=195868365-244835455'} {'Range': 'bytes=489670911-538638001'}{'Range': 'bytes=391736729-440703819'}{'Range': 'bytes=342769638-391736728'}{'Range': 'bytes=0-48967091'}{'Range': 'bytes=97934183-146901273'} {'Range': 'bytes=146901274-195868364'}{'Range': 'bytes=440703820-489670910'}{'Range': 'bytes=293802547-342769637'}
{'Range': 'bytes=538638002-587605092'}{'Range': 'bytes=587605093-636572183'} {'Range': 'bytes=685539275-734506365'}
{'Range': 'bytes=636572184-685539274'}{'Range': 'bytes=734506366-783473471'}
the thread's range is quite strange but I can't find where's wrong.
It is expected that the program print out the range not in order, depending on which thread is started first rather than depending on which thread is create first.
You should change your code like this:
target = open(self.path + self.filename, 'wb')
for num in range(self.thread_num):
blk = open('dl_block_' + str(num), 'rb')
target.write(blk.read())
blk.close()
os.remove('dl_block_' + str(num))
target.close()
This change will create a target file when you first run it or rewrite the target file from the second running.
The code seems ugly although it works. More advice: