I'd like to use the Python Requests library to GET a file from a url and use it as a mulitpart encoded file in a post request. The catch is that the file could be very large (50MB-2GB) and I don't want to load it in memory. (Context here.)
Following examples in the docs (multipart, stream down and stream up) I cooked up something like this:
with requests.get(big_file_url, stream=True) as f:
requests.post(upload_url, files={'file': ('filename', f.content)})
but I'm not sure I'm doing it right. It is in fact throwing this error - redacted from traceback:
with requests.get(big_file_url, stream=True) as f:
AttributeError: __exit__
Any suggestions?
There actually is an issue about that on Kenneth Reitz's GitHub repo. I had the same problem (although I'm just uploading a local file), and I added a wrapper class that is a list of streams corresponding to the different parts of the requests, with a read() attribute that iterates through the list and reads each part, and also gets necessary values for the headers (boundary and content-length) :
# coding=utf-8
from __future__ import unicode_literals
from mimetools import choose_boundary
from requests.packages.urllib3.filepost import iter_fields, get_content_type
from io import BytesIO
import codecs
writer = codecs.lookup('utf-8')[3]
class MultipartUploadWrapper(object):
def __init__(self, files):
"""
Initializer
:param files:
A dictionary of files to upload, of the form {'file': ('filename', <file object>)}
:type network_down_callback:
Dict
"""
super(MultipartUploadWrapper, self).__init__()
self._cursor = 0
self._body_parts = None
self.content_type_header = None
self.content_length_header = None
self.create_request_parts(files)
def create_request_parts(self, files):
request_list = []
boundary = choose_boundary()
content_length = 0
boundary_string = b'--%s\r\n' % (boundary)
for fieldname, value in iter_fields(files):
content_length += len(boundary_string)
if isinstance(value, tuple):
filename, data = value
content_disposition_string = (('Content-Disposition: form-data; name="%s"; ''filename="%s"\r\n' % (fieldname, filename))
+ ('Content-Type: %s\r\n\r\n' % (get_content_type(filename))))
else:
data = value
content_disposition_string = (('Content-Disposition: form-data; name="%s"\r\n' % (fieldname))
+ 'Content-Type: text/plain\r\n\r\n')
request_list.append(BytesIO(str(boundary_string + content_disposition_string)))
content_length += len(content_disposition_string)
if hasattr(data, 'read'):
data_stream = data
else:
data_stream = BytesIO(str(data))
data_stream.seek(0,2)
data_size = data_stream.tell()
data_stream.seek(0)
request_list.append(data_stream)
content_length += data_size
end_string = b'\r\n'
request_list.append(BytesIO(end_string))
content_length += len(end_string)
request_list.append(BytesIO(b'--%s--\r\n' % (boundary)))
content_length += len(boundary_string)
# There's a bug in httplib.py that generates a UnicodeDecodeError on binary uploads if
# there are *any* unicode strings passed into headers as part of the requests call.
# For this reason all strings are explicitly converted to non-unicode at this point.
self.content_type_header = {b'Content-Type': b'multipart/form-data; boundary=%s' % boundary}
self.content_length_header = {b'Content-Length': str(content_length)}
self._body_parts = request_list
def read(self, chunk_size=0):
remaining_to_read = chunk_size
output_array = []
while remaining_to_read > 0:
body_part = self._body_parts[self._cursor]
current_piece = body_part.read(remaining_to_read)
length_read = len(current_piece)
output_array.append(current_piece)
if length_read < remaining_to_read:
# we finished this piece but haven't read enough, moving on to the next one
remaining_to_read -= length_read
if self._cursor == len(self._body_parts) - 1:
break
else:
self._cursor += 1
else:
break
return b''.join(output_array)
So instead of passing a 'files' keyword arg, you pass this object as 'data' attribute to your Request.request object
I've cleaned up the code