pythonpython-2.7python-requests

Using Python Requests to 'bridge' a file without loading into memory?


I'd like to use the Python Requests library to GET a file from a url and use it as a mulitpart encoded file in a post request. The catch is that the file could be very large (50MB-2GB) and I don't want to load it in memory. (Context here.)

Following examples in the docs (multipart, stream down and stream up) I cooked up something like this:

    with requests.get(big_file_url, stream=True) as f:
        requests.post(upload_url, files={'file': ('filename', f.content)})

but I'm not sure I'm doing it right. It is in fact throwing this error - redacted from traceback:

    with requests.get(big_file_url, stream=True) as f:
    AttributeError: __exit__

Any suggestions?


Solution

  • There actually is an issue about that on Kenneth Reitz's GitHub repo. I had the same problem (although I'm just uploading a local file), and I added a wrapper class that is a list of streams corresponding to the different parts of the requests, with a read() attribute that iterates through the list and reads each part, and also gets necessary values for the headers (boundary and content-length) :

    # coding=utf-8
    
    from __future__ import unicode_literals
    from mimetools import choose_boundary
    from requests.packages.urllib3.filepost import iter_fields, get_content_type
    from io import BytesIO
    import codecs
    
    writer = codecs.lookup('utf-8')[3]
    
    class MultipartUploadWrapper(object):
    
        def __init__(self, files):
            """
            Initializer
    
            :param files:
                A dictionary of files to upload, of the form {'file': ('filename', <file object>)}
            :type network_down_callback:
                Dict
            """
            super(MultipartUploadWrapper, self).__init__()
            self._cursor = 0
            self._body_parts = None
            self.content_type_header = None
            self.content_length_header = None
            self.create_request_parts(files)
    
        def create_request_parts(self, files):
            request_list = []
            boundary = choose_boundary()
            content_length = 0
    
            boundary_string = b'--%s\r\n' % (boundary)
            for fieldname, value in iter_fields(files):
                content_length += len(boundary_string)
    
                if isinstance(value, tuple):
                    filename, data = value
                    content_disposition_string = (('Content-Disposition: form-data; name="%s"; ''filename="%s"\r\n' % (fieldname, filename))
                                                + ('Content-Type: %s\r\n\r\n' % (get_content_type(filename))))
    
                else:
                    data = value
                    content_disposition_string =  (('Content-Disposition: form-data; name="%s"\r\n' % (fieldname))
                                                + 'Content-Type: text/plain\r\n\r\n')
                request_list.append(BytesIO(str(boundary_string + content_disposition_string)))
                content_length += len(content_disposition_string)
                if hasattr(data, 'read'):
                    data_stream = data
                else:
                    data_stream = BytesIO(str(data))
    
                data_stream.seek(0,2)
                data_size = data_stream.tell()
                data_stream.seek(0)
    
                request_list.append(data_stream)
                content_length += data_size
    
                end_string = b'\r\n'
                request_list.append(BytesIO(end_string))
                content_length += len(end_string)
    
            request_list.append(BytesIO(b'--%s--\r\n' % (boundary)))
            content_length += len(boundary_string)
    
            # There's a bug in httplib.py that generates a UnicodeDecodeError on binary uploads if
            # there are *any* unicode strings passed into headers as part of the requests call.
            # For this reason all strings are explicitly converted to non-unicode at this point.
            self.content_type_header = {b'Content-Type': b'multipart/form-data; boundary=%s' % boundary}
            self.content_length_header = {b'Content-Length': str(content_length)}
            self._body_parts = request_list
    
        def read(self, chunk_size=0):
            remaining_to_read = chunk_size
            output_array = []
            while remaining_to_read > 0:
                body_part = self._body_parts[self._cursor]
                current_piece = body_part.read(remaining_to_read)
                length_read = len(current_piece)
                output_array.append(current_piece)
                if length_read < remaining_to_read:
                    # we finished this piece but haven't read enough, moving on to the next one
                    remaining_to_read -= length_read
                    if self._cursor == len(self._body_parts) - 1:
                        break
                    else:
                        self._cursor += 1
                else:
                    break
            return b''.join(output_array)
    

    So instead of passing a 'files' keyword arg, you pass this object as 'data' attribute to your Request.request object

    Edit

    I've cleaned up the code