pythonwindowstarfilemultipartfile

Extract all part-files using python tarfile of format tar.gz.part*


In a remote server, due to some limitations, I have generated tarfiles split by 2000 MB using the command as stated here:

tar -cvzf - tdd*20210914*.csv | split -b 2000M - archives/20210914.tar.gz.part

Now, I have a list of files: [20210914.tar.gz.partaa, 20210914.tar.gz.partab, 20210914.tar.gz.partac], and need to extract all the partfiles in a windows machine, using python.

Using tar.extractall():

def extract(infile : str, path : str):
    tar = tarfile.open(infile, "r:gz")
    tar.extractall(path = path)
    tar.close()

extract("20210914.tar.gz.partaa", path = "tmp") # where file is first file

However, I am getting EOFError: Compressed file ended before the end-of-stream marker was reached which is expected as (I suppose) there are two more files that need to be extracted.

My question: how to modify the function to read all files, and extract them in the same directory?

I've tried to directly pass the second file into the function, but the following error is raised:

OSError                                   Traceback (most recent call last)
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
   1643         try:
-> 1644             t = cls.taropen(name, mode, fileobj, **kwargs)
   1645         except OSError:

~\.conda\envs\python37\lib\tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
   1620             raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621         return cls(name, mode, fileobj, **kwargs)
   1622 

~\.conda\envs\python37\lib\tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
   1483                 self.firstmember = None
-> 1484                 self.firstmember = self.next()
   1485 

~\.conda\envs\python37\lib\tarfile.py in next(self)
   2286             try:
-> 2287                 tarinfo = self.tarinfo.fromtarfile(self)
   2288             except EOFHeaderError as e:

~\.conda\envs\python37\lib\tarfile.py in fromtarfile(cls, tarfile)
   1093         
-> 1094         buf = tarfile.fileobj.read(BLOCKSIZE)
   1095         obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)

~\.conda\envs\python37\lib\gzip.py in read(self, size)
    286             raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 287         return self._buffer.read(size)
    288 

~\.conda\envs\python37\lib\_compression.py in readinto(self, b)
     67         with memoryview(b) as view, view.cast("B") as byte_view:
---> 68             data = self.read(len(byte_view))
     69             byte_view[:len(data)] = data

~\.conda\envs\python37\lib\gzip.py in read(self, size)
    473                 self._init_read()
--> 474                 if not self._read_gzip_header():
    475                     self._size = self._pos

~\.conda\envs\python37\lib\gzip.py in _read_gzip_header(self)
    421         if magic != b'\037\213':
--> 422             raise OSError('Not a gzipped file (%r)' % magic)
    423 

OSError: Not a gzipped file (b'|\x19')

During handling of the above exception, another exception occurred:

ReadError                                 Traceback (most recent call last)
<ipython-input-77-29d5169be949> in <module>
----> 1 extract("20210914.tar.gz.partab", path = "tmp") # where file is first file

<ipython-input-75-60cd4e78bf4e> in extract(infile, path, chunk, **kwargs)
      1 def extract(infile : str, path : str, chunk : int = 2000, **kwargs):
----> 2     tar = tarfile.open(infile, "r:gz")
      3     tar.extractall(path = path)
      4     tar.close()

~\.conda\envs\python37\lib\tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
   1589             else:
   1590                 raise CompressionError("unknown compression type %r" % comptype)
-> 1591             return func(name, filemode, fileobj, **kwargs)
   1592 
   1593         elif "|" in mode:

~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
   1646             fileobj.close()
   1647             if mode == 'r':
-> 1648                 raise ReadError("not a gzip file")
   1649             raise
   1650         except:

ReadError: not a gzip file

Solution

  • split does what it names say - split file into parts, you should first concat all pieces you have, then treat it as normal *.tar.gz file. You might concat them using python as follows, create file concater.py

    import sys
    with open('total.tar.gz','wb') as f:
        for fname in sys.argv[1:]:
            with open(fname,'rb') as g:
                f.write(g.read())
    

    then do

    python concater.py 20210914.tar.gz.partaa 20210914.tar.gz.partab 20210914.tar.gz.partac
    

    which should create total.tar.gz which is to be treated as just single *.tar.gz file. sys.argv hold current script name followed by command line arguments, thus I jettison first of them (i.e. script name)