In a remote server, due to some limitations, I have generated tarfiles split by 2000 MB using the command as stated here:
tar -cvzf - tdd*20210914*.csv | split -b 2000M - archives/20210914.tar.gz.part
Now, I have a list of files: [20210914.tar.gz.partaa, 20210914.tar.gz.partab, 20210914.tar.gz.partac]
, and need to extract all the partfiles in a windows machine, using python.
Using tar.extractall()
:
def extract(infile : str, path : str):
tar = tarfile.open(infile, "r:gz")
tar.extractall(path = path)
tar.close()
extract("20210914.tar.gz.partaa", path = "tmp") # where file is first file
However, I am getting EOFError: Compressed file ended before the end-of-stream marker was reached
which is expected as (I suppose) there are two more files that need to be extracted.
My question: how to modify the function to read all files, and extract them in the same directory?
I've tried to directly pass the second file into the function, but the following error is raised:
OSError Traceback (most recent call last)
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1643 try:
-> 1644 t = cls.taropen(name, mode, fileobj, **kwargs)
1645 except OSError:
~\.conda\envs\python37\lib\tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1620 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
~\.conda\envs\python37\lib\tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
~\.conda\envs\python37\lib\tarfile.py in next(self)
2286 try:
-> 2287 tarinfo = self.tarinfo.fromtarfile(self)
2288 except EOFHeaderError as e:
~\.conda\envs\python37\lib\tarfile.py in fromtarfile(cls, tarfile)
1093
-> 1094 buf = tarfile.fileobj.read(BLOCKSIZE)
1095 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
~\.conda\envs\python37\lib\gzip.py in read(self, size)
286 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 287 return self._buffer.read(size)
288
~\.conda\envs\python37\lib\_compression.py in readinto(self, b)
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
~\.conda\envs\python37\lib\gzip.py in read(self, size)
473 self._init_read()
--> 474 if not self._read_gzip_header():
475 self._size = self._pos
~\.conda\envs\python37\lib\gzip.py in _read_gzip_header(self)
421 if magic != b'\037\213':
--> 422 raise OSError('Not a gzipped file (%r)' % magic)
423
OSError: Not a gzipped file (b'|\x19')
During handling of the above exception, another exception occurred:
ReadError Traceback (most recent call last)
<ipython-input-77-29d5169be949> in <module>
----> 1 extract("20210914.tar.gz.partab", path = "tmp") # where file is first file
<ipython-input-75-60cd4e78bf4e> in extract(infile, path, chunk, **kwargs)
1 def extract(infile : str, path : str, chunk : int = 2000, **kwargs):
----> 2 tar = tarfile.open(infile, "r:gz")
3 tar.extractall(path = path)
4 tar.close()
~\.conda\envs\python37\lib\tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1589 else:
1590 raise CompressionError("unknown compression type %r" % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
1593 elif "|" in mode:
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1646 fileobj.close()
1647 if mode == 'r':
-> 1648 raise ReadError("not a gzip file")
1649 raise
1650 except:
ReadError: not a gzip file
split
does what it names say - split file into parts, you should first concat all pieces you have, then treat it as normal *.tar.gz file. You might concat them using python as follows, create file concater.py
import sys
with open('total.tar.gz','wb') as f:
for fname in sys.argv[1:]:
with open(fname,'rb') as g:
f.write(g.read())
then do
python concater.py 20210914.tar.gz.partaa 20210914.tar.gz.partab 20210914.tar.gz.partac
which should create total.tar.gz
which is to be treated as just single *.tar.gz file. sys.argv
hold current script name followed by command line arguments, thus I jettison first of them (i.e. script name)