pythontarfilendjson

Process line-by-line ndjson file inside tar file without decompression


I have Python code to process a big .ndjson file in a line-by-line manner:

import json

inputFileDir = ""
outputFileDir = ""

def processHtml(html):
    return(1)

with open(inputFileDir, "r", encoding = "utf-8") as f, open(outputFileDir, "w", encoding='utf-8') as g:
    for jsonLine in f:
        html = json.loads(jsonLine)
        html = processHtml(html)
        json.dump(html, g)
        g.write('\n') 

In fact, the original is a .tar.gz file that contains that single big .ndjson file.

Is it possible to process line-by-line the .ndjson file inside this .tar.gz file without the need of decompressing the .tar.gz file?


Solution

  • You can use tarfile.open() with mode r:gz to read directly from tar.gz without gzip.open()

    Let say you have file data.tar.gz which has file data.ndjson

    import tarfile
    import json
    
    with tarfile.open("data.tar.gz", "r:gz") as tar:
    
        data_file = tar.extractfile("data.ndjson")
    
        for json_line in data_file:
            html = json.loads(json_line)
            print("html:", html)
            # html = process_htm(html)
    

    Full code which I used for tests.

    First part generates file data.tar.gz with data.ndjson.

    import io
    import json
    import tarfile
    
    # --- create ---
    
    with tarfile.open("data.tar.gz", "w:gz") as tar:
    
        lines = []
        for x in range(10):
            json_data = {"number": x, "text": "a" * x}
            json_line = json.dumps(json_data)
            lines.append(json_line)
    
        data = "\n".join(lines).encode()  # add data as bytes
    
        data_file = io.BytesIO()
        data_file.write(data)
        data_file.seek(0)  # move to the beginning of file
    
        tar_info = tar.tarinfo("data.ndjson")
        tar_info.size = len(data)
    
        tar.addfile(tar_info, data_file)
    
    # --- extract ---
    
    with tarfile.open("data.tar.gz", "r:gz") as tar:
        # print("list:")
        # tar.list()  # it sends directly to sys.stdout, so no need `print()`
    
        # list all files using print()
        # for tar_info in tar:
        #    print(tar_info.name, tar_info.size)
    
        # tar_info = tar.next()
        tar_info = tar.getmember("data.ndjson")
        # print(tar_info)
        print("name:", tar_info.name)
        print("size:", tar_info.size)
    
        # data_file = tar.extractfile("data.ndjson")
        data_file = tar.extractfile(tar_info)
    
        for json_line in data_file:
            # print(">", json_line.decode())
            json_data = json.loads(json_line)
            print("json_data:", json_data)