I have Python code to process a big .ndjson file in a line-by-line manner:
import json
inputFileDir = ""
outputFileDir = ""
def processHtml(html):
return(1)
with open(inputFileDir, "r", encoding = "utf-8") as f, open(outputFileDir, "w", encoding='utf-8') as g:
for jsonLine in f:
html = json.loads(jsonLine)
html = processHtml(html)
json.dump(html, g)
g.write('\n')
In fact, the original is a .tar.gz file that contains that single big .ndjson file.
Is it possible to process line-by-line the .ndjson file inside this .tar.gz file without the need of decompressing the .tar.gz file?
You can use tarfile.open() with mode r:gz to read directly from tar.gz without gzip.open()
Let say you have file data.tar.gz which has file data.ndjson
import tarfile
import json
with tarfile.open("data.tar.gz", "r:gz") as tar:
data_file = tar.extractfile("data.ndjson")
for json_line in data_file:
html = json.loads(json_line)
print("html:", html)
# html = process_htm(html)
Full code which I used for tests.
First part generates file data.tar.gz with data.ndjson.
import io
import json
import tarfile
# --- create ---
with tarfile.open("data.tar.gz", "w:gz") as tar:
lines = []
for x in range(10):
json_data = {"number": x, "text": "a" * x}
json_line = json.dumps(json_data)
lines.append(json_line)
data = "\n".join(lines).encode() # add data as bytes
data_file = io.BytesIO()
data_file.write(data)
data_file.seek(0) # move to the beginning of file
tar_info = tar.tarinfo("data.ndjson")
tar_info.size = len(data)
tar.addfile(tar_info, data_file)
# --- extract ---
with tarfile.open("data.tar.gz", "r:gz") as tar:
# print("list:")
# tar.list() # it sends directly to sys.stdout, so no need `print()`
# list all files using print()
# for tar_info in tar:
# print(tar_info.name, tar_info.size)
# tar_info = tar.next()
tar_info = tar.getmember("data.ndjson")
# print(tar_info)
print("name:", tar_info.name)
print("size:", tar_info.size)
# data_file = tar.extractfile("data.ndjson")
data_file = tar.extractfile(tar_info)
for json_line in data_file:
# print(">", json_line.decode())
json_data = json.loads(json_line)
print("json_data:", json_data)