I have four numpy arrays that has about N=5 million rows each. The shape of them are:
I want to pack them into a binary file with the same ordering.
This is what I have currently.
buffer = BytesIO()
for row in zip(a, b, c, d):
buffer.write(row[0].tobytes())
buffer.write(row[1].tobytes())
buffer.write(row[2].tobytes())
buffer.write(row[3].tobytes())
with open(output_path, "wb") as f:
f.write(buffer.getvalue())
Is there a more time efficient method instead of looping it N times like what I had?
Edit: The binary file is used across different languages (e.g. JS, Cpp) and the reason I needed to serialize it row by row (a[i], b[i], c[i], d[i]) is because I needed to adhere to the file format (.splat).
It is easy to implement by using structured arrays.
def structured_save(path, a, b, c, d):
arr = np.empty(
len(a),
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
arr["a"] = a
arr["b"] = b
arr["c"] = c
arr["d"] = d
arr.tofile(path) # faster than f.write(arr.tobytes())
This is simple, but arr
is a copy of a, b, c, d
and requires an additional 160MB (32 bytes x 5 million rows) of memory. With such a large memory usage, the impact on execution speed cannot be ignored.
To address this issue, we can do chunked writing.
def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
arr = np.empty(
chunk_size,
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
with open(path, "wb") as f:
for i in range(0, len(a), chunk_size):
n_elem = len(a[i: i + chunk_size])
arr["a"][:n_elem] = a[i: i + chunk_size]
arr["b"][:n_elem] = b[i: i + chunk_size]
arr["c"][:n_elem] = c[i: i + chunk_size]
arr["d"][:n_elem] = d[i: i + chunk_size]
arr[:n_elem].tofile(f)
It is a little more complicated, but it is superior in both memory usage and execution speed.
Benchmark:
import time
from io import BytesIO
from pathlib import Path
import numpy as np
def baseline(path, a, b, c, d):
buffer = BytesIO()
for row in zip(a, b, c, d):
buffer.write(row[0].tobytes())
buffer.write(row[1].tobytes())
buffer.write(row[2].tobytes())
buffer.write(row[3].tobytes())
with open(path, "wb") as f:
f.write(buffer.getvalue())
def structured_save(path, a, b, c, d):
arr = np.empty(
len(a),
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
arr["a"] = a
arr["b"] = b
arr["c"] = c
arr["d"] = d
arr.tofile(path)
def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
arr = np.empty(
chunk_size,
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
with open(path, "wb") as f:
for i in range(0, len(a), chunk_size):
n_elem = len(a[i: i + chunk_size])
arr["a"][:n_elem] = a[i: i + chunk_size]
arr["b"][:n_elem] = b[i: i + chunk_size]
arr["c"][:n_elem] = c[i: i + chunk_size]
arr["d"][:n_elem] = d[i: i + chunk_size]
arr[:n_elem].tofile(f)
def main():
n = 5_000_000
a = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
b = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
c = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
d = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
candidates = [
baseline,
structured_save,
chunked_structured_save,
]
name_len = max(len(f.__name__) for f in candidates)
path = Path("temp.bin")
baseline(path, a, b, c, d)
expected = path.read_bytes()
start = time.perf_counter()
np.savez(path, a=a, b=b, c=c, d=d)
print(f"{'np.savez (reference)':{name_len}}: {time.perf_counter() - start}")
for f in candidates:
started = time.perf_counter()
f(path, a, b, c, d)
elapsed = time.perf_counter() - started
print(f"{f.__name__:{name_len}}: {elapsed}")
assert path.read_bytes() == expected, f"{f.__name__} failed"
if __name__ == "__main__":
main()
Result:
np.savez (reference) : 0.8915687190001336
baseline : 5.309623991999615
structured_save : 0.2205286160005926
chunked_structured_save: 0.18220391599970753
Note that the result will vary greatly depending on the environment.
Here is the result on my other machine:
np.savez (reference) : 0.1376536000170745
baseline : 3.5804199000122026
structured_save : 0.4771533999883104
chunked_structured_save: 0.13709589999052696
It should also be noted that none of the above solutions take endianness into account.