pythonnumpypacking

What is the most efficient way to pack various numpy arrays into a binary file


I have four numpy arrays that has about N=5 million rows each. The shape of them are:

I want to pack them into a binary file with the same ordering.

This is what I have currently.

buffer = BytesIO()
for row in zip(a, b, c, d):
    buffer.write(row[0].tobytes())
    buffer.write(row[1].tobytes())
    buffer.write(row[2].tobytes())
    buffer.write(row[3].tobytes())

with open(output_path, "wb") as f:
    f.write(buffer.getvalue())

Is there a more time efficient method instead of looping it N times like what I had?

Edit: The binary file is used across different languages (e.g. JS, Cpp) and the reason I needed to serialize it row by row (a[i], b[i], c[i], d[i]) is because I needed to adhere to the file format (.splat).


Solution

  • It is easy to implement by using structured arrays.

    def structured_save(path, a, b, c, d):
        arr = np.empty(
            len(a),
            dtype=np.dtype(
                [
                    ("a", a.dtype, a.shape[1:]),
                    ("b", b.dtype, b.shape[1:]),
                    ("c", c.dtype, c.shape[1:]),
                    ("d", d.dtype, d.shape[1:]),
                ]
            ),
        )
        arr["a"] = a
        arr["b"] = b
        arr["c"] = c
        arr["d"] = d
        arr.tofile(path)  # faster than f.write(arr.tobytes())
    

    This is simple, but arr is a copy of a, b, c, d and requires an additional 160MB (32 bytes x 5 million rows) of memory. With such a large memory usage, the impact on execution speed cannot be ignored.

    To address this issue, we can do chunked writing.

    def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
        arr = np.empty(
            chunk_size,
            dtype=np.dtype(
                [
                    ("a", a.dtype, a.shape[1:]),
                    ("b", b.dtype, b.shape[1:]),
                    ("c", c.dtype, c.shape[1:]),
                    ("d", d.dtype, d.shape[1:]),
                ]
            ),
        )
    
        with open(path, "wb") as f:
            for i in range(0, len(a), chunk_size):
                n_elem = len(a[i: i + chunk_size])
                arr["a"][:n_elem] = a[i: i + chunk_size]
                arr["b"][:n_elem] = b[i: i + chunk_size]
                arr["c"][:n_elem] = c[i: i + chunk_size]
                arr["d"][:n_elem] = d[i: i + chunk_size]
                arr[:n_elem].tofile(f)
    

    It is a little more complicated, but it is superior in both memory usage and execution speed.

    Benchmark:

    import time
    from io import BytesIO
    from pathlib import Path
    
    import numpy as np
    
    
    def baseline(path, a, b, c, d):
        buffer = BytesIO()
        for row in zip(a, b, c, d):
            buffer.write(row[0].tobytes())
            buffer.write(row[1].tobytes())
            buffer.write(row[2].tobytes())
            buffer.write(row[3].tobytes())
    
        with open(path, "wb") as f:
            f.write(buffer.getvalue())
    
    
    def structured_save(path, a, b, c, d):
        arr = np.empty(
            len(a),
            dtype=np.dtype(
                [
                    ("a", a.dtype, a.shape[1:]),
                    ("b", b.dtype, b.shape[1:]),
                    ("c", c.dtype, c.shape[1:]),
                    ("d", d.dtype, d.shape[1:]),
                ]
            ),
        )
        arr["a"] = a
        arr["b"] = b
        arr["c"] = c
        arr["d"] = d
        arr.tofile(path)
    
    
    def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
        arr = np.empty(
            chunk_size,
            dtype=np.dtype(
                [
                    ("a", a.dtype, a.shape[1:]),
                    ("b", b.dtype, b.shape[1:]),
                    ("c", c.dtype, c.shape[1:]),
                    ("d", d.dtype, d.shape[1:]),
                ]
            ),
        )
    
        with open(path, "wb") as f:
            for i in range(0, len(a), chunk_size):
                n_elem = len(a[i: i + chunk_size])
                arr["a"][:n_elem] = a[i: i + chunk_size]
                arr["b"][:n_elem] = b[i: i + chunk_size]
                arr["c"][:n_elem] = c[i: i + chunk_size]
                arr["d"][:n_elem] = d[i: i + chunk_size]
                arr[:n_elem].tofile(f)
    
    
    def main():
        n = 5_000_000
        a = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
        b = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
        c = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
        d = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
    
        candidates = [
            baseline,
            structured_save,
            chunked_structured_save,
        ]
        name_len = max(len(f.__name__) for f in candidates)
    
        path = Path("temp.bin")
        baseline(path, a, b, c, d)
        expected = path.read_bytes()
    
        start = time.perf_counter()
        np.savez(path, a=a, b=b, c=c, d=d)
        print(f"{'np.savez (reference)':{name_len}}: {time.perf_counter() - start}")
    
        for f in candidates:
            started = time.perf_counter()
            f(path, a, b, c, d)
            elapsed = time.perf_counter() - started
            print(f"{f.__name__:{name_len}}: {elapsed}")
            assert path.read_bytes() == expected, f"{f.__name__} failed"
    
    
    if __name__ == "__main__":
        main()
    

    Result:

    np.savez (reference)   : 0.8915687190001336
    baseline               : 5.309623991999615
    structured_save        : 0.2205286160005926
    chunked_structured_save: 0.18220391599970753
    

    Note that the result will vary greatly depending on the environment.

    Here is the result on my other machine:

    np.savez (reference)   : 0.1376536000170745
    baseline               : 3.5804199000122026
    structured_save        : 0.4771533999883104
    chunked_structured_save: 0.13709589999052696
    

    It should also be noted that none of the above solutions take endianness into account.