pythonfile-ioh5pykedrofsspec

Unable to Save Data to HDF5 File using fsspec in Python


I'm trying to save data (3D numpy array) to an HDF5 file using fsspec in Python, but I'm encountering issues and I am unable to successfully write the data to the file. The bigger picture is I am trying to amend this dataset class to load/write video data. Is it possible to use h5py and fsspec in simultaneously or is it even advised?

The root issue seems to be this spec. But can I work around it or am I doing something wrong?

My first idea was to save the data as follows:

import numpy as np
import fsspec
import h5py


data = np.random.rand(100, 100)

with fsspec.open("./file", mode="wb") as fs_file:
    with h5py.File(fs_file, mode="w") as h5_file:
        h5_file.create_dataset("video", data=data)

-----------------------------------------------------------------
UnsupportedOperation                      Traceback (most recent call last) Cell In[8], line 4 1 data = np.random.rand(100, 100) 3 with fsspec.open("./file", mode="wb") as fs_file: ----> 4     with h5py.File(fs_file, mode="w") as h5_file: 5         h5_file.create_dataset("video", data=data)

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:604, in File.exit(self, *args) 601 @with_phil 602 def exit(self, *args): 603     if self.id: --> 604         self.close()

File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:586, in File.close(self) 580 if self.id.valid: 581     # We have to explicitly murder all open objects related to the file 582 583     # Close file-resident objects first, then the files. 584     # Otherwise we get errors in MPI mode. 585     self.id._close_open_objects(h5f.OBJ_LOCAL | ~h5f.OBJ_FILE) ... File h5py/h5fd.pyx:185, in h5py.h5fd.H5FD_fileobj_flush()

File h5py/h5fd.pyx:180, in h5py.h5fd.H5FD_fileobj_truncate()
UnsupportedOperation: truncate

My second idea was to first get the binary representation of the h5 file through an in-memory file but something is wrong and I am unsure if I can get a binary representation via h5py:

import h5py
import numpy as np
import fsspec

dataset = np.random.rand(100, 100)

# Create an in-memory HDF5 file
with h5py.File("in_memory_file.h5", driver="core", backing_store=False, mode='w') as h5file:
    # Create the dataset within the in-memory file
    h5file.create_dataset("video", data=dataset)

    # Save the binary representation to a file
    with fsspec.open("binary_representation.h5", "wb") as file:
        file.write(h5file.id.get_file_image())
    
    h5file.close()
    
# Now try to open the saved binary file
with h5py.File("binary_representation.h5", "r") as h5file:
    dataset = h5file["video"]

    # Perform any desired operations with the dataset

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[1], line 20
     17     h5file.close()
     19 # Now try to open the saved binary file
---> 20 with h5py.File("binary_representation.h5", "r") as h5file:
     21     # Access the dataset
     22     dataset = h5file["video"]
     24     # Perform any desired operations with the dataset

File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:567, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, **kwds)
    558     fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
    559                      locking, page_buf_size, min_meta_keep, min_raw_keep,
    560                      alignment_threshold=alignment_threshold,
    561                      alignment_interval=alignment_interval,
    562                      meta_block_size=meta_block_size,
    563                      **kwds)
    564     fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
    565                      fs_persist=fs_persist, fs_threshold=fs_threshold,
    566                      fs_page_size=fs_page_size)
--> 567     fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
    569 if isinstance(libver, tuple):
    570     self._libver = libver

File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:231, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
...
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File h5py/h5f.pyx:106, in h5py.h5f.open()

OSError: Unable to open file (bad object header version number)

Currently I have it implemented as:

with fsspec.open(save_path, mode='wb') as fs_file:
    h5_file = h5py.File(fs_file, mode="w")
    h5_file.create_dataset("video", data=data)

which works (kind of) but gives me an ignored error every time, which I am very unsure about:

ValueError: truncate of closed file
Exception ignored in: 'h5py._objects.ObjectID.__dealloc__'
Traceback (most recent call last):
  File "h5py/_objects.pyx", line 201, in h5py._objects.ObjectID.__dealloc__
  File "h5py/h5fd.pyx", line 180, in h5py.h5fd.H5FD_fileobj_truncate
ValueError: truncate of closed file

I would appreciate any insights or suggestions on how to resolve this issue and successfully save the data. Is there an alternative approach or additional steps I need to take to ensure the data is written correctly?

Thank you in advance for your help!


Solution

  • Meanfile I figured out how to save the binary representation of h5py.File, which was described here:

    import h5py
    import numpy as np
    import fsspec
    import io
    
    # Create a sample dataset
    dataset = np.random.rand(100, 100)
    
    bio = io.BytesIO()
    with h5py.File(bio, 'w') as f:
        f.create_dataset("video", data=dataset)
    
    data = bio.getvalue() # data is a regular Python bytes object.
    
    with fsspec.open("bytes_io.h5", "wb") as file:
        file.write(data)
        
        
    # Now try to open the saved binary file
    with h5py.File("bytes_io.h5", "r") as h5file:
        # Access the dataset
        dataset = h5file["video"]