I'm trying to save data (3D numpy array) to an HDF5 file using fsspec in Python, but I'm encountering issues and I am unable to successfully write the data to the file. The bigger picture is I am trying to amend this dataset class to load/write video data. Is it possible to use h5py and fsspec in simultaneously or is it even advised?
The root issue seems to be this spec. But can I work around it or am I doing something wrong?
My first idea was to save the data as follows:
import numpy as np
import fsspec
import h5py
data = np.random.rand(100, 100)
with fsspec.open("./file", mode="wb") as fs_file:
with h5py.File(fs_file, mode="w") as h5_file:
h5_file.create_dataset("video", data=data)
-----------------------------------------------------------------
UnsupportedOperation Traceback (most recent call last) Cell In[8], line 4 1 data = np.random.rand(100, 100) 3 with fsspec.open("./file", mode="wb") as fs_file: ----> 4 with h5py.File(fs_file, mode="w") as h5_file: 5 h5_file.create_dataset("video", data=data)
File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:604, in File.exit(self, *args) 601 @with_phil 602 def exit(self, *args): 603 if self.id: --> 604 self.close()
File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:586, in File.close(self) 580 if self.id.valid: 581 # We have to explicitly murder all open objects related to the file 582 583 # Close file-resident objects first, then the files. 584 # Otherwise we get errors in MPI mode. 585 self.id._close_open_objects(h5f.OBJ_LOCAL | ~h5f.OBJ_FILE) ... File h5py/h5fd.pyx:185, in h5py.h5fd.H5FD_fileobj_flush()
File h5py/h5fd.pyx:180, in h5py.h5fd.H5FD_fileobj_truncate()
UnsupportedOperation: truncate
My second idea was to first get the binary representation of the h5 file through an in-memory file but something is wrong and I am unsure if I can get a binary representation via h5py:
import h5py
import numpy as np
import fsspec
dataset = np.random.rand(100, 100)
# Create an in-memory HDF5 file
with h5py.File("in_memory_file.h5", driver="core", backing_store=False, mode='w') as h5file:
# Create the dataset within the in-memory file
h5file.create_dataset("video", data=dataset)
# Save the binary representation to a file
with fsspec.open("binary_representation.h5", "wb") as file:
file.write(h5file.id.get_file_image())
h5file.close()
# Now try to open the saved binary file
with h5py.File("binary_representation.h5", "r") as h5file:
dataset = h5file["video"]
# Perform any desired operations with the dataset
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Cell In[1], line 20
17 h5file.close()
19 # Now try to open the saved binary file
---> 20 with h5py.File("binary_representation.h5", "r") as h5file:
21 # Access the dataset
22 dataset = h5file["video"]
24 # Perform any desired operations with the dataset
File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:567, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, **kwds)
558 fapl = make_fapl(driver, libver, rdcc_nslots, rdcc_nbytes, rdcc_w0,
559 locking, page_buf_size, min_meta_keep, min_raw_keep,
560 alignment_threshold=alignment_threshold,
561 alignment_interval=alignment_interval,
562 meta_block_size=meta_block_size,
563 **kwds)
564 fcpl = make_fcpl(track_order=track_order, fs_strategy=fs_strategy,
565 fs_persist=fs_persist, fs_threshold=fs_threshold,
566 fs_page_size=fs_page_size)
--> 567 fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
569 if isinstance(libver, tuple):
570 self._libver = libver
File ~/miniconda3/envs/kedro-environment/lib/python3.10/site-packages/h5py/_hl/files.py:231, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
...
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File h5py/h5f.pyx:106, in h5py.h5f.open()
OSError: Unable to open file (bad object header version number)
Currently I have it implemented as:
with fsspec.open(save_path, mode='wb') as fs_file:
h5_file = h5py.File(fs_file, mode="w")
h5_file.create_dataset("video", data=data)
which works (kind of) but gives me an ignored error every time, which I am very unsure about:
ValueError: truncate of closed file
Exception ignored in: 'h5py._objects.ObjectID.__dealloc__'
Traceback (most recent call last):
File "h5py/_objects.pyx", line 201, in h5py._objects.ObjectID.__dealloc__
File "h5py/h5fd.pyx", line 180, in h5py.h5fd.H5FD_fileobj_truncate
ValueError: truncate of closed file
I would appreciate any insights or suggestions on how to resolve this issue and successfully save the data. Is there an alternative approach or additional steps I need to take to ensure the data is written correctly?
Thank you in advance for your help!
Meanfile I figured out how to save the binary representation of h5py.File
, which was described here:
import h5py
import numpy as np
import fsspec
import io
# Create a sample dataset
dataset = np.random.rand(100, 100)
bio = io.BytesIO()
with h5py.File(bio, 'w') as f:
f.create_dataset("video", data=dataset)
data = bio.getvalue() # data is a regular Python bytes object.
with fsspec.open("bytes_io.h5", "wb") as file:
file.write(data)
# Now try to open the saved binary file
with h5py.File("bytes_io.h5", "r") as h5file:
# Access the dataset
dataset = h5file["video"]