First of all, my thanks for the space.
I came here to look for help because I need to extract data from multiple .HDF5 files that are in a folder and put them into a single and unique Pandas dataframe. The columns are ['H1', 'L1', 'frequency_Hz'].
I shared the files on my Google Drive so they are easy to access: https://drive.google.com/drive/folders/1GwocMZeqZGyikZYgwvGvnnsVNn0B7aNW?usp=sharing
My code is below with my programming logic, but it is returning the error below. I appreciate any help!
# Imports
import glob
import h5py
import numpy as np
import pandas as pd
# Create a list to store the DataFrames of each HDF5 file:
dfs = []
# Get the list of training HDF5 files using glob:
arquivos_hdf5_treino = glob.glob('../Data/OriginalDatasets/train/*.hdf5')
# View the file list:
arquivos_hdf5_treino
['../Data/OriginalDatasets/train\\001121a05.hdf5',
'../Data/OriginalDatasets/train\\00a6db666.hdf5',
'../Data/OriginalDatasets/train\\00f36a6ac.hdf5',
'../Data/OriginalDatasets/train\\0197bacf8.hdf5',
'../Data/OriginalDatasets/train\\01b8b67f3.hdf5',
'../Data/OriginalDatasets/train\\01dba9731.hdf5',
'../Data/OriginalDatasets/train\\02887d232.hdf5',
'../Data/OriginalDatasets/train\\02c8f43f3.hdf5',
'../Data/OriginalDatasets/train\\0367dc82c.hdf5',
'../Data/OriginalDatasets/train\\0517ef7fe.hdf5',
'../Data/OriginalDatasets/train\\05c0675fe.hdf5',
'../Data/OriginalDatasets/train\\05cdc0769.hdf5',
'../Data/OriginalDatasets/train\\05f0aef12.hdf5',
'../Data/OriginalDatasets/train\\067b3fb4b.hdf5',
'../Data/OriginalDatasets/train\\06e321c6e.hdf5',
'../Data/OriginalDatasets/train\\08a060dad.hdf5',
'../Data/OriginalDatasets/train\\08c444d66.hdf5',
'../Data/OriginalDatasets/train\\0920a4276.hdf5',
'../Data/OriginalDatasets/train\\09531cde3.hdf5',
'../Data/OriginalDatasets/train\\097370861.hdf5',
'../Data/OriginalDatasets/train\\09e55aeba.hdf5',
'../Data/OriginalDatasets/train\\09ecddbba.hdf5',
'../Data/OriginalDatasets/train\\0ba188c57.hdf5',
'../Data/OriginalDatasets/train\\0bc8216f2.hdf5',
'../Data/OriginalDatasets/train\\0c55d030c.hdf5',
'../Data/OriginalDatasets/train\\0d0ad0b19.hdf5',
'../Data/OriginalDatasets/train\\0dc4c8ed0.hdf5',
'../Data/OriginalDatasets/train\\0e39a18bf.hdf5',
'../Data/OriginalDatasets/train\\0e60d4893.hdf5',
'../Data/OriginalDatasets/train\\0e66d0460.hdf5',
'../Data/OriginalDatasets/train\\0eb30f7c4.hdf5',
'../Data/OriginalDatasets/train\\0ebe28dd5.hdf5',
'../Data/OriginalDatasets/train\\0f53d8b96.hdf5',
'../Data/OriginalDatasets/train\\10dfa2ed6.hdf5',
'../Data/OriginalDatasets/train\\10eaa1cb2.hdf5',
'../Data/OriginalDatasets/train\\1185806d8.hdf5',
'../Data/OriginalDatasets/train\\119610501.hdf5',
'../Data/OriginalDatasets/train\\123594dc7.hdf5',
'../Data/OriginalDatasets/train\\1282f6c1f.hdf5',
'../Data/OriginalDatasets/train\\12f0fd6fd.hdf5',
'../Data/OriginalDatasets/train\\12f9824fa.hdf5',
'../Data/OriginalDatasets/train\\13a23148f.hdf5',
'../Data/OriginalDatasets/train\\13df1746e.hdf5',
'../Data/OriginalDatasets/train\\147cc5f92.hdf5',
'../Data/OriginalDatasets/train\\1510f75f9.hdf5',
'../Data/OriginalDatasets/train\\1523dcd0c.hdf5',
'../Data/OriginalDatasets/train\\1607fd753.hdf5',
'../Data/OriginalDatasets/train\\1748ad051.hdf5',
'../Data/OriginalDatasets/train\\177d1a100.hdf5',
'../Data/OriginalDatasets/train\\1796d0836.hdf5']
# Initializing the count of the number of hdf5 files:
numArquivo = 1
# Iterating over Training hdf5 files and extracting data:
for arquivo_hdf5 in arquivos_hdf5_treino:
with h5py.File(arquivo_hdf5, 'r') as arquivo:
# Printing the count of the number of hdf5 files on the screen:
print(f'Arquivo {numArquivo}')
# Creating the key list in HDF5 files:
keyList = list(arquivo.keys())[0]
# Creating the list of variables in HDF5 files:
varList = list(arquivo[keyList])
# Printing all datasets, also known as "keys":
print(f'Chave em {arquivo_hdf5}: {keyList}')
# Iterating over the datasets in the file:
for key in arquivo.keys():
# Printing the variables in the keys on the screen:
print(f'Variáveis na chave {key}: {varList}')
# Extracting the datasets:
dados = arquivo[key][:]
# Printing the dataset to the screen:
print(f'Dados no conjunto de dados {key}: {dados}')
# Converting data to a Pandas DataFrame:
df = pd.DataFrame(dados)
# Adding the DataFrame to the list:
dfs.append(df)
# Printing a blank line on the screen:
print()
# Incrementing the number of files:
numArquivo += 1
# Concatenating DataFrames into a single DataFrame:
resultado_final = pd.concat(dfs, ignore_index=True)
# Viewing the first lines:
print(resultado_final.head())
Arquivo 1
Chave em ../Data/OriginalDatasets/train\001121a05.hdf5: 001121a05
Variáveis na chave 001121a05: ['H1', 'L1', 'frequency_Hz']
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[7], line 27
24 print(f'Variáveis na chave {key}: {varList}')
26 # Extraindo os conjuntos de dados:
---> 27 dados = arquivo[key][:]
29 # Imprimindo na tela o conjunto de dados:
30 print(f'Dados no conjunto de dados {key}: {dados}')
File h5py\_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py\_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File c:\Opt\Anaconda3\Lib\site-packages\h5py\_hl\group.py:330, in Group.__getitem__(self, name)
328 oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
329 else:
--> 330 raise TypeError("Accessing a group is done with bytes or str, "
331 " not {}".format(type(name)))
333 otype = h5i.get_type(oid)
334 if otype == h5i.GROUP:
TypeError: Accessing a group is done with bytes or str, not <class 'slice'>
Let's start with the file schema. (You have to understand the schema first. Then you can read the data correctly.) Be careful with keys. h5py uses dictionary syntax for H5 objects. So a key can be a Dataset or a Group. Each key is the object name and it's value is the object. If you don't know the object type, you can test it with isinstance()
.
You have a slight misunderstanding of the file schema. I checked 3 files, and each has this schema:
001121a05
for 001121a05.hdf5
)H1
is a Group, L1
is a
Group and frequency_Hz
is a Dataset.H1
and L1
Groups have 2 datasets each: named SFTs
and
timestamps_GPS
.There are at 2-3 issues to address:
arquivo.keys()
, you are looping on the root level
objects (only a group named 001121a05
). That's why you get an error
about TypeError: Accessing a group
.arquivo[keyList].keys()
, you will get another error when you try to read
H1
and L1
as datasets (because they are groups).H1
and L1
.I modified your code to read the data in frequency_Hz
and load to a dataframe. It should get you pointed in the right direction. If you want the H1
and L1
data, you will need to load [H1][SFTs]
and [L1][SFTs]
as appropriate.
Also, I made a few other minor changes to simplify the code (For example, I used glob.iglob()
and enumerate()
). Also, I modified some variable names to clarify their meaning.
# Create a list to store the DataFrames of each HDF5 file:
dfs = []
# Iterating over Training hdf5 files and extracting data:
for Arquivo_cnt, arquivo_hdf5 in enumerate(glob.iglob('*.hdf5')):
with h5py.File(arquivo_hdf5, 'r') as arquivo:
# Printing the count of the number of hdf5 files on the screen:
print(f'Arquivo {Arquivo_cnt+1}')
# Get the root level key list and print:
rootkey = list(arquivo.keys())[0]
print(f'Chave em {arquivo_hdf5}: {rootkey}')
# Get a list of objects below the root level and print:
varList = list(arquivo[rootkey].keys())
print(f'Variáveis na chave {rootkey}: {varList}')
# Iterating over the objects under rootkey (groups and datasets):
for key in arquivo[rootkey].keys():
print(f'For object name {key}, object type: {type(arquivo[rootkey][key])}')
# only process datasets; skip groups
if isinstance(arquivo[rootkey][key], h5py.Dataset):
# Extract the dataset to an np.array:
dados = arquivo[rootkey][key][:]
# Print the data to the screen:
print(f'Dados no conjunto de dados {key}: \n{dados}')
# Load data to a Pandas DataFrame:
df = pd.DataFrame(dados)
# Add the DataFrame to the list:
dfs.append(df)
# Printing a blank line on the screen:
print()
# Concatenating DataFrames into a single DataFrame:
resultado_final = pd.concat(dfs, ignore_index=True)
# Viewing the first lines:
print(resultado_final.head())