I am currently in the process of utilizing a dataset in the .mat format. However, I have encountered a challenge as the dataset contains nested arrays, and I am in need of utilizing the data in a CSV format.
I am seeking guidance on the most effective approach to convert this nested .mat dataset into a CSV format. Your expertise in this matter would be greatly appreciated. my dataset link: https://ora.ox.ac.uk/objects/uuid:03ba4b01-cfed-46d3-9b1a-7d4a7bdf6fac/files/m5ac36a1e2073852e4f1f7dee647909a7
import numpy as np
import pandas as pd
import scipy.io as sio
mat = sio.loadmat('Oxford_Battery_Degradation_Dataset_1.mat')
mat
my output
{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Mon Jun 05 11:16:25 2017',
'__version__': '1.0',
'__globals__': [],
'Cell1': array([[(array([[(array([[(array([[735954.85896553],
[735954.8589771 ],
[735954.85898867],
...,
[735954.8995558 ],
dtype=[('t', 'O'), ('v', 'O'), ('q', 'O'), ('T', 'O')])) ]],
dtype=[('C1ch', 'O'), ('C1dc', 'O'), ('OCVch', 'O'), ('OCVdc', 'O')])) ]],
dtype=[('cyc0000', 'O'), ('cyc0100', 'O'), ('cyc0300', 'O'), ('cyc0400', 'O'), ('cyc0500', 'O'), ('cyc0600', 'O'), ('cyc0700', 'O'), ('cyc0800', 'O'), ('cyc0900', 'O'), ('cyc1000', 'O'), ('cyc1100', 'O'), ('cyc1200', 'O'), ('cyc1300', 'O'), ('cyc1400', 'O'), ('cyc1600', 'O'), ('cyc1800', 'O'), ('cyc1900', 'O'), ('cyc2000', 'O'), ('cyc2100', 'O'), ('cyc2200', 'O'), ('cyc2300', 'O'), ('cyc2400', 'O'), ('cyc2500', 'O'), ('cyc2600', 'O'), ('cyc2700', 'O'), ('cyc2800', 'O'), ('cyc2900', 'O'), ('cyc3000', 'O'), ('cyc3100', 'O'), ('cyc3200', 'O'), ('cyc3300', 'O'), ('cyc3500', 'O'), ('cyc3600', 'O'), ('cyc3700', 'O'), ('cyc3800', 'O'), ('cyc3900', 'O'), ('cyc4000', 'O'), ('cyc4100', 'O'), ('cyc4200', 'O'), ('cyc4300', 'O'), ('cyc4400', 'O'), ('cyc4500', 'O'), ('cyc4600', 'O'), ('cyc4800', 'O'), ('cyc5000', 'O'), ('cyc5100', 'O'), ('cyc5200', 'O'), ('cyc5300', 'O'), ('cyc5400', 'O'), ('cyc5500', 'O'), ('cyc5600', 'O'), ('cyc5700', 'O'), ('cyc5800', 'O'), ('cyc5900', 'O'), ('cyc6000', 'O'), ('cyc6100', 'O'), ('cyc6200', 'O'), ('cyc6300', 'O'), ('cyc6400', 'O'), ('cyc6500', 'O'), ('cyc6600', 'O'), ('cyc6700', 'O'), ('cyc6800', 'O'), ('cyc6900', 'O'), ('cyc7000', 'O'), ('cyc7100', 'O'), ('cyc7200', 'O'), ('cyc7300', 'O'), ('cyc7400', 'O'), ('cyc7500', 'O'), ('cyc7600', 'O'), ('cyc7700', 'O'), ('cyc7800', 'O'), ('cyc7900', 'O'), ('cyc8000', 'O'), ('cyc8100', 'O')])}
In fact, I should have eight datasets in this format, where the columns are associated with 't', 'v', 'q', and 'T' within the arrays. There is a sample representing the expected result for one cell dataset:
cell8= pd.DataFrame(columns=['Time','Voltage','Capacity','Temperature'])
cell8
I'm not sure you realize the volume of data you have here. I have code that can extract the data, but there are just over 61 million data items here. Printed as a CSV file, that comes out to about 2.5 gigabytes.
import numpy as np
import scipy.io as sio
mat = sio.loadmat('Oxford_Battery_Degradation_Dataset_1.mat')
def dive(names,cell):
global lines
if len(cell) > 1000:
for n in cell:
print(','.join(names+[str(n[0])]))
elif len(cell) > 1:
for n,c in zip(cell.dtype.fields, cell):
dive(names+[n], c)
else:
dive(names,cell[0])
for cno in range(8):
name = f'Cell{cno+1}'
cell = mat[name]
dive([name],mat[name])
The start of this file looks like:
Cell1,cyc0000,C1ch,t,735954.8589655256
Cell1,cyc0000,C1ch,t,735954.8589770996
Cell1,cyc0000,C1ch,t,735954.8589886738
Cell1,cyc0000,C1ch,t,735954.8590002478
Cell1,cyc0000,C1ch,t,735954.8590118219
Cell1,cyc0000,C1ch,t,735954.859023396
Cell1,cyc0000,C1ch,t,735954.85903497
Cell1,cyc0000,C1ch,t,735954.8590465442
Cell1,cyc0000,C1ch,t,735954.8590581182
Cell1,cyc0000,C1ch,t,735954.8590696923
Cell1,cyc0000,C1ch,t,735954.8590812663
Cell1,cyc0000,C1ch,t,735954.8590928405
Cell1,cyc0000,C1ch,t,735954.8591044145
Cell1,cyc0000,C1ch,t,735954.8591159886
Cell1,cyc0000,C1ch,t,735954.8591275626
Cell1,cyc0000,C1ch,t,735954.8591391367
Cell1,cyc0000,C1ch,t,735954.8591507107
Cell1,cyc0000,C1ch,t,735954.8591622849
Cell1,cyc0000,C1ch,t,735954.8591738589
Cell1,cyc0000,C1ch,t,735954.859185433
Cell1,cyc0000,C1ch,t,735954.8591970071
Cell1,cyc0000,C1ch,t,735954.8592085812
Cell1,cyc0000,C1ch,t,735954.8592201553
Cell1,cyc0000,C1ch,t,735954.8592317293
Cell1,cyc0000,C1ch,t,735954.8592433034
Cell1,cyc0000,C1ch,t,735954.8592548774
Cell1,cyc0000,C1ch,t,735954.8592664516
Cell1,cyc0000,C1ch,t,735954.8592780256
Cell1,cyc0000,C1ch,t,735954.8592895997
Cell1,cyc0000,C1ch,t,735954.8593011737
Cell1,cyc0000,C1ch,t,735954.8593127478
Cell1,cyc0000,C1ch,t,735954.8593243218
Cell1,cyc0000,C1ch,t,735954.859335896
Cell1,cyc0000,C1ch,t,735954.8593474701
Cell1,cyc0000,C1ch,t,735954.8593590441
Cell1,cyc0000,C1ch,t,735954.8593706182
Cell1,cyc0000,C1ch,t,735954.8593821923
...
The first column runs Cell1 through Cell8. The second column has between 70 and 80 entries, cyc0000
, cyc0100
, etc. The third column has 4 entries, C1ch
, C1dc
, OCVch
, OCVdc
. The fourth column has 4 entries, t
, v
, q
, T
. You can't run the numbers across, because the size of the last dimension varies considerably, from 2,500 to 10,000 entries.
Here is code that converts the mat file into a set of nested dicts. You can see on the last line how to access this. Maybe this will work for your purposes.
import numpy as np
import scipy.io as sio
mat = sio.loadmat('Oxford_Battery_Degradation_Dataset_1.mat')
def dive(cell):
if len(cell) > 1000:
print(len(cell))
return cell
elif len(cell) > 1:
data = {}
for n,c in zip(cell.dtype.fields, cell):
data[n] = dive(c)
return data
else:
return dive(cell[0])
master = {}
for cno in range(8):
name = f'Cell{cno+1}'
master[name] = dive(mat[name])
print(master['Cell3']['cyc2100']['OCVch']['v'])