I have torch.tensors
that I need to save to disk as they are large files and will consume all memory.
I am new to h5py and I am having trouble figuring out how to make a data set efficiently. This process is VERY slow.
Below is a very MWE that I would intend to transform into a loop.
import numpy as np
import h5py
data = np.random.random((13, 8, 512, 768))
f = h5py.File('C:\Users\Andrew\Desktop\test_h5\xd.h5', 'w')
dset = f.create_dataset('embeds', shape=(13, 8, 512, 768),
maxshape=(None, 8, 512, 768), chunks=(13, 8, 512, 768),
dtype=np.float16)
# add first chunk of rows
dset[:] = data[0:13, :, :,]
# Resize the dataset to accommodate the next chunk of rows
dset.resize(26, axis=0)
# Write the next chunk
dset[13:] = np.random.random((13, 8, 512, 768))
# check data
with h5py.File('C:\Users\Andrew\Desktop\test_h5\xd.h5', 'r') as f:
print(f['embeds'][0:26].shape)
print(f['embeds'][0:26])
f.close()
Edit:
I am not having issues figuring out how to ensure the last appended data is actually the last generated data, consider the following:
import numpy as np
import h5py
data = np.random.random((13, 8, 512, 768)).astype(np.float32)
batch_size=8
with h5py.File('SO_65606675.h5', 'w') as f:
# create empty data set
dset = f.create_dataset('embeds', shape=(13, 16, 512, 768),
maxshape=(13, None, 512, 768), chunks=(13, 8, 512, 768),
dtype=np.float32)
for cnt in range(2):
# add chunk of rows
start = cnt*batch_size
dset[:, start:start+batch_size, :, :] = data[:, :, :, :]
# Create attribute with last_index value
dset.attrs['last_index']=(cnt+1)*batch_size
# check data
with h5py.File('SO_65606675.h5', 'r') as f:
print(f['embeds'].attrs['last_index'])
print(f['embeds'].shape)
x = f['embeds'][:, 8:16, :, :] # get last entry
np.array_equal(x, data) # passes
Edit2 : I think I had an error above and this works; will check my "real" data.