Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions src/datasets/packaged_modules/hdf5/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ def _split_generators(self, dl_manager):
# Infer features from first file
if self.info.features is None:
for first_file in itertools.chain.from_iterable(files):
with h5py.File(first_file, "r") as h5:
self.info.features = _recursive_infer_features(h5)
with open(first_file, "rb") as f:
with h5py.File(f, "r") as h5:
self.info.features = _recursive_infer_features(h5)
break
splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
return splits
Expand All @@ -73,22 +74,23 @@ def _generate_tables(self, files):
batch_size_cfg = self.config.batch_size
for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
try:
with h5py.File(file, "r") as h5:
# Infer features and lengths from first file
if self.info.features is None:
self.info.features = _recursive_infer_features(h5)
num_rows = _check_dataset_lengths(h5, self.info.features)
if num_rows is None:
logger.warning(f"File {file} contains no data, skipping...")
continue
effective_batch = batch_size_cfg or self._writer_batch_size or num_rows
for start in range(0, num_rows, effective_batch):
end = min(start + effective_batch, num_rows)
pa_table = _recursive_load_arrays(h5, self.info.features, start, end)
if pa_table is None:
with open(file, "rb") as f:
with h5py.File(f, "r") as h5:
# Infer features and lengths from first file
if self.info.features is None:
self.info.features = _recursive_infer_features(h5)
num_rows = _check_dataset_lengths(h5, self.info.features)
if num_rows is None:
logger.warning(f"File {file} contains no data, skipping...")
continue
yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features)
effective_batch = batch_size_cfg or self._writer_batch_size or num_rows
for start in range(0, num_rows, effective_batch):
end = min(start + effective_batch, num_rows)
pa_table = _recursive_load_arrays(h5, self.info.features, start, end)
if pa_table is None:
logger.warning(f"File {file} contains no data, skipping...")
continue
yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features)
except ValueError as e:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise
Expand Down
Loading