Skip to content

Commit 5975031

Browse files
authored
Fix metadata file resolution when inferred pattern is ** (#6449)
* Fix metadata file resolution when inferred pattern is `**` * Remove pdb call * Nit
1 parent 1731d5a commit 5975031

File tree

2 files changed

+19
-2
lines changed

2 files changed

+19
-2
lines changed

src/datasets/load.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,7 @@ def get_module(self) -> DatasetModule:
869869
data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name])
870870
# Collect metadata files if the module supports them
871871
supports_metadata = module_name in _MODULE_SUPPORTS_METADATA
872-
if self.data_files is None and supports_metadata and patterns != DEFAULT_PATTERNS_ALL:
872+
if self.data_files is None and supports_metadata:
873873
try:
874874
metadata_patterns = get_metadata_patterns(base_path)
875875
except FileNotFoundError:
@@ -1059,7 +1059,7 @@ def get_module(self) -> DatasetModule:
10591059
data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name])
10601060
# Collect metadata files if the module supports them
10611061
supports_metadata = module_name in _MODULE_SUPPORTS_METADATA
1062-
if self.data_files is None and supports_metadata and patterns != DEFAULT_PATTERNS_ALL:
1062+
if self.data_files is None and supports_metadata:
10631063
try:
10641064
metadata_patterns = get_metadata_patterns(base_path)
10651065
except FileNotFoundError:

tests/test_load.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def _generate_examples(self, filepath, **kwargs):
8686
SAMPLE_DATASET_IDENTIFIER2 = "hf-internal-testing/dataset_with_data_files" # only has data files
8787
SAMPLE_DATASET_IDENTIFIER3 = "hf-internal-testing/multi_dir_dataset" # has multiple data directories
8888
SAMPLE_DATASET_IDENTIFIER4 = "hf-internal-testing/imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories
89+
SAMPLE_DATASET_IDENTIFIER5 = "hf-internal-testing/imagefolder_with_metadata_no_splits" # imagefolder with a metadata file and no default split names in data files
8990
SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "hf-internal-testing/_dummy"
9091
SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST = "_dummy"
9192
SAMPLE_DATASET_NO_CONFIGS_IN_METADATA = "hf-internal-testing/audiofolder_no_configs_in_metadata"
@@ -630,6 +631,22 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self):
630631
for data_file in module_factory_result.builder_kwargs["data_files"]["test"]
631632
)
632633

634+
factory = HubDatasetModuleFactoryWithoutScript(
635+
SAMPLE_DATASET_IDENTIFIER5, download_config=self.download_config
636+
)
637+
module_factory_result = factory.get_module()
638+
assert importlib.import_module(module_factory_result.module_path) is not None
639+
assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
640+
assert (
641+
module_factory_result.builder_kwargs["data_files"] is not None
642+
and len(module_factory_result.builder_kwargs["data_files"]) == 1
643+
and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
644+
)
645+
assert any(
646+
Path(data_file).name == "metadata.jsonl"
647+
for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
648+
)
649+
633650
@pytest.mark.integration
634651
def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadata(self):
635652
factory = HubDatasetModuleFactoryWithoutScript(

0 commit comments

Comments
 (0)