Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ def get_module(self) -> DatasetModule:
data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name])
# Collect metadata files if the module supports them
supports_metadata = module_name in _MODULE_SUPPORTS_METADATA
if self.data_files is None and supports_metadata and patterns != DEFAULT_PATTERNS_ALL:
if self.data_files is None and supports_metadata:
try:
metadata_patterns = get_metadata_patterns(base_path)
except FileNotFoundError:
Expand Down Expand Up @@ -1059,7 +1059,7 @@ def get_module(self) -> DatasetModule:
data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name])
# Collect metadata files if the module supports them
supports_metadata = module_name in _MODULE_SUPPORTS_METADATA
if self.data_files is None and supports_metadata and patterns != DEFAULT_PATTERNS_ALL:
if self.data_files is None and supports_metadata:
try:
metadata_patterns = get_metadata_patterns(base_path)
except FileNotFoundError:
Expand Down
17 changes: 17 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def _generate_examples(self, filepath, **kwargs):
SAMPLE_DATASET_IDENTIFIER2 = "hf-internal-testing/dataset_with_data_files" # only has data files
SAMPLE_DATASET_IDENTIFIER3 = "hf-internal-testing/multi_dir_dataset" # has multiple data directories
SAMPLE_DATASET_IDENTIFIER4 = "hf-internal-testing/imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories
SAMPLE_DATASET_IDENTIFIER5 = "hf-internal-testing/imagefolder_with_metadata_no_splits" # imagefolder with a metadata file and no default split names in data files
SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "hf-internal-testing/_dummy"
SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST = "_dummy"
SAMPLE_DATASET_NO_CONFIGS_IN_METADATA = "hf-internal-testing/audiofolder_no_configs_in_metadata"
Expand Down Expand Up @@ -630,6 +631,22 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self):
for data_file in module_factory_result.builder_kwargs["data_files"]["test"]
)

factory = HubDatasetModuleFactoryWithoutScript(
SAMPLE_DATASET_IDENTIFIER5, download_config=self.download_config
)
module_factory_result = factory.get_module()
assert importlib.import_module(module_factory_result.module_path) is not None
assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
assert (
module_factory_result.builder_kwargs["data_files"] is not None
and len(module_factory_result.builder_kwargs["data_files"]) == 1
and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
)
assert any(
Path(data_file).name == "metadata.jsonl"
for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
)

@pytest.mark.integration
def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadata(self):
factory = HubDatasetModuleFactoryWithoutScript(
Expand Down