Skip to content

Commit c3ddbd0

Browse files
committed
fix modify files
1 parent 4608447 commit c3ddbd0

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

src/datasets/load.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -592,15 +592,15 @@ def infer_module_for_data_files(
592592
return module_name, default_builder_kwargs
593593

594594

595-
def update_hash_for_cache(hash: str, metadata_configs: MetadataConfigs, dataset_infos: DatasetInfosDict) -> str:
595+
def update_hash_for_cache(hash: str, **kwargs: Dict[str, Union[MetadataConfigs, DataFilesDict, DataFilesDict]]) -> str:
596596
"""
597597
Used to update hash of packaged modules which is used for creating unique cache directories to reflect
598598
different config parameters which are passed in metadata from readme.
599599
"""
600600
m = Hasher()
601601
m.update(hash)
602-
m.update(metadata_configs)
603-
m.update(dataset_infos)
602+
for obj in kwargs.values():
603+
m.update(obj)
604604
return m.hexdigest()
605605

606606

@@ -2231,15 +2231,25 @@ def load_dataset_builder(
22312231
error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
22322232
raise ValueError(error_msg)
22332233

2234+
hash = dataset_module.hash
22342235
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
2236+
if len(builder_cls.builder_configs) > 1:
2237+
builder_config = builder_cls.builder_configs.get(config_name or builder_cls.DEFAULT_CONFIG_NAME)
2238+
elif len(builder_cls.builder_configs) == 1:
2239+
builder_config = builder_cls.BUILDER_CONFIGS[0]
2240+
else:
2241+
builder_config = None
2242+
if builder_config and builder_config.data_files is not None:
2243+
builder_config._resolve_data_files(base_path=builder_kwargs["base_path"], download_config=download_config)
2244+
hash = update_hash_for_cache(hash, data_files=builder_config.data_files)
22352245
# Instantiate the dataset builder
22362246
builder_instance: DatasetBuilder = builder_cls(
22372247
cache_dir=cache_dir,
22382248
dataset_name=dataset_name,
22392249
config_name=config_name,
22402250
data_dir=data_dir,
22412251
data_files=data_files,
2242-
hash=dataset_module.hash,
2252+
hash=hash,
22432253
info=info,
22442254
features=features,
22452255
token=token,

tests/test_load.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1503,6 +1503,18 @@ def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir
15031503
assert dataset._fingerprint != fingerprint1
15041504

15051505

1506+
def test_load_dataset_builder_then_edit_then_load_again(tmp_path: Path):
1507+
dataset_dir = tmp_path / "test_load_dataset_then_edit_then_load_again"
1508+
dataset_dir.mkdir()
1509+
with open(dataset_dir / "train.txt", "w") as f:
1510+
f.write("Hello there")
1511+
dataset_builder = load_dataset_builder(str(dataset_dir))
1512+
with open(dataset_dir / "train.txt", "w") as f:
1513+
f.write("General Kenobi !")
1514+
edited_dataset_builder = load_dataset_builder(str(dataset_dir))
1515+
assert dataset_builder.cache_dir != edited_dataset_builder.cache_dir
1516+
1517+
15061518
def test_load_dataset_readonly(dataset_loading_script_dir, dataset_loading_script_dir_readonly, data_dir, tmp_path):
15071519
cache_dir1 = tmp_path / "cache1"
15081520
cache_dir2 = tmp_path / "cache2"

0 commit comments

Comments
 (0)