Skip to content

Commit 5bbbf1b

Browse files
Validate config name and data_files in packaged modules (#6915)
* Make configs call super post_init in packaged modules * Update hash in test * Add tests * Add tests for BuilderConfig * Fix syntax * use old hash for 2.15 cache reload --------- Co-authored-by: Quentin Lhoest <[email protected]>
1 parent 6548e0e commit 5bbbf1b

File tree

26 files changed

+226
-12
lines changed

26 files changed

+226
-12
lines changed

src/datasets/builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]
494494
and not is_remote_url(self._cache_dir_root)
495495
and not (set(self.config_kwargs) - {"data_files", "data_dir"})
496496
):
497-
from .packaged_modules import _PACKAGED_DATASETS_MODULES
497+
from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES
498498
from .utils._dill import Pickler
499499

500500
def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
@@ -516,7 +516,7 @@ def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> st
516516
namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
517517
with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True):
518518
config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files})
519-
hash = _PACKAGED_DATASETS_MODULES.get(self.name, "missing")[1]
519+
hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing")
520520
if (
521521
dataset_module.builder_configs_parameters.metadata_configs
522522
and self.config.name in dataset_module.builder_configs_parameters.metadata_configs

src/datasets/packaged_modules/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,18 @@ def _hash_python_lines(lines: List[str]) -> str:
4343
"webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())),
4444
}
4545

46+
# get importable module names and hash for caching
47+
_PACKAGED_DATASETS_MODULES_2_15_HASHES = {
48+
"csv": "eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d",
49+
"json": "8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96",
50+
"pandas": "3ac4ffc4563c796122ef66899b9485a3f1a977553e2d2a8a318c72b8cc6f2202",
51+
"parquet": "ca31c69184d9832faed373922c2acccec0b13a0bb5bbbe19371385c3ff26f1d1",
52+
"arrow": "74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137",
53+
"text": "c4a140d10f020282918b5dd1b8a49f0104729c6177f60a6b49ec2a365ec69f34",
54+
"imagefolder": "7b7ce5247a942be131d49ad4f3de5866083399a0f250901bd8dc202f8c5f7ce5",
55+
"audiofolder": "d3c1655c66c8f72e4efb5c79e952975fa6e2ce538473a6890241ddbddee9071c",
56+
}
57+
4658
# Used to infer the module to use based on the data files extensions
4759
_EXTENSION_TO_MODULE: Dict[str, Tuple[str, dict]] = {
4860
".csv": ("csv", {}),

src/datasets/packaged_modules/arrow/arrow.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ class ArrowConfig(datasets.BuilderConfig):
1717

1818
features: Optional[datasets.Features] = None
1919

20+
def __post_init__(self):
21+
super().__post_init__()
22+
2023

2124
class Arrow(datasets.ArrowBasedBuilder):
2225
BUILDER_CONFIG_CLASS = ArrowConfig

src/datasets/packaged_modules/audiofolder/audiofolder.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ class AudioFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
1515
drop_labels: bool = None
1616
drop_metadata: bool = None
1717

18+
def __post_init__(self):
19+
super().__post_init__()
20+
1821

1922
class AudioFolder(folder_based_builder.FolderBasedBuilder):
2023
BASE_FEATURE = datasets.Audio

src/datasets/packaged_modules/csv/csv.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class CsvConfig(datasets.BuilderConfig):
6868
date_format: Optional[str] = None
6969

7070
def __post_init__(self):
71+
super().__post_init__()
7172
if self.delimiter is not None:
7273
self.sep = self.delimiter
7374
if self.column_names is not None:

src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ class FolderBasedBuilderConfig(datasets.BuilderConfig):
2828
drop_labels: bool = None
2929
drop_metadata: bool = None
3030

31+
def __post_init__(self):
32+
super().__post_init__()
33+
3134

3235
class FolderBasedBuilder(datasets.GeneratorBasedBuilder):
3336
"""

src/datasets/packaged_modules/generator/generator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ class GeneratorConfig(datasets.BuilderConfig):
1111
features: Optional[datasets.Features] = None
1212

1313
def __post_init__(self):
14-
assert self.generator is not None, "generator must be specified"
14+
super().__post_init__()
15+
if self.generator is None:
16+
raise ValueError("generator must be specified")
1517

1618
if self.gen_kwargs is None:
1719
self.gen_kwargs = {}

src/datasets/packaged_modules/imagefolder/imagefolder.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ class ImageFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
1515
drop_labels: bool = None
1616
drop_metadata: bool = None
1717

18+
def __post_init__(self):
19+
super().__post_init__()
20+
1821

1922
class ImageFolder(folder_based_builder.FolderBasedBuilder):
2023
BASE_FEATURE = datasets.Image

src/datasets/packaged_modules/json/json.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ class JsonConfig(datasets.BuilderConfig):
4444
chunksize: int = 10 << 20 # 10MB
4545
newlines_in_values: Optional[bool] = None
4646

47+
def __post_init__(self):
48+
super().__post_init__()
49+
4750

4851
class Json(datasets.ArrowBasedBuilder):
4952
BUILDER_CONFIG_CLASS = JsonConfig

src/datasets/packaged_modules/pandas/pandas.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ class PandasConfig(datasets.BuilderConfig):
1616

1717
features: Optional[datasets.Features] = None
1818

19+
def __post_init__(self):
20+
super().__post_init__()
21+
1922

2023
class Pandas(datasets.ArrowBasedBuilder):
2124
BUILDER_CONFIG_CLASS = PandasConfig

0 commit comments

Comments
 (0)