Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"multiprocess",
# to save datasets locally or on any filesystem
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
"fsspec[http]>=2023.1.0,<=2023.10.0",
"fsspec[http]>=2023.1.0,<=2024.2.0",
# for data streaming via http
"aiohttp",
# To get datasets from the Datasets Hub on huggingface.co
Expand Down
19 changes: 13 additions & 6 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,17 @@ class EmptyDatasetError(FileNotFoundError):
NON_WORDS_CHARS = "-._ 0-9"
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
else:
elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
else:
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [
"**/{keyword}[{sep}]*",
"**/{keyword}/**",
"**/*[{sep}]{keyword}[{sep}]*",
"**/*[{sep}]{keyword}[{sep}]*/**",
"**/{keyword}[{sep}]*/**",
"**/*[{sep}]{keyword}/**",
]

DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
Expand Down Expand Up @@ -303,11 +312,9 @@ def resolve_pattern(
- data/* to match all the files inside "data"
- data/** to match all the files inside "data" and its subdirectories

The patterns are resolved using the fsspec glob.

glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
resulting in **.json being equivalent to **/*.json.
The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
other than a forward slash /.

More generally:
- '*' matches any character except a forward-slash (to match just the file or directory name)
Expand Down
23 changes: 13 additions & 10 deletions tests/test_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository(
("**", 4, None, "train"),
("**", 4, "data", "train"),
("**", 2, "data/subdir", "train"),
("**train*", 1, "data/subdir", "train"),
("**test*", 1, "data/subdir", "test"),
("**", 0, "data/subdir2", "train"),
],
)
Expand Down Expand Up @@ -452,14 +450,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository_hashing(hub_dataset_r
data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

patterns2 = {"train": ["data/**train.txt"], "test": ["data/**test.txt"]}
data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path)
assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

patterns2 = {"train": ["data/**train.txt"], "test": ["data/**train.txt"]}
data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path)
assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

# the tmpfs used to mock the hub repo is based on a local directory
# therefore os.stat is used to get the mtime of the data files
with patch("os.stat", return_value=os.stat(__file__)):
Expand Down Expand Up @@ -609,6 +599,18 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
{"test": "data/my_test_file.txt"},
{"validation": "my_validation_dir/dataset.txt"},
{"validation": "data/my_validation_file.txt"},
{"train": "train_dir/dataset.txt"},
{"train": "data/train_file.txt"},
{"test": "test_dir/dataset.txt"},
{"test": "data/test_file.txt"},
{"validation": "validation_dir/dataset.txt"},
{"validation": "data/validation_file.txt"},
{"train": "my_train/dataset.txt"},
{"train": "data/my_train.txt"},
{"test": "my_test/dataset.txt"},
{"test": "data/my_test.txt"},
{"validation": "my_validation/dataset.txt"},
{"validation": "data/my_validation.txt"},
# With test<>eval aliases
{"test": "eval.txt"},
{"test": "data/eval.txt"},
Expand All @@ -631,6 +633,7 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
{"test": "my-test-file.txt"},
{"test": "my_test_file.txt"},
{"test": "my test file.txt"},
{"test": "my-test_file.txt"},
{"test": "test00001.txt"},
],
)
Expand Down