Skip to content

Commit ba71e92

Browse files
pmrowlalhoestq
andauthored
fsspec: support fsspec>=2023.12.0 glob changes (#6687)
* data_files: support fsspec 2023.12.0 glob * fsspec: unpin version upper bound * fsspec: pin max version to <=2024.2.0 * data_files: remove unsupported fsspec-specific ** globbing * data_files: update resolve_pattern ** behavior docstring * fix split case with either prefix or suffix --------- Co-authored-by: Quentin Lhoest <[email protected]> Co-authored-by: Quentin Lhoest <[email protected]>
1 parent b02be21 commit ba71e92

File tree

3 files changed

+27
-17
lines changed

3 files changed

+27
-17
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@
131131
"multiprocess",
132132
# to save datasets locally or on any filesystem
133133
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
134-
"fsspec[http]>=2023.1.0,<=2023.10.0",
134+
"fsspec[http]>=2023.1.0,<=2024.2.0",
135135
# for data streaming via http
136136
"aiohttp",
137137
# To get datasets from the Datasets Hub on huggingface.co

src/datasets/data_files.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,17 @@ class EmptyDatasetError(FileNotFoundError):
4747
NON_WORDS_CHARS = "-._ 0-9"
4848
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
4949
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
50-
else:
50+
elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
5151
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
52+
else:
53+
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [
54+
"**/{keyword}[{sep}]*",
55+
"**/{keyword}/**",
56+
"**/*[{sep}]{keyword}[{sep}]*",
57+
"**/*[{sep}]{keyword}[{sep}]*/**",
58+
"**/{keyword}[{sep}]*/**",
59+
"**/*[{sep}]{keyword}/**",
60+
]
5261

5362
DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
5463
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
@@ -303,11 +312,9 @@ def resolve_pattern(
303312
- data/* to match all the files inside "data"
304313
- data/** to match all the files inside "data" and its subdirectories
305314
306-
The patterns are resolved using the fsspec glob.
307-
308-
glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
309-
For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
310-
resulting in **.json being equivalent to **/*.json.
315+
The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
316+
Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
317+
other than a forward slash /.
311318
312319
More generally:
313320
- '*' matches any character except a forward-slash (to match just the file or directory name)

tests/test_data_files.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository(
415415
("**", 4, None, "train"),
416416
("**", 4, "data", "train"),
417417
("**", 2, "data/subdir", "train"),
418-
("**train*", 1, "data/subdir", "train"),
419-
("**test*", 1, "data/subdir", "test"),
420418
("**", 0, "data/subdir2", "train"),
421419
],
422420
)
@@ -452,14 +450,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository_hashing(hub_dataset_r
452450
data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
453451
assert Hasher.hash(data_files1) == Hasher.hash(data_files2)
454452

455-
patterns2 = {"train": ["data/**train.txt"], "test": ["data/**test.txt"]}
456-
data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path)
457-
assert Hasher.hash(data_files1) == Hasher.hash(data_files2)
458-
459-
patterns2 = {"train": ["data/**train.txt"], "test": ["data/**train.txt"]}
460-
data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path)
461-
assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
462-
463453
# the tmpfs used to mock the hub repo is based on a local directory
464454
# therefore os.stat is used to get the mtime of the data files
465455
with patch("os.stat", return_value=os.stat(__file__)):
@@ -609,6 +599,18 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
609599
{"test": "data/my_test_file.txt"},
610600
{"validation": "my_validation_dir/dataset.txt"},
611601
{"validation": "data/my_validation_file.txt"},
602+
{"train": "train_dir/dataset.txt"},
603+
{"train": "data/train_file.txt"},
604+
{"test": "test_dir/dataset.txt"},
605+
{"test": "data/test_file.txt"},
606+
{"validation": "validation_dir/dataset.txt"},
607+
{"validation": "data/validation_file.txt"},
608+
{"train": "my_train/dataset.txt"},
609+
{"train": "data/my_train.txt"},
610+
{"test": "my_test/dataset.txt"},
611+
{"test": "data/my_test.txt"},
612+
{"validation": "my_validation/dataset.txt"},
613+
{"validation": "data/my_validation.txt"},
612614
# With test<>eval aliases
613615
{"test": "eval.txt"},
614616
{"test": "data/eval.txt"},
@@ -631,6 +633,7 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
631633
{"test": "my-test-file.txt"},
632634
{"test": "my_test_file.txt"},
633635
{"test": "my test file.txt"},
636+
{"test": "my-test_file.txt"},
634637
{"test": "test00001.txt"},
635638
],
636639
)

0 commit comments

Comments
 (0)