fsspec: support fsspec>=2023.12.0 glob changes (#6687)

pmrowla · lhoestq · web-flow · commit ba71e92c59c9 · 2024-02-29T16:12:17.000+01:00
* data_files: support fsspec 2023.12.0 glob

* fsspec: unpin version upper bound

* fsspec: pin max version to &lt;=2024.2.0

* data_files: remove unsupported fsspec-specific ** globbing

* data_files: update resolve_pattern ** behavior docstring

* fix split case with either prefix or suffix

---------

Co-authored-by: Quentin Lhoest &lt;lhoest.q@gmail.com&gt;
Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;
diff --git a/setup.py b/setup.py
@@ -131,7 +131,7 @@
     "multiprocess",
     # to save datasets locally or on any filesystem
     # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
-    "fsspec[http]>=2023.1.0,<=2023.10.0",
+    "fsspec[http]>=2023.1.0,<=2024.2.0",
     # for data streaming via http
     "aiohttp",
     # To get datasets from the Datasets Hub on huggingface.co
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -47,8 +47,17 @@ class EmptyDatasetError(FileNotFoundError):
 NON_WORDS_CHARS = "-._ 0-9"
 if config.FSSPEC_VERSION < version.parse("2023.9.0"):
     KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
-else:
+elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
     KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
+else:
+    KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [
+        "**/{keyword}[{sep}]*",
+        "**/{keyword}/**",
+        "**/*[{sep}]{keyword}[{sep}]*",
+        "**/*[{sep}]{keyword}[{sep}]*/**",
+        "**/{keyword}[{sep}]*/**",
+        "**/*[{sep}]{keyword}/**",
+    ]
 
 DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
 DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
@@ -303,11 +312,9 @@ def resolve_pattern(
     - data/* to match all the files inside "data"
     - data/** to match all the files inside "data" and its subdirectories
 
-    The patterns are resolved using the fsspec glob.
-
-    glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
-    For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
-    resulting in **.json being equivalent to **/*.json.
+    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
+    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
+    other than a forward slash /.
 
     More generally:
     - '*' matches any character except a forward-slash (to match just the file or directory name)
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
@@ -415,8 +415,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository(
         ("**", 4, None, "train"),
         ("**", 4, "data", "train"),
         ("**", 2, "data/subdir", "train"),
-        ("**train*", 1, "data/subdir", "train"),
-        ("**test*", 1, "data/subdir", "test"),
         ("**", 0, "data/subdir2", "train"),
     ],
 )
@@ -452,14 +450,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository_hashing(hub_dataset_r
     data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
     assert Hasher.hash(data_files1) == Hasher.hash(data_files2)
 
-    patterns2 = {"train": ["data/**train.txt"], "test": ["data/**test.txt"]}
-    data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path)
-    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)
-
-    patterns2 = {"train": ["data/**train.txt"], "test": ["data/**train.txt"]}
-    data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path)
-    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
-
     # the tmpfs used to mock the hub repo is based on a local directory
     # therefore os.stat is used to get the mtime of the data files
     with patch("os.stat", return_value=os.stat(__file__)):
@@ -609,6 +599,18 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
         {"test": "data/my_test_file.txt"},
         {"validation": "my_validation_dir/dataset.txt"},
         {"validation": "data/my_validation_file.txt"},
+        {"train": "train_dir/dataset.txt"},
+        {"train": "data/train_file.txt"},
+        {"test": "test_dir/dataset.txt"},
+        {"test": "data/test_file.txt"},
+        {"validation": "validation_dir/dataset.txt"},
+        {"validation": "data/validation_file.txt"},
+        {"train": "my_train/dataset.txt"},
+        {"train": "data/my_train.txt"},
+        {"test": "my_test/dataset.txt"},
+        {"test": "data/my_test.txt"},
+        {"validation": "my_validation/dataset.txt"},
+        {"validation": "data/my_validation.txt"},
         # With test<>eval aliases
         {"test": "eval.txt"},
         {"test": "data/eval.txt"},
@@ -631,6 +633,7 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
         {"test": "my-test-file.txt"},
         {"test": "my_test_file.txt"},
         {"test": "my test file.txt"},
+        {"test": "my-test_file.txt"},
         {"test": "test00001.txt"},
     ],
 )