Add support for fsspec>=2023.9.0 (#6244)

mariosasko · lhoestq · lhoestq · commit 21f84d0312b2 · 2023-10-23T12:38:36.000+02:00
* Add support for `fsspec&gt;=2023.9.0`

* Fixes

* Style

* Fix mock fs for files in nested directories

* Nit

* More fixes

* Nit

* Remove print

* Update tests/test_data_files.py

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;

* Address some more comments

---------

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;
diff --git a/setup.py b/setup.py
@@ -125,8 +125,8 @@
     # for better multiprocessing
     "multiprocess",
     # to save datasets locally or on any filesystem
-    # minimum 2021.11.1 so that BlockSizeError is fixed: see https://github.com/fsspec/filesystem_spec/pull/830
-    "fsspec[http]>=2021.11.1",
+    # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
+    "fsspec[http]>=2023.1.0",
     # for data streaming via http
     "aiohttp",
     # To get datasets from the Datasets Hub on huggingface.co
diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -36,6 +36,7 @@
 
 # Imports
 DILL_VERSION = version.parse(importlib.metadata.version("dill"))
+FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
 PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
 PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
 
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -8,6 +8,7 @@
 from fsspec import get_fs_token_paths
 from fsspec.implementations.http import HTTPFileSystem
 from huggingface_hub import HfFileSystem
+from packaging import version
 from tqdm.contrib.concurrent import thread_map
 
 from . import config
@@ -41,23 +42,17 @@ class EmptyDatasetError(FileNotFoundError):
     Split.TEST: ["test", "testing", "eval", "evaluation"],
 }
 NON_WORDS_CHARS = "-._ 0-9"
-KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
-KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
+if config.FSSPEC_VERSION < version.parse("2023.9.0"):
+    KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
+else:
+    KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
 
 DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
-DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
+DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
     split: [
         pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
         for keyword in SPLIT_KEYWORDS[split]
-        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
-    ]
-    for split in DEFAULT_SPLITS
-}
-DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
-    split: [
-        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
-        for keyword in SPLIT_KEYWORDS[split]
-        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+        for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
     ]
     for split in DEFAULT_SPLITS
 }
@@ -68,16 +63,21 @@ class EmptyDatasetError(FileNotFoundError):
 
 ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
 ALL_DEFAULT_PATTERNS = [
-    DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
-    DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
+    DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME,
     DEFAULT_PATTERNS_ALL,
 ]
-METADATA_PATTERNS = [
-    "metadata.csv",
-    "**/metadata.csv",
-    "metadata.jsonl",
-    "**/metadata.jsonl",
-]  # metadata file for ImageFolder and AudioFolder
+if config.FSSPEC_VERSION < version.parse("2023.9.0"):
+    METADATA_PATTERNS = [
+        "metadata.csv",
+        "**/metadata.csv",
+        "metadata.jsonl",
+        "**/metadata.jsonl",
+    ]  # metadata file for ImageFolder and AudioFolder
+else:
+    METADATA_PATTERNS = [
+        "**/metadata.csv",
+        "**/metadata.jsonl",
+    ]  # metadata file for ImageFolder and AudioFolder
 WILDCARD_CHARACTERS = "*[]"
 FILES_TO_IGNORE = [
     "README.md",
@@ -296,10 +296,10 @@ def resolve_pattern(
     - data/** to match all the files inside "data" and its subdirectories
 
     The patterns are resolved using the fsspec glob.
-    Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:
-    - '*' matches only first level items
-    - '**' matches all items
-    - '**/*' matches all at least second level items
+
+    glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
+    For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
+    resulting in **.json being equivalent to **/*.json.
 
     More generally:
     - '*' matches any character except a forward-slash (to match just the file or directory name)
@@ -416,7 +416,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
 
     Output:
 
-        {"train": ["**train*"], "test": ["**test*"]}
+        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
+         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
 
     Input:
 
@@ -434,7 +435,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
 
     Output:
 
-        {"train": ["**train*/**"], "test": ["**test*/**"]}
+        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
+         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
 
     Input:
 
@@ -451,11 +453,9 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
 
     Output:
 
-        {
-            "train": ["data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
-            "test": ["data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
-            "random": ["data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
-        }
+        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
 
     In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
     """
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
@@ -1,5 +1,5 @@
 import os
-from pathlib import Path, PurePath
+from pathlib import Path
 from typing import List
 from unittest.mock import patch
 
@@ -480,15 +480,16 @@ def mock_fs(file_paths: List[str]):
     Example:
 
     ```py
-    >>> fs = mock_fs(["data/train.txt", "data.test.txt"])
+    >>> DummyTestFS = mock_fs(["data/train.txt", "data.test.txt"])
+    >>> fs = DummyTestFS()
     >>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
     >>> assert type(fs).__name__ == "DummyTestFS"
     >>> print(fs.glob("**"))
     ["data", "data/train.txt", "data.test.txt"]
     ```
     """
 
-    dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
+    dir_paths = {file_path.rsplit("/", 1)[0] for file_path in file_paths if "/" in file_path}
     fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
         {"name": file_path, "type": "file", "size": 10} for file_path in file_paths
     ]
@@ -606,16 +607,17 @@ def resolver(pattern):
         ["metadata.jsonl"],
         ["metadata.csv"],
         # nested metadata files
-        ["data/metadata.jsonl", "data/train/metadata.jsonl"],
-        ["data/metadata.csv", "data/train/metadata.csv"],
+        ["metadata.jsonl", "data/metadata.jsonl"],
+        ["metadata.csv", "data/metadata.csv"],
     ],
 )
 def test_get_metadata_files_patterns(metadata_files):
+    DummyTestFS = mock_fs(metadata_files)
+    fs = DummyTestFS()
+
     def resolver(pattern):
-        return [PurePath(path) for path in set(metadata_files) if PurePath(path).match(pattern)]
+        return [file_path for file_path in fs.glob(pattern) if fs.isfile(file_path)]
 
     patterns = _get_metadata_files_patterns(resolver)
-    matched = [path for path in metadata_files for pattern in patterns if PurePath(path).match(pattern)]
-    # Use set to remove the difference between in behavior between PurePath.match and mathcing via fsspec.glob
-    assert len(set(matched)) == len(metadata_files)
-    assert sorted(set(matched)) == sorted(metadata_files)
+    matched = [file_path for pattern in patterns for file_path in resolver(pattern)]
+    assert sorted(matched) == sorted(metadata_files)