Skip to content

Commit 21f84d0

Browse files
mariosaskolhoestq
andcommitted
Add support for fsspec>=2023.9.0 (#6244)
* Add support for `fsspec>=2023.9.0` * Fixes * Style * Fix mock fs for files in nested directories * Nit * More fixes * Nit * Remove print * Update tests/test_data_files.py Co-authored-by: Quentin Lhoest <[email protected]> * Address some more comments --------- Co-authored-by: Quentin Lhoest <[email protected]>
1 parent de305f7 commit 21f84d0

File tree

4 files changed

+46
-43
lines changed

4 files changed

+46
-43
lines changed

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@
125125
# for better multiprocessing
126126
"multiprocess",
127127
# to save datasets locally or on any filesystem
128-
# minimum 2021.11.1 so that BlockSizeError is fixed: see https://github.com/fsspec/filesystem_spec/pull/830
129-
"fsspec[http]>=2021.11.1",
128+
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
129+
"fsspec[http]>=2023.1.0",
130130
# for data streaming via http
131131
"aiohttp",
132132
# To get datasets from the Datasets Hub on huggingface.co

src/datasets/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
# Imports
3838
DILL_VERSION = version.parse(importlib.metadata.version("dill"))
39+
FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
3940
PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
4041
PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
4142

src/datasets/data_files.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from fsspec import get_fs_token_paths
99
from fsspec.implementations.http import HTTPFileSystem
1010
from huggingface_hub import HfFileSystem
11+
from packaging import version
1112
from tqdm.contrib.concurrent import thread_map
1213

1314
from . import config
@@ -41,23 +42,17 @@ class EmptyDatasetError(FileNotFoundError):
4142
Split.TEST: ["test", "testing", "eval", "evaluation"],
4243
}
4344
NON_WORDS_CHARS = "-._ 0-9"
44-
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
45-
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
45+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
46+
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
47+
else:
48+
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
4649

4750
DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
48-
DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
51+
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
4952
split: [
5053
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
5154
for keyword in SPLIT_KEYWORDS[split]
52-
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
53-
]
54-
for split in DEFAULT_SPLITS
55-
}
56-
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
57-
split: [
58-
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
59-
for keyword in SPLIT_KEYWORDS[split]
60-
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
55+
for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
6156
]
6257
for split in DEFAULT_SPLITS
6358
}
@@ -68,16 +63,21 @@ class EmptyDatasetError(FileNotFoundError):
6863

6964
ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
7065
ALL_DEFAULT_PATTERNS = [
71-
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
72-
DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
66+
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME,
7367
DEFAULT_PATTERNS_ALL,
7468
]
75-
METADATA_PATTERNS = [
76-
"metadata.csv",
77-
"**/metadata.csv",
78-
"metadata.jsonl",
79-
"**/metadata.jsonl",
80-
] # metadata file for ImageFolder and AudioFolder
69+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
70+
METADATA_PATTERNS = [
71+
"metadata.csv",
72+
"**/metadata.csv",
73+
"metadata.jsonl",
74+
"**/metadata.jsonl",
75+
] # metadata file for ImageFolder and AudioFolder
76+
else:
77+
METADATA_PATTERNS = [
78+
"**/metadata.csv",
79+
"**/metadata.jsonl",
80+
] # metadata file for ImageFolder and AudioFolder
8181
WILDCARD_CHARACTERS = "*[]"
8282
FILES_TO_IGNORE = [
8383
"README.md",
@@ -296,10 +296,10 @@ def resolve_pattern(
296296
- data/** to match all the files inside "data" and its subdirectories
297297
298298
The patterns are resolved using the fsspec glob.
299-
Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:
300-
- '*' matches only first level items
301-
- '**' matches all items
302-
- '**/*' matches all at least second level items
299+
300+
glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
301+
For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
302+
resulting in **.json being equivalent to **/*.json.
303303
304304
More generally:
305305
- '*' matches any character except a forward-slash (to match just the file or directory name)
@@ -416,7 +416,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
416416
417417
Output:
418418
419-
{"train": ["**train*"], "test": ["**test*"]}
419+
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
420+
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
420421
421422
Input:
422423
@@ -434,7 +435,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
434435
435436
Output:
436437
437-
{"train": ["**train*/**"], "test": ["**test*/**"]}
438+
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
439+
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
438440
439441
Input:
440442
@@ -451,11 +453,9 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
451453
452454
Output:
453455
454-
{
455-
"train": ["data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
456-
"test": ["data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
457-
"random": ["data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
458-
}
456+
{'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
457+
'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
458+
'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
459459
460460
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
461461
"""

tests/test_data_files.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from pathlib import Path, PurePath
2+
from pathlib import Path
33
from typing import List
44
from unittest.mock import patch
55

@@ -480,15 +480,16 @@ def mock_fs(file_paths: List[str]):
480480
Example:
481481
482482
```py
483-
>>> fs = mock_fs(["data/train.txt", "data.test.txt"])
483+
>>> DummyTestFS = mock_fs(["data/train.txt", "data.test.txt"])
484+
>>> fs = DummyTestFS()
484485
>>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
485486
>>> assert type(fs).__name__ == "DummyTestFS"
486487
>>> print(fs.glob("**"))
487488
["data", "data/train.txt", "data.test.txt"]
488489
```
489490
"""
490491

491-
dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
492+
dir_paths = {file_path.rsplit("/", 1)[0] for file_path in file_paths if "/" in file_path}
492493
fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
493494
{"name": file_path, "type": "file", "size": 10} for file_path in file_paths
494495
]
@@ -606,16 +607,17 @@ def resolver(pattern):
606607
["metadata.jsonl"],
607608
["metadata.csv"],
608609
# nested metadata files
609-
["data/metadata.jsonl", "data/train/metadata.jsonl"],
610-
["data/metadata.csv", "data/train/metadata.csv"],
610+
["metadata.jsonl", "data/metadata.jsonl"],
611+
["metadata.csv", "data/metadata.csv"],
611612
],
612613
)
613614
def test_get_metadata_files_patterns(metadata_files):
615+
DummyTestFS = mock_fs(metadata_files)
616+
fs = DummyTestFS()
617+
614618
def resolver(pattern):
615-
return [PurePath(path) for path in set(metadata_files) if PurePath(path).match(pattern)]
619+
return [file_path for file_path in fs.glob(pattern) if fs.isfile(file_path)]
616620

617621
patterns = _get_metadata_files_patterns(resolver)
618-
matched = [path for path in metadata_files for pattern in patterns if PurePath(path).match(pattern)]
619-
# Use set to remove the difference between in behavior between PurePath.match and mathcing via fsspec.glob
620-
assert len(set(matched)) == len(metadata_files)
621-
assert sorted(set(matched)) == sorted(metadata_files)
622+
matched = [file_path for pattern in patterns for file_path in resolver(pattern)]
623+
assert sorted(matched) == sorted(metadata_files)

0 commit comments

Comments
 (0)