Skip to content

Commit 5b53cf3

Browse files
mariosaskolhoestq
andcommitted
Add support for fsspec>=2023.9.0 (#6244)
* Add support for `fsspec>=2023.9.0` * Fixes * Style * Fix mock fs for files in nested directories * Nit * More fixes * Nit * Remove print * Update tests/test_data_files.py Co-authored-by: Quentin Lhoest <[email protected]> * Address some more comments --------- Co-authored-by: Quentin Lhoest <[email protected]>
1 parent 9fc46da commit 5b53cf3

File tree

4 files changed

+45
-42
lines changed

4 files changed

+45
-42
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@
126126
"multiprocess",
127127
# to save datasets locally or on any filesystem
128128
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
129-
"fsspec[http]>=2023.1.0,<2023.9.0", # Temporary pin
129+
"fsspec[http]>=2023.1.0",
130130
# for data streaming via http
131131
"aiohttp",
132132
# To get datasets from the Datasets Hub on huggingface.co

src/datasets/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
# Imports
3838
DILL_VERSION = version.parse(importlib.metadata.version("dill"))
39+
FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
3940
PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
4041
PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
4142

src/datasets/data_files.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from fsspec import get_fs_token_paths
1010
from fsspec.implementations.http import HTTPFileSystem
1111
from huggingface_hub import HfFileSystem
12+
from packaging import version
1213
from tqdm.contrib.concurrent import thread_map
1314

1415
from . import config
@@ -42,23 +43,17 @@ class EmptyDatasetError(FileNotFoundError):
4243
Split.TEST: ["test", "testing", "eval", "evaluation"],
4344
}
4445
NON_WORDS_CHARS = "-._ 0-9"
45-
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
46-
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
46+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
47+
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
48+
else:
49+
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
4750

4851
DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
49-
DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
52+
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
5053
split: [
5154
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
5255
for keyword in SPLIT_KEYWORDS[split]
53-
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
54-
]
55-
for split in DEFAULT_SPLITS
56-
}
57-
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
58-
split: [
59-
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
60-
for keyword in SPLIT_KEYWORDS[split]
61-
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
56+
for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
6257
]
6358
for split in DEFAULT_SPLITS
6459
}
@@ -69,16 +64,21 @@ class EmptyDatasetError(FileNotFoundError):
6964

7065
ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
7166
ALL_DEFAULT_PATTERNS = [
72-
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
73-
DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
67+
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME,
7468
DEFAULT_PATTERNS_ALL,
7569
]
76-
METADATA_PATTERNS = [
77-
"metadata.csv",
78-
"**/metadata.csv",
79-
"metadata.jsonl",
80-
"**/metadata.jsonl",
81-
] # metadata file for ImageFolder and AudioFolder
70+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
71+
METADATA_PATTERNS = [
72+
"metadata.csv",
73+
"**/metadata.csv",
74+
"metadata.jsonl",
75+
"**/metadata.jsonl",
76+
] # metadata file for ImageFolder and AudioFolder
77+
else:
78+
METADATA_PATTERNS = [
79+
"**/metadata.csv",
80+
"**/metadata.jsonl",
81+
] # metadata file for ImageFolder and AudioFolder
8282
WILDCARD_CHARACTERS = "*[]"
8383
FILES_TO_IGNORE = [
8484
"README.md",
@@ -297,10 +297,10 @@ def resolve_pattern(
297297
- data/** to match all the files inside "data" and its subdirectories
298298
299299
The patterns are resolved using the fsspec glob.
300-
Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:
301-
- '*' matches only first level items
302-
- '**' matches all items
303-
- '**/*' matches all at least second level items
300+
301+
glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
302+
For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
303+
resulting in **.json being equivalent to **/*.json.
304304
305305
More generally:
306306
- '*' matches any character except a forward-slash (to match just the file or directory name)
@@ -417,7 +417,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
417417
418418
Output:
419419
420-
{"train": ["**train*"], "test": ["**test*"]}
420+
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
421+
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
421422
422423
Input:
423424
@@ -435,7 +436,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
435436
436437
Output:
437438
438-
{"train": ["**train*/**"], "test": ["**test*/**"]}
439+
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
440+
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
439441
440442
Input:
441443
@@ -452,11 +454,9 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
452454
453455
Output:
454456
455-
{
456-
"train": ["data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
457-
"test": ["data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
458-
"random": ["data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
459-
}
457+
{'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
458+
'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
459+
'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
460460
461461
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
462462
"""

tests/test_data_files.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import copy
22
import os
3-
from pathlib import Path, PurePath
3+
from pathlib import Path
44
from typing import List
55
from unittest.mock import patch
66

@@ -493,15 +493,16 @@ def mock_fs(file_paths: List[str]):
493493
Example:
494494
495495
```py
496-
>>> fs = mock_fs(["data/train.txt", "data.test.txt"])
496+
>>> DummyTestFS = mock_fs(["data/train.txt", "data.test.txt"])
497+
>>> fs = DummyTestFS()
497498
>>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
498499
>>> assert type(fs).__name__ == "DummyTestFS"
499500
>>> print(fs.glob("**"))
500501
["data", "data/train.txt", "data.test.txt"]
501502
```
502503
"""
503504

504-
dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
505+
dir_paths = {file_path.rsplit("/", 1)[0] for file_path in file_paths if "/" in file_path}
505506
fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
506507
{"name": file_path, "type": "file", "size": 10} for file_path in file_paths
507508
]
@@ -619,16 +620,17 @@ def resolver(pattern):
619620
["metadata.jsonl"],
620621
["metadata.csv"],
621622
# nested metadata files
622-
["data/metadata.jsonl", "data/train/metadata.jsonl"],
623-
["data/metadata.csv", "data/train/metadata.csv"],
623+
["metadata.jsonl", "data/metadata.jsonl"],
624+
["metadata.csv", "data/metadata.csv"],
624625
],
625626
)
626627
def test_get_metadata_files_patterns(metadata_files):
628+
DummyTestFS = mock_fs(metadata_files)
629+
fs = DummyTestFS()
630+
627631
def resolver(pattern):
628-
return [PurePath(path) for path in set(metadata_files) if PurePath(path).match(pattern)]
632+
return [file_path for file_path in fs.glob(pattern) if fs.isfile(file_path)]
629633

630634
patterns = _get_metadata_files_patterns(resolver)
631-
matched = [path for path in metadata_files for pattern in patterns if PurePath(path).match(pattern)]
632-
# Use set to remove the difference between in behavior between PurePath.match and mathcing via fsspec.glob
633-
assert len(set(matched)) == len(metadata_files)
634-
assert sorted(set(matched)) == sorted(metadata_files)
635+
matched = [file_path for pattern in patterns for file_path in resolver(pattern)]
636+
assert sorted(matched) == sorted(metadata_files)

0 commit comments

Comments
 (0)