88from fsspec import get_fs_token_paths
99from fsspec .implementations .http import HTTPFileSystem
1010from huggingface_hub import HfFileSystem
11+ from packaging import version
1112from tqdm .contrib .concurrent import thread_map
1213
1314from . import config
@@ -41,23 +42,17 @@ class EmptyDatasetError(FileNotFoundError):
4142 Split .TEST : ["test" , "testing" , "eval" , "evaluation" ],
4243}
4344NON_WORDS_CHARS = "-._ 0-9"
44- KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*" , "{keyword}[{sep}]*" ]
45- KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**[{sep}/]{keyword}[{sep}/]**" ]
45+ if config .FSSPEC_VERSION < version .parse ("2023.9.0" ):
46+ KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**[{sep}/]{keyword}[{sep}/]**" ]
47+ else :
48+ KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**/*[{sep}/]{keyword}[{sep}/]**" ]
4649
4750DEFAULT_SPLITS = [Split .TRAIN , Split .VALIDATION , Split .TEST ]
48- DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
51+ DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
4952 split : [
5053 pattern .format (keyword = keyword , sep = NON_WORDS_CHARS )
5154 for keyword in SPLIT_KEYWORDS [split ]
52- for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
53- ]
54- for split in DEFAULT_SPLITS
55- }
56- DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
57- split : [
58- pattern .format (keyword = keyword , sep = NON_WORDS_CHARS )
59- for keyword in SPLIT_KEYWORDS [split ]
60- for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
55+ for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
6156 ]
6257 for split in DEFAULT_SPLITS
6358}
@@ -68,16 +63,21 @@ class EmptyDatasetError(FileNotFoundError):
6863
6964ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED ]
7065ALL_DEFAULT_PATTERNS = [
71- DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME ,
72- DEFAULT_PATTERNS_SPLIT_IN_FILENAME ,
66+ DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME ,
7367 DEFAULT_PATTERNS_ALL ,
7468]
75- METADATA_PATTERNS = [
76- "metadata.csv" ,
77- "**/metadata.csv" ,
78- "metadata.jsonl" ,
79- "**/metadata.jsonl" ,
80- ] # metadata file for ImageFolder and AudioFolder
69+ if config .FSSPEC_VERSION < version .parse ("2023.9.0" ):
70+ METADATA_PATTERNS = [
71+ "metadata.csv" ,
72+ "**/metadata.csv" ,
73+ "metadata.jsonl" ,
74+ "**/metadata.jsonl" ,
75+ ] # metadata file for ImageFolder and AudioFolder
76+ else :
77+ METADATA_PATTERNS = [
78+ "**/metadata.csv" ,
79+ "**/metadata.jsonl" ,
80+ ] # metadata file for ImageFolder and AudioFolder
8181WILDCARD_CHARACTERS = "*[]"
8282FILES_TO_IGNORE = [
8383 "README.md" ,
@@ -296,10 +296,10 @@ def resolve_pattern(
296296 - data/** to match all the files inside "data" and its subdirectories
297297
298298 The patterns are resolved using the fsspec glob.
299- Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:
300- - '*' matches only first level items
301- - '**' matches all items
302- - '**/*' matches all at least second level items
299+
300+ glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
301+ For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
302+ resulting in **.json being equivalent to **/*.json.
303303
304304 More generally:
305305 - '*' matches any character except a forward-slash (to match just the file or directory name)
@@ -416,7 +416,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
416416
417417 Output:
418418
419- {"train": ["**train*"], "test": ["**test*"]}
419+ {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
420+ 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
420421
421422 Input:
422423
@@ -434,7 +435,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
434435
435436 Output:
436437
437- {"train": ["**train*/**"], "test": ["**test*/**"]}
438+ {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
439+ 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
438440
439441 Input:
440442
@@ -451,11 +453,9 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
451453
452454 Output:
453455
454- {
455- "train": ["data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
456- "test": ["data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
457- "random": ["data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
458- }
456+ {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
457+ 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
458+ 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
459459
460460 In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
461461 """
0 commit comments