99from fsspec import get_fs_token_paths
1010from fsspec .implementations .http import HTTPFileSystem
1111from huggingface_hub import HfFileSystem
12+ from packaging import version
1213from tqdm .contrib .concurrent import thread_map
1314
1415from . import config
@@ -42,23 +43,17 @@ class EmptyDatasetError(FileNotFoundError):
4243 Split .TEST : ["test" , "testing" , "eval" , "evaluation" ],
4344}
4445NON_WORDS_CHARS = "-._ 0-9"
45- KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*" , "{keyword}[{sep}]*" ]
46- KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**[{sep}/]{keyword}[{sep}/]**" ]
46+ if config .FSSPEC_VERSION < version .parse ("2023.9.0" ):
47+ KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**[{sep}/]{keyword}[{sep}/]**" ]
48+ else :
49+ KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**/*[{sep}/]{keyword}[{sep}/]**" ]
4750
4851DEFAULT_SPLITS = [Split .TRAIN , Split .VALIDATION , Split .TEST ]
49- DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
52+ DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
5053 split : [
5154 pattern .format (keyword = keyword , sep = NON_WORDS_CHARS )
5255 for keyword in SPLIT_KEYWORDS [split ]
53- for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
54- ]
55- for split in DEFAULT_SPLITS
56- }
57- DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
58- split : [
59- pattern .format (keyword = keyword , sep = NON_WORDS_CHARS )
60- for keyword in SPLIT_KEYWORDS [split ]
61- for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
56+ for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
6257 ]
6358 for split in DEFAULT_SPLITS
6459}
@@ -69,16 +64,21 @@ class EmptyDatasetError(FileNotFoundError):
6964
7065ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED ]
7166ALL_DEFAULT_PATTERNS = [
72- DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME ,
73- DEFAULT_PATTERNS_SPLIT_IN_FILENAME ,
67+ DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME ,
7468 DEFAULT_PATTERNS_ALL ,
7569]
76- METADATA_PATTERNS = [
77- "metadata.csv" ,
78- "**/metadata.csv" ,
79- "metadata.jsonl" ,
80- "**/metadata.jsonl" ,
81- ] # metadata file for ImageFolder and AudioFolder
70+ if config .FSSPEC_VERSION < version .parse ("2023.9.0" ):
71+ METADATA_PATTERNS = [
72+ "metadata.csv" ,
73+ "**/metadata.csv" ,
74+ "metadata.jsonl" ,
75+ "**/metadata.jsonl" ,
76+ ] # metadata file for ImageFolder and AudioFolder
77+ else :
78+ METADATA_PATTERNS = [
79+ "**/metadata.csv" ,
80+ "**/metadata.jsonl" ,
81+ ] # metadata file for ImageFolder and AudioFolder
8282WILDCARD_CHARACTERS = "*[]"
8383FILES_TO_IGNORE = [
8484 "README.md" ,
@@ -297,10 +297,10 @@ def resolve_pattern(
297297 - data/** to match all the files inside "data" and its subdirectories
298298
299299 The patterns are resolved using the fsspec glob.
300- Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:
301- - '*' matches only first level items
302- - '**' matches all items
303- - '**/*' matches all at least second level items
300+
301+ glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
302+ For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
303+ resulting in **.json being equivalent to **/*.json.
304304
305305 More generally:
306306 - '*' matches any character except a forward-slash (to match just the file or directory name)
@@ -417,7 +417,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
417417
418418 Output:
419419
420- {"train": ["**train*"], "test": ["**test*"]}
420+ {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
421+ 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
421422
422423 Input:
423424
@@ -435,7 +436,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
435436
436437 Output:
437438
438- {"train": ["**train*/**"], "test": ["**test*/**"]}
439+ {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
440+ 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
439441
440442 Input:
441443
@@ -452,11 +454,9 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
452454
453455 Output:
454456
455- {
456- "train": ["data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
457- "test": ["data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
458- "random": ["data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9].*"],
459- }
457+ {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
458+ 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
459+ 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
460460
461461 In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
462462 """
0 commit comments