66from typing import Callable , Dict , List , Optional , Set , Tuple , Union
77
88import huggingface_hub
9- from fsspec import get_fs_token_paths
9+ from fsspec . core import url_to_fs
1010from fsspec .implementations .http import HTTPFileSystem
1111from huggingface_hub import HfFileSystem
1212from packaging import version
@@ -46,36 +46,57 @@ class EmptyDatasetError(FileNotFoundError):
4646}
4747NON_WORDS_CHARS = "-._ 0-9"
4848if config .FSSPEC_VERSION < version .parse ("2023.9.0" ):
49- KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**[{sep}/]{keyword}[{sep}/]**" ]
49+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*" , "{keyword}[{sep}]*" ]
50+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
51+ "{keyword}/**" ,
52+ "{keyword}[{sep}]*/**" ,
53+ "**[{sep}/]{keyword}/**" ,
54+ "**[{sep}/]{keyword}[{sep}]*/**" ,
55+ ]
5056elif config .FSSPEC_VERSION < version .parse ("2023.12.0" ):
51- KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**" , "**/*[{sep}/]{keyword}[{sep}/]**" ]
57+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*" , "{keyword}[{sep}]*" ]
58+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
59+ "{keyword}/**/*" ,
60+ "{keyword}[{sep}]*/**/*" ,
61+ "**/*[{sep}/]{keyword}/**/*" ,
62+ "**/*[{sep}/]{keyword}[{sep}]*/**/*" ,
63+ ]
5264else :
53- KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [
54- "**/{keyword}[{sep}]*" ,
65+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*" , "**/*[{sep}]{keyword}[{sep}]*" ]
66+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
5567 "**/{keyword}/**" ,
56- "**/*[{sep}]{keyword}[{sep}]*" ,
57- "**/*[{sep}]{keyword}[{sep}]*/**" ,
5868 "**/{keyword}[{sep}]*/**" ,
5969 "**/*[{sep}]{keyword}/**" ,
70+ "**/*[{sep}]{keyword}[{sep}]*/**" ,
6071 ]
6172
6273DEFAULT_SPLITS = [Split .TRAIN , Split .VALIDATION , Split .TEST ]
63- DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
74+ DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
6475 split : [
6576 pattern .format (keyword = keyword , sep = NON_WORDS_CHARS )
6677 for keyword in SPLIT_KEYWORDS [split ]
67- for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
78+ for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
6879 ]
6980 for split in DEFAULT_SPLITS
7081}
82+ DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
83+ split : [
84+ pattern .format (keyword = keyword , sep = NON_WORDS_CHARS )
85+ for keyword in SPLIT_KEYWORDS [split ]
86+ for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
87+ ]
88+ for split in DEFAULT_SPLITS
89+ }
90+
7191
7292DEFAULT_PATTERNS_ALL = {
7393 Split .TRAIN : ["**" ],
7494}
7595
7696ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED ]
7797ALL_DEFAULT_PATTERNS = [
78- DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME ,
98+ DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME ,
99+ DEFAULT_PATTERNS_SPLIT_IN_FILENAME ,
79100 DEFAULT_PATTERNS_ALL ,
80101]
81102if config .FSSPEC_VERSION < version .parse ("2023.9.0" ):
@@ -351,7 +372,7 @@ def resolve_pattern(
351372 else :
352373 base_path = ""
353374 pattern , storage_options = _prepare_path_and_storage_options (pattern , download_config = download_config )
354- fs , _ , _ = get_fs_token_paths (pattern , storage_options = storage_options )
375+ fs , * _ = url_to_fs (pattern , ** storage_options )
355376 fs_base_path = base_path .split ("::" )[0 ].split ("://" )[- 1 ] or fs .root_marker
356377 fs_pattern = pattern .split ("::" )[0 ].split ("://" )[- 1 ]
357378 files_to_ignore = set (FILES_TO_IGNORE ) - {xbasename (pattern )}
@@ -409,7 +430,7 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
409430
410431 Output:
411432
412- {" train" : ["**" ]}
433+ {' train' : ['**' ]}
413434
414435 Input:
415436
@@ -435,8 +456,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
435456
436457 Output:
437458
438- {'train': ['train[-._ 0-9/]** ', '**/*[-._ 0-9/ ]train[-._ 0-9/]** ', 'training[-._ 0-9/]** ', '**/*[-._ 0-9/ ]training[-._ 0-9/]* *'],
439- 'test': ['test[-._ 0-9/]** ', '**/*[-._ 0-9/ ]test[-._ 0-9/]** ', 'testing[-._ 0-9/]** ', '**/*[-._ 0-9/ ]testing[-._ 0-9/]* *', ...]}
459+ {'train': ['**/ train[-._ 0-9]* ', '**/*[-._ 0-9]train[-._ 0-9]* ', '**/ training[-._ 0-9]* ', '**/*[-._ 0-9]training[-._ 0-9] *'],
460+ 'test': ['**/ test[-._ 0-9]* ', '**/*[-._ 0-9]test[-._ 0-9]* ', '**/ testing[-._ 0-9]* ', '**/*[-._ 0-9]testing[-._ 0-9] *', ...]}
440461
441462 Input:
442463
@@ -454,8 +475,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
454475
455476 Output:
456477
457- {'train': ['train[-._ 0-9/] **', '**/*[-._ 0-9/] train[-._ 0-9/]** ', 'training [-._ 0-9/] **', '**/*[-._ 0-9/]training [-._ 0-9/]**' ],
458- 'test': ['test[-._ 0-9/] **', '**/*[-._ 0-9/] test[-._ 0-9/]** ', 'testing [-._ 0-9/] **', '**/*[-._ 0-9/]testing [-._ 0-9/] **', ...]}
478+ {'train': ['**/ train/ **', '**/train[-._ 0-9]*/** ', '**/* [-._ 0-9]train/ **', '**/*[-._ 0-9]train [-._ 0-9]*/**', ... ],
479+ 'test': ['**/ test/ **', '**/test[-._ 0-9]*/** ', '**/* [-._ 0-9]test/ **', '**/*[-._ 0-9]test [-._ 0-9]*/ **', ...]}
459480
460481 Input:
461482
@@ -504,7 +525,7 @@ def _get_single_origin_metadata(
504525 download_config : Optional [DownloadConfig ] = None ,
505526) -> Tuple [str ]:
506527 data_file , storage_options = _prepare_path_and_storage_options (data_file , download_config = download_config )
507- fs , _ , _ = get_fs_token_paths (data_file , storage_options = storage_options )
528+ fs , * _ = url_to_fs (data_file , ** storage_options )
508529 if isinstance (fs , HfFileSystem ):
509530 resolved_path = fs .resolve_path (data_file )
510531 return (resolved_path .repo_id , resolved_path .revision )
0 commit comments