diff --git a/setup.py b/setup.py index 6508452a4f0..45a892c19d6 100644 --- a/setup.py +++ b/setup.py @@ -131,7 +131,7 @@ "multiprocess", # to save datasets locally or on any filesystem # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 - "fsspec[http]>=2023.1.0,<=2023.10.0", + "fsspec[http]>=2023.1.0,<=2024.2.0", # for data streaming via http "aiohttp", # To get datasets from the Datasets Hub on huggingface.co diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 1a4d447b48d..752145413db 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -47,8 +47,17 @@ class EmptyDatasetError(FileNotFoundError): NON_WORDS_CHARS = "-._ 0-9" if config.FSSPEC_VERSION < version.parse("2023.9.0"): KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"] -else: +elif config.FSSPEC_VERSION < version.parse("2023.12.0"): KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"] +else: + KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [ + "**/{keyword}[{sep}]*", + "**/{keyword}/**", + "**/*[{sep}]{keyword}[{sep}]*", + "**/*[{sep}]{keyword}[{sep}]*/**", + "**/{keyword}[{sep}]*/**", + "**/*[{sep}]{keyword}/**", + ] DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST] DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = { @@ -303,11 +312,9 @@ def resolve_pattern( - data/* to match all the files inside "data" - data/** to match all the files inside "data" and its subdirectories - The patterns are resolved using the fsspec glob. - - glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /. - For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix, - resulting in **.json being equivalent to **/*.json. + The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to + Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix + other than a forward slash /. More generally: - '*' matches any character except a forward-slash (to match just the file or directory name) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 9ac8945c931..0b4ef65ce2a 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -415,8 +415,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository( ("**", 4, None, "train"), ("**", 4, "data", "train"), ("**", 2, "data/subdir", "train"), - ("**train*", 1, "data/subdir", "train"), - ("**test*", 1, "data/subdir", "test"), ("**", 0, "data/subdir2", "train"), ], ) @@ -452,14 +450,6 @@ def test_DataFilesDict_from_patterns_in_dataset_repository_hashing(hub_dataset_r data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True)) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) - patterns2 = {"train": ["data/**train.txt"], "test": ["data/**test.txt"]} - data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path) - assert Hasher.hash(data_files1) == Hasher.hash(data_files2) - - patterns2 = {"train": ["data/**train.txt"], "test": ["data/**train.txt"]} - data_files2 = DataFilesDict.from_patterns(patterns2, hub_dataset_repo_path) - assert Hasher.hash(data_files1) != Hasher.hash(data_files2) - # the tmpfs used to mock the hub repo is based on a local directory # therefore os.stat is used to get the mtime of the data files with patch("os.stat", return_value=os.stat(__file__)): @@ -609,6 +599,18 @@ def ls(self, path, detail=True, refresh=True, **kwargs): {"test": "data/my_test_file.txt"}, {"validation": "my_validation_dir/dataset.txt"}, {"validation": "data/my_validation_file.txt"}, + {"train": "train_dir/dataset.txt"}, + {"train": "data/train_file.txt"}, + {"test": "test_dir/dataset.txt"}, + {"test": "data/test_file.txt"}, + {"validation": "validation_dir/dataset.txt"}, + {"validation": "data/validation_file.txt"}, + {"train": "my_train/dataset.txt"}, + {"train": "data/my_train.txt"}, + {"test": "my_test/dataset.txt"}, + {"test": "data/my_test.txt"}, + {"validation": "my_validation/dataset.txt"}, + {"validation": "data/my_validation.txt"}, # With test<>eval aliases {"test": "eval.txt"}, {"test": "data/eval.txt"}, @@ -631,6 +633,7 @@ def ls(self, path, detail=True, refresh=True, **kwargs): {"test": "my-test-file.txt"}, {"test": "my_test_file.txt"}, {"test": "my test file.txt"}, + {"test": "my-test_file.txt"}, {"test": "test00001.txt"}, ], )