From d963a0e60643293a73bd73dbe5f25a25a2239bb2 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 5 Jul 2022 15:18:27 +0200 Subject: [PATCH 01/11] only match separated split names --- src/datasets/data_files.py | 26 +++++++--- tests/test_data_files.py | 97 +++++++++++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 22 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 915c603d56c..e1c153f513f 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -27,15 +27,29 @@ class Url(str): SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*" DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { - str(Split.TRAIN): ["**train*"], - str(Split.TEST): ["**test*", "**eval*"], - str(Split.VALIDATION): ["**dev*", "**valid*"], + str(Split.TRAIN): ["**[-._/]train[-._]*", "train[-._]*", "**[-._/]training[-._]*", "training[-._]*"], + str(Split.TEST): ["**[-._/]test[-._]*", "test[-._]*", "**[-._/]eval[-._]*", "eval[-._]*"], + str(Split.VALIDATION): [ + "**[-._/]dev[-._]*", + "dev[-._]*", + "**[-._/]valid[-._]*", + "valid[-._]*", + "**[-._/]validation[-._]*", + "validation[-._]*", + ], } DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { - str(Split.TRAIN): ["**train*/**"], - str(Split.TEST): ["**test*/**", "**eval*/**"], - str(Split.VALIDATION): ["**dev*/**", "**valid*/**"], + str(Split.TRAIN): ["train[-._/]**", "**[-._/]train[-._/]**", "training[-._/]**", "**[-._/]training[-._/]**"], + str(Split.TEST): ["test[-._/]**", "**[-._/]test[-._/]**", "eval[-._/]**", "**[-._/]eval[-._/]**"], + str(Split.VALIDATION): [ + "dev[-._/]**", + "**[-._/]dev[-._/]**", + "valid[-._/]**", + "**[-._/]valid[-._/]**", + "validation[-._/]**", + "**[-._/]validation[-._/]**", + ], } DEFAULT_PATTERNS_ALL = { diff --git a/tests/test_data_files.py b/tests/test_data_files.py index b66fe1960fb..89b0ea70eb0 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -1,10 +1,12 @@ import os -from itertools import chain +from contextlib import contextmanager from pathlib import Path, PurePath +from typing import List from unittest.mock import patch import fsspec import pytest +from fsspec.spec import AbstractBufferedFile, AbstractFileSystem from huggingface_hub.hf_api import DatasetInfo from datasets.data_files import ( @@ -491,6 +493,69 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file): assert Hasher.hash(data_files1) != Hasher.hash(data_files2) +@contextmanager +def mock_fs(file_paths: List[str]): + """context manager to set up a mock:// filesystem in sfspec containing the provided files""" + + dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path} + fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [ + {"name": file_path, "type": "file", "size": 10} for file_path in file_paths + ] + + class DummyTestFS(AbstractFileSystem): + protocol = "mock" + _file_class = AbstractBufferedFile + _fs_contents = fs_contents + + def __getitem__(self, name): + for item in self._fs_contents: + if item["name"] == name: + return item + raise IndexError(f"{name} not found!") + + def ls(self, path, detail=True, refresh=True, **kwargs): + if kwargs.pop("strip_proto", True): + path = self._strip_protocol(path) + + files = not refresh and self._ls_from_cache(path) + if not files: + files = [file for file in self._fs_contents if path == self._parent(file["name"])] + files.sort(key=lambda file: file["name"]) + self.dircache[path.rstrip("/")] = files + + if detail: + return files + return [file["name"] for file in files] + + @classmethod + def get_test_paths(cls, start_with=""): + """Helper to return directory and file paths with no details""" + all = [file["name"] for file in cls._fs_contents if file["name"].startswith(start_with)] + return all + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + return self._file_class( + self, + path, + mode, + block_size, + autocommit, + cache_options=cache_options, + **kwargs, + ) + + with patch.dict(fsspec.registry.target, {"mock": DummyTestFS}): + yield DummyTestFS() + + @pytest.mark.parametrize( "data_file_per_split", [ @@ -541,25 +606,25 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file): {"validation": "dev/dataset.txt"}, # With other extensions {"train": "train.parquet", "test": "test.parquet", "validation": "valid.parquet"}, + # With "dev" or "eval" without separators + {"train": "developers_list.txt"}, + {"train": "data/seqeval_results.txt"}, ], ) def test_get_data_files_patterns(data_file_per_split): data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()} - - def resolver(pattern): - return [PurePath(path) for path in chain(*data_file_per_split.values()) if PurePath(path).match(pattern)] - - patterns_per_split = _get_data_files_patterns(resolver) - assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys()) - for split, patterns in patterns_per_split.items(): - matched = [ - path - for path in chain(*data_file_per_split.values()) - for pattern in patterns - if PurePath(path).match(pattern) - ] - assert len(matched) == len(data_file_per_split[split]) - assert matched == data_file_per_split[split] + with mock_fs( + [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths] + ) as fs: + + def resolver(pattern): + return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)] + + patterns_per_split = _get_data_files_patterns(resolver) + assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys()) + for split, patterns in patterns_per_split.items(): + matched = [str(file_path) for pattern in patterns for file_path in resolver(pattern)] + assert matched == data_file_per_split[split] @pytest.mark.parametrize( From 159649a9b22dbf34b546228f199a05bb3d6cf4fd Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 5 Jul 2022 16:08:47 +0200 Subject: [PATCH 02/11] docs --- docs/source/repository_structure.mdx | 38 ++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index 871879f0323..a53e9771dde 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -22,10 +22,13 @@ my_dataset_repository/ ## Splits and file names -🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. Files that contain *train* in their names are considered part of the train split. The same idea applies to the test and validation split: +🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. -- All the files that contain *test* in their names are considered part of the test split. -- All the files that contain *valid* in their names are considered part of the validation split. +Files that contain *train* in their names are considered part of the train split, e.g. `train.csv`, `my_train_file.csv`, etc. +The same idea applies to the test and validation split: + +- All the files that contain *test* in their names are considered part of the test split, e.g. `test.csv`, `my_test_file.csv` +- All the files that contain *validation* in their names are considered part of the validation split, e.g. `validation.csv`, `my_validation_file.csv` Here is an example where all the files are placed into a directory named `data`: @@ -35,9 +38,11 @@ my_dataset_repository/ └── data/ ├── train.csv ├── test.csv - └── valid.csv + └── validation.csv ``` +Note that if a file contains *test* but is embedded in another word (e.g. `contest.csv`), it's not counted as a test file. + ## Multiple files per split If one of your splits comprises several files, 🤗 Datasets can still infer whether it is the train, validation, and test split from the file name. @@ -58,7 +63,8 @@ Make sure all the files of your `train` set have *train* in their names (same fo Even if you add a prefix or suffix to `train` in the file name (like `my_train_file_00001.csv` for example), 🤗 Datasets can still infer the appropriate split. -For convenience, you can also place your data files into different directories. In this case, the split name is inferred from the directory name. +For convenience, you can also place your data files into different directories. +In this case, the split name is inferred from the directory name. ``` my_dataset_repository/ @@ -80,6 +86,28 @@ Eventually, you'll also be able to structure your repository to specify differen +## Split names keywords + +Train/validation/test splits are sometimes called train/dev/test, or sometimes train & eval sets. +These other names are also supported. +In particular, these keywords are equivalent: + +- train, training +- validation, valid, dev +- test, eval + +Therefore this is also a valid repository: + +``` +my_dataset_repository/ +├── README.md +└── data/ + ├── training.csv + ├── eval.csv + └── valid.csv +``` + + ## Custom split names If you have other data files in addition to the traditional train, validation, and test sets, you must use a different structure. From 78938562de214cda2cc7bccc0c6446c5bf16db8a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 5 Jul 2022 16:15:30 +0200 Subject: [PATCH 03/11] add space separator --- src/datasets/data_files.py | 32 ++++++++++++++++---------------- tests/test_data_files.py | 5 +++++ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index e1c153f513f..121f8685c42 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -27,28 +27,28 @@ class Url(str): SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*" DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { - str(Split.TRAIN): ["**[-._/]train[-._]*", "train[-._]*", "**[-._/]training[-._]*", "training[-._]*"], - str(Split.TEST): ["**[-._/]test[-._]*", "test[-._]*", "**[-._/]eval[-._]*", "eval[-._]*"], + str(Split.TRAIN): ["**[-._ /]train[-._ ]*", "train[-._ ]*", "**[-._ /]training[-._ ]*", "training[-._ ]*"], + str(Split.TEST): ["**[-._ /]test[-._ ]*", "test[-._ ]*", "**[-._ /]eval[-._ ]*", "eval[-._ ]*"], str(Split.VALIDATION): [ - "**[-._/]dev[-._]*", - "dev[-._]*", - "**[-._/]valid[-._]*", - "valid[-._]*", - "**[-._/]validation[-._]*", - "validation[-._]*", + "**[-._ /]dev[-._ ]*", + "dev[-._ ]*", + "**[-._ /]valid[-._ ]*", + "valid[-._ ]*", + "**[-._ /]validation[-._ ]*", + "validation[-._ ]*", ], } DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { - str(Split.TRAIN): ["train[-._/]**", "**[-._/]train[-._/]**", "training[-._/]**", "**[-._/]training[-._/]**"], - str(Split.TEST): ["test[-._/]**", "**[-._/]test[-._/]**", "eval[-._/]**", "**[-._/]eval[-._/]**"], + str(Split.TRAIN): ["train[-._ /]**", "**[-._ /]train[-._ /]**", "training[-._ /]**", "**[-._ /]training[-._ /]**"], + str(Split.TEST): ["test[-._ /]**", "**[-._ /]test[-._ /]**", "eval[-._ /]**", "**[-._ /]eval[-._ /]**"], str(Split.VALIDATION): [ - "dev[-._/]**", - "**[-._/]dev[-._/]**", - "valid[-._/]**", - "**[-._/]valid[-._/]**", - "validation[-._/]**", - "**[-._/]validation[-._/]**", + "dev[-._ /]**", + "**[-._ /]dev[-._ /]**", + "valid[-._ /]**", + "**[-._ /]valid[-._ /]**", + "validation[-._ /]**", + "**[-._ /]validation[-._ /]**", ], } diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 89b0ea70eb0..e22fabffd8c 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -609,6 +609,11 @@ def _open( # With "dev" or "eval" without separators {"train": "developers_list.txt"}, {"train": "data/seqeval_results.txt"}, + # With supported separators + {"test": "my.test.file.txt"}, + {"test": "my-test-file.txt"}, + {"test": "my_test_file.txt"}, + {"test": "my test file.txt"}, ], ) def test_get_data_files_patterns(data_file_per_split): From 31bf1bb58e20ce704db07af126ecd1ab69e2fc1a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 5 Jul 2022 17:04:39 +0200 Subject: [PATCH 04/11] fix win --- tests/test_data_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index e22fabffd8c..81b91f66080 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -628,7 +628,7 @@ def resolver(pattern): patterns_per_split = _get_data_files_patterns(resolver) assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys()) for split, patterns in patterns_per_split.items(): - matched = [str(file_path) for pattern in patterns for file_path in resolver(pattern)] + matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)] assert matched == data_file_per_split[split] From 93f45262d6b009d7dddbd3540d45094b49b66a78 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 8 Jul 2022 15:11:52 +0200 Subject: [PATCH 05/11] add testing --- docs/source/repository_structure.mdx | 2 +- src/datasets/data_files.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index a53e9771dde..cbe50be3403 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -94,7 +94,7 @@ In particular, these keywords are equivalent: - train, training - validation, valid, dev -- test, eval +- test, testing, eval Therefore this is also a valid repository: diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 6ed9174a53c..1e0da25e2b5 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -28,7 +28,14 @@ class Url(str): DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { str(Split.TRAIN): ["**[-._ /]train[-._ ]*", "train[-._ ]*", "**[-._ /]training[-._ ]*", "training[-._ ]*"], - str(Split.TEST): ["**[-._ /]test[-._ ]*", "test[-._ ]*", "**[-._ /]eval[-._ ]*", "eval[-._ ]*"], + str(Split.TEST): [ + "**[-._ /]test[-._ ]*", + "test[-._ ]*", + "**[-._ /]testing[-._ ]*", + "testing[-._ ]*", + "**[-._ /]eval[-._ ]*", + "eval[-._ ]*", + ], str(Split.VALIDATION): [ "**[-._ /]dev[-._ ]*", "dev[-._ ]*", @@ -41,7 +48,14 @@ class Url(str): DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { str(Split.TRAIN): ["train[-._ /]**", "**[-._ /]train[-._ /]**", "training[-._ /]**", "**[-._ /]training[-._ /]**"], - str(Split.TEST): ["test[-._ /]**", "**[-._ /]test[-._ /]**", "eval[-._ /]**", "**[-._ /]eval[-._ /]**"], + str(Split.TEST): [ + "test[-._ /]**", + "**[-._ /]test[-._ /]**", + "testing[-._ /]**", + "**[-._ /]testing[-._ /]**", + "eval[-._ /]**", + "**[-._ /]eval[-._ /]**", + ], str(Split.VALIDATION): [ "dev[-._ /]**", "**[-._ /]dev[-._ /]**", From e6adffbea00264bd0873e36b572cb7014f5feee8 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 8 Jul 2022 15:34:05 +0200 Subject: [PATCH 06/11] add evaluation --- docs/source/repository_structure.mdx | 2 +- src/datasets/data_files.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index cbe50be3403..fdf64bd9910 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -94,7 +94,7 @@ In particular, these keywords are equivalent: - train, training - validation, valid, dev -- test, testing, eval +- test, testing, eval, evaluation Therefore this is also a valid repository: diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 1e0da25e2b5..d3102fb7c68 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -35,6 +35,8 @@ class Url(str): "testing[-._ ]*", "**[-._ /]eval[-._ ]*", "eval[-._ ]*", + "**[-._ /]evaluation[-._ ]*", + "evaluation[-._ ]*", ], str(Split.VALIDATION): [ "**[-._ /]dev[-._ ]*", @@ -55,6 +57,8 @@ class Url(str): "**[-._ /]testing[-._ /]**", "eval[-._ /]**", "**[-._ /]eval[-._ /]**", + "evaluation[-._ /]**", + "**[-._ /]evaluation[-._ /]**", ], str(Split.VALIDATION): [ "dev[-._ /]**", From 9393386464439e4340c88352ef4fdbc8ef2e6ae0 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 18 Jul 2022 11:49:45 +0200 Subject: [PATCH 07/11] suggestions in doc --- docs/source/repository_structure.mdx | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index fdf64bd9910..e9e46ff5217 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -24,11 +24,11 @@ my_dataset_repository/ 🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. -Files that contain *train* in their names are considered part of the train split, e.g. `train.csv`, `my_train_file.csv`, etc. -The same idea applies to the test and validation split: +All the files that contain a split name in their names (delimited by non-word characters, see below) are considered part of that split: -- All the files that contain *test* in their names are considered part of the test split, e.g. `test.csv`, `my_test_file.csv` -- All the files that contain *validation* in their names are considered part of the validation split, e.g. `validation.csv`, `my_validation_file.csv` +- train split: `train.csv`, `my_train_file.csv`, `train1.csv` +- validation split: `validation.csv`, `my_validation_file.csv`, `validation1.csv` +- test split: `test.csv`, `my_test_file.csv`, `test1.csv` Here is an example where all the files are placed into a directory named `data`: @@ -41,7 +41,9 @@ my_dataset_repository/ └── validation.csv ``` -Note that if a file contains *test* but is embedded in another word (e.g. `contest.csv`), it's not counted as a test file. +Note that if a file contains *test* but is embedded in another word (e.g. `testfile.csv`), it's not counted as a test file. +It must be delimited by non-word characters, e.g. `test_file.csv`. +Supported delimiters are underscores, dashes, spaces, dots and numbers. ## Multiple files per split @@ -88,7 +90,7 @@ Eventually, you'll also be able to structure your repository to specify differen ## Split names keywords -Train/validation/test splits are sometimes called train/dev/test, or sometimes train & eval sets. +Validation splits are sometimes called "dev", and test splits are called "eval". These other names are also supported. In particular, these keywords are equivalent: From 8efbb7866ba6fbbd722e72745ecc92e064b18721 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 18 Jul 2022 12:09:47 +0200 Subject: [PATCH 08/11] use list comprehension + support numbers --- src/datasets/data_files.py | 59 +++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index d3102fb7c68..50dddf1f0fa 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -26,47 +26,46 @@ class Url(str): SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*" +TRAIN_KEYWORDS = ["train", "training"] +TEST_KEYWORDS = ["test", "testing", "eval", "evaluation"] +VALIDATION_KEYWORDS = ["validation", "valid", "dev"] +NON_WORDS_CHARS = "-._ 0-9" +KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] +KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"] + DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { - str(Split.TRAIN): ["**[-._ /]train[-._ ]*", "train[-._ ]*", "**[-._ /]training[-._ ]*", "training[-._ ]*"], + str(Split.TRAIN): [ + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in TRAIN_KEYWORDS + for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS + ], str(Split.TEST): [ - "**[-._ /]test[-._ ]*", - "test[-._ ]*", - "**[-._ /]testing[-._ ]*", - "testing[-._ ]*", - "**[-._ /]eval[-._ ]*", - "eval[-._ ]*", - "**[-._ /]evaluation[-._ ]*", - "evaluation[-._ ]*", + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in TEST_KEYWORDS + for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS ], str(Split.VALIDATION): [ - "**[-._ /]dev[-._ ]*", - "dev[-._ ]*", - "**[-._ /]valid[-._ ]*", - "valid[-._ ]*", - "**[-._ /]validation[-._ ]*", - "validation[-._ ]*", + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in VALIDATION_KEYWORDS + for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS ], } DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { - str(Split.TRAIN): ["train[-._ /]**", "**[-._ /]train[-._ /]**", "training[-._ /]**", "**[-._ /]training[-._ /]**"], + str(Split.TRAIN): [ + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in TRAIN_KEYWORDS + for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS + ], str(Split.TEST): [ - "test[-._ /]**", - "**[-._ /]test[-._ /]**", - "testing[-._ /]**", - "**[-._ /]testing[-._ /]**", - "eval[-._ /]**", - "**[-._ /]eval[-._ /]**", - "evaluation[-._ /]**", - "**[-._ /]evaluation[-._ /]**", + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in TEST_KEYWORDS + for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS ], str(Split.VALIDATION): [ - "dev[-._ /]**", - "**[-._ /]dev[-._ /]**", - "valid[-._ /]**", - "**[-._ /]valid[-._ /]**", - "validation[-._ /]**", - "**[-._ /]validation[-._ /]**", + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in VALIDATION_KEYWORDS + for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS ], } From ee15edecd4e210015492dff16c664966758664c8 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 18 Jul 2022 12:26:23 +0200 Subject: [PATCH 09/11] update tests --- tests/test_data_files.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 81b91f66080..7f1579e7a87 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -495,7 +495,19 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file): @contextmanager def mock_fs(file_paths: List[str]): - """context manager to set up a mock:// filesystem in sfspec containing the provided files""" + """ + Context manager to set up a mock:// filesystem in fsspec containing the provided files + + Example: + + ```py + >>> with mock_fs(["data/train.txt", "data.test.txt"]) as fs: + ... assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS" + ... assert type(fs).__name__ == "DummyTestFS" + ... print(fs.glob("**")) + ["data", "data/train.txt", "data.test.txt"] + ``` + """ dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path} fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [ @@ -527,12 +539,6 @@ def ls(self, path, detail=True, refresh=True, **kwargs): return files return [file["name"] for file in files] - @classmethod - def get_test_paths(cls, start_with=""): - """Helper to return directory and file paths with no details""" - all = [file["name"] for file in cls._fs_contents if file["name"].startswith(start_with)] - return all - def _open( self, path, @@ -609,27 +615,28 @@ def _open( # With "dev" or "eval" without separators {"train": "developers_list.txt"}, {"train": "data/seqeval_results.txt"}, + {"train": "contest.txt"}, # With supported separators {"test": "my.test.file.txt"}, {"test": "my-test-file.txt"}, {"test": "my_test_file.txt"}, {"test": "my test file.txt"}, + {"test": "test00001.txt"}, ], ) def test_get_data_files_patterns(data_file_per_split): data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()} - with mock_fs( - [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths] - ) as fs: + file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths] - def resolver(pattern): + def resolver(pattern): + with mock_fs(file_paths) as fs: return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)] - patterns_per_split = _get_data_files_patterns(resolver) - assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys()) - for split, patterns in patterns_per_split.items(): - matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)] - assert matched == data_file_per_split[split] + patterns_per_split = _get_data_files_patterns(resolver) + assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys()) + for split, patterns in patterns_per_split.items(): + matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)] + assert matched == data_file_per_split[split] @pytest.mark.parametrize( From d372b28733cbc1df18b9518272563ec3de3842a4 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 18 Jul 2022 14:25:52 +0200 Subject: [PATCH 10/11] remove unnecessary patching and context manager --- tests/test_data_files.py | 45 +++++++--------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 7f1579e7a87..5c478d50d40 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -1,5 +1,4 @@ import os -from contextlib import contextmanager from pathlib import Path, PurePath from typing import List from unittest.mock import patch @@ -493,18 +492,17 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file): assert Hasher.hash(data_files1) != Hasher.hash(data_files2) -@contextmanager def mock_fs(file_paths: List[str]): """ - Context manager to set up a mock:// filesystem in fsspec containing the provided files + Set up a mock filesystem for fsspec containing the provided files Example: ```py - >>> with mock_fs(["data/train.txt", "data.test.txt"]) as fs: - ... assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS" - ... assert type(fs).__name__ == "DummyTestFS" - ... print(fs.glob("**")) + >>> fs = mock_fs(["data/train.txt", "data.test.txt"]) + >>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS" + >>> assert type(fs).__name__ == "DummyTestFS" + >>> print(fs.glob("**")) ["data", "data/train.txt", "data.test.txt"] ``` """ @@ -516,15 +514,8 @@ def mock_fs(file_paths: List[str]): class DummyTestFS(AbstractFileSystem): protocol = "mock" - _file_class = AbstractBufferedFile _fs_contents = fs_contents - def __getitem__(self, name): - for item in self._fs_contents: - if item["name"] == name: - return item - raise IndexError(f"{name} not found!") - def ls(self, path, detail=True, refresh=True, **kwargs): if kwargs.pop("strip_proto", True): path = self._strip_protocol(path) @@ -539,27 +530,7 @@ def ls(self, path, detail=True, refresh=True, **kwargs): return files return [file["name"] for file in files] - def _open( - self, - path, - mode="rb", - block_size=None, - autocommit=True, - cache_options=None, - **kwargs, - ): - return self._file_class( - self, - path, - mode, - block_size, - autocommit, - cache_options=cache_options, - **kwargs, - ) - - with patch.dict(fsspec.registry.target, {"mock": DummyTestFS}): - yield DummyTestFS() + return DummyTestFS() @pytest.mark.parametrize( @@ -627,10 +598,10 @@ def _open( def test_get_data_files_patterns(data_file_per_split): data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()} file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths] + fs = mock_fs(file_paths) def resolver(pattern): - with mock_fs(file_paths) as fs: - return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)] + return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)] patterns_per_split = _get_data_files_patterns(resolver) assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys()) From 3e97fd5e7d1095610ffbd501698ea517f46a4eaa Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 18 Jul 2022 14:31:01 +0200 Subject: [PATCH 11/11] style --- tests/test_data_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 5c478d50d40..0c310f588e9 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -5,7 +5,7 @@ import fsspec import pytest -from fsspec.spec import AbstractBufferedFile, AbstractFileSystem +from fsspec.spec import AbstractFileSystem from huggingface_hub.hf_api import DatasetInfo from datasets.data_files import (