[data_files] Only match separated split names (#4633)

lhoestq · web-flow · commit cd674a3f7114 · 2022-07-18T15:07:32.000+02:00
* only match separated split names

* docs

* add space separator

* fix win

* add testing

* add evaluation

* suggestions in doc

* use list comprehension + support numbers

* update tests

* remove unnecessary patching and context manager

* style
diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx
@@ -22,10 +22,13 @@ my_dataset_repository/
 
 ## Splits and file names
 
-🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. Files that contain *train* in their names are considered part of the train split. The same idea applies to the test and validation split:
+🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names.
 
-- All the files that contain *test* in their names are considered part of the test split.
-- All the files that contain *valid* in their names are considered part of the validation split.
+All the files that contain a split name in their names (delimited by non-word characters, see below) are considered part of that split:
+
+- train split: `train.csv`, `my_train_file.csv`, `train1.csv`
+- validation split: `validation.csv`, `my_validation_file.csv`, `validation1.csv`
+- test split: `test.csv`, `my_test_file.csv`, `test1.csv`
 
 Here is an example where all the files are placed into a directory named `data`:
 
@@ -35,9 +38,13 @@ my_dataset_repository/
 └── data/
     ├── train.csv
     ├── test.csv
-    └── valid.csv
+    └── validation.csv
 ```
 
+Note that if a file contains *test* but is embedded in another word (e.g. `testfile.csv`), it's not counted as a test file.
+It must be delimited by non-word characters, e.g. `test_file.csv`.
+Supported delimiters are underscores, dashes, spaces, dots and numbers.
+
 ## Multiple files per split
 
 If one of your splits comprises several files, 🤗 Datasets can still infer whether it is the train, validation, and test split from the file name.
@@ -58,7 +65,8 @@ Make sure all the files of your `train` set have *train* in their names (same fo
 Even if you add a prefix or suffix to `train` in the file name (like `my_train_file_00001.csv` for example),
 🤗 Datasets can still infer the appropriate split.
 
-For convenience, you can also place your data files into different directories. In this case, the split name is inferred from the directory name.
+For convenience, you can also place your data files into different directories.
+In this case, the split name is inferred from the directory name.
 
 ```
 my_dataset_repository/
@@ -80,6 +88,28 @@ Eventually, you'll also be able to structure your repository to specify differen
 
 </Tip>
 
+## Split names keywords
+
+Validation splits are sometimes called "dev", and test splits are called "eval".
+These other names are also supported.
+In particular, these keywords are equivalent:
+
+- train, training
+- validation, valid, dev
+- test, testing, eval, evaluation
+
+Therefore this is also a valid repository:
+
+```
+my_dataset_repository/
+├── README.md
+└── data/
+    ├── training.csv
+    ├── eval.csv
+    └── valid.csv
+```
+
+
 ## Custom split names
 
 If you have other data files in addition to the traditional train, validation, and test sets, you must use a different structure.
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -26,16 +26,47 @@ class Url(str):
 
 SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
 
+TRAIN_KEYWORDS = ["train", "training"]
+TEST_KEYWORDS = ["test", "testing", "eval", "evaluation"]
+VALIDATION_KEYWORDS = ["validation", "valid", "dev"]
+NON_WORDS_CHARS = "-._ 0-9"
+KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
+KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
+
 DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
-    str(Split.TRAIN): ["**train*"],
-    str(Split.TEST): ["**test*", "**eval*"],
-    str(Split.VALIDATION): ["**dev*", "**valid*"],
+    str(Split.TRAIN): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TRAIN_KEYWORDS
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
+    ],
+    str(Split.TEST): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TEST_KEYWORDS
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
+    ],
+    str(Split.VALIDATION): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in VALIDATION_KEYWORDS
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
+    ],
 }
 
 DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
-    str(Split.TRAIN): ["**train*/**"],
-    str(Split.TEST): ["**test*/**", "**eval*/**"],
-    str(Split.VALIDATION): ["**dev*/**", "**valid*/**"],
+    str(Split.TRAIN): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TRAIN_KEYWORDS
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+    ],
+    str(Split.TEST): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TEST_KEYWORDS
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+    ],
+    str(Split.VALIDATION): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in VALIDATION_KEYWORDS
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+    ],
 }
 
 DEFAULT_PATTERNS_ALL = {
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
@@ -1,10 +1,11 @@
 import os
-from itertools import chain
 from pathlib import Path, PurePath
+from typing import List
 from unittest.mock import patch
 
 import fsspec
 import pytest
+from fsspec.spec import AbstractFileSystem
 from huggingface_hub.hf_api import DatasetInfo
 
 from datasets.data_files import (
@@ -491,6 +492,47 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
         assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
 
 
+def mock_fs(file_paths: List[str]):
+    """
+    Set up a mock filesystem for fsspec containing the provided files
+
+    Example:
+
+    ```py
+    >>> fs = mock_fs(["data/train.txt", "data.test.txt"])
+    >>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
+    >>> assert type(fs).__name__ == "DummyTestFS"
+    >>> print(fs.glob("**"))
+    ["data", "data/train.txt", "data.test.txt"]
+    ```
+    """
+
+    dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
+    fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
+        {"name": file_path, "type": "file", "size": 10} for file_path in file_paths
+    ]
+
+    class DummyTestFS(AbstractFileSystem):
+        protocol = "mock"
+        _fs_contents = fs_contents
+
+        def ls(self, path, detail=True, refresh=True, **kwargs):
+            if kwargs.pop("strip_proto", True):
+                path = self._strip_protocol(path)
+
+            files = not refresh and self._ls_from_cache(path)
+            if not files:
+                files = [file for file in self._fs_contents if path == self._parent(file["name"])]
+                files.sort(key=lambda file: file["name"])
+                self.dircache[path.rstrip("/")] = files
+
+            if detail:
+                return files
+            return [file["name"] for file in files]
+
+    return DummyTestFS()
+
+
 @pytest.mark.parametrize(
     "data_file_per_split",
     [
@@ -541,24 +583,30 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
         {"validation": "dev/dataset.txt"},
         # With other extensions
         {"train": "train.parquet", "test": "test.parquet", "validation": "valid.parquet"},
+        # With "dev" or "eval" without separators
+        {"train": "developers_list.txt"},
+        {"train": "data/seqeval_results.txt"},
+        {"train": "contest.txt"},
+        # With supported separators
+        {"test": "my.test.file.txt"},
+        {"test": "my-test-file.txt"},
+        {"test": "my_test_file.txt"},
+        {"test": "my test file.txt"},
+        {"test": "test00001.txt"},
     ],
 )
 def test_get_data_files_patterns(data_file_per_split):
     data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}
+    file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]
+    fs = mock_fs(file_paths)
 
     def resolver(pattern):
-        return [PurePath(path) for path in chain(*data_file_per_split.values()) if PurePath(path).match(pattern)]
+        return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]
 
     patterns_per_split = _get_data_files_patterns(resolver)
     assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
     for split, patterns in patterns_per_split.items():
-        matched = [
-            path
-            for path in chain(*data_file_per_split.values())
-            for pattern in patterns
-            if PurePath(path).match(pattern)
-        ]
-        assert len(matched) == len(data_file_per_split[split])
+        matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)]
         assert matched == data_file_per_split[split]