From d963a0e60643293a73bd73dbe5f25a25a2239bb2 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 5 Jul 2022 15:18:27 +0200
Subject: [PATCH 01/11] only match separated split names

---
 src/datasets/data_files.py | 26 +++++++---
 tests/test_data_files.py   | 97 +++++++++++++++++++++++++++++++-------
 2 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 915c603d56c..e1c153f513f 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -27,15 +27,29 @@ class Url(str):
 SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
 
 DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
-    str(Split.TRAIN): ["**train*"],
-    str(Split.TEST): ["**test*", "**eval*"],
-    str(Split.VALIDATION): ["**dev*", "**valid*"],
+    str(Split.TRAIN): ["**[-._/]train[-._]*", "train[-._]*", "**[-._/]training[-._]*", "training[-._]*"],
+    str(Split.TEST): ["**[-._/]test[-._]*", "test[-._]*", "**[-._/]eval[-._]*", "eval[-._]*"],
+    str(Split.VALIDATION): [
+        "**[-._/]dev[-._]*",
+        "dev[-._]*",
+        "**[-._/]valid[-._]*",
+        "valid[-._]*",
+        "**[-._/]validation[-._]*",
+        "validation[-._]*",
+    ],
 }
 
 DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
-    str(Split.TRAIN): ["**train*/**"],
-    str(Split.TEST): ["**test*/**", "**eval*/**"],
-    str(Split.VALIDATION): ["**dev*/**", "**valid*/**"],
+    str(Split.TRAIN): ["train[-._/]**", "**[-._/]train[-._/]**", "training[-._/]**", "**[-._/]training[-._/]**"],
+    str(Split.TEST): ["test[-._/]**", "**[-._/]test[-._/]**", "eval[-._/]**", "**[-._/]eval[-._/]**"],
+    str(Split.VALIDATION): [
+        "dev[-._/]**",
+        "**[-._/]dev[-._/]**",
+        "valid[-._/]**",
+        "**[-._/]valid[-._/]**",
+        "validation[-._/]**",
+        "**[-._/]validation[-._/]**",
+    ],
 }
 
 DEFAULT_PATTERNS_ALL = {
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index b66fe1960fb..89b0ea70eb0 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -1,10 +1,12 @@
 import os
-from itertools import chain
+from contextlib import contextmanager
 from pathlib import Path, PurePath
+from typing import List
 from unittest.mock import patch
 
 import fsspec
 import pytest
+from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
 from huggingface_hub.hf_api import DatasetInfo
 
 from datasets.data_files import (
@@ -491,6 +493,69 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
         assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
 
 
+@contextmanager
+def mock_fs(file_paths: List[str]):
+    """context manager to set up a mock:// filesystem in sfspec containing the provided files"""
+
+    dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
+    fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
+        {"name": file_path, "type": "file", "size": 10} for file_path in file_paths
+    ]
+
+    class DummyTestFS(AbstractFileSystem):
+        protocol = "mock"
+        _file_class = AbstractBufferedFile
+        _fs_contents = fs_contents
+
+        def __getitem__(self, name):
+            for item in self._fs_contents:
+                if item["name"] == name:
+                    return item
+            raise IndexError(f"{name} not found!")
+
+        def ls(self, path, detail=True, refresh=True, **kwargs):
+            if kwargs.pop("strip_proto", True):
+                path = self._strip_protocol(path)
+
+            files = not refresh and self._ls_from_cache(path)
+            if not files:
+                files = [file for file in self._fs_contents if path == self._parent(file["name"])]
+                files.sort(key=lambda file: file["name"])
+                self.dircache[path.rstrip("/")] = files
+
+            if detail:
+                return files
+            return [file["name"] for file in files]
+
+        @classmethod
+        def get_test_paths(cls, start_with=""):
+            """Helper to return directory and file paths with no details"""
+            all = [file["name"] for file in cls._fs_contents if file["name"].startswith(start_with)]
+            return all
+
+        def _open(
+            self,
+            path,
+            mode="rb",
+            block_size=None,
+            autocommit=True,
+            cache_options=None,
+            **kwargs,
+        ):
+            return self._file_class(
+                self,
+                path,
+                mode,
+                block_size,
+                autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+
+    with patch.dict(fsspec.registry.target, {"mock": DummyTestFS}):
+        yield DummyTestFS()
+
+
 @pytest.mark.parametrize(
     "data_file_per_split",
     [
@@ -541,25 +606,25 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
         {"validation": "dev/dataset.txt"},
         # With other extensions
         {"train": "train.parquet", "test": "test.parquet", "validation": "valid.parquet"},
+        # With "dev" or "eval" without separators
+        {"train": "developers_list.txt"},
+        {"train": "data/seqeval_results.txt"},
     ],
 )
 def test_get_data_files_patterns(data_file_per_split):
     data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}
-
-    def resolver(pattern):
-        return [PurePath(path) for path in chain(*data_file_per_split.values()) if PurePath(path).match(pattern)]
-
-    patterns_per_split = _get_data_files_patterns(resolver)
-    assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
-    for split, patterns in patterns_per_split.items():
-        matched = [
-            path
-            for path in chain(*data_file_per_split.values())
-            for pattern in patterns
-            if PurePath(path).match(pattern)
-        ]
-        assert len(matched) == len(data_file_per_split[split])
-        assert matched == data_file_per_split[split]
+    with mock_fs(
+        [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]
+    ) as fs:
+
+        def resolver(pattern):
+            return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]
+
+        patterns_per_split = _get_data_files_patterns(resolver)
+        assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
+        for split, patterns in patterns_per_split.items():
+            matched = [str(file_path) for pattern in patterns for file_path in resolver(pattern)]
+            assert matched == data_file_per_split[split]
 
 
 @pytest.mark.parametrize(

From 159649a9b22dbf34b546228f199a05bb3d6cf4fd Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 5 Jul 2022 16:08:47 +0200
Subject: [PATCH 02/11] docs

---
 docs/source/repository_structure.mdx | 38 ++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx
index 871879f0323..a53e9771dde 100644
--- a/docs/source/repository_structure.mdx
+++ b/docs/source/repository_structure.mdx
@@ -22,10 +22,13 @@ my_dataset_repository/
 
 ## Splits and file names
 
-🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. Files that contain *train* in their names are considered part of the train split. The same idea applies to the test and validation split:
+🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names.
 
-- All the files that contain *test* in their names are considered part of the test split.
-- All the files that contain *valid* in their names are considered part of the validation split.
+Files that contain *train* in their names are considered part of the train split, e.g. `train.csv`, `my_train_file.csv`, etc.
+The same idea applies to the test and validation split:
+
+- All the files that contain *test* in their names are considered part of the test split, e.g. `test.csv`, `my_test_file.csv`
+- All the files that contain *validation* in their names are considered part of the validation split, e.g. `validation.csv`, `my_validation_file.csv`
 
 Here is an example where all the files are placed into a directory named `data`:
 
@@ -35,9 +38,11 @@ my_dataset_repository/
 └── data/
     ├── train.csv
     ├── test.csv
-    └── valid.csv
+    └── validation.csv
 ```
 
+Note that if a file contains *test* but is embedded in another word (e.g. `contest.csv`), it's not counted as a test file.
+
 ## Multiple files per split
 
 If one of your splits comprises several files, 🤗 Datasets can still infer whether it is the train, validation, and test split from the file name.
@@ -58,7 +63,8 @@ Make sure all the files of your `train` set have *train* in their names (same fo
 Even if you add a prefix or suffix to `train` in the file name (like `my_train_file_00001.csv` for example),
 🤗 Datasets can still infer the appropriate split.
 
-For convenience, you can also place your data files into different directories. In this case, the split name is inferred from the directory name.
+For convenience, you can also place your data files into different directories.
+In this case, the split name is inferred from the directory name.
 
 ```
 my_dataset_repository/
@@ -80,6 +86,28 @@ Eventually, you'll also be able to structure your repository to specify differen
 
 </Tip>
 
+## Split names keywords
+
+Train/validation/test splits are sometimes called train/dev/test, or sometimes train & eval sets.
+These other names are also supported.
+In particular, these keywords are equivalent:
+
+- train, training
+- validation, valid, dev
+- test, eval
+
+Therefore this is also a valid repository:
+
+```
+my_dataset_repository/
+├── README.md
+└── data/
+    ├── training.csv
+    ├── eval.csv
+    └── valid.csv
+```
+
+
 ## Custom split names
 
 If you have other data files in addition to the traditional train, validation, and test sets, you must use a different structure.

From 78938562de214cda2cc7bccc0c6446c5bf16db8a Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 5 Jul 2022 16:15:30 +0200
Subject: [PATCH 03/11] add space separator

---
 src/datasets/data_files.py | 32 ++++++++++++++++----------------
 tests/test_data_files.py   |  5 +++++
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index e1c153f513f..121f8685c42 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -27,28 +27,28 @@ class Url(str):
 SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
 
 DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
-    str(Split.TRAIN): ["**[-._/]train[-._]*", "train[-._]*", "**[-._/]training[-._]*", "training[-._]*"],
-    str(Split.TEST): ["**[-._/]test[-._]*", "test[-._]*", "**[-._/]eval[-._]*", "eval[-._]*"],
+    str(Split.TRAIN): ["**[-._ /]train[-._ ]*", "train[-._ ]*", "**[-._ /]training[-._ ]*", "training[-._ ]*"],
+    str(Split.TEST): ["**[-._ /]test[-._ ]*", "test[-._ ]*", "**[-._ /]eval[-._ ]*", "eval[-._ ]*"],
     str(Split.VALIDATION): [
-        "**[-._/]dev[-._]*",
-        "dev[-._]*",
-        "**[-._/]valid[-._]*",
-        "valid[-._]*",
-        "**[-._/]validation[-._]*",
-        "validation[-._]*",
+        "**[-._ /]dev[-._ ]*",
+        "dev[-._ ]*",
+        "**[-._ /]valid[-._ ]*",
+        "valid[-._ ]*",
+        "**[-._ /]validation[-._ ]*",
+        "validation[-._ ]*",
     ],
 }
 
 DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
-    str(Split.TRAIN): ["train[-._/]**", "**[-._/]train[-._/]**", "training[-._/]**", "**[-._/]training[-._/]**"],
-    str(Split.TEST): ["test[-._/]**", "**[-._/]test[-._/]**", "eval[-._/]**", "**[-._/]eval[-._/]**"],
+    str(Split.TRAIN): ["train[-._ /]**", "**[-._ /]train[-._ /]**", "training[-._ /]**", "**[-._ /]training[-._ /]**"],
+    str(Split.TEST): ["test[-._ /]**", "**[-._ /]test[-._ /]**", "eval[-._ /]**", "**[-._ /]eval[-._ /]**"],
     str(Split.VALIDATION): [
-        "dev[-._/]**",
-        "**[-._/]dev[-._/]**",
-        "valid[-._/]**",
-        "**[-._/]valid[-._/]**",
-        "validation[-._/]**",
-        "**[-._/]validation[-._/]**",
+        "dev[-._ /]**",
+        "**[-._ /]dev[-._ /]**",
+        "valid[-._ /]**",
+        "**[-._ /]valid[-._ /]**",
+        "validation[-._ /]**",
+        "**[-._ /]validation[-._ /]**",
     ],
 }
 
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 89b0ea70eb0..e22fabffd8c 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -609,6 +609,11 @@ def _open(
         # With "dev" or "eval" without separators
         {"train": "developers_list.txt"},
         {"train": "data/seqeval_results.txt"},
+        # With supported separators
+        {"test": "my.test.file.txt"},
+        {"test": "my-test-file.txt"},
+        {"test": "my_test_file.txt"},
+        {"test": "my test file.txt"},
     ],
 )
 def test_get_data_files_patterns(data_file_per_split):

From 31bf1bb58e20ce704db07af126ecd1ab69e2fc1a Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 5 Jul 2022 17:04:39 +0200
Subject: [PATCH 04/11] fix win

---
 tests/test_data_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index e22fabffd8c..81b91f66080 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -628,7 +628,7 @@ def resolver(pattern):
         patterns_per_split = _get_data_files_patterns(resolver)
         assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
         for split, patterns in patterns_per_split.items():
-            matched = [str(file_path) for pattern in patterns for file_path in resolver(pattern)]
+            matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)]
             assert matched == data_file_per_split[split]
 
 

From 93f45262d6b009d7dddbd3540d45094b49b66a78 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 8 Jul 2022 15:11:52 +0200
Subject: [PATCH 05/11] add testing

---
 docs/source/repository_structure.mdx |  2 +-
 src/datasets/data_files.py           | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx
index a53e9771dde..cbe50be3403 100644
--- a/docs/source/repository_structure.mdx
+++ b/docs/source/repository_structure.mdx
@@ -94,7 +94,7 @@ In particular, these keywords are equivalent:
 
 - train, training
 - validation, valid, dev
-- test, eval
+- test, testing, eval
 
 Therefore this is also a valid repository:
 
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 6ed9174a53c..1e0da25e2b5 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -28,7 +28,14 @@ class Url(str):
 
 DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
     str(Split.TRAIN): ["**[-._ /]train[-._ ]*", "train[-._ ]*", "**[-._ /]training[-._ ]*", "training[-._ ]*"],
-    str(Split.TEST): ["**[-._ /]test[-._ ]*", "test[-._ ]*", "**[-._ /]eval[-._ ]*", "eval[-._ ]*"],
+    str(Split.TEST): [
+        "**[-._ /]test[-._ ]*",
+        "test[-._ ]*",
+        "**[-._ /]testing[-._ ]*",
+        "testing[-._ ]*",
+        "**[-._ /]eval[-._ ]*",
+        "eval[-._ ]*",
+    ],
     str(Split.VALIDATION): [
         "**[-._ /]dev[-._ ]*",
         "dev[-._ ]*",
@@ -41,7 +48,14 @@ class Url(str):
 
 DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
     str(Split.TRAIN): ["train[-._ /]**", "**[-._ /]train[-._ /]**", "training[-._ /]**", "**[-._ /]training[-._ /]**"],
-    str(Split.TEST): ["test[-._ /]**", "**[-._ /]test[-._ /]**", "eval[-._ /]**", "**[-._ /]eval[-._ /]**"],
+    str(Split.TEST): [
+        "test[-._ /]**",
+        "**[-._ /]test[-._ /]**",
+        "testing[-._ /]**",
+        "**[-._ /]testing[-._ /]**",
+        "eval[-._ /]**",
+        "**[-._ /]eval[-._ /]**",
+    ],
     str(Split.VALIDATION): [
         "dev[-._ /]**",
         "**[-._ /]dev[-._ /]**",

From e6adffbea00264bd0873e36b572cb7014f5feee8 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 8 Jul 2022 15:34:05 +0200
Subject: [PATCH 06/11] add evaluation

---
 docs/source/repository_structure.mdx | 2 +-
 src/datasets/data_files.py           | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx
index cbe50be3403..fdf64bd9910 100644
--- a/docs/source/repository_structure.mdx
+++ b/docs/source/repository_structure.mdx
@@ -94,7 +94,7 @@ In particular, these keywords are equivalent:
 
 - train, training
 - validation, valid, dev
-- test, testing, eval
+- test, testing, eval, evaluation
 
 Therefore this is also a valid repository:
 
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 1e0da25e2b5..d3102fb7c68 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -35,6 +35,8 @@ class Url(str):
         "testing[-._ ]*",
         "**[-._ /]eval[-._ ]*",
         "eval[-._ ]*",
+        "**[-._ /]evaluation[-._ ]*",
+        "evaluation[-._ ]*",
     ],
     str(Split.VALIDATION): [
         "**[-._ /]dev[-._ ]*",
@@ -55,6 +57,8 @@ class Url(str):
         "**[-._ /]testing[-._ /]**",
         "eval[-._ /]**",
         "**[-._ /]eval[-._ /]**",
+        "evaluation[-._ /]**",
+        "**[-._ /]evaluation[-._ /]**",
     ],
     str(Split.VALIDATION): [
         "dev[-._ /]**",

From 9393386464439e4340c88352ef4fdbc8ef2e6ae0 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 18 Jul 2022 11:49:45 +0200
Subject: [PATCH 07/11] suggestions in doc

---
 docs/source/repository_structure.mdx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx
index fdf64bd9910..e9e46ff5217 100644
--- a/docs/source/repository_structure.mdx
+++ b/docs/source/repository_structure.mdx
@@ -24,11 +24,11 @@ my_dataset_repository/
 
 🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names.
 
-Files that contain *train* in their names are considered part of the train split, e.g. `train.csv`, `my_train_file.csv`, etc.
-The same idea applies to the test and validation split:
+All the files that contain a split name in their names (delimited by non-word characters, see below) are considered part of that split:
 
-- All the files that contain *test* in their names are considered part of the test split, e.g. `test.csv`, `my_test_file.csv`
-- All the files that contain *validation* in their names are considered part of the validation split, e.g. `validation.csv`, `my_validation_file.csv`
+- train split: `train.csv`, `my_train_file.csv`, `train1.csv`
+- validation split: `validation.csv`, `my_validation_file.csv`, `validation1.csv`
+- test split: `test.csv`, `my_test_file.csv`, `test1.csv`
 
 Here is an example where all the files are placed into a directory named `data`:
 
@@ -41,7 +41,9 @@ my_dataset_repository/
     └── validation.csv
 ```
 
-Note that if a file contains *test* but is embedded in another word (e.g. `contest.csv`), it's not counted as a test file.
+Note that if a file contains *test* but is embedded in another word (e.g. `testfile.csv`), it's not counted as a test file.
+It must be delimited by non-word characters, e.g. `test_file.csv`.
+Supported delimiters are underscores, dashes, spaces, dots and numbers.
 
 ## Multiple files per split
 
@@ -88,7 +90,7 @@ Eventually, you'll also be able to structure your repository to specify differen
 
 ## Split names keywords
 
-Train/validation/test splits are sometimes called train/dev/test, or sometimes train & eval sets.
+Validation splits are sometimes called "dev", and test splits are called "eval".
 These other names are also supported.
 In particular, these keywords are equivalent:
 

From 8efbb7866ba6fbbd722e72745ecc92e064b18721 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 18 Jul 2022 12:09:47 +0200
Subject: [PATCH 08/11] use list comprehension + support numbers

---
 src/datasets/data_files.py | 59 +++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index d3102fb7c68..50dddf1f0fa 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -26,47 +26,46 @@ class Url(str):
 
 SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
 
+TRAIN_KEYWORDS = ["train", "training"]
+TEST_KEYWORDS = ["test", "testing", "eval", "evaluation"]
+VALIDATION_KEYWORDS = ["validation", "valid", "dev"]
+NON_WORDS_CHARS = "-._ 0-9"
+KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
+KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
+
 DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
-    str(Split.TRAIN): ["**[-._ /]train[-._ ]*", "train[-._ ]*", "**[-._ /]training[-._ ]*", "training[-._ ]*"],
+    str(Split.TRAIN): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TRAIN_KEYWORDS
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
+    ],
     str(Split.TEST): [
-        "**[-._ /]test[-._ ]*",
-        "test[-._ ]*",
-        "**[-._ /]testing[-._ ]*",
-        "testing[-._ ]*",
-        "**[-._ /]eval[-._ ]*",
-        "eval[-._ ]*",
-        "**[-._ /]evaluation[-._ ]*",
-        "evaluation[-._ ]*",
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TEST_KEYWORDS
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
     ],
     str(Split.VALIDATION): [
-        "**[-._ /]dev[-._ ]*",
-        "dev[-._ ]*",
-        "**[-._ /]valid[-._ ]*",
-        "valid[-._ ]*",
-        "**[-._ /]validation[-._ ]*",
-        "validation[-._ ]*",
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in VALIDATION_KEYWORDS
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
     ],
 }
 
 DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
-    str(Split.TRAIN): ["train[-._ /]**", "**[-._ /]train[-._ /]**", "training[-._ /]**", "**[-._ /]training[-._ /]**"],
+    str(Split.TRAIN): [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TRAIN_KEYWORDS
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+    ],
     str(Split.TEST): [
-        "test[-._ /]**",
-        "**[-._ /]test[-._ /]**",
-        "testing[-._ /]**",
-        "**[-._ /]testing[-._ /]**",
-        "eval[-._ /]**",
-        "**[-._ /]eval[-._ /]**",
-        "evaluation[-._ /]**",
-        "**[-._ /]evaluation[-._ /]**",
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in TEST_KEYWORDS
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
     ],
     str(Split.VALIDATION): [
-        "dev[-._ /]**",
-        "**[-._ /]dev[-._ /]**",
-        "valid[-._ /]**",
-        "**[-._ /]valid[-._ /]**",
-        "validation[-._ /]**",
-        "**[-._ /]validation[-._ /]**",
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in VALIDATION_KEYWORDS
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
     ],
 }
 

From ee15edecd4e210015492dff16c664966758664c8 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 18 Jul 2022 12:26:23 +0200
Subject: [PATCH 09/11] update tests

---
 tests/test_data_files.py | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 81b91f66080..7f1579e7a87 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -495,7 +495,19 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
 
 @contextmanager
 def mock_fs(file_paths: List[str]):
-    """context manager to set up a mock:// filesystem in sfspec containing the provided files"""
+    """
+    Context manager to set up a mock:// filesystem in fsspec containing the provided files
+
+    Example:
+
+    ```py
+    >>> with mock_fs(["data/train.txt", "data.test.txt"]) as fs:
+    ...     assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
+    ...     assert type(fs).__name__ == "DummyTestFS"
+    ...     print(fs.glob("**"))
+    ["data", "data/train.txt", "data.test.txt"]
+    ```
+    """
 
     dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
     fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
@@ -527,12 +539,6 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
                 return files
             return [file["name"] for file in files]
 
-        @classmethod
-        def get_test_paths(cls, start_with=""):
-            """Helper to return directory and file paths with no details"""
-            all = [file["name"] for file in cls._fs_contents if file["name"].startswith(start_with)]
-            return all
-
         def _open(
             self,
             path,
@@ -609,27 +615,28 @@ def _open(
         # With "dev" or "eval" without separators
         {"train": "developers_list.txt"},
         {"train": "data/seqeval_results.txt"},
+        {"train": "contest.txt"},
         # With supported separators
         {"test": "my.test.file.txt"},
         {"test": "my-test-file.txt"},
         {"test": "my_test_file.txt"},
         {"test": "my test file.txt"},
+        {"test": "test00001.txt"},
     ],
 )
 def test_get_data_files_patterns(data_file_per_split):
     data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}
-    with mock_fs(
-        [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]
-    ) as fs:
+    file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]
 
-        def resolver(pattern):
+    def resolver(pattern):
+        with mock_fs(file_paths) as fs:
             return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]
 
-        patterns_per_split = _get_data_files_patterns(resolver)
-        assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
-        for split, patterns in patterns_per_split.items():
-            matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)]
-            assert matched == data_file_per_split[split]
+    patterns_per_split = _get_data_files_patterns(resolver)
+    assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
+    for split, patterns in patterns_per_split.items():
+        matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)]
+        assert matched == data_file_per_split[split]
 
 
 @pytest.mark.parametrize(

From d372b28733cbc1df18b9518272563ec3de3842a4 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 18 Jul 2022 14:25:52 +0200
Subject: [PATCH 10/11] remove unnecessary patching and context manager

---
 tests/test_data_files.py | 45 +++++++---------------------------------
 1 file changed, 8 insertions(+), 37 deletions(-)

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 7f1579e7a87..5c478d50d40 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -1,5 +1,4 @@
 import os
-from contextlib import contextmanager
 from pathlib import Path, PurePath
 from typing import List
 from unittest.mock import patch
@@ -493,18 +492,17 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
         assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
 
 
-@contextmanager
 def mock_fs(file_paths: List[str]):
     """
-    Context manager to set up a mock:// filesystem in fsspec containing the provided files
+    Set up a mock filesystem for fsspec containing the provided files
 
     Example:
 
     ```py
-    >>> with mock_fs(["data/train.txt", "data.test.txt"]) as fs:
-    ...     assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
-    ...     assert type(fs).__name__ == "DummyTestFS"
-    ...     print(fs.glob("**"))
+    >>> fs = mock_fs(["data/train.txt", "data.test.txt"])
+    >>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
+    >>> assert type(fs).__name__ == "DummyTestFS"
+    >>> print(fs.glob("**"))
     ["data", "data/train.txt", "data.test.txt"]
     ```
     """
@@ -516,15 +514,8 @@ def mock_fs(file_paths: List[str]):
 
     class DummyTestFS(AbstractFileSystem):
         protocol = "mock"
-        _file_class = AbstractBufferedFile
         _fs_contents = fs_contents
 
-        def __getitem__(self, name):
-            for item in self._fs_contents:
-                if item["name"] == name:
-                    return item
-            raise IndexError(f"{name} not found!")
-
         def ls(self, path, detail=True, refresh=True, **kwargs):
             if kwargs.pop("strip_proto", True):
                 path = self._strip_protocol(path)
@@ -539,27 +530,7 @@ def ls(self, path, detail=True, refresh=True, **kwargs):
                 return files
             return [file["name"] for file in files]
 
-        def _open(
-            self,
-            path,
-            mode="rb",
-            block_size=None,
-            autocommit=True,
-            cache_options=None,
-            **kwargs,
-        ):
-            return self._file_class(
-                self,
-                path,
-                mode,
-                block_size,
-                autocommit,
-                cache_options=cache_options,
-                **kwargs,
-            )
-
-    with patch.dict(fsspec.registry.target, {"mock": DummyTestFS}):
-        yield DummyTestFS()
+    return DummyTestFS()
 
 
 @pytest.mark.parametrize(
@@ -627,10 +598,10 @@ def _open(
 def test_get_data_files_patterns(data_file_per_split):
     data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}
     file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]
+    fs = mock_fs(file_paths)
 
     def resolver(pattern):
-        with mock_fs(file_paths) as fs:
-            return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]
+        return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]
 
     patterns_per_split = _get_data_files_patterns(resolver)
     assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())

From 3e97fd5e7d1095610ffbd501698ea517f46a4eaa Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 18 Jul 2022 14:31:01 +0200
Subject: [PATCH 11/11] style

---
 tests/test_data_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 5c478d50d40..0c310f588e9 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -5,7 +5,7 @@
 
 import fsspec
 import pytest
-from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
+from fsspec.spec import AbstractFileSystem
 from huggingface_hub.hf_api import DatasetInfo
 
 from datasets.data_files import (