Skip to content
40 changes: 35 additions & 5 deletions docs/source/repository_structure.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@ my_dataset_repository/

## Splits and file names

🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. Files that contain *train* in their names are considered part of the train split. The same idea applies to the test and validation split:
🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names.

- All the files that contain *test* in their names are considered part of the test split.
- All the files that contain *valid* in their names are considered part of the validation split.
All the files that contain a split name in their names (delimited by non-word characters, see below) are considered part of that split:

- train split: `train.csv`, `my_train_file.csv`, `train1.csv`
- validation split: `validation.csv`, `my_validation_file.csv`, `validation1.csv`
- test split: `test.csv`, `my_test_file.csv`, `test1.csv`

Here is an example where all the files are placed into a directory named `data`:

Expand All @@ -35,9 +38,13 @@ my_dataset_repository/
└── data/
├── train.csv
├── test.csv
└── valid.csv
└── validation.csv
```

Note that if a file contains *test* but is embedded in another word (e.g. `testfile.csv`), it's not counted as a test file.
It must be delimited by non-word characters, e.g. `test_file.csv`.
Supported delimiters are underscores, dashes, spaces, dots and numbers.

## Multiple files per split

If one of your splits comprises several files, 🤗 Datasets can still infer whether it is the train, validation, and test split from the file name.
Expand All @@ -58,7 +65,8 @@ Make sure all the files of your `train` set have *train* in their names (same fo
Even if you add a prefix or suffix to `train` in the file name (like `my_train_file_00001.csv` for example),
🤗 Datasets can still infer the appropriate split.

For convenience, you can also place your data files into different directories. In this case, the split name is inferred from the directory name.
For convenience, you can also place your data files into different directories.
In this case, the split name is inferred from the directory name.

```
my_dataset_repository/
Expand All @@ -80,6 +88,28 @@ Eventually, you'll also be able to structure your repository to specify differen

</Tip>

## Split names keywords

Validation splits are sometimes called "dev", and test splits are called "eval".
These other names are also supported.
In particular, these keywords are equivalent:

- train, training
- validation, valid, dev
- test, testing, eval, evaluation

Therefore this is also a valid repository:

```
my_dataset_repository/
├── README.md
└── data/
├── training.csv
├── eval.csv
└── valid.csv
```


## Custom split names

If you have other data files in addition to the traditional train, validation, and test sets, you must use a different structure.
Expand Down
43 changes: 37 additions & 6 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,47 @@ class Url(str):

SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"

TRAIN_KEYWORDS = ["train", "training"]
TEST_KEYWORDS = ["test", "testing", "eval", "evaluation"]
VALIDATION_KEYWORDS = ["validation", "valid", "dev"]
NON_WORDS_CHARS = "-._ 0-9"
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
Comment on lines +33 to +34
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! Indeed, much clearer this way! Thanks.


DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
str(Split.TRAIN): ["**train*"],
str(Split.TEST): ["**test*", "**eval*"],
str(Split.VALIDATION): ["**dev*", "**valid*"],
str(Split.TRAIN): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TRAIN_KEYWORDS
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
],
str(Split.TEST): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TEST_KEYWORDS
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
],
str(Split.VALIDATION): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in VALIDATION_KEYWORDS
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
],
}

DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
str(Split.TRAIN): ["**train*/**"],
str(Split.TEST): ["**test*/**", "**eval*/**"],
str(Split.VALIDATION): ["**dev*/**", "**valid*/**"],
str(Split.TRAIN): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TRAIN_KEYWORDS
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
],
str(Split.TEST): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TEST_KEYWORDS
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
],
str(Split.VALIDATION): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in VALIDATION_KEYWORDS
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
],
}

DEFAULT_PATTERNS_ALL = {
Expand Down
66 changes: 57 additions & 9 deletions tests/test_data_files.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
from itertools import chain
from pathlib import Path, PurePath
from typing import List
from unittest.mock import patch

import fsspec
import pytest
from fsspec.spec import AbstractFileSystem
from huggingface_hub.hf_api import DatasetInfo

from datasets.data_files import (
Expand Down Expand Up @@ -491,6 +492,47 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
assert Hasher.hash(data_files1) != Hasher.hash(data_files2)


def mock_fs(file_paths: List[str]):
"""
Set up a mock filesystem for fsspec containing the provided files

Example:

```py
>>> fs = mock_fs(["data/train.txt", "data.test.txt"])
>>> assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
>>> assert type(fs).__name__ == "DummyTestFS"
>>> print(fs.glob("**"))
["data", "data/train.txt", "data.test.txt"]
```
"""

dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
{"name": file_path, "type": "file", "size": 10} for file_path in file_paths
]

class DummyTestFS(AbstractFileSystem):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome that you use a mock file system. I personally think this goes in the right direction.

protocol = "mock"
_fs_contents = fs_contents

def ls(self, path, detail=True, refresh=True, **kwargs):
if kwargs.pop("strip_proto", True):
path = self._strip_protocol(path)

files = not refresh and self._ls_from_cache(path)
if not files:
files = [file for file in self._fs_contents if path == self._parent(file["name"])]
files.sort(key=lambda file: file["name"])
self.dircache[path.rstrip("/")] = files

if detail:
return files
return [file["name"] for file in files]

return DummyTestFS()


@pytest.mark.parametrize(
"data_file_per_split",
[
Expand Down Expand Up @@ -541,24 +583,30 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
{"validation": "dev/dataset.txt"},
# With other extensions
{"train": "train.parquet", "test": "test.parquet", "validation": "valid.parquet"},
# With "dev" or "eval" without separators
{"train": "developers_list.txt"},
{"train": "data/seqeval_results.txt"},
Comment on lines +587 to +588
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe also adding a test for "test": "contest.txt"?

{"train": "contest.txt"},
# With supported separators
{"test": "my.test.file.txt"},
{"test": "my-test-file.txt"},
{"test": "my_test_file.txt"},
{"test": "my test file.txt"},
{"test": "test00001.txt"},
],
)
def test_get_data_files_patterns(data_file_per_split):
data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}
file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]
fs = mock_fs(file_paths)

def resolver(pattern):
return [PurePath(path) for path in chain(*data_file_per_split.values()) if PurePath(path).match(pattern)]
return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]

patterns_per_split = _get_data_files_patterns(resolver)
assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
for split, patterns in patterns_per_split.items():
matched = [
path
for path in chain(*data_file_per_split.values())
for pattern in patterns
if PurePath(path).match(pattern)
]
assert len(matched) == len(data_file_per_split[split])
matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)]
assert matched == data_file_per_split[split]


Expand Down