Skip to content
40 changes: 35 additions & 5 deletions docs/source/repository_structure.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@ my_dataset_repository/

## Splits and file names

🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names. Files that contain *train* in their names are considered part of the train split. The same idea applies to the test and validation split:
🤗 Datasets automatically infer a dataset's train, validation, and test splits from the file names.

- All the files that contain *test* in their names are considered part of the test split.
- All the files that contain *valid* in their names are considered part of the validation split.
All the files that contain a split name in their names (delimited by non-word characters, see below) are considered part of that split:

- train split: `train.csv`, `my_train_file.csv`, `train1.csv`
- validation split: `validation.csv`, `my_validation_file.csv`, `validation1.csv`
- test split: `test.csv`, `my_test_file.csv`, `test1.csv`

Here is an example where all the files are placed into a directory named `data`:

Expand All @@ -35,9 +38,13 @@ my_dataset_repository/
└── data/
├── train.csv
├── test.csv
└── valid.csv
└── validation.csv
```

Note that if a file contains *test* but is embedded in another word (e.g. `testfile.csv`), it's not counted as a test file.
It must be delimited by non-word characters, e.g. `test_file.csv`.
Supported delimiters are underscores, dashes, spaces, dots and numbers.

## Multiple files per split

If one of your splits comprises several files, 🤗 Datasets can still infer whether it is the train, validation, and test split from the file name.
Expand All @@ -58,7 +65,8 @@ Make sure all the files of your `train` set have *train* in their names (same fo
Even if you add a prefix or suffix to `train` in the file name (like `my_train_file_00001.csv` for example),
🤗 Datasets can still infer the appropriate split.

For convenience, you can also place your data files into different directories. In this case, the split name is inferred from the directory name.
For convenience, you can also place your data files into different directories.
In this case, the split name is inferred from the directory name.

```
my_dataset_repository/
Expand All @@ -80,6 +88,28 @@ Eventually, you'll also be able to structure your repository to specify differen

</Tip>

## Split names keywords

Validation splits are sometimes called "dev", and test splits are called "eval".
These other names are also supported.
In particular, these keywords are equivalent:

- train, training
- validation, valid, dev
- test, testing, eval, evaluation

Therefore this is also a valid repository:

```
my_dataset_repository/
├── README.md
└── data/
├── training.csv
├── eval.csv
└── valid.csv
```


## Custom split names

If you have other data files in addition to the traditional train, validation, and test sets, you must use a different structure.
Expand Down
43 changes: 37 additions & 6 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,47 @@ class Url(str):

SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"

TRAIN_KEYWORDS = ["train", "training"]
TEST_KEYWORDS = ["test", "testing", "eval", "evaluation"]
VALIDATION_KEYWORDS = ["validation", "valid", "dev"]
NON_WORDS_CHARS = "-._ 0-9"
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
Comment on lines +33 to +34
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! Indeed, much clearer this way! Thanks.


DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
str(Split.TRAIN): ["**train*"],
str(Split.TEST): ["**test*", "**eval*"],
str(Split.VALIDATION): ["**dev*", "**valid*"],
str(Split.TRAIN): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TRAIN_KEYWORDS
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
],
str(Split.TEST): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TEST_KEYWORDS
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
],
str(Split.VALIDATION): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in VALIDATION_KEYWORDS
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
],
}

DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
str(Split.TRAIN): ["**train*/**"],
str(Split.TEST): ["**test*/**", "**eval*/**"],
str(Split.VALIDATION): ["**dev*/**", "**valid*/**"],
str(Split.TRAIN): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TRAIN_KEYWORDS
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
],
str(Split.TEST): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in TEST_KEYWORDS
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
],
str(Split.VALIDATION): [
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
for keyword in VALIDATION_KEYWORDS
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
],
}

DEFAULT_PATTERNS_ALL = {
Expand Down
95 changes: 86 additions & 9 deletions tests/test_data_files.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
from itertools import chain
from contextlib import contextmanager
from pathlib import Path, PurePath
from typing import List
from unittest.mock import patch

import fsspec
import pytest
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
from huggingface_hub.hf_api import DatasetInfo

from datasets.data_files import (
Expand Down Expand Up @@ -491,6 +493,75 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
assert Hasher.hash(data_files1) != Hasher.hash(data_files2)


@contextmanager
def mock_fs(file_paths: List[str]):
"""
Context manager to set up a mock:// filesystem in fsspec containing the provided files

Example:

```py
>>> with mock_fs(["data/train.txt", "data.test.txt"]) as fs:
... assert fsspec.get_filesystem_class("mock").__name__ == "DummyTestFS"
... assert type(fs).__name__ == "DummyTestFS"
... print(fs.glob("**"))
["data", "data/train.txt", "data.test.txt"]
```
"""

dir_paths = {file_path.rsplit("/")[0] for file_path in file_paths if "/" in file_path}
fs_contents = [{"name": dir_path, "type": "directory"} for dir_path in dir_paths] + [
{"name": file_path, "type": "file", "size": 10} for file_path in file_paths
]

class DummyTestFS(AbstractFileSystem):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome that you use a mock file system. I personally think this goes in the right direction.

protocol = "mock"
_file_class = AbstractBufferedFile
_fs_contents = fs_contents

def __getitem__(self, name):
for item in self._fs_contents:
if item["name"] == name:
return item
raise IndexError(f"{name} not found!")

def ls(self, path, detail=True, refresh=True, **kwargs):
if kwargs.pop("strip_proto", True):
path = self._strip_protocol(path)

files = not refresh and self._ls_from_cache(path)
if not files:
files = [file for file in self._fs_contents if path == self._parent(file["name"])]
files.sort(key=lambda file: file["name"])
self.dircache[path.rstrip("/")] = files

if detail:
return files
return [file["name"] for file in files]

def _open(
self,
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
**kwargs,
):
return self._file_class(
self,
path,
mode,
block_size,
autocommit,
cache_options=cache_options,
**kwargs,
)

with patch.dict(fsspec.registry.target, {"mock": DummyTestFS}):
yield DummyTestFS()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a comment (feel free to ignore it): you use here unittest.mock.patch, but you could use pytest.monkeypatch instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is a context manager that doesn't have access to the monkeypatch fixture of pytest, so I used unittest.mock.patch instead.

Copy link
Member

@albertvillanova albertvillanova Jul 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhoestq again not important: but indeed you are not using the patching. You are just using the returned instance DummyTestFS().

So I guess you could just remove the patching (unittest.mock.patch) and the test will pass anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with patch.dict(fsspec.registry.target, {"mock": DummyTestFS}):
yield DummyTestFS()
yield DummyTestFS()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up removing the patching and the context manager :) merging

It makes sense if it is not indeed necessary.



@pytest.mark.parametrize(
"data_file_per_split",
[
Expand Down Expand Up @@ -541,24 +612,30 @@ def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
{"validation": "dev/dataset.txt"},
# With other extensions
{"train": "train.parquet", "test": "test.parquet", "validation": "valid.parquet"},
# With "dev" or "eval" without separators
{"train": "developers_list.txt"},
{"train": "data/seqeval_results.txt"},
Comment on lines +587 to +588
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe also adding a test for "test": "contest.txt"?

{"train": "contest.txt"},
# With supported separators
{"test": "my.test.file.txt"},
{"test": "my-test-file.txt"},
{"test": "my_test_file.txt"},
{"test": "my test file.txt"},
{"test": "test00001.txt"},
],
)
def test_get_data_files_patterns(data_file_per_split):
data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}
file_paths = [file_path for split_file_paths in data_file_per_split.values() for file_path in split_file_paths]

def resolver(pattern):
return [PurePath(path) for path in chain(*data_file_per_split.values()) if PurePath(path).match(pattern)]
with mock_fs(file_paths) as fs:
return [PurePath(file_path) for file_path in fs.glob(pattern) if fs.isfile(file_path)]

patterns_per_split = _get_data_files_patterns(resolver)
assert sorted(patterns_per_split.keys()) == sorted(data_file_per_split.keys())
for split, patterns in patterns_per_split.items():
matched = [
path
for path in chain(*data_file_per_split.values())
for pattern in patterns
if PurePath(path).match(pattern)
]
assert len(matched) == len(data_file_per_split[split])
matched = [file_path.as_posix() for pattern in patterns for file_path in resolver(pattern)]
assert matched == data_file_per_split[split]


Expand Down