Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fadc0a0
basic validation
Mar 22, 2021
7a4b594
ci script and test change
Mar 23, 2021
c3c97ea
color is better
Mar 23, 2021
2fe5787
check all option
Mar 24, 2021
0f68ce4
validate size cats & multiling, point to reference file urls on error
Mar 24, 2021
2d264e8
add validation to ci and rename files
Mar 24, 2021
fc46ec3
spurrious change to trigger CI
Mar 24, 2021
58763d2
add qa reqs
Mar 24, 2021
115d252
disallow empty lists
Mar 24, 2021
9ae048e
better error msg: show all invalid values rather than first one
Mar 24, 2021
299e907
some code shuffling & better error msg for langcodes
Mar 24, 2021
b4a0665
add pyyaml to qa reqs
Mar 24, 2021
7eeb647
fix package file loading
Mar 24, 2021
3a94086
include json resources
Mar 24, 2021
e4409a9
reflect changes to size cats from https://github.com/huggingface/data…
Mar 24, 2021
9450b5f
trying another format for package_data
Mar 24, 2021
58709bf
ci works! fixing the readme like a good citizen 🤗
Mar 24, 2021
702a8a1
escape validation everywhere it's allowed in the tagging app
Mar 24, 2021
d3eec3c
code review: more json files, conditional import
Mar 25, 2021
59d7dde
Merge remote-tracking branch 'origin/master' into theo/config-validator
Mar 26, 2021
84de013
pointers to integrate readme metadata in class (wip)
Mar 29, 2021
7fbd51d
no pydantic
Mar 31, 2021
0aefcae
Merge remote-tracking branch 'origin/master' into theo/config-validator
Mar 31, 2021
ab82a6c
fix docs?
Mar 31, 2021
a4953db
Revert "fix docs?"
Mar 31, 2021
4cfd2e8
Merge remote-tracking branch 'origin/master' into theo/config-validator
Apr 1, 2021
e63d325
remove pointers to add readme to loader
Apr 1, 2021
2f2e197
Merge branch 'master' into theo/config-validator
SBrandeis Apr 23, 2021
3102ccf
Get rid of langcodes, some refactor
SBrandeis Apr 23, 2021
a9846fd
Update languages.json
SBrandeis Apr 23, 2021
551ae96
Refactor, add tests
SBrandeis Apr 23, 2021
8afb25a
I said, tests!!
SBrandeis Apr 23, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
- run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
- run: isort --check-only tests src benchmarks datasets metrics
- run: flake8 tests src benchmarks datasets metrics
- run: ./scripts/datasets_metadata_validator.py

build_doc:
working_directory: ~/datasets
Expand Down
61 changes: 61 additions & 0 deletions scripts/datasets_metadata_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python

""" This script will run in CI and make sure all new changes to datasets readme files have valid metadata yaml headers.

"""

from pathlib import Path
from subprocess import check_output
from typing import List

from pydantic import ValidationError

from datasets.utils.metadata import DatasetMetadata


def get_changed_files(repo_path: Path) -> List[Path]:
diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
return changed_files


if __name__ == "__main__":
import logging
from argparse import ArgumentParser

logging.basicConfig(level=logging.DEBUG)

ap = ArgumentParser()
ap.add_argument("--repo_path", type=Path, default=Path.cwd())
ap.add_argument("--check_all", action="store_true")
args = ap.parse_args()

repo_path: Path = args.repo_path
if args.check_all:
readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
else:
changed_files = get_changed_files(repo_path)
readmes = [
f
for f in changed_files
if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
]

failed: List[Path] = []
for readme in sorted(readmes):
try:
DatasetMetadata.from_readme(readme)
logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
except ValidationError as e:
failed.append(readme)
logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
except Exception as e:
failed.append(readme)
logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")

if len(failed) > 0:
logging.info(f"❌ Failed on {len(failed)} files.")
exit(1)
else:
logging.info("All is well, keep up the good work 🤗!")
exit(0)
39 changes: 18 additions & 21 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@
import os
import sys

from setuptools import find_packages
from setuptools import setup
from setuptools import find_packages, setup


DOCLINES = __doc__.split("\n")

Expand Down Expand Up @@ -140,27 +140,28 @@
"texttable>=1.6.3",
"s3fs>=0.4.2",
"Werkzeug>=1.0.1",
# metadata validation
"langcodes[data]>=3.1.0",
"pydantic>=1.8.1",
]

if os.name == "nt": # windows
TESTS_REQUIRE.remove("faiss-cpu") # faiss doesn't exist on windows
else:
# dependencies of unbabel-comet
# only test if not on windows since there're issues installing fairseq on windows
TESTS_REQUIRE.extend([
"wget>=3.2",
"pytorch-nlp==0.5.0",
"pytorch_lightning",
"fastBPE==0.1.0",
"fairseq",
])


QUALITY_REQUIRE = [
"black",
"isort",
"flake8==3.7.9",
]
TESTS_REQUIRE.extend(
[
"wget>=3.2",
"pytorch-nlp==0.5.0",
"pytorch_lightning",
"fastBPE==0.1.0",
"fairseq",
]
)


QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "langcodes[data]>=3.1.0", "pydantic>=1.8.1", "pyyaml>=5.3.1"]


EXTRAS_REQUIRE = {
Expand Down Expand Up @@ -199,11 +200,7 @@
license="Apache 2.0",
package_dir={"": "src"},
packages=find_packages("src"),
package_data={
"datasets": [
"scripts/templates/*",
],
},
package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]},
entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
install_requires=REQUIRED_PKGS,
extras_require=EXTRAS_REQUIRE,
Expand Down
188 changes: 188 additions & 0 deletions src/datasets/utils/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import json
import logging
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple


# loading package files: https://stackoverflow.com/a/20885799
try:
import importlib.resources as pkg_resources
except ImportError:
# Try backported to PY<37 `importlib_resources`.
import importlib_resources as pkg_resources

import langcodes as lc
import yaml
from pydantic import BaseModel, conlist, validator

from . import resources


BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
this_url = f"{BASE_REF_URL}/{__file__}"
logger = logging.getLogger(__name__)


def load_json_resource(resource: str) -> Tuple[Dict, str]:
content = pkg_resources.read_text(resources, resource)
return json.loads(content), f"{BASE_REF_URL}/resources/{resource}"


known_licenses, known_licenses_url = load_json_resource("licenses.json")
known_task_ids, known_task_ids_url = load_json_resource("tasks.json")
known_creators, known_creators_url = load_json_resource("creators.json")
known_size_categories = [
"unknown",
"n<1K",
"1K<n<10K",
"10K<n<100K",
"100K<n<1M",
"1M<n<10M",
"10M<n<100M",
"100M<n<1B",
"1B<n<10B",
"10B<n<100B",
"100B<n<1T",
"n>1T",
]
known_multilingualities = {
"monolingual": "contains a single language",
"multilingual": "contains multiple languages",
"translation": "contains translated or aligned text",
"other": "other type of language distribution",
}


def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]:
def dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]:

Use explicit argument names.
Can you also add a docstring ?

with f.open() as fi:
content = [line.strip() for line in fi]

if content[0] == "---" and "---" in content[1:]:
yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
metada_dict = yaml.safe_load(yamlblock) or dict()
return metada_dict


def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> List[str]:
invalid_values = [v for v in values if v not in reference_values]
if len(invalid_values) > 0:
raise ValueError(f"{invalid_values} are not registered tags for '{name}', reference at {url}")
return values


def escape_validation_for_predicate(
values: List[Any], predicate_fn: Callable[[Any], bool]
) -> Tuple[List[Any], List[Any]]:
trues, falses = list(), list()
for v in values:
if predicate_fn(v):
trues.append(v)
else:
falses.append(v)
if len(trues) > 0:
logger.warning(f"The following values will escape validation: {trues}")
return trues, falses


class DatasetMetadata(BaseModel):
annotations_creators: conlist(str, min_items=1)
language_creators: conlist(str, min_items=1)
languages: conlist(str, min_items=1)
licenses: conlist(str, min_items=1)
multilinguality: conlist(str, min_items=1)
size_categories: conlist(str, min_items=1)
source_datasets: conlist(str, min_items=1)
task_categories: conlist(str, min_items=1)
task_ids: conlist(str, min_items=1)

@classmethod
def from_readme(cls, f: Path) -> "DatasetMetadata":
metadata_dict = dict_from_readme(f)
if metadata_dict is not None:
return cls(**metadata_dict)
else:
raise ValueError(f"did not find a yaml block in '{f}'")

@classmethod
def from_yaml_string(cls, string: str) -> "DatasetMetadata":
metada_dict = yaml.safe_load(string) or dict()
return cls(**metada_dict)

@validator("annotations_creators")
def annotations_creators_must_be_in_known_set(cls, annotations_creators: List[str]) -> List[str]:
return tagset_validator(annotations_creators, known_creators["annotations"], "annotations", known_creators_url)

@validator("language_creators")
def language_creators_must_be_in_known_set(cls, language_creators: List[str]) -> List[str]:
return tagset_validator(language_creators, known_creators["language"], "annotations", known_creators_url)

@validator("languages")
def language_code_must_be_recognized(cls, languages: List[str]):
invalid_values = []
for code in languages:
try:
lc.get(code)
except lc.tag_parser.LanguageTagError:
invalid_values.append(code)
if len(invalid_values) > 0:
raise ValueError(
f"{invalid_values} are not recognised as valid language codes (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes"
)
return languages

@validator("licenses")
def licenses_must_be_in_known_set(cls, licenses: List[str]):
others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e)
return [*tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url), *others]

@validator("task_categories")
def task_category_must_be_in_known_set(cls, task_categories: List[str]):
# TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
# in the near future and we don't want to waste energy in tagging against a moving taxonomy.
known_set = list(known_task_ids.keys())
others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other"))
return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others]

@validator("task_ids")
def task_id_must_be_in_known_set(cls, task_ids: List[str]):
# TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
# in the near future and we don't want to waste energy in tagging against a moving taxonomy.
known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]]
others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e)
return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others]

@validator("multilinguality")
def multilinguality_must_be_in_known_set(cls, multilinguality: List[str]):
others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other"))
return [
*tagset_validator(to_validate, list(known_multilingualities.keys()), "multilinguality", this_url),
*others,
]

@validator("size_categories")
def size_categories_must_be_in_known_set(cls, size_cats: List[str]):
return tagset_validator(size_cats, known_size_categories, "size_categories", this_url)

@validator("source_datasets")
def source_datasets_must_be_in_known_set(cls, sources: List[str]):
invalid_values = []
for src in sources:
is_ok = src in ["original", "extended"] or src.startswith("extended|")
if not is_ok:
invalid_values.append(src)
if len(invalid_values) > 0:
raise ValueError(
f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}"
)
return sources


if __name__ == "__main__":
from argparse import ArgumentParser

ap = ArgumentParser(usage="Validate the yaml metadata block of a README.md file.")
ap.add_argument("readme_filepath")
args = ap.parse_args()

readme_filepath = Path(args.readme_filepath)
DatasetMetadata.from_readme(readme_filepath)
Empty file.
17 changes: 17 additions & 0 deletions src/datasets/utils/resources/creators.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"language": [
"found",
"crowdsourced",
"expert-generated",
"machine-generated",
"other"
],
"annotations": [
"found",
"crowdsourced",
"expert-generated",
"machine-generated",
"no-annotation",
"other"
]
}
Loading