Skip to content

Commit bb42d5c

Browse files
theo-mtheoSBrandeis
authored
Metadata validation (#2107)
* basic validation * ci script and test change * color is better * check all option * validate size cats & multiling, point to reference file urls on error * add validation to ci and rename files * spurrious change to trigger CI * add qa reqs * disallow empty lists * better error msg: show all invalid values rather than first one * some code shuffling & better error msg for langcodes * add pyyaml to qa reqs * fix package file loading * include json resources * reflect changes to size cats from huggingface/datasets-tagging#11 * trying another format for package_data * ci works! fixing the readme like a good citizen 🤗 * escape validation everywhere it's allowed in the tagging app * code review: more json files, conditional import * pointers to integrate readme metadata in class (wip) * no pydantic * fix docs? * Revert "fix docs?" This reverts commit ab82a6c. * remove pointers to add readme to loader * Get rid of langcodes, some refactor * Update languages.json * Refactor, add tests * I said, tests!! Co-authored-by: theo <theo@matussie.re> Co-authored-by: SBrandeis <s.brandeis@protonmail.com>
1 parent b529b49 commit bb42d5c

12 files changed

Lines changed: 1934 additions & 14 deletions

File tree

.circleci/config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ jobs:
8181
- run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
8282
- run: isort --check-only tests src benchmarks datasets metrics
8383
- run: flake8 tests src benchmarks datasets metrics
84+
- run: ./scripts/datasets_metadata_validator.py
8485

8586
build_doc:
8687
working_directory: ~/datasets
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
3+
""" This script will run in CI and make sure all new changes to datasets readme files have valid metadata yaml headers.
4+
5+
"""
6+
7+
from pathlib import Path
8+
from subprocess import check_output
9+
from typing import List
10+
11+
from datasets.utils.metadata import DatasetMetadata
12+
13+
14+
def get_changed_files(repo_path: Path) -> List[Path]:
15+
diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
16+
changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
17+
return changed_files
18+
19+
20+
if __name__ == "__main__":
21+
import logging
22+
from argparse import ArgumentParser
23+
24+
logging.basicConfig(level=logging.DEBUG)
25+
26+
ap = ArgumentParser()
27+
ap.add_argument("--repo_path", type=Path, default=Path.cwd())
28+
ap.add_argument("--check_all", action="store_true")
29+
args = ap.parse_args()
30+
31+
repo_path: Path = args.repo_path
32+
if args.check_all:
33+
readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
34+
else:
35+
changed_files = get_changed_files(repo_path)
36+
readmes = [
37+
f
38+
for f in changed_files
39+
if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
40+
]
41+
42+
failed: List[Path] = []
43+
for readme in sorted(readmes):
44+
try:
45+
DatasetMetadata.from_readme(readme)
46+
logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
47+
except TypeError as e:
48+
failed.append(readme)
49+
logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
50+
except Exception as e:
51+
failed.append(readme)
52+
logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")
53+
54+
if len(failed) > 0:
55+
logging.info(f"❌ Failed on {len(failed)} files.")
56+
exit(1)
57+
else:
58+
logging.info("All is well, keep up the good work 🤗!")
59+
exit(0)

setup.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,20 @@
5252
import os
5353
import sys
5454

55-
from setuptools import find_packages
56-
from setuptools import setup
55+
from setuptools import find_packages, setup
56+
5757

5858
DOCLINES = __doc__.split("\n")
5959

6060

6161
# Pin some dependencies for old python versions
6262
_deps = {
63-
"fsspec": "fsspec" if sys.version_info >= (3, 7) else "fsspec<0.8.1", # fsspec>=0.8.1 requires py>=3.7 for async stuff
64-
"s3fs": "s3fs" if sys.version_info >= (3, 7) else "s3fs==0.4.2", # later versions of s3fs have issues downloading directories recursively for py36
63+
"fsspec": "fsspec"
64+
if sys.version_info >= (3, 7)
65+
else "fsspec<0.8.1", # fsspec>=0.8.1 requires py>=3.7 for async stuff
66+
"s3fs": "s3fs"
67+
if sys.version_info >= (3, 7)
68+
else "s3fs==0.4.2", # later versions of s3fs have issues downloading directories recursively for py36
6569
}
6670

6771

@@ -149,6 +153,8 @@
149153
"tldextract>=3.1.0",
150154
"texttable>=1.6.3",
151155
"Werkzeug>=1.0.1",
156+
# metadata validation
157+
"importlib_resources;python_version<'3.7'",
152158
]
153159

154160
if os.name == "nt": # windows
@@ -167,11 +173,7 @@
167173
)
168174

169175

170-
QUALITY_REQUIRE = [
171-
"black",
172-
"isort",
173-
"flake8==3.7.9",
174-
]
176+
QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "pyyaml>=5.3.1"]
175177

176178

177179
EXTRAS_REQUIRE = {
@@ -214,11 +216,7 @@
214216
license="Apache 2.0",
215217
package_dir={"": "src"},
216218
packages=find_packages("src"),
217-
package_data={
218-
"datasets": [
219-
"scripts/templates/*",
220-
],
221-
},
219+
package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]},
222220
entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
223221
install_requires=REQUIRED_PKGS,
224222
extras_require=EXTRAS_REQUIRE,

src/datasets/utils/metadata.py

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
import json
2+
import logging
3+
from dataclasses import dataclass
4+
from pathlib import Path
5+
from typing import Any, Callable, Dict, List, Optional, Tuple
6+
7+
8+
# loading package files: https://stackoverflow.com/a/20885799
9+
try:
10+
import importlib.resources as pkg_resources
11+
except ImportError:
12+
# Try backported to PY<37 `importlib_resources`.
13+
import importlib_resources as pkg_resources
14+
15+
import yaml
16+
17+
from . import resources
18+
19+
20+
BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
21+
this_url = f"{BASE_REF_URL}/{__file__}"
22+
logger = logging.getLogger(__name__)
23+
24+
25+
def load_json_resource(resource: str) -> Tuple[Any, str]:
26+
content = pkg_resources.read_text(resources, resource)
27+
return json.loads(content), f"{BASE_REF_URL}/resources/{resource}"
28+
29+
30+
# Source of languages.json:
31+
# https://datahub.io/core/language-codes/r/ietf-language-tags.csv
32+
# Language names were obtained with langcodes: https://github.com/LuminosoInsight/langcodes
33+
known_language_codes, known_language_codes_url = load_json_resource("languages.json")
34+
known_licenses, known_licenses_url = load_json_resource("licenses.json")
35+
known_task_ids, known_task_ids_url = load_json_resource("tasks.json")
36+
known_creators, known_creators_url = load_json_resource("creators.json")
37+
known_size_categories, known_size_categories_url = load_json_resource("size_categories.json")
38+
known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json")
39+
40+
41+
def yaml_block_from_readme(path: Path) -> Optional[str]:
42+
with path.open() as readme_file:
43+
content = [line.strip() for line in readme_file]
44+
45+
if content[0] == "---" and "---" in content[1:]:
46+
yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
47+
return yamlblock
48+
49+
return None
50+
51+
52+
def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]:
53+
""""Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict"""
54+
yaml_block = yaml_block_from_readme(path=path)
55+
if yaml_block is None:
56+
return None
57+
metada_dict = yaml.safe_load(yaml_block) or dict()
58+
return metada_dict
59+
60+
61+
ValidatorOutput = Tuple[List[str], Optional[str]]
62+
63+
64+
def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> ValidatorOutput:
65+
invalid_values = [v for v in values if v not in reference_values]
66+
if len(invalid_values) > 0:
67+
return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}"
68+
return values, None
69+
70+
71+
def escape_validation_for_predicate(
72+
values: List[Any], predicate_fn: Callable[[Any], bool]
73+
) -> Tuple[List[Any], List[Any]]:
74+
trues, falses = list(), list()
75+
for v in values:
76+
if predicate_fn(v):
77+
trues.append(v)
78+
else:
79+
falses.append(v)
80+
if len(trues) > 0:
81+
logger.warning(f"The following values will escape validation: {trues}")
82+
return trues, falses
83+
84+
85+
def validate_metadata_type(metadata_dict: dict):
86+
basic_typing_errors = {
87+
name: value
88+
for name, value in metadata_dict.items()
89+
if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str)
90+
}
91+
if len(basic_typing_errors) > 0:
92+
raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}")
93+
94+
95+
@dataclass
96+
class DatasetMetadata:
97+
annotations_creators: List[str]
98+
language_creators: List[str]
99+
languages: List[str]
100+
licenses: List[str]
101+
multilinguality: List[str]
102+
size_categories: List[str]
103+
source_datasets: List[str]
104+
task_categories: List[str]
105+
task_ids: List[str]
106+
107+
def __post_init__(self):
108+
validate_metadata_type(metadata_dict=vars(self))
109+
110+
self.annotations_creators, annotations_creators_errors = self.validate_annotations_creators(
111+
self.annotations_creators
112+
)
113+
self.language_creators, language_creators_errors = self.validate_language_creators(self.language_creators)
114+
self.languages, languages_errors = self.validate_language_codes(self.languages)
115+
self.licenses, licenses_errors = self.validate_licences(self.licenses)
116+
self.multilinguality, multilinguality_errors = self.validate_mulitlinguality(self.multilinguality)
117+
self.size_categories, size_categories_errors = self.validate_size_catgeories(self.size_categories)
118+
self.source_datasets, source_datasets_errors = self.validate_source_datasets(self.source_datasets)
119+
self.task_categories, task_categories_errors = self.validate_task_categories(self.task_categories)
120+
self.task_ids, task_ids_errors = self.validate_task_ids(self.task_ids)
121+
122+
errors = {
123+
"annotations_creators": annotations_creators_errors,
124+
"language_creators": language_creators_errors,
125+
"licenses": licenses_errors,
126+
"multilinguality": multilinguality_errors,
127+
"size_categories": size_categories_errors,
128+
"source_datasets": source_datasets_errors,
129+
"task_categories": task_categories_errors,
130+
"task_ids": task_ids_errors,
131+
"languages": languages_errors,
132+
}
133+
134+
exception_msg_dict = dict()
135+
for field, errs in errors.items():
136+
if errs is not None:
137+
exception_msg_dict[field] = errs
138+
if len(exception_msg_dict) > 0:
139+
raise TypeError(
140+
"Could not validate the metada, found the following errors:\n"
141+
+ "\n".join(f"* field '{fieldname}':\n\t{err}" for fieldname, err in exception_msg_dict.items())
142+
)
143+
144+
@classmethod
145+
def from_readme(cls, path: Path) -> "DatasetMetadata":
146+
"""Loads and validates the dataset metadat from its dataset card (README.md)
147+
148+
Args:
149+
path (:obj:`Path`): Path to the dataset card (its README.md file)
150+
151+
Returns:
152+
:class:`DatasetMetadata`: The dataset's metadata
153+
154+
Raises:
155+
:obj:`TypeError`: If the dataset card has no metadata (no YAML header)
156+
:obj:`TypeError`: If the dataset's metadata is invalid
157+
"""
158+
yaml_string = yaml_block_from_readme(path)
159+
if yaml_string is not None:
160+
return cls.from_yaml_string(yaml_string)
161+
else:
162+
raise TypeError(f"did not find a yaml block in '{path}'")
163+
164+
@classmethod
165+
def from_yaml_string(cls, string: str) -> "DatasetMetadata":
166+
"""Loads and validates the dataset metadat from a YAML string
167+
168+
Args:
169+
string (:obj:`str`): The YAML string
170+
171+
Returns:
172+
:class:`DatasetMetadata`: The dataset's metadata
173+
174+
Raises:
175+
:obj:`TypeError`: If the dataset's metadata is invalid
176+
"""
177+
metada_dict = yaml.safe_load(string) or dict()
178+
return cls(**metada_dict)
179+
180+
@staticmethod
181+
def validate_annotations_creators(annotations_creators: List[str]) -> ValidatorOutput:
182+
return tagset_validator(
183+
annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url
184+
)
185+
186+
@staticmethod
187+
def validate_language_creators(language_creators: List[str]) -> ValidatorOutput:
188+
return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url)
189+
190+
@staticmethod
191+
def validate_language_codes(languages: List[str]) -> ValidatorOutput:
192+
return tagset_validator(
193+
values=languages,
194+
reference_values=known_language_codes.keys(),
195+
name="languages",
196+
url=known_language_codes_url,
197+
)
198+
199+
@staticmethod
200+
def validate_licences(licenses: List[str]) -> ValidatorOutput:
201+
others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e)
202+
validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url)
203+
return [*validated, *others], error
204+
205+
@staticmethod
206+
def validate_task_categories(task_categories: List[str]) -> ValidatorOutput:
207+
# TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
208+
# in the near future and we don't want to waste energy in tagging against a moving taxonomy.
209+
known_set = list(known_task_ids.keys())
210+
others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other"))
211+
validated, error = tagset_validator(to_validate, known_set, "task_categories", known_task_ids_url)
212+
return [*validated, *others], error
213+
214+
@staticmethod
215+
def validate_task_ids(task_ids: List[str]) -> ValidatorOutput:
216+
# TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
217+
# in the near future and we don't want to waste energy in tagging against a moving taxonomy.
218+
known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]]
219+
others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e)
220+
validated, error = tagset_validator(to_validate, known_set, "task_ids", known_task_ids_url)
221+
return [*validated, *others], error
222+
223+
@staticmethod
224+
def validate_mulitlinguality(multilinguality: List[str]) -> ValidatorOutput:
225+
others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other"))
226+
validated, error = tagset_validator(
227+
to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url
228+
)
229+
return [*validated, *others], error
230+
231+
@staticmethod
232+
def validate_size_catgeories(size_cats: List[str]) -> ValidatorOutput:
233+
return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url)
234+
235+
@staticmethod
236+
def validate_source_datasets(sources: List[str]) -> ValidatorOutput:
237+
invalid_values = []
238+
for src in sources:
239+
is_ok = src in ["original", "extended"] or src.startswith("extended|")
240+
if not is_ok:
241+
invalid_values.append(src)
242+
if len(invalid_values) > 0:
243+
return (
244+
[],
245+
f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}",
246+
)
247+
248+
return sources, None
249+
250+
251+
if __name__ == "__main__":
252+
from argparse import ArgumentParser
253+
254+
ap = ArgumentParser(usage="Validate the yaml metadata block of a README.md file.")
255+
ap.add_argument("readme_filepath")
256+
args = ap.parse_args()
257+
258+
readme_filepath = Path(args.readme_filepath)
259+
DatasetMetadata.from_readme(readme_filepath)

src/datasets/utils/resources/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)