-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Add Validation For README #2121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
993acd6
c2f0699
014c49d
2040602
79c2ad0
7a1654b
c6d2345
99d2222
51c08ae
4e54669
1d788a9
2d13f70
a1c1f67
ee31e15
ae60ce5
362e464
057d0d9
cd4b69e
35e08d8
a3de91a
8dd3feb
87b0668
1d49a4d
d9f0ac3
414fc2e
0c3425a
933fdf7
cd895a1
a3bdb1f
6e85d4a
10386e7
b4ca9ca
a69c019
f12a105
d45ec9b
309a69e
cdcffe0
ffdfcb6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| #!/usr/bin/env python | ||
|
|
||
| """ This script will run in CI and make sure all new changes to datasets readme files have valid content present. | ||
| """ | ||
|
|
||
| from pathlib import Path | ||
| from subprocess import check_output | ||
| from typing import List | ||
|
|
||
| from datasets.utils.readme import validate_readme | ||
|
|
||
|
|
||
| def get_changed_files(repo_path: Path) -> List[Path]: | ||
| diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) | ||
| changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] | ||
| return changed_files | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import logging | ||
| from argparse import ArgumentParser | ||
|
|
||
| logging.basicConfig(level=logging.DEBUG) | ||
|
|
||
| ap = ArgumentParser() | ||
| ap.add_argument("--repo_path", type=Path, default=Path.cwd()) | ||
| ap.add_argument("--check_all", action="store_true") | ||
| args = ap.parse_args() | ||
|
|
||
| repo_path: Path = args.repo_path | ||
| if args.check_all: | ||
| readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()] | ||
| else: | ||
| changed_files = get_changed_files(repo_path) | ||
| readmes = [ | ||
| f | ||
| for f in changed_files | ||
| if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" | ||
| ] | ||
|
|
||
| failed: List[Path] = [] | ||
| for readme in sorted(readmes): | ||
| try: | ||
| DatasetMetadata.from_readme(readme) | ||
| logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") | ||
| except TypeError as e: | ||
| failed.append(readme) | ||
| logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") | ||
| except Exception as e: | ||
| failed.append(readme) | ||
| logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") | ||
|
|
||
| if len(failed) > 0: | ||
| logging.info(f"❌ Failed on {len(failed)} files.") | ||
| exit(1) | ||
| else: | ||
| logging.info("All is well, keep up the good work 🤗!") | ||
| exit(0) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,173 @@ | ||
| import logging | ||
| from dataclasses import dataclass | ||
| from pathlib import Path | ||
| from typing import Any, Callable, Dict, List, Optional, Tuple | ||
|
|
||
| import yaml | ||
|
|
||
|
|
||
| BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" | ||
| this_url = f"{BASE_REF_URL}/{__file__}" | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def load_yaml_resource(resource: str) -> Tuple[Any, str]: | ||
| with open(resource) as f: | ||
| content = yaml.safe_load(f) | ||
| return content, f"{BASE_REF_URL}/resources/{resource}" | ||
|
|
||
|
|
||
| readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml") | ||
| filler_text = [ | ||
| "[Needs More Information]", | ||
| "[More Information Needed]", | ||
| "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", | ||
| ] | ||
|
|
||
|
|
||
| class Section: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the future we may have subclasses of this to have more finegrained validation per section
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this class can be extended and we can keep a section-to-class mapping in the future. For now, this should be fine, right?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it's fine for now |
||
| def __init__(self, name, level, lines=None): | ||
| self.name = name | ||
| self.level = level | ||
| self.text = "" | ||
| self.is_empty = True | ||
| self.content = {} | ||
| if lines is not None: | ||
| self.parse(lines) | ||
|
|
||
| def parse(self, lines): | ||
| current_sub_level = "" | ||
| current_lines = [] | ||
| code_start = False | ||
| for line in lines: | ||
| if line.strip(" \n") == "": | ||
| continue | ||
| elif line.strip(" \n")[:3] == "```": | ||
| code_start = not code_start | ||
| elif line.split()[0] == self.level + "#" and not code_start: | ||
| if current_sub_level != "": | ||
| self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) | ||
| current_lines = [] | ||
| else: | ||
| if current_lines != []: | ||
| self.text += "".join(current_lines).strip() | ||
| if self.text != "" and self.text not in filler_text: | ||
| self.is_empty = False | ||
| current_lines = [] | ||
|
|
||
| current_sub_level = " ".join(line.split()[1:]).strip(" \n") | ||
| else: | ||
| current_lines.append(line) | ||
| else: | ||
| if current_sub_level != "": | ||
| self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) | ||
| else: | ||
| if current_lines != []: | ||
| self.text += "".join(current_lines).strip() | ||
| if self.text != "" and self.text not in filler_text: | ||
| self.is_empty = False | ||
|
|
||
| def to_dict(self): | ||
| return { | ||
| "name": self.name, | ||
| "text": self.text, | ||
| "is_empty": self.is_empty, | ||
| "subsections": [value.to_dict() for value in self.content.values()], | ||
| } | ||
|
|
||
|
|
||
| class ReadMe(Section): # Level 0 | ||
| def __init__(self, file_path): | ||
| super().__init__(name=file_path, level="") | ||
| self.yaml_tags_line_count = -2 | ||
| self.parse(file_path) | ||
|
|
||
| def parse(self, file_path): | ||
| with open(self.name) as f: | ||
| # Skip Tags | ||
| tag_count = 0 | ||
| for line in f: | ||
| self.yaml_tags_line_count += 1 | ||
| if line.strip(" \n") == "---": | ||
| tag_count += 1 | ||
|
|
||
| if tag_count == 2: | ||
| break | ||
| else: | ||
| raise ValueError( | ||
| "The README doesn't contain proper tags. Please ensure you add the correct YAML tags." | ||
| ) | ||
| super().parse(f) | ||
|
|
||
| def _validate_section(self, section, structure): | ||
| # Text validation | ||
| error_list = [] | ||
| if structure["allow_empty"] == False: | ||
| if section.is_empty: | ||
| error_list.append(f"Expected some text for section '{section.name}'") | ||
|
|
||
| if structure["subsections"] is not None: | ||
| # If no subsections present | ||
| if section.content == {}: | ||
| values = [subsection["name"] for subsection in structure["subsections"]] | ||
| error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.") | ||
| else: | ||
| # Each key validation | ||
| structure_names = [subsection["name"] for subsection in structure["subsections"]] | ||
| for idx, name in enumerate(structure_names): | ||
| if name not in section.content: | ||
| error_list.append(f"'{section.name}' is missing subsection: '{name}'.") | ||
| else: | ||
| error_list += self._validate_section(section.content[name], structure["subsections"][idx]) | ||
|
|
||
| for name in section.content: | ||
| if name not in structure_names: | ||
| error_list.append( | ||
| f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection." | ||
| ) | ||
|
|
||
| return error_list | ||
|
|
||
| def __str__(self): | ||
| return str(self.to_dict()) | ||
|
|
||
| def validate(self, readme_structure): | ||
| error_list = [] | ||
| num_first_level_keys = len(self.content.keys()) | ||
| if num_first_level_keys > 1: | ||
| error_list.append( | ||
| f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected." | ||
| ) | ||
| elif num_first_level_keys < 1: | ||
| error_list.append(f"The README has no first-level headings.") | ||
|
|
||
| else: | ||
| start_key = list(self.content.keys())[0] | ||
| if start_key.startswith("Dataset Card for"): | ||
| error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0]) | ||
| else: | ||
| error_list.append("No first-level heading starting with `Dataset Card for` found.") | ||
| return error_list | ||
|
|
||
|
|
||
| def validate_readme(file_path): | ||
|
||
| readme = ReadMe(file_path) | ||
| if readme.yaml_tags_line_count == 0: | ||
| raise Warning("YAML Tags are not present in this README.") | ||
| elif readme.yaml_tags_line_count == -1: | ||
| raise Warning("Only the start of YAML tags present in this README.") | ||
| error_list = readme.validate(readme_structure) | ||
| if error_list != []: | ||
| errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) | ||
| error_string = "The following issues were found with the README\n" + errors | ||
| raise ValueError(error_string) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| from argparse import ArgumentParser | ||
|
|
||
| ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.") | ||
| ap.add_argument("readme_filepath") | ||
| args = ap.parse_args() | ||
| readme_filepath = Path(args.readme_filepath) | ||
| validate_readme(readme_filepath) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| name: "" # Filename comes here | ||
| allow_empty: false | ||
| subsections: | ||
| - name: "Dataset Card for X" # First-level markdown heading | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Table of Contents" | ||
| allow_empty: false | ||
| subsections: null # meaning it should not be checked. | ||
| - name: "Dataset Description" | ||
| allow_empty: false | ||
| subsections: | ||
| - name: "Dataset Summary" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Supported Tasks and Leaderboards" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: Languages | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Dataset Structure" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Data Instances" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Data Fields" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Data Splits" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Dataset Creation" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Curation Rationale" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Source Data" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Initial Data Collection and Normalization" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Who are the source language producers?" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Annotations" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Annotation process" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Who are the annotators?" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Personal and Sensitive Information" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Considerations for Using the Data" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Social Impact of Dataset" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Discussion of Biases" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Other Known Limitations" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Additional Information" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Dataset Curators" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Licensing Information" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Citation Information" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Contributions" | ||
| allow_empty: false | ||
| subsections: null |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You may need to use
pkg_resourceshere to load the yaml dataSee an example here:
datasets/src/datasets/utils/metadata.py
Lines 25 to 27 in 8e903b5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I'll use
pkg_resources, but can you please explain why it is needed?