huggingface · lhoestq · May 10, 2021 · Mar 26, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+""" This script will run in CI and make sure all new changes to datasets readme files have valid content present.
+"""
+
+from pathlib import Path
+from subprocess import check_output
+from typing import List
+
+from datasets.utils.readme import validate_readme
+
+
+def get_changed_files(repo_path: Path) -> List[Path]:
+    diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
+    changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
+    return changed_files
+
+
+if __name__ == "__main__":
+    import logging
+    from argparse import ArgumentParser
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    ap = ArgumentParser()
+    ap.add_argument("--repo_path", type=Path, default=Path.cwd())
+    ap.add_argument("--check_all", action="store_true")
+    args = ap.parse_args()
+
+    repo_path: Path = args.repo_path
+    if args.check_all:
+        readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
+    else:
+        changed_files = get_changed_files(repo_path)
+        readmes = [
+            f
+            for f in changed_files
+            if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
+        ]
+
+    failed: List[Path] = []
+    for readme in sorted(readmes):
+        try:
+            DatasetMetadata.from_readme(readme)
+            logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
+        except TypeError as e:
+            failed.append(readme)
+            logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
+        except Exception as e:
+            failed.append(readme)
+            logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")
+
+    if len(failed) > 0:
+        logging.info(f"❌ Failed on {len(failed)} files.")
+        exit(1)
+    else:
+        logging.info("All is well, keep up the good work 🤗!")
+        exit(0)
diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
@@ -0,0 +1,173 @@
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import yaml
+
+
+BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
+this_url = f"{BASE_REF_URL}/{__file__}"
+logger = logging.getLogger(__name__)
+
+
+def load_yaml_resource(resource: str) -> Tuple[Any, str]:
+    with open(resource) as f:
+        content = yaml.safe_load(f)
 def load_json_resource(resource: str) -> Tuple[Any, str]: 
     content = pkg_resources.read_text(resources, resource) 
     return json.loads(content), f"{BASE_REF_URL}/resources/{resource}" 
 def load_json_resource(resource: str) -> Tuple[Any, str]: 
     content = pkg_resources.read_text(resources, resource) 
     return json.loads(content), f"{BASE_REF_URL}/resources/{resource}" 
+    return content, f"{BASE_REF_URL}/resources/{resource}"
+
+
+readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")
+filler_text = [
+    "[Needs More Information]",
+    "[More Information Needed]",
+    "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
+]
+
+
+class Section:
+    def __init__(self, name, level, lines=None):
+        self.name = name
+        self.level = level
+        self.text = ""
+        self.is_empty = True
+        self.content = {}
+        if lines is not None:
+            self.parse(lines)
+
+    def parse(self, lines):
+        current_sub_level = ""
+        current_lines = []
+        code_start = False
+        for line in lines:
+            if line.strip(" \n") == "":
+                continue
+            elif line.strip(" \n")[:3] == "```":
+                code_start = not code_start
+            elif line.split()[0] == self.level + "#" and not code_start:
+                if current_sub_level != "":
+                    self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+                    current_lines = []
+                else:
+                    if current_lines != []:
+                        self.text += "".join(current_lines).strip()
+                        if self.text != "" and self.text not in filler_text:
+                            self.is_empty = False
+                        current_lines = []
+
+                current_sub_level = " ".join(line.split()[1:]).strip(" \n")
+            else:
+                current_lines.append(line)
+        else:
+            if current_sub_level != "":
+                self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+            else:
+                if current_lines != []:
+                    self.text += "".join(current_lines).strip()
+                    if self.text != "" and self.text not in filler_text:
+                        self.is_empty = False
+
+    def to_dict(self):
+        return {
+            "name": self.name,
+            "text": self.text,
+            "is_empty": self.is_empty,
+            "subsections": [value.to_dict() for value in self.content.values()],
+        }
+
+
+class ReadMe(Section):  # Level 0
+    def __init__(self, file_path):
+        super().__init__(name=file_path, level="")
+        self.yaml_tags_line_count = -2
+        self.parse(file_path)
+
+    def parse(self, file_path):
+        with open(self.name) as f:
+            # Skip Tags
+            tag_count = 0
+            for line in f:
+                self.yaml_tags_line_count += 1
+                if line.strip(" \n") == "---":
+                    tag_count += 1
+
+                    if tag_count == 2:
+                        break
+            else:
+                raise ValueError(
+                    "The README doesn't contain proper tags. Please ensure you add the correct YAML tags."
+                )
+            super().parse(f)
+
+    def _validate_section(self, section, structure):
+        # Text validation
+        error_list = []
+        if structure["allow_empty"] == False:
+            if section.is_empty:
+                error_list.append(f"Expected some text for section '{section.name}'")
+
+        if structure["subsections"] is not None:
+            # If no subsections present
+            if section.content == {}:
+                values = [subsection["name"] for subsection in structure["subsections"]]
+                error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.")
+            else:
+                # Each key validation
+                structure_names = [subsection["name"] for subsection in structure["subsections"]]
+                for idx, name in enumerate(structure_names):
+                    if name not in section.content:
+                        error_list.append(f"'{section.name}' is missing subsection: '{name}'.")
+                    else:
+                        error_list += self._validate_section(section.content[name], structure["subsections"][idx])
+
+                for name in section.content:
+                    if name not in structure_names:
+                        error_list.append(
+                            f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection."
+                        )
+
+        return error_list
+
+    def __str__(self):
+        return str(self.to_dict())
+
+    def validate(self, readme_structure):
+        error_list = []
+        num_first_level_keys = len(self.content.keys())
+        if num_first_level_keys > 1:
+            error_list.append(
+                f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected."
+            )
+        elif num_first_level_keys < 1:
+            error_list.append(f"The README has no first-level headings.")
+
+        else:
+            start_key = list(self.content.keys())[0]
+            if start_key.startswith("Dataset Card for"):
+                error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0])
+            else:
+                error_list.append("No first-level heading starting with `Dataset Card for` found.")
+        return error_list
+
+
+def validate_readme(file_path):
+    readme = ReadMe(file_path)
+    if readme.yaml_tags_line_count == 0:
+        raise Warning("YAML Tags are not present in this README.")
+    elif readme.yaml_tags_line_count == -1:
+        raise Warning("Only the start of YAML tags present in this README.")
+    error_list = readme.validate(readme_structure)
+    if error_list != []:
+        errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
+        error_string = "The following issues were found with the README\n" + errors
+        raise ValueError(error_string)
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.")
+    ap.add_argument("readme_filepath")
+    args = ap.parse_args()
+    readme_filepath = Path(args.readme_filepath)
+    validate_readme(readme_filepath)
diff --git a/src/datasets/utils/resources/__init__.py b/src/datasets/utils/resources/__init__.py
diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml
@@ -0,0 +1,87 @@
+name: "" # Filename comes here
+allow_empty: false
+subsections:
+  - name: "Dataset Card for X" # First-level markdown heading
+    allow_empty: true
+    subsections:
+      - name: "Table of Contents"
+        allow_empty: false
+        subsections: null # meaning it should not be checked.
+      - name: "Dataset Description"
+        allow_empty: false
+        subsections:
+          - name: "Dataset Summary"
+            allow_empty: false
+            subsections: null
+          - name: "Supported Tasks and Leaderboards"
+            allow_empty: true
+            subsections: null
+          - name: Languages
+            allow_empty: true
+            subsections: null
+      - name: "Dataset Structure"
+        allow_empty: true
+        subsections:
+          - name: "Data Instances"
+            allow_empty: false
+            subsections: null
+          - name: "Data Fields"
+            allow_empty: false
+            subsections: null
+          - name: "Data Splits"
+            allow_empty: false
+            subsections: null
+      - name: "Dataset Creation"
+        allow_empty: true
+        subsections:
+        - name: "Curation Rationale"
+          allow_empty: true
+          subsections: null
+        - name: "Source Data"
+          allow_empty: true
+          subsections:
+            - name: "Initial Data Collection and Normalization"
+              allow_empty: true
+              subsections: null
+            - name: "Who are the source language producers?"
+              allow_empty: true
+              subsections: null
+        - name: "Annotations"
+          allow_empty: true
+          subsections:
+            - name: "Annotation process"
+              allow_empty: true
+              subsections: null
+            - name: "Who are the annotators?"
+              allow_empty: true
+              subsections: null
+        - name: "Personal and Sensitive Information"
+          allow_empty: true
+          subsections: null
+      - name: "Considerations for Using the Data"
+        allow_empty: true
+        subsections:
+          - name: "Social Impact of Dataset"
+            allow_empty: true
+            subsections: null
+          - name: "Discussion of Biases"
+            allow_empty: true
+            subsections: null
+          - name: "Other Known Limitations"
+            allow_empty: true
+            subsections: null
+      - name: "Additional Information"
+        allow_empty: true
+        subsections:
+          - name: "Dataset Curators"
+            allow_empty: true
+            subsections: null
+          - name: "Licensing Information"
+            allow_empty: true
+            subsections: null
+          - name: "Citation Information"
+            allow_empty: false
+            subsections: null
+          - name: "Contributions"
+            allow_empty: false
+            subsections: null