huggingface · lhoestq · May 10, 2021 · Mar 26, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -32,7 +32,6 @@ jobs:
             - run: pip install pyarrow==1.0.0
             - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/
 
-
     run_dataset_script_tests_pyarrow_latest_WIN:
         working_directory: ~/datasets
         executor:
@@ -82,6 +81,7 @@ jobs:
             - run: isort --check-only tests src benchmarks datasets metrics
             - run: flake8 tests src benchmarks datasets metrics
             - run: ./scripts/datasets_metadata_validator.py
+            - run: ./scripts/datasets_readme_validator.py
 
     build_doc:
         working_directory: ~/datasets
@@ -100,8 +100,8 @@ jobs:
             - image: circleci/python:3.6
         steps:
             - add_ssh_keys:
-                fingerprints:
-                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
+                  fingerprints:
+                      - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
             - checkout
             - run: sudo pip install .[docs]
             - run: ./.circleci/deploy.sh

diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content."""
+
+from pathlib import Path
+from subprocess import check_output
+from typing import List
+
+from datasets.utils.readme import ReadMe
+
+
+def get_changed_files(repo_path: Path) -> List[Path]:
+    diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
+    changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
+    return changed_files
+
+
+if __name__ == "__main__":
+    import logging
+    from argparse import ArgumentParser
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    ap = ArgumentParser()
+    ap.add_argument("--repo_path", type=Path, default=Path.cwd())
+    ap.add_argument("--check_all", action="store_true")
+    args = ap.parse_args()
+
+    repo_path: Path = args.repo_path
+    if args.check_all:
+        readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
+    else:
+        changed_files = get_changed_files(repo_path)
+        readmes = [
+            f
+            for f in changed_files
+            if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
+        ]
+
+    failed: List[Path] = []
+    for readme in sorted(readmes):
+        try:
+            ReadMe.from_readme(readme)
+            logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
+        except ValueError as e:
+            failed.append(readme)
+            logging.warning(f"❌ Validation failed for '{readme.relative_to(repo_path)}':\n{e}")
+        except Exception as e:
+            failed.append(readme)
+            logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")
+
+    if len(failed) > 0:
+        logging.info(f"❌ Failed on {len(failed)} files.")
+        exit(1)
+    else:
+        logging.info("All is well, keep up the good work 🤗!")
+        exit(0)
diff --git a/setup.py b/setup.py
@@ -216,7 +216,7 @@
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
-    package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]},
+    package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml"]},
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
@@ -0,0 +1,279 @@
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Tuple
+
+import yaml
+
+
+# loading package files: https://stackoverflow.com/a/20885799
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Try backported to PY<37 `importlib_resources`.
+    import importlib_resources as pkg_resources
+
+from . import resources
+
+
+BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
+this_url = f"{BASE_REF_URL}/{__file__}"
+logger = logging.getLogger(__name__)
+
+
+def load_yaml_resource(resource: str) -> Tuple[Any, str]:
+    content = pkg_resources.read_text(resources, resource)
+    return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}"
+
+
+readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")
+
+FILLER_TEXT = [
+    "[Needs More Information]",
+    "[More Information Needed]",
+    "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
+]
+
+# Dictionary representation of section/readme, error_list, warning_list
+ReadmeValidatorOutput = Tuple[dict, List[str], List[str]]
+
+
+@dataclass
+class Section:
+    name: str
+    level: str
+    lines: List[str] = None
+
+    def __post_init__(self):
+        self.text = ""
+        self.is_empty_text = True
+        self.content = {}
+        self.parsing_error_list = []
+        self.parsing_warning_list = []
+        if self.lines is not None:
+            self.parse()
+
+    def parse(self):
+        current_sub_level = ""
+        current_lines = []
+        code_start = False
+        for line in self.lines:
+            if line.strip(" \n") == "":
+                continue
+            elif line.strip(" \n")[:3] == "```":
+                code_start = not code_start
+            elif line.split()[0] == self.level + "#" and not code_start:
+                if current_sub_level != "":
+                    self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+                    current_lines = []
+                else:
+                    if current_lines != []:
+                        self.text += "".join(current_lines).strip()
+                        if self.text != "" and self.text not in FILLER_TEXT:
+                            self.is_empty_text = False
+                        current_lines = []
+
+                current_sub_level = " ".join(line.split()[1:]).strip(" \n")
+            else:
+                current_lines.append(line)
+        else:
+            if current_sub_level != "":
+                if current_sub_level in self.content:
+                    self.parsing_error_list.append(
+                        f"Multiple sections with the same heading `{current_sub_level}` have been found. Please keep only one of these sections."
+                    )
+                self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+            else:
+                if current_lines != []:
+                    self.text += "".join(current_lines).strip()
+                    if self.text != "" and self.text not in FILLER_TEXT:
+                        self.is_empty_text = False
+
+    def validate(self, structure: dict) -> ReadmeValidatorOutput:
+        """Validates a Section class object recursively using the structure provided as a dictionary.
+
+        Args:
+            structute (:obj: `dict`): The dictionary representing expected structure.
+
+        Returns:
+            :obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors.
+        """
+        # Header text validation
+        error_list = []
+        warning_list = []
+        if structure["allow_empty"] is False:
+            # If content is expected
+            if self.is_empty_text and self.content == {}:
+                # If no content is found, mention it in the error_list
+                error_list.append(f"Expected some content in section `{self.name}` but it is empty.")
+
+        if structure["allow_empty_text"] is False:
+            # If some text is expected
+            if self.is_empty_text:
+                # If no text is found, mention it in the error_list
+                error_list.append(
+                    f"Expected some text in section `{self.name}` but it is empty (text in subsections are ignored)."
+                )
+        # Subsections Validation
+        if structure["subsections"] is not None:
+            # If subsections are expected
+            if self.content == {}:
+                # If no subsections are present
+                values = [subsection["name"] for subsection in structure["subsections"]]
+                # Mention the expected values in the error_list
+                error_list.append(
+                    f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'."
+                )
+            else:
+                # If some subsections are present
+                structure_names = [subsection["name"] for subsection in structure["subsections"]]
+                for idx, name in enumerate(structure_names):
+                    if name not in self.content:
+                        # If the expected subsection is not present
+                        error_list.append(f"Section `{self.name}` is missing subsection: `{name}`.")
+                    else:
+                        # If the subsection is present, validate subsection, return the result
+                        # and concat the errors from subsection to section error_list
+
+                        # Skip sublevel validation if current level is `###`
+                        if self.level == "###":
+                            continue
+                        else:
+                            _, subsec_error_list, subsec_warning_list = self.content[name].validate(
+                                structure["subsections"][idx]
+                            )
+                        error_list += subsec_error_list
+                        warning_list += subsec_warning_list
+
+                for name in self.content:
+                    if name not in structure_names:
+                        # If an extra subsection is present
+                        warning_list.append(
+                            f"`{self.name}` has an extra subsection: `{name}`. Skipping further validation checks for this subsection as expected structure is unknown."
+                        )
+        error_list = self.parsing_error_list + error_list
+        warning_list = self.parsing_warning_list + warning_list
+        if error_list:
+            # If there are errors, do not return the dictionary as it is invalid
+            return {}, error_list, warning_list
+        else:
+            return self.to_dict(), error_list, warning_list
+
+    def to_dict(self) -> dict:
+        """Returns the dictionary representation of a section."""
+        return {
+            "name": self.name,
+            "text": self.text,
+            "is_empty_text": self.is_empty_text,
+            "subsections": [value.to_dict() for value in self.content.values()],
+        }
+
+
+class ReadMe(Section):  # Level 0
+    def __init__(self, name: str, lines: List[str], structure: dict = None):
+        super().__init__(name=name, level="")  # Not using lines here as we need to use a child class parse
+        self.structure = structure
+        self.yaml_tags_line_count = -2
+        self.tag_count = 0
+        self.lines = lines
+        if self.lines is not None:
+            self.parse()
+
+        # Validation
+        if self.structure is None:
+            content, error_list, warning_list = self.validate(readme_structure)
+        else:
+            content, error_list, warning_list = self.validate(self.structure)
+
+        error_list = self.parsing_error_list + error_list
+        warning_list = self.parsing_warning_list + warning_list
+        if error_list != [] or warning_list != []:
+            errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list)))
+            error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors
+            raise ValueError(error_string)
+
+    @classmethod
+    def from_readme(cls, path: Path, structure: dict = None):
+        with open(path) as f:
+            lines = f.readlines()
+        return cls(path, lines, structure)
+
+    @classmethod
+    def from_string(cls, string: str, structure: dict = None, root_name: str = "root"):
+        lines = string.split("\n")
+        return cls(root_name, lines, structure)
+
+    def parse(self):
+        # Skip Tags
+        line_count = 0
+
+        for line in self.lines:
+            self.yaml_tags_line_count += 1
+            if line.strip(" \n") == "---":
+                self.tag_count += 1
+                if self.tag_count == 2:
+                    break
+            line_count += 1
+        if self.tag_count == 2:
+            self.lines = self.lines[line_count + 1 :]  # Get the last + 1 th item.
+        else:
+            self.lines = self.lines[self.tag_count :]
+        super().parse()
+
+    def __str__(self):
+        """Returns the string of dictionary representation of the ReadMe."""
+        return str(self.to_dict())
+
+    def validate(self, readme_structure):
+        error_list = []
+        warning_list = []
+        if self.yaml_tags_line_count == 0:
+            warning_list.append("Empty YAML markers are present in the README.")
+        elif self.tag_count == 0:
+            warning_list.append("No YAML markers are present in the README.")
+        elif self.tag_count == 1:
+            warning_list.append("Only the start of YAML tags present in the README.")
+        # Check how many first level sections are present.
+        num_first_level_keys = len(self.content.keys())
+        if num_first_level_keys > 1:
+            # If more than one, add to the error list, continue
+            error_list.append(
+                f"The README has several first-level headings: {', '.join(['`'+x+'`' for x in list(self.content.keys())])}. Only one heading is expected. Skipping further validation for this README."
+            )
+        elif num_first_level_keys < 1:
+            # If less than one, append error.
+            error_list.append(
+                f"The README has no first-level headings. One heading is expected. Skipping further validation for this README."
+            )
+
+        else:
+            # If one exactly
+            start_key = list(self.content.keys())[0]  # Get the key
+            if start_key.startswith("Dataset Card for"):  # Check correct start
+
+                # If the starting is correct, validate all the sections
+                _, sec_error_list, sec_warning_list = self.content[start_key].validate(
+                    readme_structure["subsections"][0]
+                )
+                error_list += sec_error_list
+                warning_list += sec_warning_list
+            else:
+                # If not found, append error
+                error_list.append(
+                    f"No first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README."
+                )
+        if error_list:
+            # If there are errors, do not return the dictionary as it is invalid
+            return {}, error_list, warning_list
+        else:
+            return self.to_dict(), error_list, warning_list
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.")
+    ap.add_argument("readme_filepath")
+    args = ap.parse_args()
+    readme_filepath = Path(args.readme_filepath)
+    readme = ReadMe.from_readme(readme_filepath)