diff --git a/.circleci/config.yml b/.circleci/config.yml index 2361409c4c9..c1e870b8fd7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -32,7 +32,6 @@ jobs: - run: pip install pyarrow==1.0.0 - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/ - run_dataset_script_tests_pyarrow_latest_WIN: working_directory: ~/datasets executor: @@ -82,6 +81,7 @@ jobs: - run: isort --check-only tests src benchmarks datasets metrics - run: flake8 tests src benchmarks datasets metrics - run: ./scripts/datasets_metadata_validator.py + - run: ./scripts/datasets_readme_validator.py build_doc: working_directory: ~/datasets @@ -100,8 +100,8 @@ jobs: - image: circleci/python:3.6 steps: - add_ssh_keys: - fingerprints: - - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" + fingerprints: + - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" - checkout - run: sudo pip install .[docs] - run: ./.circleci/deploy.sh diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py new file mode 100755 index 00000000000..af0cc05445d --- /dev/null +++ b/scripts/datasets_readme_validator.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content.""" + +from pathlib import Path +from subprocess import check_output +from typing import List + +from datasets.utils.readme import ReadMe + + +def get_changed_files(repo_path: Path) -> List[Path]: + diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) + changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] + return changed_files + + +if __name__ == "__main__": + import logging + from argparse import ArgumentParser + + logging.basicConfig(level=logging.DEBUG) + + ap = ArgumentParser() + ap.add_argument("--repo_path", type=Path, default=Path.cwd()) + ap.add_argument("--check_all", action="store_true") + args = ap.parse_args() + + repo_path: Path = args.repo_path + if args.check_all: + readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()] + else: + changed_files = get_changed_files(repo_path) + readmes = [ + f + for f in changed_files + if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" + ] + + failed: List[Path] = [] + for readme in sorted(readmes): + try: + ReadMe.from_readme(readme) + logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") + except ValueError as e: + failed.append(readme) + logging.warning(f"❌ Validation failed for '{readme.relative_to(repo_path)}':\n{e}") + except Exception as e: + failed.append(readme) + logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") + + if len(failed) > 0: + logging.info(f"❌ Failed on {len(failed)} files.") + exit(1) + else: + logging.info("All is well, keep up the good work 🤗!") + exit(0) diff --git a/setup.py b/setup.py index c1d7542119d..0a87810e8d9 100644 --- a/setup.py +++ b/setup.py @@ -216,7 +216,7 @@ license="Apache 2.0", package_dir={"": "src"}, packages=find_packages("src"), - package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]}, + package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml"]}, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py new file mode 100644 index 00000000000..45b8713f085 --- /dev/null +++ b/src/datasets/utils/readme.py @@ -0,0 +1,279 @@ +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, List, Tuple + +import yaml + + +# loading package files: https://stackoverflow.com/a/20885799 +try: + import importlib.resources as pkg_resources +except ImportError: + # Try backported to PY<37 `importlib_resources`. + import importlib_resources as pkg_resources + +from . import resources + + +BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" +this_url = f"{BASE_REF_URL}/{__file__}" +logger = logging.getLogger(__name__) + + +def load_yaml_resource(resource: str) -> Tuple[Any, str]: + content = pkg_resources.read_text(resources, resource) + return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}" + + +readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml") + +FILLER_TEXT = [ + "[Needs More Information]", + "[More Information Needed]", + "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", +] + +# Dictionary representation of section/readme, error_list, warning_list +ReadmeValidatorOutput = Tuple[dict, List[str], List[str]] + + +@dataclass +class Section: + name: str + level: str + lines: List[str] = None + + def __post_init__(self): + self.text = "" + self.is_empty_text = True + self.content = {} + self.parsing_error_list = [] + self.parsing_warning_list = [] + if self.lines is not None: + self.parse() + + def parse(self): + current_sub_level = "" + current_lines = [] + code_start = False + for line in self.lines: + if line.strip(" \n") == "": + continue + elif line.strip(" \n")[:3] == "```": + code_start = not code_start + elif line.split()[0] == self.level + "#" and not code_start: + if current_sub_level != "": + self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) + current_lines = [] + else: + if current_lines != []: + self.text += "".join(current_lines).strip() + if self.text != "" and self.text not in FILLER_TEXT: + self.is_empty_text = False + current_lines = [] + + current_sub_level = " ".join(line.split()[1:]).strip(" \n") + else: + current_lines.append(line) + else: + if current_sub_level != "": + if current_sub_level in self.content: + self.parsing_error_list.append( + f"Multiple sections with the same heading `{current_sub_level}` have been found. Please keep only one of these sections." + ) + self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) + else: + if current_lines != []: + self.text += "".join(current_lines).strip() + if self.text != "" and self.text not in FILLER_TEXT: + self.is_empty_text = False + + def validate(self, structure: dict) -> ReadmeValidatorOutput: + """Validates a Section class object recursively using the structure provided as a dictionary. + + Args: + structute (:obj: `dict`): The dictionary representing expected structure. + + Returns: + :obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors. + """ + # Header text validation + error_list = [] + warning_list = [] + if structure["allow_empty"] is False: + # If content is expected + if self.is_empty_text and self.content == {}: + # If no content is found, mention it in the error_list + error_list.append(f"Expected some content in section `{self.name}` but it is empty.") + + if structure["allow_empty_text"] is False: + # If some text is expected + if self.is_empty_text: + # If no text is found, mention it in the error_list + error_list.append( + f"Expected some text in section `{self.name}` but it is empty (text in subsections are ignored)." + ) + # Subsections Validation + if structure["subsections"] is not None: + # If subsections are expected + if self.content == {}: + # If no subsections are present + values = [subsection["name"] for subsection in structure["subsections"]] + # Mention the expected values in the error_list + error_list.append( + f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'." + ) + else: + # If some subsections are present + structure_names = [subsection["name"] for subsection in structure["subsections"]] + for idx, name in enumerate(structure_names): + if name not in self.content: + # If the expected subsection is not present + error_list.append(f"Section `{self.name}` is missing subsection: `{name}`.") + else: + # If the subsection is present, validate subsection, return the result + # and concat the errors from subsection to section error_list + + # Skip sublevel validation if current level is `###` + if self.level == "###": + continue + else: + _, subsec_error_list, subsec_warning_list = self.content[name].validate( + structure["subsections"][idx] + ) + error_list += subsec_error_list + warning_list += subsec_warning_list + + for name in self.content: + if name not in structure_names: + # If an extra subsection is present + warning_list.append( + f"`{self.name}` has an extra subsection: `{name}`. Skipping further validation checks for this subsection as expected structure is unknown." + ) + error_list = self.parsing_error_list + error_list + warning_list = self.parsing_warning_list + warning_list + if error_list: + # If there are errors, do not return the dictionary as it is invalid + return {}, error_list, warning_list + else: + return self.to_dict(), error_list, warning_list + + def to_dict(self) -> dict: + """Returns the dictionary representation of a section.""" + return { + "name": self.name, + "text": self.text, + "is_empty_text": self.is_empty_text, + "subsections": [value.to_dict() for value in self.content.values()], + } + + +class ReadMe(Section): # Level 0 + def __init__(self, name: str, lines: List[str], structure: dict = None): + super().__init__(name=name, level="") # Not using lines here as we need to use a child class parse + self.structure = structure + self.yaml_tags_line_count = -2 + self.tag_count = 0 + self.lines = lines + if self.lines is not None: + self.parse() + + # Validation + if self.structure is None: + content, error_list, warning_list = self.validate(readme_structure) + else: + content, error_list, warning_list = self.validate(self.structure) + + error_list = self.parsing_error_list + error_list + warning_list = self.parsing_warning_list + warning_list + if error_list != [] or warning_list != []: + errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list))) + error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors + raise ValueError(error_string) + + @classmethod + def from_readme(cls, path: Path, structure: dict = None): + with open(path) as f: + lines = f.readlines() + return cls(path, lines, structure) + + @classmethod + def from_string(cls, string: str, structure: dict = None, root_name: str = "root"): + lines = string.split("\n") + return cls(root_name, lines, structure) + + def parse(self): + # Skip Tags + line_count = 0 + + for line in self.lines: + self.yaml_tags_line_count += 1 + if line.strip(" \n") == "---": + self.tag_count += 1 + if self.tag_count == 2: + break + line_count += 1 + if self.tag_count == 2: + self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item. + else: + self.lines = self.lines[self.tag_count :] + super().parse() + + def __str__(self): + """Returns the string of dictionary representation of the ReadMe.""" + return str(self.to_dict()) + + def validate(self, readme_structure): + error_list = [] + warning_list = [] + if self.yaml_tags_line_count == 0: + warning_list.append("Empty YAML markers are present in the README.") + elif self.tag_count == 0: + warning_list.append("No YAML markers are present in the README.") + elif self.tag_count == 1: + warning_list.append("Only the start of YAML tags present in the README.") + # Check how many first level sections are present. + num_first_level_keys = len(self.content.keys()) + if num_first_level_keys > 1: + # If more than one, add to the error list, continue + error_list.append( + f"The README has several first-level headings: {', '.join(['`'+x+'`' for x in list(self.content.keys())])}. Only one heading is expected. Skipping further validation for this README." + ) + elif num_first_level_keys < 1: + # If less than one, append error. + error_list.append( + f"The README has no first-level headings. One heading is expected. Skipping further validation for this README." + ) + + else: + # If one exactly + start_key = list(self.content.keys())[0] # Get the key + if start_key.startswith("Dataset Card for"): # Check correct start + + # If the starting is correct, validate all the sections + _, sec_error_list, sec_warning_list = self.content[start_key].validate( + readme_structure["subsections"][0] + ) + error_list += sec_error_list + warning_list += sec_warning_list + else: + # If not found, append error + error_list.append( + f"No first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." + ) + if error_list: + # If there are errors, do not return the dictionary as it is invalid + return {}, error_list, warning_list + else: + return self.to_dict(), error_list, warning_list + + +if __name__ == "__main__": + from argparse import ArgumentParser + + ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.") + ap.add_argument("readme_filepath") + args = ap.parse_args() + readme_filepath = Path(args.readme_filepath) + readme = ReadMe.from_readme(readme_filepath) diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml new file mode 100644 index 00000000000..755483d1d4f --- /dev/null +++ b/src/datasets/utils/resources/readme_structure.yaml @@ -0,0 +1,116 @@ +name: "" # Filename comes here +allow_empty: false +allow_empty_text: true +subsections: + - name: "Dataset Card for X" # First-level markdown heading + allow_empty: false + allow_empty_text: true + subsections: + - name: "Table of Contents" + allow_empty: false + allow_empty_text: false + subsections: null # meaning it should not be checked. + - name: "Dataset Description" + allow_empty: false + allow_empty_text: false + subsections: + - name: "Dataset Summary" + allow_empty: false + allow_empty_text: false + subsections: null + - name: "Supported Tasks and Leaderboards" + allow_empty: true + allow_empty_text: true + subsections: null + - name: Languages + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Dataset Structure" + allow_empty: false + allow_empty_text: true + subsections: + - name: "Data Instances" + allow_empty: false + allow_empty_text: false + subsections: null + - name: "Data Fields" + allow_empty: false + allow_empty_text: false + subsections: null + - name: "Data Splits" + allow_empty: false + allow_empty_text: false + subsections: null + - name: "Dataset Creation" + allow_empty: false + allow_empty_text: true + subsections: + - name: "Curation Rationale" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Source Data" + allow_empty: false + allow_empty_text: true + subsections: + - name: "Initial Data Collection and Normalization" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Who are the source language producers?" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Annotations" + allow_empty: false + allow_empty_text: true + subsections: + - name: "Annotation process" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Who are the annotators?" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Personal and Sensitive Information" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Considerations for Using the Data" + allow_empty: true + allow_empty_text: true + subsections: + - name: "Social Impact of Dataset" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Discussion of Biases" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Other Known Limitations" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Additional Information" + allow_empty: true + allow_empty_text: true + subsections: + - name: "Dataset Curators" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Licensing Information" + allow_empty: true + allow_empty_text: true + subsections: null + - name: "Citation Information" + allow_empty: false + allow_empty_text: true + subsections: null + - name: "Contributions" + allow_empty: false + allow_empty_text: false + subsections: null diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py new file mode 100644 index 00000000000..711c6ea55c5 --- /dev/null +++ b/tests/test_readme_util.py @@ -0,0 +1,457 @@ +import re +import tempfile +from pathlib import Path + +import pytest +import yaml + +from datasets.utils.readme import ReadMe + + +# @pytest.fixture +# def example_yaml_structure(): + +example_yaml_structure = yaml.safe_load( + """\ +name: "" +allow_empty: false +allow_empty_text: true +subsections: + - name: "Dataset Card for X" # First-level markdown heading + allow_empty: false + allow_empty_text: true + subsections: + - name: "Table of Contents" + allow_empty: false + allow_empty_text: false + subsections: null + - name: "Dataset Description" + allow_empty: false + allow_empty_text: false + subsections: + - name: "Dataset Summary" + allow_empty: false + allow_empty_text: false + subsections: null + - name: "Supported Tasks and Leaderboards" + allow_empty: true + allow_empty_text: true + subsections: null + - name: Languages + allow_empty: false + allow_empty_text: true + subsections: null +""" +) + + +CORRECT_DICT = { + "name": "root", + "text": "", + "is_empty_text": True, + "subsections": [ + { + "name": "Dataset Card for My Dataset", + "text": "", + "is_empty_text": True, + "subsections": [ + {"name": "Table of Contents", "text": "Some text here.", "is_empty_text": False, "subsections": []}, + { + "name": "Dataset Description", + "text": "Some text here.", + "is_empty_text": False, + "subsections": [ + { + "name": "Dataset Summary", + "text": "Some text here.", + "is_empty_text": False, + "subsections": [], + }, + { + "name": "Supported Tasks and Leaderboards", + "text": "", + "is_empty_text": True, + "subsections": [], + }, + {"name": "Languages", "text": "Language Text", "is_empty_text": False, "subsections": []}, + ], + }, + ], + } + ], +} + + +README_CORRECT = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + + +README_CORRECT_FOUR_LEVEL = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +#### Extra Ignored Subsection +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + +CORRECT_DICT_FOUR_LEVEL = { + "name": "root", + "text": "", + "is_empty_text": True, + "subsections": [ + { + "name": "Dataset Card for My Dataset", + "text": "", + "is_empty_text": True, + "subsections": [ + {"name": "Table of Contents", "text": "Some text here.", "is_empty_text": False, "subsections": []}, + { + "name": "Dataset Description", + "text": "Some text here.", + "is_empty_text": False, + "subsections": [ + { + "name": "Dataset Summary", + "text": "Some text here.", + "is_empty_text": False, + "subsections": [ + { + "name": "Extra Ignored Subsection", + "text": "", + "is_empty_text": True, + "subsections": [], + } + ], + }, + { + "name": "Supported Tasks and Leaderboards", + "text": "", + "is_empty_text": True, + "subsections": [], + }, + {"name": "Languages", "text": "Language Text", "is_empty_text": False, "subsections": []}, + ], + }, + ], + } + ], +} + +README_EMPTY_YAML = """\ +--- +--- +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + +EXPECTED_ERROR_README_EMPTY_YAML = ( + "The following issues were found for the README at `{path}`:\n-\tEmpty YAML markers are present in the README." +) + +README_NO_YAML = """\ +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + +EXPECTED_ERROR_README_NO_YAML = ( + "The following issues were found for the README at `{path}`:\n-\tNo YAML markers are present in the README." +) + +README_INCORRECT_YAML = """\ +--- +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + +EXPECTED_ERROR_README_INCORRECT_YAML = "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README." + +README_MISSING_TEXT = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +### Supported Tasks and Leaderboards +### Languages +Language Text +""" +EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Dataset Summary` but it is empty.\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)." + + +README_NONE_SUBSECTION = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +""" +EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Dataset Card for My Dataset` but it is empty.\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'." + +README_MISSING_SUBSECTION = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Languages +Language Text +""" + +EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Description` is missing subsection: `Supported Tasks and Leaderboards`." + + +README_MISSING_CONTENT = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +EXPECTED_ERROR_README_MISSING_CONTENT = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Languages` but it is empty." + +README_MISSING_FIRST_LEVEL = """\ +--- +languages: +- zh +- en +--- + +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" +EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README." + +README_MULTIPLE_WRONG_FIRST_LEVEL = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +# Dataset Card My Dataset +""" + +EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: `Dataset Card for My Dataset`, `Dataset Card My Dataset`. Only one heading is expected. Skipping further validation for this README." + +README_WRONG_FIRST_LEVEL = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + +EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." + +README_EMPTY = "" + +EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README." + +README_MULTIPLE_SAME_HEADING_1 = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +Language Text +""" + +EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections." + + +@pytest.mark.parametrize( + "readme_md, expected_dict", + [ + (README_CORRECT, CORRECT_DICT), + (README_CORRECT_FOUR_LEVEL, CORRECT_DICT_FOUR_LEVEL), + ], +) +def test_readme_from_string_correct(readme_md, expected_dict): + assert ReadMe.from_string(readme_md, example_yaml_structure).to_dict() == expected_dict + + +@pytest.mark.parametrize( + "readme_md, expected_error", + [ + (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), + (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), + (README_INCORRECT_YAML, EXPECTED_ERROR_README_INCORRECT_YAML), + (README_EMPTY, EXPECTED_ERROR_README_EMPTY), + (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION), + (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), + (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), + (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), + (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), + (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), + (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), + (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT), + ], +) +def test_readme_from_string_errors(readme_md, expected_error): + with pytest.raises(ValueError, match=re.escape(expected_error.format(path="root"))): + ReadMe.from_string(readme_md, example_yaml_structure) + + +@pytest.mark.parametrize( + "readme_md, expected_dict", + [ + (README_CORRECT, CORRECT_DICT), + (README_CORRECT_FOUR_LEVEL, CORRECT_DICT_FOUR_LEVEL), + ], +) +def test_readme_from_readme_correct(readme_md, expected_dict): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(readme_md) + out = ReadMe.from_readme(path, example_yaml_structure).to_dict() + assert out["name"] == path + assert out["text"] == "" + assert out["is_empty_text"] + assert out["subsections"] == expected_dict["subsections"] + + +@pytest.mark.parametrize( + "readme_md, expected_error", + [ + (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), + (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), + (README_INCORRECT_YAML, EXPECTED_ERROR_README_INCORRECT_YAML), + (README_EMPTY, EXPECTED_ERROR_README_EMPTY), + (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION), + (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), + (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), + (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), + (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), + (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), + (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), + (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT), + ], +) +def test_readme_from_readme_error(readme_md, expected_error): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(readme_md) + expected_error = expected_error.format(path=path) + with pytest.raises(ValueError, match=re.escape(expected_error)): + ReadMe.from_readme(path, example_yaml_structure)