Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
993acd6
Add Initial README parser
gchhablani Mar 26, 2021
c2f0699
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Mar 29, 2021
014c49d
Add basic validation checks
gchhablani Mar 29, 2021
2040602
Minor fix
gchhablani Mar 29, 2021
79c2ad0
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 1, 2021
7a1654b
Changes from review
gchhablani Apr 1, 2021
c6d2345
Merge branch 'master' of https://github.com/huggingface/datasets into…
gchhablani Apr 8, 2021
99d2222
Make main into a function in readme_parser
gchhablani Apr 8, 2021
51c08ae
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 23, 2021
4e54669
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 24, 2021
1d788a9
Move README validator to scripts
gchhablani Apr 25, 2021
2d13f70
Arrange README validation files
gchhablani Apr 26, 2021
a1c1f67
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 27, 2021
ee31e15
Update readme validator class
gchhablani Apr 27, 2021
ae60ce5
Add from_string tests
gchhablani Apr 27, 2021
362e464
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 28, 2021
057d0d9
Add PyTest tests
gchhablani Apr 28, 2021
cd4b69e
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani May 2, 2021
35e08d8
Add tests for from_readme
gchhablani May 2, 2021
a3de91a
Add ReadMe validator script
gchhablani May 2, 2021
8dd3feb
Fix style
gchhablani May 2, 2021
87b0668
Remove print statement
gchhablani May 2, 2021
1d49a4d
Add validator to CircleCI
gchhablani May 2, 2021
d9f0ac3
Fix style
gchhablani May 2, 2021
414fc2e
Add YAML files to setup resources
gchhablani May 2, 2021
0c3425a
Make validator executable
gchhablani May 2, 2021
933fdf7
Add no subsections test
gchhablani May 2, 2021
cd895a1
Add incorrect YAML test
gchhablani May 2, 2021
a3bdb1f
Fix style
gchhablani May 2, 2021
6e85d4a
Fix tests
gchhablani May 2, 2021
10386e7
Fix tests
gchhablani May 2, 2021
b4ca9ca
Fix style
gchhablani May 2, 2021
a69c019
Fix escape character issue
gchhablani May 4, 2021
f12a105
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani May 4, 2021
d45ec9b
Add three-level heading validation limit
gchhablani May 7, 2021
309a69e
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani May 7, 2021
cdcffe0
Add either text or subsection option
gchhablani May 7, 2021
ffdfcb6
Fix style
gchhablani May 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions scripts/datasets_readme_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python

""" This script will run in CI and make sure all new changes to datasets readme files have valid content present.
"""

from pathlib import Path
from subprocess import check_output
from typing import List

from datasets.utils.readme import validate_readme


def get_changed_files(repo_path: Path) -> List[Path]:
diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
return changed_files


if __name__ == "__main__":
import logging
from argparse import ArgumentParser

logging.basicConfig(level=logging.DEBUG)

ap = ArgumentParser()
ap.add_argument("--repo_path", type=Path, default=Path.cwd())
ap.add_argument("--check_all", action="store_true")
args = ap.parse_args()

repo_path: Path = args.repo_path
if args.check_all:
readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
else:
changed_files = get_changed_files(repo_path)
readmes = [
f
for f in changed_files
if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
]

failed: List[Path] = []
for readme in sorted(readmes):
try:
DatasetMetadata.from_readme(readme)
logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
except TypeError as e:
failed.append(readme)
logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
except Exception as e:
failed.append(readme)
logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")

if len(failed) > 0:
logging.info(f"❌ Failed on {len(failed)} files.")
exit(1)
else:
logging.info("All is well, keep up the good work 🤗!")
exit(0)
173 changes: 173 additions & 0 deletions src/datasets/utils/readme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple

import yaml


BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
this_url = f"{BASE_REF_URL}/{__file__}"
logger = logging.getLogger(__name__)


def load_yaml_resource(resource: str) -> Tuple[Any, str]:
with open(resource) as f:
content = yaml.safe_load(f)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may need to use pkg_resources here to load the yaml data

See an example here:

def load_json_resource(resource: str) -> Tuple[Any, str]:
content = pkg_resources.read_text(resources, resource)
return json.loads(content), f"{BASE_REF_URL}/resources/{resource}"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I'll use pkg_resources, but can you please explain why it is needed?

return content, f"{BASE_REF_URL}/resources/{resource}"


readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")
filler_text = [
"[Needs More Information]",
"[More Information Needed]",
"(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
]


class Section:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the future we may have subclasses of this to have more finegrained validation per section

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this class can be extended and we can keep a section-to-class mapping in the future. For now, this should be fine, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's fine for now

def __init__(self, name, level, lines=None):
self.name = name
self.level = level
self.text = ""
self.is_empty = True
self.content = {}
if lines is not None:
self.parse(lines)

def parse(self, lines):
current_sub_level = ""
current_lines = []
code_start = False
for line in lines:
if line.strip(" \n") == "":
continue
elif line.strip(" \n")[:3] == "```":
code_start = not code_start
elif line.split()[0] == self.level + "#" and not code_start:
if current_sub_level != "":
self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
current_lines = []
else:
if current_lines != []:
self.text += "".join(current_lines).strip()
if self.text != "" and self.text not in filler_text:
self.is_empty = False
current_lines = []

current_sub_level = " ".join(line.split()[1:]).strip(" \n")
else:
current_lines.append(line)
else:
if current_sub_level != "":
self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
else:
if current_lines != []:
self.text += "".join(current_lines).strip()
if self.text != "" and self.text not in filler_text:
self.is_empty = False

def to_dict(self):
return {
"name": self.name,
"text": self.text,
"is_empty": self.is_empty,
"subsections": [value.to_dict() for value in self.content.values()],
}


class ReadMe(Section): # Level 0
def __init__(self, file_path):
super().__init__(name=file_path, level="")
self.yaml_tags_line_count = -2
self.parse(file_path)

def parse(self, file_path):
with open(self.name) as f:
# Skip Tags
tag_count = 0
for line in f:
self.yaml_tags_line_count += 1
if line.strip(" \n") == "---":
tag_count += 1

if tag_count == 2:
break
else:
raise ValueError(
"The README doesn't contain proper tags. Please ensure you add the correct YAML tags."
)
super().parse(f)

def _validate_section(self, section, structure):
# Text validation
error_list = []
if structure["allow_empty"] == False:
if section.is_empty:
error_list.append(f"Expected some text for section '{section.name}'")

if structure["subsections"] is not None:
# If no subsections present
if section.content == {}:
values = [subsection["name"] for subsection in structure["subsections"]]
error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.")
else:
# Each key validation
structure_names = [subsection["name"] for subsection in structure["subsections"]]
for idx, name in enumerate(structure_names):
if name not in section.content:
error_list.append(f"'{section.name}' is missing subsection: '{name}'.")
else:
error_list += self._validate_section(section.content[name], structure["subsections"][idx])

for name in section.content:
if name not in structure_names:
error_list.append(
f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection."
)

return error_list

def __str__(self):
return str(self.to_dict())

def validate(self, readme_structure):
error_list = []
num_first_level_keys = len(self.content.keys())
if num_first_level_keys > 1:
error_list.append(
f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected."
)
elif num_first_level_keys < 1:
error_list.append(f"The README has no first-level headings.")

else:
start_key = list(self.content.keys())[0]
if start_key.startswith("Dataset Card for"):
error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0])
else:
error_list.append("No first-level heading starting with `Dataset Card for` found.")
return error_list


def validate_readme(file_path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you write a few tests for this function ? that would be appreciated

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I will add the tests.

readme = ReadMe(file_path)
if readme.yaml_tags_line_count == 0:
raise Warning("YAML Tags are not present in this README.")
elif readme.yaml_tags_line_count == -1:
raise Warning("Only the start of YAML tags present in this README.")
error_list = readme.validate(readme_structure)
if error_list != []:
errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
error_string = "The following issues were found with the README\n" + errors
raise ValueError(error_string)


if __name__ == "__main__":
from argparse import ArgumentParser

ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.")
ap.add_argument("readme_filepath")
args = ap.parse_args()
readme_filepath = Path(args.readme_filepath)
validate_readme(readme_filepath)
Empty file.
87 changes: 87 additions & 0 deletions src/datasets/utils/resources/readme_structure.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: "" # Filename comes here
allow_empty: false
subsections:
- name: "Dataset Card for X" # First-level markdown heading
allow_empty: true
subsections:
- name: "Table of Contents"
allow_empty: false
subsections: null # meaning it should not be checked.
- name: "Dataset Description"
allow_empty: false
subsections:
- name: "Dataset Summary"
allow_empty: false
subsections: null
- name: "Supported Tasks and Leaderboards"
allow_empty: true
subsections: null
- name: Languages
allow_empty: true
subsections: null
- name: "Dataset Structure"
allow_empty: true
subsections:
- name: "Data Instances"
allow_empty: false
subsections: null
- name: "Data Fields"
allow_empty: false
subsections: null
- name: "Data Splits"
allow_empty: false
subsections: null
- name: "Dataset Creation"
allow_empty: true
subsections:
- name: "Curation Rationale"
allow_empty: true
subsections: null
- name: "Source Data"
allow_empty: true
subsections:
- name: "Initial Data Collection and Normalization"
allow_empty: true
subsections: null
- name: "Who are the source language producers?"
allow_empty: true
subsections: null
- name: "Annotations"
allow_empty: true
subsections:
- name: "Annotation process"
allow_empty: true
subsections: null
- name: "Who are the annotators?"
allow_empty: true
subsections: null
- name: "Personal and Sensitive Information"
allow_empty: true
subsections: null
- name: "Considerations for Using the Data"
allow_empty: true
subsections:
- name: "Social Impact of Dataset"
allow_empty: true
subsections: null
- name: "Discussion of Biases"
allow_empty: true
subsections: null
- name: "Other Known Limitations"
allow_empty: true
subsections: null
- name: "Additional Information"
allow_empty: true
subsections:
- name: "Dataset Curators"
allow_empty: true
subsections: null
- name: "Licensing Information"
allow_empty: true
subsections: null
- name: "Citation Information"
allow_empty: false
subsections: null
- name: "Contributions"
allow_empty: false
subsections: null