-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Add Validation For README #2121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 6 commits
Commits
Show all changes
38 commits
Select commit
Hold shift + click to select a range
993acd6
Add Initial README parser
gchhablani c2f0699
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani 014c49d
Add basic validation checks
gchhablani 2040602
Minor fix
gchhablani 79c2ad0
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani 7a1654b
Changes from review
gchhablani c6d2345
Merge branch 'master' of https://github.com/huggingface/datasets into…
gchhablani 99d2222
Make main into a function in readme_parser
gchhablani 51c08ae
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani 4e54669
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani 1d788a9
Move README validator to scripts
gchhablani 2d13f70
Arrange README validation files
gchhablani a1c1f67
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani ee31e15
Update readme validator class
gchhablani ae60ce5
Add from_string tests
gchhablani 362e464
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani 057d0d9
Add PyTest tests
gchhablani cd4b69e
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani 35e08d8
Add tests for from_readme
gchhablani a3de91a
Add ReadMe validator script
gchhablani 8dd3feb
Fix style
gchhablani 87b0668
Remove print statement
gchhablani 1d49a4d
Add validator to CircleCI
gchhablani d9f0ac3
Fix style
gchhablani 414fc2e
Add YAML files to setup resources
gchhablani 0c3425a
Make validator executable
gchhablani 933fdf7
Add no subsections test
gchhablani cd895a1
Add incorrect YAML test
gchhablani a3bdb1f
Fix style
gchhablani 6e85d4a
Fix tests
gchhablani 10386e7
Fix tests
gchhablani b4ca9ca
Fix style
gchhablani a69c019
Fix escape character issue
gchhablani f12a105
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani d45ec9b
Add three-level heading validation limit
gchhablani 309a69e
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani cdcffe0
Add either text or subsection option
gchhablani ffdfcb6
Fix style
gchhablani File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,240 @@ | ||
| # class_mapping = { | ||
| # "Dataset Description": DatasetDescription, | ||
| # } | ||
|
|
||
| # key_mapping = { | ||
| # "Dataset Desription": 'dataset_desc' | ||
| # } | ||
|
|
||
| import pprint | ||
|
|
||
| # import json | ||
| import yaml | ||
|
|
||
|
|
||
| yaml_struc = """ | ||
| name: "" # Filename | ||
| allow_empty: false | ||
| subsections: | ||
| - name: "Dataset Card for X" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Table of Contents" | ||
| allow_empty: false | ||
| subsections: null # meaning it should not be checked. | ||
| - name: "Dataset Description" | ||
| allow_empty: false | ||
| subsections: | ||
| - name: "Dataset Summary" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Supported Tasks and Leaderboards" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: Languages | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Dataset Structure" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Data Instances" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Data Fields" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Data Splits" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Dataset Creation" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Curation Rationale" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Source Data" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Initial Data Collection and Normalization" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Who are the source X producers?" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Annotations" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Annotation process" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Who are the annotators?" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Personal and Sensitive Information" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Considerations for Using the Data" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Social Impact of Dataset" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Discussion of Biases" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Other Known Limitations" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Additional Information" | ||
| allow_empty: true | ||
| subsections: | ||
| - name: "Dataset Curators" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Licensing Information" | ||
| allow_empty: true | ||
| subsections: null | ||
| - name: "Citation Information" | ||
| allow_empty: false | ||
| subsections: null | ||
| - name: "Contributions" | ||
| allow_empty: false | ||
| subsections: null | ||
| """ | ||
|
|
||
| filler_text = [ | ||
| "[Needs More Information]", | ||
| "[More Information Needed]", | ||
| "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", | ||
| ] | ||
|
|
||
|
|
||
| class Section: | ||
| def __init__(self, name, level, lines=None): | ||
| self.name = name | ||
| self.level = level | ||
| self.text = "" | ||
| self.is_empty = True | ||
| self.content = {} | ||
| if lines is not None: | ||
| self.parse(lines) | ||
|
|
||
| def parse(self, lines): | ||
| current_sub_level = "" | ||
| current_lines = [] | ||
| code_start = False | ||
| for line in lines: | ||
| if line.strip(" \n") == "": | ||
| continue | ||
| elif line.strip(" \n")[:3] == "```": | ||
| code_start = not code_start | ||
| elif line.split()[0] == self.level + "#" and not code_start: | ||
| if current_sub_level != "": | ||
| self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) | ||
| current_lines = [] | ||
| else: | ||
| if current_lines != []: | ||
| self.text += "".join(current_lines).strip() | ||
| if self.text != "" and self.text not in filler_text: | ||
| self.is_empty = False | ||
| current_lines = [] | ||
|
|
||
| current_sub_level = " ".join(line.split()[1:]).strip(" \n") | ||
| else: | ||
| current_lines.append(line) | ||
| else: | ||
| if current_sub_level != "": | ||
| self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) | ||
| else: | ||
| if current_lines != []: | ||
| self.text += "".join(current_lines).strip() | ||
| if self.text != "" and self.text not in filler_text: | ||
| self.is_empty = False | ||
|
|
||
| def to_dict(self): | ||
| return { | ||
| "name": self.name, | ||
| "text": self.text, | ||
| "is_empty": self.is_empty, | ||
| "subsections": [value.to_dict() for value in self.content.values()], | ||
| } | ||
|
|
||
|
|
||
| class ReadMe(Section): # Level 0 | ||
| def __init__(self, file_path): | ||
| super().__init__(name=file_path, level="") | ||
| self.parse(file_path) | ||
|
|
||
| def parse(self, file_path): | ||
| with open(self.name) as f: | ||
| # Skip Tags | ||
| tag_count = 0 | ||
| for line in f: | ||
| if line.strip(" \n") == "---": | ||
| tag_count += 1 | ||
| if tag_count == 2: | ||
| break | ||
| else: | ||
| raise ValueError("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.") | ||
| super().parse(f) | ||
|
|
||
| def _validate_section(self, section, structure): | ||
| # Text validation | ||
| error_list = [] | ||
| if structure["allow_empty"] == False: | ||
| if section.is_empty: | ||
| print(section.text) | ||
| error_list.append(f"Expected some text for section '{section.name}'") | ||
|
|
||
| if structure["subsections"] is not None: | ||
| # If no subsections present | ||
| if section.content == {}: | ||
| values = [subsection["name"] for subsection in structure["subsections"]] | ||
| error_list.append(f"'{section.name}'' expected the following subsections: {values}, found `None`.") | ||
| else: | ||
| # Each key validation | ||
| structure_names = [subsection["name"] for subsection in structure["subsections"]] | ||
| for idx, name in enumerate(structure_names): | ||
| if name not in section.content: | ||
| error_list.append(f"'{section.name}' is missing subsection: '{name}'.") | ||
| else: | ||
| error_list += self._validate_section(section.content[name], structure["subsections"][idx]) | ||
|
|
||
| for name in section.content: | ||
| if name not in structure_names: | ||
| error_list.append( | ||
| f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection." | ||
| ) | ||
|
|
||
| return error_list | ||
|
|
||
| def __str__(self): | ||
| return str(self.to_dict()) | ||
|
|
||
| def validate(self, yaml_struc): | ||
| error_list = [] | ||
| structure = yaml.safe_load(yaml_struc) | ||
| num_first_level_keys = len(self.content.keys()) | ||
| if num_first_level_keys > 1: | ||
| error_list.append( | ||
| f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected." | ||
| ) | ||
| elif num_first_level_keys < 1: | ||
| error_list.append(f"The README has no first-level headings.") | ||
|
|
||
| else: | ||
| start_key = list(self.content.keys())[0] | ||
| if start_key.startswith("Dataset Card for"): | ||
| error_list += self._validate_section(self.content[start_key], structure["subsections"][0]) | ||
| else: | ||
| error_list.append("No first-level heading starting with `Dataset Card for` found.") | ||
| return error_list | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| readme = ReadMe("./dummy_readme.md") | ||
| error_list = readme.validate(yaml_struc) | ||
| if error_list != []: | ||
| errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) | ||
| error_string = "The following issues were found with the README\n" + errors | ||
| raise ValueError(error_string) | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.