From aa637feb7deef389e1a47c84bd54d1a5f99e2da4 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 15:01:08 +0530 Subject: [PATCH 01/15] Update tagset validator to allow dict --- src/datasets/utils/metadata.py | 45 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 8c4c94a5e48..9bd69042cf8 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -3,7 +3,7 @@ from collections import Counter from dataclasses import dataclass, fields from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union # loading package files: https://stackoverflow.com/a/20885799 @@ -66,7 +66,7 @@ def yaml_block_from_readme(path: Path) -> Optional[str]: def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: - """ "Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" + """ Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" yaml_block = yaml_block_from_readme(path=path) if yaml_block is None: return None @@ -77,11 +77,16 @@ def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: ValidatorOutput = Tuple[List[str], Optional[str]] -def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> ValidatorOutput: - invalid_values = [v for v in values if v not in reference_values] +def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_values: List[str], name: str, url: str) -> ValidatorOutput: + if isinstance(items, list): + invalid_values = [v for v in items if v not in reference_values] + else: + invalid_values = [] + for config_name, values in items.items(): + invalid_values += [v for v in values if v not in reference_values] if len(invalid_values) > 0: return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}" - return values, None + return items, None def escape_validation_for_predicate( @@ -120,18 +125,19 @@ def validate_metadata_type(metadata_dict: dict): @dataclass class DatasetMetadata: - annotations_creators: List[str] - language_creators: List[str] - languages: List[str] - licenses: List[str] - multilinguality: List[str] - size_categories: List[str] - source_datasets: List[str] - task_categories: List[str] - task_ids: List[str] + annotations_creators: Union[List[str], Dict[str, List[str]]] + language_creators: Union[List[str], Dict[str, List[str]]] + languages: Union[List[str], Dict[str, List[str]]] + licenses: Union[List[str], Dict[str, List[str]]] + multilinguality: Union[List[str], Dict[str, List[str]]] + pretty_names: Union[str, Dict[str,str]] + size_categories: Union[List[str], Dict[str, List[str]]] + source_datasets: Union[List[str], Dict[str, List[str]]] + task_categories: Union[List[str], Dict[str, List[str]]] + task_ids: Union[List[str], Dict[str, List[str]]] paperswithcode_id: Optional[str] = None - def __post_init__(self): + def validate(self): validate_metadata_type(metadata_dict=vars(self)) self.annotations_creators, annotations_creators_errors = self.validate_annotations_creators( @@ -168,7 +174,7 @@ def __post_init__(self): exception_msg_dict[field] = errs if len(exception_msg_dict) > 0: raise TypeError( - "Could not validate the metada, found the following errors:\n" + "Could not validate the metadata, found the following errors:\n" + "\n".join(f"* field '{fieldname}':\n\t{err}" for fieldname, err in exception_msg_dict.items()) ) @@ -206,10 +212,6 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata": :obj:`TypeError`: If the dataset's metadata is invalid """ metada_dict = yaml.load(string, Loader=NoDuplicateSafeLoader) or dict() - # flatten the metadata of each config - for key in metada_dict: - if isinstance(metada_dict[key], dict): - metada_dict[key] = list(set(sum(metada_dict[key].values(), []))) return cls(**metada_dict) @staticmethod @@ -308,4 +310,5 @@ def validate_paperswithcode_id_errors(paperswithcode_id: Optional[str]) -> Valid args = ap.parse_args() readme_filepath = Path(args.readme_filepath) - DatasetMetadata.from_readme(readme_filepath) + dataset_metadata = DatasetMetadata.from_readme(readme_filepath) + dataset_metadata.validate() From 72077f615c554688ec3dd8e0261f66dbd13bb4c7 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 15:18:39 +0530 Subject: [PATCH 02/15] Add escape validation mask for predicate_fn --- src/datasets/utils/metadata.py | 54 ++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 9bd69042cf8..b3d58a379dc 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -77,30 +77,52 @@ def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: ValidatorOutput = Tuple[List[str], Optional[str]] -def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_values: List[str], name: str, url: str) -> ValidatorOutput: +def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_values: List[str], name: str, url: str, escape_validation_mask:Union[List[str],Dict[str,List[str]]] =None) -> ValidatorOutput: if isinstance(items, list): - invalid_values = [v for v in items if v not in reference_values] + if escape_validation_mask is not None: + invalid_values = [v for idx,v in enumerate(items) if v not in reference_values and escape_validation_mask[idx] is False] + else: + invalid_values = [v for v in items if v not in reference_values] + else: invalid_values = [] - for config_name, values in items.items(): - invalid_values += [v for v in values if v not in reference_values] + if escape_validation_mask is not None: + for config_name, values in items.items(): + invalid_values += [v for idx, v in enumerate(values) if v not in reference_values and escape_validation_mask[config_name][idx] is False] + else: + for config_name, values in items.items(): + invalid_values += [v for v in values if v not in reference_values] + if len(invalid_values) > 0: return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}" return items, None -def escape_validation_for_predicate( - values: List[Any], predicate_fn: Callable[[Any], bool] -) -> Tuple[List[Any], List[Any]]: - trues, falses = list(), list() - for v in values: - if predicate_fn(v): - trues.append(v) - else: - falses.append(v) - if len(trues) > 0: - logger.warning(f"The following values will escape validation: {trues}") - return trues, falses +def escape_validation_mask_for_predicate( + items: Union[List[Any],Dict[str, List[Any]]], predicate_fn: Callable[[Any], bool] +) -> Union[List[Any],Dict[str, List[Any]]]: + values_to_ignore = [] + if isinstance(items, list): + mask = [] + for v in items: + if predicate_fn(v): + mask.append(True) + values_to_ignore.append(v) + else: + mask.append(False) + if isinstance(items, dict): + mask = {} + for k,values in items.items(): + mask[k] = [] + for v in values: + if predicate_fn(v): + mask[k].append(True) + values_to_ignore.append(v) + else: + mask[k].append(False) + if len(values_to_ignore) > 0: + logger.warning(f"The following values will escape validation: {values_to_ignore}") + return mask def validate_metadata_type(metadata_dict: dict): From 9b68d934c5d5ae14d15a99005fae8908cee281b1 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 17:42:56 +0530 Subject: [PATCH 03/15] Add recursive metadata type checking --- src/datasets/utils/metadata.py | 76 ++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index b3d58a379dc..99b6be053d7 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -3,7 +3,8 @@ from collections import Counter from dataclasses import dataclass, fields from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Type, Any, Callable, Dict, List, Optional, Tuple, Union +import typing # loading package files: https://stackoverflow.com/a/20885799 @@ -125,24 +126,65 @@ def escape_validation_mask_for_predicate( return mask +def validate_type(value: Any, expected_type: Type): + error_string = '' + if expected_type == str: + if not isinstance(value, str) or len(value)==0: + return f'Expected `{str}` with length > 0. Found value: `{value}` of type: `{type(value)}`, with length: {len(value)}.\n' + else: + return error_string + # Add more `elif` statements if primitive type checking is needed + else: + expected_type_origin = expected_type.__origin__ + expected_type_args = expected_type.__args__ + + if expected_type_origin == Union: + for type_arg in expected_type_args: + temp_error_string = validate_type(value, type_arg) + if temp_error_string == '': # at least one type is successfully validated + return temp_error_string + else: + if error_string == '': + error_string = '('+temp_error_string+')' + else: + error_string+= '\nOR\n' + '('+temp_error_string+')' + + else: + # Assuming `List`/`Dict`/`Tuple` + if not isinstance(value, expected_type_origin) or len(value)==0: + return f'Expected `{expected_type_origin}` with length > 0. Found value of type: `{type(value)}`, with length: {len(value)}.\n' + + if expected_type_origin == Dict: + key_type, value_type = expected_type_args + key_error_string = '' + value_error_string = '' + for k, v in value.items(): + key_error_string+=validate_type(k, key_type) + value_error_string+=validate_type(v, value_type) + if key_error_string!='' or value_error_string!='': + return f'Type errors with keys:\n {key_error_string} and values:\n {value_error_string}' + + else: # `List`/`Tuple` + value_type = expected_type_args[0] + value_error_string = '' + for v in value: + value_error_string+=validate_type(v, value_type) + if value_error_string!='': + return f'Type errors with values:\n {value_error_string}' + + return error_string + + def validate_metadata_type(metadata_dict: dict): fields_types = {field.name: field.type for field in fields(DatasetMetadata)} - list_typing_errors = { - name: value - for name, value in metadata_dict.items() - if fields_types.get(name, List[str]) == List[str] - and (not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str)) - } - if len(list_typing_errors) > 0: - raise TypeError(f"Found fields that are not non-empty list of strings: {list_typing_errors}") - - other_typing_errors = { - name: value - for name, value in metadata_dict.items() - if fields_types.get(name, List[str]) != List[str] and isinstance(value, list) - } - if len(other_typing_errors) > 0: - raise TypeError(f"Found fields that are lists instead of single strings: {other_typing_errors}") + + typing_errors = {} + for field_name, field_type in fields_types.items(): + field_type_error = validate_type(metadata_dict[field_name], field_type) + if field_type_error!='': + typing_errors[field_name] = field_type_error + if len(typing_errors) > 0: + raise TypeError(f"Found fields that are lists instead of single strings: {typing_errors}") @dataclass From edaa1bd0cc2144d43bd494ad06479d62e15e1d13 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 17:44:16 +0530 Subject: [PATCH 04/15] Update typing errors validation --- src/datasets/utils/metadata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 99b6be053d7..2e400abe05c 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -162,7 +162,7 @@ def validate_type(value: Any, expected_type: Type): key_error_string+=validate_type(k, key_type) value_error_string+=validate_type(v, value_type) if key_error_string!='' or value_error_string!='': - return f'Type errors with keys:\n {key_error_string} and values:\n {value_error_string}' + return f'Typing errors with keys:\n {key_error_string} and values:\n {value_error_string}' else: # `List`/`Tuple` value_type = expected_type_args[0] @@ -170,7 +170,7 @@ def validate_type(value: Any, expected_type: Type): for v in value: value_error_string+=validate_type(v, value_type) if value_error_string!='': - return f'Type errors with values:\n {value_error_string}' + return f'Typing errors with values:\n {value_error_string}' return error_string @@ -184,7 +184,7 @@ def validate_metadata_type(metadata_dict: dict): if field_type_error!='': typing_errors[field_name] = field_type_error if len(typing_errors) > 0: - raise TypeError(f"Found fields that are lists instead of single strings: {typing_errors}") + raise TypeError(f"The following typing errors are found: {typing_errors}") @dataclass From 3f74e591d577510b1997497694b6abe3c5302b03 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 17:48:51 +0530 Subject: [PATCH 05/15] Update input types in validation methods --- src/datasets/utils/metadata.py | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 2e400abe05c..ce61959205c 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -260,7 +260,7 @@ def from_readme(cls, path: Path) -> "DatasetMetadata": if yaml_string is not None: return cls.from_yaml_string(yaml_string) else: - raise TypeError(f"did not find a yaml block in '{path}'") + raise TypeError(f"Unable to find a yaml block in '{path}'") @classmethod def from_yaml_string(cls, string: str) -> "DatasetMetadata": @@ -279,17 +279,17 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata": return cls(**metada_dict) @staticmethod - def validate_annotations_creators(annotations_creators: List[str]) -> ValidatorOutput: + def validate_annotations_creators(annotations_creators: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: return tagset_validator( annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url ) @staticmethod - def validate_language_creators(language_creators: List[str]) -> ValidatorOutput: + def validate_language_creators(language_creators: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url) @staticmethod - def validate_language_codes(languages: List[str]) -> ValidatorOutput: + def validate_language_codes(languages: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: return tagset_validator( values=languages, reference_values=known_language_codes.keys(), @@ -298,47 +298,47 @@ def validate_language_codes(languages: List[str]) -> ValidatorOutput: ) @staticmethod - def validate_licences(licenses: List[str]) -> ValidatorOutput: - others, to_validate = escape_validation_for_predicate( + def validate_licences(licenses: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: + escape_validation_mask = escape_validation_mask_for_predicate( licenses, lambda e: "-other-" in e or e.startswith("other-") ) - validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url) - return [*validated, *others], error + validated, error = tagset_validator(licenses, list(known_licenses.keys()), "licenses", known_licenses_url, escape_validation_mask) + return validated, error @staticmethod - def validate_task_categories(task_categories: List[str]) -> ValidatorOutput: + def validate_task_categories(task_categories: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) - others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other-")) - validated, error = tagset_validator(to_validate, known_set, "task_categories", known_task_ids_url) - return [*validated, *others], error + escape_validation_mask = escape_validation_mask_for_predicate(task_categories, lambda e: e.startswith("other-")) + validated, error = tagset_validator(task_categories, known_set, "task_categories", known_task_ids_url, escape_validation_mask) + return validated, error @staticmethod - def validate_task_ids(task_ids: List[str]) -> ValidatorOutput: + def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] - others, to_validate = escape_validation_for_predicate( + escape_validation_mask = escape_validation_mask_for_predicate( task_ids, lambda e: "-other-" in e or e.startswith("other-") ) - validated, error = tagset_validator(to_validate, known_set, "task_ids", known_task_ids_url) - return [*validated, *others], error + validated, error = tagset_validator(task_ids, known_set, "task_ids", known_task_ids_url, escape_validation_mask) + return validated, error @staticmethod - def validate_mulitlinguality(multilinguality: List[str]) -> ValidatorOutput: - others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other-")) + def validate_mulitlinguality(multilinguality: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: + escape_validation_mask = escape_validation_mask_for_predicate(multilinguality, lambda e: e.startswith("other-")) validated, error = tagset_validator( - to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url + multilinguality, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url, escape_validation_mask ) - return [*validated, *others], error + return validated, error @staticmethod - def validate_size_catgeories(size_cats: List[str]) -> ValidatorOutput: + def validate_size_catgeories(size_cats: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url) @staticmethod - def validate_source_datasets(sources: List[str]) -> ValidatorOutput: + def validate_source_datasets(sources: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: invalid_values = [] for src in sources: is_ok = src in ["original", "extended"] or src.startswith("extended|") From a42366a8a7c8d560b32d1f56926ce50b352d292d Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 18:17:53 +0530 Subject: [PATCH 06/15] Add method to get metadata by config --- src/datasets/utils/metadata.py | 37 +++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index ce61959205c..f62387174ae 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -3,6 +3,7 @@ from collections import Counter from dataclasses import dataclass, fields from pathlib import Path +from types import prepare_class from typing import Type, Any, Callable, Dict, List, Optional, Tuple, Union import typing @@ -76,7 +77,7 @@ def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: ValidatorOutput = Tuple[List[str], Optional[str]] - +MetadataOutput = Dict[str, Union[List[str], Dict[str, List[str]]]] def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_values: List[str], name: str, url: str, escape_validation_mask:Union[List[str],Dict[str,List[str]]] =None) -> ValidatorOutput: if isinstance(items, list): @@ -365,6 +366,40 @@ def validate_paperswithcode_id_errors(paperswithcode_id: Optional[str]) -> Valid else: return paperswithcode_id, None + @staticmethod + def validate_pretty_names(pretty_names: Union[str, Dict[str,str]]): + if isinstance(pretty_names, str): + if len(pretty_names) == 0: + return None, f"The pretty name must have a length greater than 0 but got an empty string." + else: + error_string = "" + for key, value in pretty_names.items(): + if len(value) == 0: + error_string+=f"The pretty name must have a length greater than 0 but got an empty string for config: {key}.\n" + + if error_string == "": + return None, error_string + else: + return pretty_names, None + + def get_metadata_by_config_name(self, name: str) -> MetadataOutput: + metadata_dict = self.asdict() + config_name_hits = 0 + result_dict = {} + for tag_key, tag_value in metadata_dict.items(): + if isinstance(tag_value, str) or isinstance(tag_value, list): + result_dict[tag_key] = tag_value + elif isinstance(tag_value, dict): + for config_name, value in tag_value.items(): + if config_name == name: + result_dict[tag_key] = value + config_name_hits+=1 + + if config_name_hits==0: + logger.warning("No matching config names found in the metadata, using the common values to create metadata.") + + return DatasetMetadata(**result_dict) + if __name__ == "__main__": from argparse import ArgumentParser From c06ebd4dbaf6ca74690ea2d2d5e4fc65608c79d8 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 18:32:11 +0530 Subject: [PATCH 07/15] Fix styling/typing issues --- src/datasets/utils/metadata.py | 128 ++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 51 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index f62387174ae..a0ec5f0f67b 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -3,9 +3,7 @@ from collections import Counter from dataclasses import dataclass, fields from pathlib import Path -from types import prepare_class -from typing import Type, Any, Callable, Dict, List, Optional, Tuple, Union -import typing +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union # loading package files: https://stackoverflow.com/a/20885799 @@ -68,7 +66,7 @@ def yaml_block_from_readme(path: Path) -> Optional[str]: def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: - """ Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" + """Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" yaml_block = yaml_block_from_readme(path=path) if yaml_block is None: return None @@ -77,12 +75,20 @@ def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: ValidatorOutput = Tuple[List[str], Optional[str]] -MetadataOutput = Dict[str, Union[List[str], Dict[str, List[str]]]] -def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_values: List[str], name: str, url: str, escape_validation_mask:Union[List[str],Dict[str,List[str]]] =None) -> ValidatorOutput: + +def tagset_validator( + items: Union[List[str], Dict[str, List[str]]], + reference_values: List[str], + name: str, + url: str, + escape_validation_mask: Union[List[str], Dict[str, List[str]]] = None, +) -> ValidatorOutput: if isinstance(items, list): if escape_validation_mask is not None: - invalid_values = [v for idx,v in enumerate(items) if v not in reference_values and escape_validation_mask[idx] is False] + invalid_values = [ + v for idx, v in enumerate(items) if v not in reference_values and escape_validation_mask[idx] is False + ] else: invalid_values = [v for v in items if v not in reference_values] @@ -90,7 +96,11 @@ def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_valu invalid_values = [] if escape_validation_mask is not None: for config_name, values in items.items(): - invalid_values += [v for idx, v in enumerate(values) if v not in reference_values and escape_validation_mask[config_name][idx] is False] + invalid_values += [ + v + for idx, v in enumerate(values) + if v not in reference_values and escape_validation_mask[config_name][idx] is False + ] else: for config_name, values in items.items(): invalid_values += [v for v in values if v not in reference_values] @@ -101,8 +111,8 @@ def tagset_validator(items: Union[List[str],Dict[str,List[str]]], reference_valu def escape_validation_mask_for_predicate( - items: Union[List[Any],Dict[str, List[Any]]], predicate_fn: Callable[[Any], bool] -) -> Union[List[Any],Dict[str, List[Any]]]: + items: Union[List[Any], Dict[str, List[Any]]], predicate_fn: Callable[[Any], bool] +) -> Union[List[Any], Dict[str, List[Any]]]: values_to_ignore = [] if isinstance(items, list): mask = [] @@ -114,9 +124,9 @@ def escape_validation_mask_for_predicate( mask.append(False) if isinstance(items, dict): mask = {} - for k,values in items.items(): + for k, values in items.items(): mask[k] = [] - for v in values: + for v in values: if predicate_fn(v): mask[k].append(True) values_to_ignore.append(v) @@ -128,10 +138,10 @@ def escape_validation_mask_for_predicate( def validate_type(value: Any, expected_type: Type): - error_string = '' + error_string = "" if expected_type == str: - if not isinstance(value, str) or len(value)==0: - return f'Expected `{str}` with length > 0. Found value: `{value}` of type: `{type(value)}`, with length: {len(value)}.\n' + if not isinstance(value, str) or len(value) == 0: + return f"Expected `{str}` with length > 0. Found value: `{value}` of type: `{type(value)}`, with length: {len(value)}.\n" else: return error_string # Add more `elif` statements if primitive type checking is needed @@ -142,37 +152,37 @@ def validate_type(value: Any, expected_type: Type): if expected_type_origin == Union: for type_arg in expected_type_args: temp_error_string = validate_type(value, type_arg) - if temp_error_string == '': # at least one type is successfully validated + if temp_error_string == "": # at least one type is successfully validated return temp_error_string else: - if error_string == '': - error_string = '('+temp_error_string+')' + if error_string == "": + error_string = "(" + temp_error_string + ")" else: - error_string+= '\nOR\n' + '('+temp_error_string+')' - + error_string += "\nOR\n" + "(" + temp_error_string + ")" + else: # Assuming `List`/`Dict`/`Tuple` - if not isinstance(value, expected_type_origin) or len(value)==0: - return f'Expected `{expected_type_origin}` with length > 0. Found value of type: `{type(value)}`, with length: {len(value)}.\n' + if not isinstance(value, expected_type_origin) or len(value) == 0: + return f"Expected `{expected_type_origin}` with length > 0. Found value of type: `{type(value)}`, with length: {len(value)}.\n" if expected_type_origin == Dict: key_type, value_type = expected_type_args - key_error_string = '' - value_error_string = '' + key_error_string = "" + value_error_string = "" for k, v in value.items(): - key_error_string+=validate_type(k, key_type) - value_error_string+=validate_type(v, value_type) - if key_error_string!='' or value_error_string!='': - return f'Typing errors with keys:\n {key_error_string} and values:\n {value_error_string}' - - else: # `List`/`Tuple` + key_error_string += validate_type(k, key_type) + value_error_string += validate_type(v, value_type) + if key_error_string != "" or value_error_string != "": + return f"Typing errors with keys:\n {key_error_string} and values:\n {value_error_string}" + + else: # `List`/`Tuple` value_type = expected_type_args[0] - value_error_string = '' + value_error_string = "" for v in value: - value_error_string+=validate_type(v, value_type) - if value_error_string!='': - return f'Typing errors with values:\n {value_error_string}' - + value_error_string += validate_type(v, value_type) + if value_error_string != "": + return f"Typing errors with values:\n {value_error_string}" + return error_string @@ -182,7 +192,7 @@ def validate_metadata_type(metadata_dict: dict): typing_errors = {} for field_name, field_type in fields_types.items(): field_type_error = validate_type(metadata_dict[field_name], field_type) - if field_type_error!='': + if field_type_error != "": typing_errors[field_name] = field_type_error if len(typing_errors) > 0: raise TypeError(f"The following typing errors are found: {typing_errors}") @@ -195,7 +205,7 @@ class DatasetMetadata: languages: Union[List[str], Dict[str, List[str]]] licenses: Union[List[str], Dict[str, List[str]]] multilinguality: Union[List[str], Dict[str, List[str]]] - pretty_names: Union[str, Dict[str,str]] + pretty_names: Union[str, Dict[str, str]] size_categories: Union[List[str], Dict[str, List[str]]] source_datasets: Union[List[str], Dict[str, List[str]]] task_categories: Union[List[str], Dict[str, List[str]]] @@ -303,7 +313,9 @@ def validate_licences(licenses: Union[List[str], Dict[str, List[str]]]) -> Valid escape_validation_mask = escape_validation_mask_for_predicate( licenses, lambda e: "-other-" in e or e.startswith("other-") ) - validated, error = tagset_validator(licenses, list(known_licenses.keys()), "licenses", known_licenses_url, escape_validation_mask) + validated, error = tagset_validator( + licenses, list(known_licenses.keys()), "licenses", known_licenses_url, escape_validation_mask + ) return validated, error @staticmethod @@ -311,8 +323,12 @@ def validate_task_categories(task_categories: Union[List[str], Dict[str, List[st # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) - escape_validation_mask = escape_validation_mask_for_predicate(task_categories, lambda e: e.startswith("other-")) - validated, error = tagset_validator(task_categories, known_set, "task_categories", known_task_ids_url, escape_validation_mask) + escape_validation_mask = escape_validation_mask_for_predicate( + task_categories, lambda e: e.startswith("other-") + ) + validated, error = tagset_validator( + task_categories, known_set, "task_categories", known_task_ids_url, escape_validation_mask + ) return validated, error @staticmethod @@ -323,14 +339,22 @@ def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> Valid escape_validation_mask = escape_validation_mask_for_predicate( task_ids, lambda e: "-other-" in e or e.startswith("other-") ) - validated, error = tagset_validator(task_ids, known_set, "task_ids", known_task_ids_url, escape_validation_mask) + validated, error = tagset_validator( + task_ids, known_set, "task_ids", known_task_ids_url, escape_validation_mask + ) return validated, error @staticmethod def validate_mulitlinguality(multilinguality: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: - escape_validation_mask = escape_validation_mask_for_predicate(multilinguality, lambda e: e.startswith("other-")) + escape_validation_mask = escape_validation_mask_for_predicate( + multilinguality, lambda e: e.startswith("other-") + ) validated, error = tagset_validator( - multilinguality, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url, escape_validation_mask + multilinguality, + list(known_multilingualities.keys()), + "multilinguality", + known_size_categories_url, + escape_validation_mask, ) return validated, error @@ -367,7 +391,7 @@ def validate_paperswithcode_id_errors(paperswithcode_id: Optional[str]) -> Valid return paperswithcode_id, None @staticmethod - def validate_pretty_names(pretty_names: Union[str, Dict[str,str]]): + def validate_pretty_names(pretty_names: Union[str, Dict[str, str]]): if isinstance(pretty_names, str): if len(pretty_names) == 0: return None, f"The pretty name must have a length greater than 0 but got an empty string." @@ -375,14 +399,14 @@ def validate_pretty_names(pretty_names: Union[str, Dict[str,str]]): error_string = "" for key, value in pretty_names.items(): if len(value) == 0: - error_string+=f"The pretty name must have a length greater than 0 but got an empty string for config: {key}.\n" - + error_string += f"The pretty name must have a length greater than 0 but got an empty string for config: {key}.\n" + if error_string == "": return None, error_string else: return pretty_names, None - def get_metadata_by_config_name(self, name: str) -> MetadataOutput: + def get_metadata_by_config_name(self, name: str) -> DatasetMetadata: metadata_dict = self.asdict() config_name_hits = 0 result_dict = {} @@ -393,11 +417,13 @@ def get_metadata_by_config_name(self, name: str) -> MetadataOutput: for config_name, value in tag_value.items(): if config_name == name: result_dict[tag_key] = value - config_name_hits+=1 + config_name_hits += 1 + + if config_name_hits == 0: + logger.warning( + "No matching config names found in the metadata, using the common values to create metadata." + ) - if config_name_hits==0: - logger.warning("No matching config names found in the metadata, using the common values to create metadata.") - return DatasetMetadata(**result_dict) From d40e56a9d86f15d27984af8c7cd6b6915198588b Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 19:04:37 +0530 Subject: [PATCH 08/15] Simplify predicate_fn in validation --- src/datasets/utils/metadata.py | 63 +++++++--------------------------- 1 file changed, 13 insertions(+), 50 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index a0ec5f0f67b..73a9a50c928 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -82,24 +82,22 @@ def tagset_validator( reference_values: List[str], name: str, url: str, - escape_validation_mask: Union[List[str], Dict[str, List[str]]] = None, + escape_validation_predicate_fn: Optional[Callable[[Any], bool]] = None, ) -> ValidatorOutput: if isinstance(items, list): - if escape_validation_mask is not None: + if escape_validation_predicate_fn is not None: invalid_values = [ - v for idx, v in enumerate(items) if v not in reference_values and escape_validation_mask[idx] is False + v for v in items if v not in reference_values and escape_validation_predicate_fn(v) is False ] else: invalid_values = [v for v in items if v not in reference_values] else: invalid_values = [] - if escape_validation_mask is not None: + if escape_validation_predicate_fn is not None: for config_name, values in items.items(): invalid_values += [ - v - for idx, v in enumerate(values) - if v not in reference_values and escape_validation_mask[config_name][idx] is False + v for v in values if v not in reference_values and escape_validation_predicate_fn(v) is False ] else: for config_name, values in items.items(): @@ -110,33 +108,6 @@ def tagset_validator( return items, None -def escape_validation_mask_for_predicate( - items: Union[List[Any], Dict[str, List[Any]]], predicate_fn: Callable[[Any], bool] -) -> Union[List[Any], Dict[str, List[Any]]]: - values_to_ignore = [] - if isinstance(items, list): - mask = [] - for v in items: - if predicate_fn(v): - mask.append(True) - values_to_ignore.append(v) - else: - mask.append(False) - if isinstance(items, dict): - mask = {} - for k, values in items.items(): - mask[k] = [] - for v in values: - if predicate_fn(v): - mask[k].append(True) - values_to_ignore.append(v) - else: - mask[k].append(False) - if len(values_to_ignore) > 0: - logger.warning(f"The following values will escape validation: {values_to_ignore}") - return mask - - def validate_type(value: Any, expected_type: Type): error_string = "" if expected_type == str: @@ -310,11 +281,9 @@ def validate_language_codes(languages: Union[List[str], Dict[str, List[str]]]) - @staticmethod def validate_licences(licenses: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: - escape_validation_mask = escape_validation_mask_for_predicate( - licenses, lambda e: "-other-" in e or e.startswith("other-") - ) + escape_validation_predicate_fn = lambda e: "-other-" in e or e.startswith("other-") validated, error = tagset_validator( - licenses, list(known_licenses.keys()), "licenses", known_licenses_url, escape_validation_mask + licenses, list(known_licenses.keys()), "licenses", known_licenses_url, escape_validation_predicate_fn ) return validated, error @@ -323,11 +292,9 @@ def validate_task_categories(task_categories: Union[List[str], Dict[str, List[st # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) - escape_validation_mask = escape_validation_mask_for_predicate( - task_categories, lambda e: e.startswith("other-") - ) + escape_validation_predicate_fn = lambda e: e.startswith("other-") validated, error = tagset_validator( - task_categories, known_set, "task_categories", known_task_ids_url, escape_validation_mask + task_categories, known_set, "task_categories", known_task_ids_url, escape_validation_predicate_fn ) return validated, error @@ -336,25 +303,21 @@ def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> Valid # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] - escape_validation_mask = escape_validation_mask_for_predicate( - task_ids, lambda e: "-other-" in e or e.startswith("other-") - ) + escape_validation_predicate_fn = lambda e: "-other-" in e or e.startswith("other-") validated, error = tagset_validator( - task_ids, known_set, "task_ids", known_task_ids_url, escape_validation_mask + task_ids, known_set, "task_ids", known_task_ids_url, escape_validation_predicate_fn ) return validated, error @staticmethod def validate_mulitlinguality(multilinguality: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: - escape_validation_mask = escape_validation_mask_for_predicate( - multilinguality, lambda e: e.startswith("other-") - ) + escape_validation_predicate_fn = lambda e: e.startswith("other-") validated, error = tagset_validator( multilinguality, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url, - escape_validation_mask, + escape_validation_predicate_fn, ) return validated, error From 9c55f1a0c52159b4bb474c94d027be6d0d9d79a6 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 20:54:46 +0530 Subject: [PATCH 09/15] Fix existing tests --- src/datasets/utils/metadata.py | 25 +++++++-- tests/test_metadata_util.py | 100 ++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 39 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 73a9a50c928..f639333d05d 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -110,9 +110,20 @@ def tagset_validator( def validate_type(value: Any, expected_type: Type): error_string = "" + NoneType = type(None) + if expected_type == NoneType: + if not isinstance(value, NoneType): + return f"Expected `{NoneType}`. Found value: `{value}` of type `{type(value)}`.\n" + else: + return error_string if expected_type == str: - if not isinstance(value, str) or len(value) == 0: - return f"Expected `{str}` with length > 0. Found value: `{value}` of type: `{type(value)}`, with length: {len(value)}.\n" + if not isinstance(value, str): + return f"Expected `{str}`. Found value: `{value}` of type: `{type(value)}`.\n" + + elif isinstance(value, str) and len(value) == 0: + return ( + f"Expected `{str}` with length > 0. Found value: `{value}` of type: `{type(value)}` with length: 0.\n" + ) else: return error_string # Add more `elif` statements if primitive type checking is needed @@ -158,11 +169,13 @@ def validate_type(value: Any, expected_type: Type): def validate_metadata_type(metadata_dict: dict): - fields_types = {field.name: field.type for field in fields(DatasetMetadata)} + field_types = {field.name: field.type for field in fields(DatasetMetadata)} typing_errors = {} - for field_name, field_type in fields_types.items(): - field_type_error = validate_type(metadata_dict[field_name], field_type) + for field_name, field_value in metadata_dict.items(): + field_type_error = validate_type( + metadata_dict[field_name], field_types.get(field_name, Union[List[str], Dict[str, List[str]]]) + ) if field_type_error != "": typing_errors[field_name] = field_type_error if len(typing_errors) > 0: @@ -369,7 +382,7 @@ def validate_pretty_names(pretty_names: Union[str, Dict[str, str]]): else: return pretty_names, None - def get_metadata_by_config_name(self, name: str) -> DatasetMetadata: + def get_metadata_by_config_name(self, name: str) -> "DatasetMetadata": metadata_dict = self.asdict() config_name_hits = 0 result_dict = {} diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index 1f1bab833f1..3484b65919c 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -5,7 +5,6 @@ from datasets.utils.metadata import ( DatasetMetadata, - escape_validation_for_predicate, metadata_dict_from_readme, tagset_validator, validate_metadata_type, @@ -53,7 +52,8 @@ def test_validate_metadata_type(self): "tag": ["list", "of", "values"], "another tag": ["Another", {"list"}, ["of"], 0x646D46736457567A], } - validate_metadata_type(metadata_dict) + with self.assertRaises(TypeError): + validate_metadata_type(metadata_dict) metadata_dict = {"tag1": []} with self.assertRaises(TypeError): @@ -67,48 +67,68 @@ def test_tagset_validator(self): name = "test_tag" url = "https://dummy.hf.co" - values = ["tag1", "tag2", "tag2", "tag3"] + items = ["tag1", "tag2", "tag2", "tag3"] reference_values = ["tag1", "tag2", "tag3"] - returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) - self.assertListEqual(returned_values, values) + returned_values, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, items) self.assertIsNone(error) - values = [] + items = [] reference_values = ["tag1", "tag2", "tag3"] - returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) - self.assertListEqual(returned_values, values) + items, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) + self.assertListEqual(items, []) self.assertIsNone(error) - values = [] + items = [] reference_values = [] - returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) - self.assertListEqual(returned_values, values) + returned_values, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, []) self.assertIsNone(error) - values = ["tag1", "tag2", "tag2", "tag3", "unknown tag"] + items = ["tag1", "tag2", "tag2", "tag3", "unknown tag"] reference_values = ["tag1", "tag2", "tag3"] - returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + returned_values, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) self.assertListEqual(returned_values, []) self.assertEqual(error, f"{['unknown tag']} are not registered tags for '{name}', reference at {url}") - def test_escape_validation_for_predicate(self): - def predicate_fn(string: str) -> bool: + def predicate_fn(string): return "ignore" in string - values = ["process me", "process me too", "ignore me"] - to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) - self.assertListEqual(to_ignore, ["ignore me"]) - self.assertListEqual(to_validate, ["process me", "process me too"]) - - values = ["process me", "process me too"] - to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) - self.assertListEqual(to_ignore, []) - self.assertListEqual(to_validate, values) + items = ["process me", "process me too", "ignore me"] + reference_values = ["process me too"] + returned_values, error = tagset_validator( + items=items, + reference_values=reference_values, + name=name, + url=url, + escape_validation_predicate_fn=predicate_fn, + ) + self.assertListEqual(returned_values, []) + self.assertEqual(error, f"{['process me']} are not registered tags for '{name}', reference at {url}") + + items = ["process me", "process me too", "ignore me"] + reference_values = ["process me too", "process me"] + returned_values, error = tagset_validator( + items=items, + reference_values=reference_values, + name=name, + url=url, + escape_validation_predicate_fn=predicate_fn, + ) + self.assertListEqual(returned_values, items) + self.assertIsNone(error) - values = ["this value will be ignored", "ignore this one two"] - to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) - self.assertListEqual(to_ignore, values) - self.assertListEqual(to_validate, []) + items = ["ignore me too", "ignore me"] + reference_values = ["process me too"] + returned_values, error = tagset_validator( + items=items, + reference_values=reference_values, + name=name, + url=url, + escape_validation_predicate_fn=predicate_fn, + ) + self.assertListEqual(returned_values, items) + self.assertIsNone(error) def test_yaml_block_from_readme(self): with tempfile.TemporaryDirectory() as tmp_dir: @@ -177,6 +197,7 @@ def test_from_yaml_string(self): - unknown multilinguality: - monolingual + pretty_names: Test Dataset size_categories: - 10K Date: Tue, 1 Jun 2021 22:06:04 +0530 Subject: [PATCH 10/15] Add test for get metadata by config name --- src/datasets/utils/metadata.py | 16 ++- tests/test_metadata_util.py | 198 +++++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+), 5 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index f639333d05d..953474d7713 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -1,7 +1,7 @@ import json import logging from collections import Counter -from dataclasses import dataclass, fields +from dataclasses import asdict, dataclass, fields from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union @@ -383,19 +383,25 @@ def validate_pretty_names(pretty_names: Union[str, Dict[str, str]]): return pretty_names, None def get_metadata_by_config_name(self, name: str) -> "DatasetMetadata": - metadata_dict = self.asdict() - config_name_hits = 0 + metadata_dict = asdict(self) + config_name_hit = [] + has_multi_configs = [] result_dict = {} for tag_key, tag_value in metadata_dict.items(): if isinstance(tag_value, str) or isinstance(tag_value, list): result_dict[tag_key] = tag_value elif isinstance(tag_value, dict): + has_multi_configs.append(tag_key) for config_name, value in tag_value.items(): if config_name == name: result_dict[tag_key] = value - config_name_hits += 1 + config_name_hit.append(tag_key) - if config_name_hits == 0: + if len(has_multi_configs) > 0 and has_multi_configs != config_name_hit: + raise TypeError( + f"The following tags have multiple configs: {has_multi_configs} but the config `{name}` was found only in: {config_name_hit}." + ) + if config_name_hit == 0: logger.warning( "No matching config names found in the metadata, using the common values to create metadata." ) diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index 3484b65919c..dfd23779448 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -1,6 +1,7 @@ import re import tempfile import unittest +from dataclasses import asdict from pathlib import Path from datasets.utils.metadata import ( @@ -427,3 +428,200 @@ def test_from_yaml_string(self): with self.assertRaises(TypeError): metadata = DatasetMetadata.from_yaml_string(valid_yaml_string_with_list_paperswithcode_id) metadata.validate() + + def test_get_metadata_by_config_name(self): + valid_yaml_with_multiple_configs = _dedent( + """\ + annotations_creators: + - found + language_creators: + - found + languages: + en: + - en + fr: + - fr + licenses: + - unknown + multilinguality: + - monolingual + pretty_names: + en: English Test Dataset + fr: French Test Dataset + size_categories: + - 10K Date: Tue, 1 Jun 2021 22:09:25 +0530 Subject: [PATCH 11/15] Update test dataset cards --- tests/test_dataset_cards.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_dataset_cards.py b/tests/test_dataset_cards.py index 81b0680b6c6..cb8327edf15 100644 --- a/tests/test_dataset_cards.py +++ b/tests/test_dataset_cards.py @@ -60,7 +60,8 @@ def test_changed_dataset_card(dataset_name): except Exception as readme_error: error_messages.append(f"The following issues have been found in the dataset cards:\nREADME:\n{readme_error}") try: - DatasetMetadata.from_readme(card_path) + metadata = DatasetMetadata.from_readme(card_path) + metadata.validate() except Exception as metadata_error: error_messages.append( f"The following issues have been found in the dataset cards:\nYAML tags:\n{metadata_error}" @@ -81,7 +82,8 @@ def test_dataset_card(dataset_name): except Exception as readme_error: error_messages.append(f"The following issues have been found in the dataset cards:\nREADME:\n{readme_error}") try: - DatasetMetadata.from_readme(card_path) + metadata = DatasetMetadata.from_readme(card_path) + metadata.validate() except Exception as metadata_error: error_messages.append( f"The following issues have been found in the dataset cards:\nYAML tags:\n{metadata_error}" From 7eb967ca6a708bc86e2fbc6a77a281c7bdae3304 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 22:10:26 +0530 Subject: [PATCH 12/15] Fix style --- tests/test_metadata_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index dfd23779448..38d7b5cd458 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -445,7 +445,7 @@ def test_get_metadata_by_config_name(self): - unknown multilinguality: - monolingual - pretty_names: + pretty_names: en: English Test Dataset fr: French Test Dataset size_categories: From 5da0771ac6238c4c126d72f60b83e402c7d280a3 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 1 Jun 2021 22:15:45 +0530 Subject: [PATCH 13/15] Fix style --- src/datasets/utils/metadata.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 953474d7713..bcdd0150250 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -294,9 +294,12 @@ def validate_language_codes(languages: Union[List[str], Dict[str, List[str]]]) - @staticmethod def validate_licences(licenses: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: - escape_validation_predicate_fn = lambda e: "-other-" in e or e.startswith("other-") validated, error = tagset_validator( - licenses, list(known_licenses.keys()), "licenses", known_licenses_url, escape_validation_predicate_fn + licenses, + list(known_licenses.keys()), + "licenses", + known_licenses_url, + lambda e: "-other-" in e or e.startswith("other-"), ) return validated, error @@ -305,9 +308,8 @@ def validate_task_categories(task_categories: Union[List[str], Dict[str, List[st # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) - escape_validation_predicate_fn = lambda e: e.startswith("other-") validated, error = tagset_validator( - task_categories, known_set, "task_categories", known_task_ids_url, escape_validation_predicate_fn + task_categories, known_set, "task_categories", known_task_ids_url, lambda e: e.startswith("other-") ) return validated, error @@ -316,21 +318,19 @@ def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> Valid # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] - escape_validation_predicate_fn = lambda e: "-other-" in e or e.startswith("other-") validated, error = tagset_validator( - task_ids, known_set, "task_ids", known_task_ids_url, escape_validation_predicate_fn + task_ids, known_set, "task_ids", known_task_ids_url, lambda e: "-other-" in e or e.startswith("other-") ) return validated, error @staticmethod def validate_mulitlinguality(multilinguality: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: - escape_validation_predicate_fn = lambda e: e.startswith("other-") validated, error = tagset_validator( multilinguality, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url, - escape_validation_predicate_fn, + lambda e: e.startswith("other-"), ) return validated, error From ecc003205bbaeb6f1bea1e7b587b3128cf0a7abd Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Wed, 2 Jun 2021 00:37:48 +0530 Subject: [PATCH 14/15] Support `validate` method in ReadMe --- src/datasets/utils/readme.py | 52 +++++++++++++++---------------- tests/test_dataset_cards.py | 30 ++++++++++++++---- tests/test_readme_util.py | 59 +++++++++++++++++++++++++++++++++--- 3 files changed, 105 insertions(+), 36 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 45b8713f085..9498ddd5fd6 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -1,5 +1,4 @@ import logging -from dataclasses import dataclass from pathlib import Path from typing import Any, List, Tuple @@ -38,22 +37,20 @@ def load_yaml_resource(resource: str) -> Tuple[Any, str]: ReadmeValidatorOutput = Tuple[dict, List[str], List[str]] -@dataclass class Section: - name: str - level: str - lines: List[str] = None - - def __post_init__(self): + def __init__(self, name: str, level: str, lines: List[str] = None, suppress_parsing_errors: bool = False): + self.name = name + self.level = level + self.lines = lines self.text = "" self.is_empty_text = True self.content = {} self.parsing_error_list = [] self.parsing_warning_list = [] if self.lines is not None: - self.parse() + self.parse(suppress_parsing_errors=suppress_parsing_errors) - def parse(self): + def parse(self, suppress_parsing_errors: bool = False): current_sub_level = "" current_lines = [] code_start = False @@ -89,6 +86,12 @@ def parse(self): if self.text != "" and self.text not in FILLER_TEXT: self.is_empty_text = False + if self.level == "" and not suppress_parsing_errors: + if self.parsing_error_list != [] or self.parsing_warning_list != []: + errors = "\n".join(list(map(lambda x: "-\t" + x, self.parsing_error_list + self.parsing_warning_list))) + error_string = f"The following issues were found while parsing the README at `{self.name}`:\n" + errors + raise ValueError(error_string) + def validate(self, structure: dict) -> ReadmeValidatorOutput: """Validates a Section class object recursively using the structure provided as a dictionary. @@ -151,8 +154,6 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: warning_list.append( f"`{self.name}` has an extra subsection: `{name}`. Skipping further validation checks for this subsection as expected structure is unknown." ) - error_list = self.parsing_error_list + error_list - warning_list = self.parsing_warning_list + warning_list if error_list: # If there are errors, do not return the dictionary as it is invalid return {}, error_list, warning_list @@ -170,40 +171,39 @@ def to_dict(self) -> dict: class ReadMe(Section): # Level 0 - def __init__(self, name: str, lines: List[str], structure: dict = None): + def __init__(self, name: str, lines: List[str], structure: dict = None, suppress_parsing_errors: bool = False): super().__init__(name=name, level="") # Not using lines here as we need to use a child class parse self.structure = structure self.yaml_tags_line_count = -2 self.tag_count = 0 self.lines = lines if self.lines is not None: - self.parse() + self.parse(suppress_parsing_errors=suppress_parsing_errors) - # Validation + def validate(self): if self.structure is None: - content, error_list, warning_list = self.validate(readme_structure) + content, error_list, warning_list = self._validate(readme_structure) else: - content, error_list, warning_list = self.validate(self.structure) - - error_list = self.parsing_error_list + error_list - warning_list = self.parsing_warning_list + warning_list + content, error_list, warning_list = self._validate(self.structure) if error_list != [] or warning_list != []: errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list))) error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors raise ValueError(error_string) @classmethod - def from_readme(cls, path: Path, structure: dict = None): + def from_readme(cls, path: Path, structure: dict = None, suppress_parsing_errors: bool = False): with open(path) as f: lines = f.readlines() - return cls(path, lines, structure) + return cls(path, lines, structure, suppress_parsing_errors=suppress_parsing_errors) @classmethod - def from_string(cls, string: str, structure: dict = None, root_name: str = "root"): + def from_string( + cls, string: str, structure: dict = None, root_name: str = "root", suppress_parsing_errors: bool = False + ): lines = string.split("\n") - return cls(root_name, lines, structure) + return cls(root_name, lines, structure, suppress_parsing_errors=suppress_parsing_errors) - def parse(self): + def parse(self, suppress_parsing_errors: bool = False): # Skip Tags line_count = 0 @@ -218,13 +218,13 @@ def parse(self): self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item. else: self.lines = self.lines[self.tag_count :] - super().parse() + super().parse(suppress_parsing_errors=suppress_parsing_errors) def __str__(self): """Returns the string of dictionary representation of the ReadMe.""" return str(self.to_dict()) - def validate(self, readme_structure): + def _validate(self, readme_structure): error_list = [] warning_list = [] if self.yaml_tags_line_count == 0: diff --git a/tests/test_dataset_cards.py b/tests/test_dataset_cards.py index cb8327edf15..0ed1009022b 100644 --- a/tests/test_dataset_cards.py +++ b/tests/test_dataset_cards.py @@ -56,9 +56,18 @@ def test_changed_dataset_card(dataset_name): assert card_path.exists() error_messages = [] try: - ReadMe.from_readme(card_path) - except Exception as readme_error: - error_messages.append(f"The following issues have been found in the dataset cards:\nREADME:\n{readme_error}") + readme = ReadMe.from_readme(card_path) + except Exception as readme_parsing_error: + error_messages.append( + f"The following issues have been found in the dataset cards:\nREADME Parsing:\n{readme_parsing_error}" + ) + try: + readme = ReadMe.from_readme(card_path, suppress_parsing_errors=True) + readme.validate() + except Exception as readme_validation_error: + error_messages.append( + f"The following issues have been found in the dataset cards:\nREADME Validation:\n{readme_validation_error}" + ) try: metadata = DatasetMetadata.from_readme(card_path) metadata.validate() @@ -78,9 +87,18 @@ def test_dataset_card(dataset_name): assert card_path.exists() error_messages = [] try: - ReadMe.from_readme(card_path) - except Exception as readme_error: - error_messages.append(f"The following issues have been found in the dataset cards:\nREADME:\n{readme_error}") + readme = ReadMe.from_readme(card_path) + except Exception as readme_parsing_error: + error_messages.append( + f"The following issues have been found in the dataset cards:\nREADME Parsing:\n{readme_parsing_error}" + ) + try: + readme = ReadMe.from_readme(card_path, suppress_parsing_errors=True) + readme.validate() + except Exception as readme_validation_error: + error_messages.append( + f"The following issues have been found in the dataset cards:\nREADME Validation:\n{readme_validation_error}" + ) try: metadata = DatasetMetadata.from_readme(card_path) metadata.validate() diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 711c6ea55c5..21e9163e628 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -375,7 +375,7 @@ Language Text """ -EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections." +EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found while parsing the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections." @pytest.mark.parametrize( @@ -400,17 +400,38 @@ def test_readme_from_string_correct(readme_md, expected_dict): (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), - (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT), ], ) -def test_readme_from_string_errors(readme_md, expected_error): +def test_readme_from_string_validation_errors(readme_md, expected_error): + with pytest.raises(ValueError, match=re.escape(expected_error.format(path="root"))): + readme = ReadMe.from_string(readme_md, example_yaml_structure) + readme.validate() + + +@pytest.mark.parametrize( + "readme_md, expected_error", + [ + (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), + ], +) +def test_readme_from_string_parsing_errors(readme_md, expected_error): with pytest.raises(ValueError, match=re.escape(expected_error.format(path="root"))): ReadMe.from_string(readme_md, example_yaml_structure) +@pytest.mark.parametrize( + "readme_md,", + [ + (README_MULTIPLE_SAME_HEADING_1), + ], +) +def test_readme_from_string_suppress_parsing_errors(readme_md): + ReadMe.from_string(readme_md, example_yaml_structure, suppress_parsing_errors=True) + + @pytest.mark.parametrize( "readme_md, expected_dict", [ @@ -441,13 +462,29 @@ def test_readme_from_readme_correct(readme_md, expected_dict): (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), - (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT), ], ) def test_readme_from_readme_error(readme_md, expected_error): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(readme_md) + expected_error = expected_error.format(path=path) + with pytest.raises(ValueError, match=re.escape(expected_error)): + readme = ReadMe.from_readme(path, example_yaml_structure) + readme.validate() + + +@pytest.mark.parametrize( + "readme_md, expected_error", + [ + (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), + ], +) +def test_readme_from_readme_parsing_errors(readme_md, expected_error): with tempfile.TemporaryDirectory() as tmp_dir: path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: @@ -455,3 +492,17 @@ def test_readme_from_readme_error(readme_md, expected_error): expected_error = expected_error.format(path=path) with pytest.raises(ValueError, match=re.escape(expected_error)): ReadMe.from_readme(path, example_yaml_structure) + + +@pytest.mark.parametrize( + "readme_md,", + [ + (README_MULTIPLE_SAME_HEADING_1), + ], +) +def test_readme_from_readme_suppress_parsing_errors(readme_md): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(readme_md) + ReadMe.from_readme(path, example_yaml_structure, suppress_parsing_errors=True) From 1e6106763055627afb08c501b24dbcbdef226cdc Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Thu, 10 Jun 2021 21:04:32 +0530 Subject: [PATCH 15/15] Add changes from review --- src/datasets/utils/metadata.py | 12 ++++++------ src/datasets/utils/readme.py | 2 +- tests/test_metadata_util.py | 36 +++++++++++++++++----------------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index edc6c6af8de..9e929dc71b4 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -189,7 +189,7 @@ class DatasetMetadata: languages: Union[List[str], Dict[str, List[str]]] licenses: Union[List[str], Dict[str, List[str]]] multilinguality: Union[List[str], Dict[str, List[str]]] - pretty_names: Union[str, Dict[str, str]] + pretty_name: Union[str, Dict[str, str]] size_categories: Union[List[str], Dict[str, List[str]]] source_datasets: Union[List[str], Dict[str, List[str]]] task_categories: Union[List[str], Dict[str, List[str]]] @@ -367,20 +367,20 @@ def validate_paperswithcode_id_errors(paperswithcode_id: Optional[str]) -> Valid return paperswithcode_id, None @staticmethod - def validate_pretty_names(pretty_names: Union[str, Dict[str, str]]): - if isinstance(pretty_names, str): - if len(pretty_names) == 0: + def validate_pretty_name(pretty_name: Union[str, Dict[str, str]]): + if isinstance(pretty_name, str): + if len(pretty_name) == 0: return None, f"The pretty name must have a length greater than 0 but got an empty string." else: error_string = "" - for key, value in pretty_names.items(): + for key, value in pretty_name.items(): if len(value) == 0: error_string += f"The pretty name must have a length greater than 0 but got an empty string for config: {key}.\n" if error_string == "": return None, error_string else: - return pretty_names, None + return pretty_name, None def get_metadata_by_config_name(self, name: str) -> "DatasetMetadata": metadata_dict = asdict(self) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 847c9890e82..d4019661595 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -88,7 +88,7 @@ def parse(self, suppress_parsing_errors: bool = False): if self.level == "" and not suppress_parsing_errors: if self.parsing_error_list != [] or self.parsing_warning_list != []: - errors = "\n".join(list(map(lambda x: "-\t" + x, self.parsing_error_list + self.parsing_warning_list))) + errors = errors = "\n".join("-\t" + x for x in self.parsing_error_list + self.parsing_warning_list) error_string = f"The following issues were found while parsing the README at `{self.name}`:\n" + errors raise ValueError(error_string) diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index 38d7b5cd458..dc0dc156830 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -198,7 +198,7 @@ def test_from_yaml_string(self): - unknown multilinguality: - monolingual - pretty_names: Test Dataset + pretty_name: Test Dataset size_categories: - 10K