diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 809b8dc07dc..b2d4f93618a 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -175,6 +175,10 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata": :obj:`TypeError`: If the dataset's metadata is invalid """ metada_dict = yaml.safe_load(string) or dict() + # flatten the metadata of each config + for key in metada_dict: + if isinstance(metada_dict[key], dict): + metada_dict[key] = list(set(sum(metada_dict[key].values(), []))) return cls(**metada_dict) @staticmethod diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index a24e0741862..a5b4c4257b1 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -1,3 +1,4 @@ +import re import tempfile import unittest from pathlib import Path @@ -13,7 +14,8 @@ def _dedent(string: str) -> str: - return "\n".join([line.lstrip() for line in string.splitlines()]) + indent_level = min(re.search("^ +", t).end() if t.startswith(" ") else 0 for t in string.splitlines()) + return "\n".join([line[indent_level:] for line in string.splitlines()]) README_YAML = """\ @@ -187,6 +189,33 @@ def test_from_yaml_string(self): ) DatasetMetadata.from_yaml_string(valid_yaml_string) + valid_yaml_string_with_configs = _dedent( + """\ + annotations_creators: + - found + language_creators: + - found + languages: + en: + - en + fr: + - fr + licenses: + - unknown + multilinguality: + - monolingual + size_categories: + - 10K