diff --git a/datasets/russian_super_glue/README.md b/datasets/russian_super_glue/README.md new file mode 100644 index 00000000000..b574ee1ffee --- /dev/null +++ b/datasets/russian_super_glue/README.md @@ -0,0 +1,683 @@ +--- +pretty_name: Russian SuperGLUE +annotations_creators: +- crowdsourced +- expert-generated +language_creators: +- crowdsourced +- expert-generated +languages: +- ru-RU +licenses: +- mit +multilinguality: +- monolingual +size_categories: +- 100K str: + return data_url.split("/")[-1] + + +def _cast_label(label: Union[str, bool, int]) -> str: + """Converts the label into the appropriate string version.""" + if isinstance(label, str): + return label + elif isinstance(label, bool): + return "True" if label else "False" + elif isinstance(label, int): + assert label in (0, 1) + return str(label) + else: + raise ValueError("Invalid label format.") + + +def _get_rucos_entities(passage: dict) -> List[str]: + """Returns the unique set of entities.""" + text = passage["text"] + entities = set() + for entity in passage["entities"]: + entities.add(text[entity["start"] : entity["end"] + 1]) + return sorted(entities) + + +def _get_rucos_answers(qa: dict) -> List[str]: + """Returns the unique set of answers.""" + if "answers" not in qa: + return [] + answers = set() + for answer in qa["answers"]: + answers.add(answer["text"]) + return sorted(answers)