Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/super_glue/dataset_infos.json

Large diffs are not rendered by default.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

3 changes: 0 additions & 3 deletions datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/train.jsonl

This file was deleted.

2 changes: 0 additions & 2 deletions datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/val.jsonl

This file was deleted.

26 changes: 20 additions & 6 deletions datasets/super_glue/super_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,12 +296,13 @@ def __init__(self, features, data_url, citation, url, label_classes=("False", "T
**kwargs: keyword arguments forwarded to super.
"""
# Version history:
# 1.0.3: Fix not including entity position in ReCoRD.
# 1.0.2: Fixed non-nondeterminism in ReCoRD.
# 1.0.1: Change from the pre-release trial version of SuperGLUE (v1.9) to
# the full release (v2.0).
# 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
# 0.0.2: Initial version.
super(SuperGlueConfig, self).__init__(version=datasets.Version("1.0.2"), **kwargs)
super(SuperGlueConfig, self).__init__(version=datasets.Version("1.0.3"), **kwargs)
self.features = features
self.label_classes = label_classes
self.data_url = data_url
Expand Down Expand Up @@ -355,7 +356,7 @@ class SuperGlue(datasets.GeneratorBasedBuilder):
# Note that entities and answers will be a sequences of strings. Query
# will contain @placeholder as a substring, which represents the word
# to be substituted in.
features=["passage", "query", "entities", "answers"],
features=["passage", "query", "entities", "entity_ranges", "answers"],
data_url="https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip",
citation=_RECORD_CITATION,
url="https://sheng-z.github.io/ReCoRD-explorer/",
Expand Down Expand Up @@ -453,6 +454,10 @@ def _info(self):
if self.config.name == "record":
# Entities are the set of possible choices for the placeholder.
features["entities"] = datasets.features.Sequence(datasets.Value("string"))
# The start and end indices of paragraph text for each entity.
features["entity_ranges"] = datasets.features.Sequence(
datasets.features.Sequence(datasets.Value("int32"), length=2)
)
# Answers are the subset of entities that are correct.
features["answers"] = datasets.features.Sequence(datasets.Value("string"))
else:
Expand Down Expand Up @@ -523,11 +528,13 @@ def _generate_examples(self, data_file, split):
}
elif self.config.name == "record":
passage = row["passage"]
entity_texts, entity_ranges = _get_record_entities(passage)
for qa in row["qas"]:
yield qa["idx"], {
"passage": passage["text"],
"query": qa["query"],
"entities": _get_record_entities(passage),
"entities": entity_texts,
"entity_ranges": entity_ranges,
"answers": _get_record_answers(qa),
"idx": {"passage": row["idx"], "query": qa["idx"]},
}
Expand Down Expand Up @@ -603,10 +610,17 @@ def _cast_label(label):
def _get_record_entities(passage):
"""Returns the unique set of entities."""
text = passage["text"]
entities = set()
entities = list()
for entity in passage["entities"]:
entities.add(text[entity["start"] : entity["end"] + 1])
return sorted(entities)
entity_text = text[entity["start"] : entity["end"] + 1]
entity_span = (entity["start"], entity["end"] + 1)
entities.append({"text": entity_text, "span": entity_span})
entities = sorted(entities, key=lambda e: e["span"][0]) # sort by start index
entity_texts, entity_ranges = [], []
for entity in entities:
entity_texts.append(entity["text"])
entity_ranges.append(entity["span"])
return entity_texts, entity_ranges


def _get_record_answers(qa):
Expand Down