Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions datasets/docred/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,27 @@
---
annotations_creators:
- expert-generated
language_creators:
- crowdsourced
languages:
- en
licenses:
- mit
multilinguality:
- monolingual
paperswithcode_id: docred
pretty_name: DocRED
size_categories:
- 100K<n<1M
source_datasets:
- original
task_categories:
- text-retrieval
task_ids:
- entity-linking-retrieval
---

# Dataset Card for "docred"
# Dataset Card for DocRED

## Table of Contents
- [Dataset Description](#dataset-description)
Expand Down Expand Up @@ -32,9 +49,8 @@ paperswithcode_id: docred

## Dataset Description

- **Homepage:** [https://github.com/thunlp/DocRED](https://github.com/thunlp/DocRED)
- **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Paper:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Repository:** [https://github.com/thunlp/DocRED](https://github.com/thunlp/DocRED)
- **Paper:** [DocRED: A Large-Scale Document-Level Relation Extraction Dataset](https://arxiv.org/abs/1906.06127)
- **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Size of downloaded dataset files:** 20.03 MB
- **Size of the generated dataset:** 19.19 MB
Expand Down Expand Up @@ -121,7 +137,7 @@ The data fields are the same among all splits.

| name |train_annotated|train_distant|validation|test|
|-------|--------------:|------------:|---------:|---:|
|default| 3053| 1000| 1000|1000|
|default| 3053| 101873| 998|1000|

## Dataset Creation

Expand Down
2 changes: 1 addition & 1 deletion datasets/docred/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu, Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n booktitle={Proceedings of ACL 2019},\n year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7": {"num_bytes": 4299810, "checksum": "85691c5ca1df0048bffab1c1cf53d7d35b5de40f3de0a2c563c03da28746d5cb"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 21006099, "dataset_size": 20127207, "size_in_bytes": 41133306}}
{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu, Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n booktitle={Proceedings of ACL 2019},\n year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3425030, "num_examples": 998, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 346001876, "num_examples": 101873, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL": {"num_bytes": 4287303, "checksum": "4554f7487a6fda3bab4d4e59432e065b7485dfb885bd7f05fd60fc7e93ee7e3e"}, "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw": {"num_bytes": 437046821, "checksum": "db6d3cdaab8d36926318bb9339f6fd82d19dbacd186c74d7c20c734355a58b36"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 458040413, "post_processing_size": null, "dataset_size": 362683939, "size_in_bytes": 820724352}}
11 changes: 7 additions & 4 deletions datasets/docred/docred.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
"""

_URLS = {
"dev": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
"train_distant": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
"dev": "https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL",
"train_distant": "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw",
"train_annotated": "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9",
"test": "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum",
"rel_info": "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download",
Expand Down Expand Up @@ -100,8 +100,11 @@ def _split_generators(self, dl_manager):

def _generate_examples(self, filepath, rel_info):
"""Generate DocRED examples."""
relation_name_map = json.load(open(rel_info))
data = json.load(open(filepath))

with open(rel_info, encoding="utf-8") as f:
relation_name_map = json.load(f)
with open(filepath, encoding="utf-8") as f:
data = json.load(f)

for idx, example in enumerate(data):

Expand Down