huggingface · albertvillanova · Sep 13, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 9, 2021
diff --git a/datasets/docred/README.md b/datasets/docred/README.md
@@ -1,10 +1,27 @@
 ---
+annotations_creators:
+- expert-generated
+language_creators:
+- crowdsourced
 languages:
 - en
+licenses:
+- mit
+multilinguality:
+- monolingual
 paperswithcode_id: docred
+pretty_name: DocRED
+size_categories:
+- 100K<n<1M
+source_datasets:
+- original
+task_categories:
+- text-retrieval
+task_ids:
+- entity-linking-retrieval
 ---
 
-# Dataset Card for "docred"
+# Dataset Card for DocRED
 
 ## Table of Contents
 - [Dataset Description](#dataset-description)
@@ -32,9 +49,8 @@ paperswithcode_id: docred
 
 ## Dataset Description
 
-- **Homepage:** [https://github.com/thunlp/DocRED](https://github.com/thunlp/DocRED)
-- **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
-- **Paper:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+- **Repository:** [https://github.com/thunlp/DocRED](https://github.com/thunlp/DocRED)
+- **Paper:** [DocRED: A Large-Scale Document-Level Relation Extraction Dataset](https://arxiv.org/abs/1906.06127)
 - **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 - **Size of downloaded dataset files:** 20.03 MB
 - **Size of the generated dataset:** 19.19 MB
@@ -121,7 +137,7 @@ The data fields are the same among all splits.
 
 | name  |train_annotated|train_distant|validation|test|
 |-------|--------------:|------------:|---------:|---:|
-|default|           3053|         1000|      1000|1000|
+|default|           3053|       101873|       998|1000|
 
 ## Dataset Creation
 

diff --git a/datasets/docred/dataset_infos.json b/datasets/docred/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n    - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n    - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n    - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n  title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n  author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu,   Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n  booktitle={Proceedings of ACL 2019},\n  year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7": {"num_bytes": 4299810, "checksum": "85691c5ca1df0048bffab1c1cf53d7d35b5de40f3de0a2c563c03da28746d5cb"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 21006099, "dataset_size": 20127207, "size_in_bytes": 41133306}}
+{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n    - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n    - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n    - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n  title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n  author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu,   Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n  booktitle={Proceedings of ACL 2019},\n  year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3425030, "num_examples": 998, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 346001876, "num_examples": 101873, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL": {"num_bytes": 4287303, "checksum": "4554f7487a6fda3bab4d4e59432e065b7485dfb885bd7f05fd60fc7e93ee7e3e"}, "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw": {"num_bytes": 437046821, "checksum": "db6d3cdaab8d36926318bb9339f6fd82d19dbacd186c74d7c20c734355a58b36"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 458040413, "post_processing_size": null, "dataset_size": 362683939, "size_in_bytes": 820724352}}
diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py
@@ -28,8 +28,8 @@
 """
 
 _URLS = {
-    "dev": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
-    "train_distant": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
+    "dev": "https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL",
+    "train_distant": "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw",
     "train_annotated": "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9",
     "test": "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum",
     "rel_info": "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download",
@@ -100,8 +100,11 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, filepath, rel_info):
         """Generate DocRED examples."""
-        relation_name_map = json.load(open(rel_info))
-        data = json.load(open(filepath))
+
+        with open(rel_info, encoding="utf-8") as f:
+            relation_name_map = json.load(f)
+        with open(filepath, encoding="utf-8") as f:
+            data = json.load(f)
 
         for idx, example in enumerate(data):
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu, Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n booktitle={Proceedings of ACL 2019},\n year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7": {"num_bytes": 4299810, "checksum": "85691c5ca1df0048bffab1c1cf53d7d35b5de40f3de0a2c563c03da28746d5cb"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 21006099, "dataset_size": 20127207, "size_in_bytes": 41133306}}
		{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu, Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n booktitle={Proceedings of ACL 2019},\n year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3425030, "num_examples": 998, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 346001876, "num_examples": 101873, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL": {"num_bytes": 4287303, "checksum": "4554f7487a6fda3bab4d4e59432e065b7485dfb885bd7f05fd60fc7e93ee7e3e"}, "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw": {"num_bytes": 437046821, "checksum": "db6d3cdaab8d36926318bb9339f6fd82d19dbacd186c74d7c20c734355a58b36"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 458040413, "post_processing_size": null, "dataset_size": 362683939, "size_in_bytes": 820724352}}