From 21abf966760a237bd347aeb53c3dd5c16ffe0963 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 9 Sep 2021 10:10:48 +0200 Subject: [PATCH 1/6] Fix URL of train_distant file --- datasets/docred/docred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py index 8bb97fcf2f2..4588b3ac4da 100644 --- a/datasets/docred/docred.py +++ b/datasets/docred/docred.py @@ -29,7 +29,7 @@ _URLS = { "dev": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7", - "train_distant": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7", + "train_distant": "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw", "train_annotated": "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9", "test": "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum", "rel_info": "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download", From 70e40f0351490082568d7926f0f4a85837ead78a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 9 Sep 2021 10:11:31 +0200 Subject: [PATCH 2/6] Update URL of dev --- datasets/docred/docred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py index 4588b3ac4da..9de82adb7b2 100644 --- a/datasets/docred/docred.py +++ b/datasets/docred/docred.py @@ -28,7 +28,7 @@ """ _URLS = { - "dev": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7", + "dev": "https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL", "train_distant": "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw", "train_annotated": "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9", "test": "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum", From fe23be8c81a1e931386753537942a47056079739 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 9 Sep 2021 10:13:26 +0200 Subject: [PATCH 3/6] Fix file opening in Windows by passing encoding --- datasets/docred/docred.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py index 9de82adb7b2..2a974c765de 100644 --- a/datasets/docred/docred.py +++ b/datasets/docred/docred.py @@ -100,8 +100,8 @@ def _split_generators(self, dl_manager): def _generate_examples(self, filepath, rel_info): """Generate DocRED examples.""" - relation_name_map = json.load(open(rel_info)) - data = json.load(open(filepath)) + relation_name_map = json.load(open(rel_info, encoding="utf-8")) + data = json.load(open(filepath, encoding="utf-8")) for idx, example in enumerate(data): From ced158c53fcc142eb83ddc6d56b0c8a58d4bef61 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 9 Sep 2021 10:15:12 +0200 Subject: [PATCH 4/6] Use context manager for file opening --- datasets/docred/docred.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py index 2a974c765de..2b3b34e24a1 100644 --- a/datasets/docred/docred.py +++ b/datasets/docred/docred.py @@ -100,8 +100,11 @@ def _split_generators(self, dl_manager): def _generate_examples(self, filepath, rel_info): """Generate DocRED examples.""" - relation_name_map = json.load(open(rel_info, encoding="utf-8")) - data = json.load(open(filepath, encoding="utf-8")) + + with open(rel_info, encoding="utf-8") as f: + relation_name_map = json.load(f) + with open(filepath, encoding="utf-8") as f: + data = json.load(f) for idx, example in enumerate(data): From 28c81b3baaa60fb09cdd5a88bb7238a8f75f6f9c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 9 Sep 2021 10:45:43 +0200 Subject: [PATCH 5/6] Update dataset metadata --- datasets/docred/dataset_infos.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/docred/dataset_infos.json b/datasets/docred/dataset_infos.json index 7795c756ff5..d3981cb09f3 100644 --- a/datasets/docred/dataset_infos.json +++ b/datasets/docred/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu, Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n booktitle={Proceedings of ACL 2019},\n year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7": {"num_bytes": 4299810, "checksum": "85691c5ca1df0048bffab1c1cf53d7d35b5de40f3de0a2c563c03da28746d5cb"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 21006099, "dataset_size": 20127207, "size_in_bytes": 41133306}} \ No newline at end of file +{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu, Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n booktitle={Proceedings of ACL 2019},\n year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3425030, "num_examples": 998, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 346001876, "num_examples": 101873, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL": {"num_bytes": 4287303, "checksum": "4554f7487a6fda3bab4d4e59432e065b7485dfb885bd7f05fd60fc7e93ee7e3e"}, "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw": {"num_bytes": 437046821, "checksum": "db6d3cdaab8d36926318bb9339f6fd82d19dbacd186c74d7c20c734355a58b36"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 458040413, "post_processing_size": null, "dataset_size": 362683939, "size_in_bytes": 820724352}} \ No newline at end of file From 22aca2a58d1592a91d24bedaf728eb996e252c88 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 9 Sep 2021 14:47:35 +0200 Subject: [PATCH 6/6] Update dataset card --- datasets/docred/README.md | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/datasets/docred/README.md b/datasets/docred/README.md index f02679f84f6..fcda8e54448 100644 --- a/datasets/docred/README.md +++ b/datasets/docred/README.md @@ -1,10 +1,27 @@ --- +annotations_creators: +- expert-generated +language_creators: +- crowdsourced languages: - en +licenses: +- mit +multilinguality: +- monolingual paperswithcode_id: docred +pretty_name: DocRED +size_categories: +- 100K