From 21abf966760a237bd347aeb53c3dd5c16ffe0963 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 9 Sep 2021 10:10:48 +0200
Subject: [PATCH 1/6] Fix URL of train_distant file

---
 datasets/docred/docred.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py
index 8bb97fcf2f2..4588b3ac4da 100644
--- a/datasets/docred/docred.py
+++ b/datasets/docred/docred.py
@@ -29,7 +29,7 @@
 
 _URLS = {
     "dev": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
-    "train_distant": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
+    "train_distant": "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw",
     "train_annotated": "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9",
     "test": "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum",
     "rel_info": "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download",

From 70e40f0351490082568d7926f0f4a85837ead78a Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 9 Sep 2021 10:11:31 +0200
Subject: [PATCH 2/6] Update URL of dev

---
 datasets/docred/docred.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py
index 4588b3ac4da..9de82adb7b2 100644
--- a/datasets/docred/docred.py
+++ b/datasets/docred/docred.py
@@ -28,7 +28,7 @@
 """
 
 _URLS = {
-    "dev": "https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7",
+    "dev": "https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL",
     "train_distant": "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw",
     "train_annotated": "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9",
     "test": "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum",

From fe23be8c81a1e931386753537942a47056079739 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 9 Sep 2021 10:13:26 +0200
Subject: [PATCH 3/6] Fix file opening in Windows by passing encoding

---
 datasets/docred/docred.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py
index 9de82adb7b2..2a974c765de 100644
--- a/datasets/docred/docred.py
+++ b/datasets/docred/docred.py
@@ -100,8 +100,8 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, filepath, rel_info):
         """Generate DocRED examples."""
-        relation_name_map = json.load(open(rel_info))
-        data = json.load(open(filepath))
+        relation_name_map = json.load(open(rel_info, encoding="utf-8"))
+        data = json.load(open(filepath, encoding="utf-8"))
 
         for idx, example in enumerate(data):
 

From ced158c53fcc142eb83ddc6d56b0c8a58d4bef61 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 9 Sep 2021 10:15:12 +0200
Subject: [PATCH 4/6] Use context manager for file opening

---
 datasets/docred/docred.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/datasets/docred/docred.py b/datasets/docred/docred.py
index 2a974c765de..2b3b34e24a1 100644
--- a/datasets/docred/docred.py
+++ b/datasets/docred/docred.py
@@ -100,8 +100,11 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, filepath, rel_info):
         """Generate DocRED examples."""
-        relation_name_map = json.load(open(rel_info, encoding="utf-8"))
-        data = json.load(open(filepath, encoding="utf-8"))
+
+        with open(rel_info, encoding="utf-8") as f:
+            relation_name_map = json.load(f)
+        with open(filepath, encoding="utf-8") as f:
+            data = json.load(f)
 
         for idx, example in enumerate(data):
 

From 28c81b3baaa60fb09cdd5a88bb7238a8f75f6f9c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 9 Sep 2021 10:45:43 +0200
Subject: [PATCH 5/6] Update dataset metadata

---
 datasets/docred/dataset_infos.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/docred/dataset_infos.json b/datasets/docred/dataset_infos.json
index 7795c756ff5..d3981cb09f3 100644
--- a/datasets/docred/dataset_infos.json
+++ b/datasets/docred/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n    - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n    - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n    - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n  title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n  author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu,   Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n  booktitle={Proceedings of ACL 2019},\n  year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 3435087, "num_examples": 1000, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1fDmfUUo5G7gfaoqWWvK81u08m71TK2g7": {"num_bytes": 4299810, "checksum": "85691c5ca1df0048bffab1c1cf53d7d35b5de40f3de0a2c563c03da28746d5cb"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 21006099, "dataset_size": 20127207, "size_in_bytes": 41133306}}
\ No newline at end of file
+{"default": {"description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n    - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n    - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n    - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.\n", "citation": "@inproceedings{yao2019DocRED,\n  title={{DocRED}: A Large-Scale Document-Level Relation Extraction Dataset},\n  author={Yao, Yuan and Ye, Deming and Li, Peng and Han, Xu and Lin, Yankai and Liu, Zhenghao and Liu,   Zhiyuan and Huang, Lixin and Zhou, Jie and Sun, Maosong},\n  booktitle={Proceedings of ACL 2019},\n  year={2019}\n}\n", "homepage": "https://github.com/thunlp/DocRED", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sents": {"feature": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "length": -1, "id": null, "_type": "Sequence"}, "vertexSet": [[{"name": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}, "pos": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "type": {"dtype": "string", "id": null, "_type": "Value"}}]], "labels": {"feature": {"head": {"dtype": "int32", "id": null, "_type": "Value"}, "tail": {"dtype": "int32", "id": null, "_type": "Value"}, "relation_id": {"dtype": "string", "id": null, "_type": "Value"}, "relation_text": {"dtype": "string", "id": null, "_type": "Value"}, "evidence": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "doc_red", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 3425030, "num_examples": 998, "dataset_name": "doc_red"}, "test": {"name": "test", "num_bytes": 2843877, "num_examples": 1000, "dataset_name": "doc_red"}, "train_annotated": {"name": "train_annotated", "num_bytes": 10413156, "num_examples": 3053, "dataset_name": "doc_red"}, "train_distant": {"name": "train_distant", "num_bytes": 346001876, "num_examples": 101873, "dataset_name": "doc_red"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1AHUm1-_V9GCtGuDcc8XrMUCJE8B-HHoL": {"num_bytes": 4287303, "checksum": "4554f7487a6fda3bab4d4e59432e065b7485dfb885bd7f05fd60fc7e93ee7e3e"}, "https://drive.google.com/uc?export=download&id=1Qr4Jct2IJ9BVI86_mCk_Pz0J32ww9dYw": {"num_bytes": 437046821, "checksum": "db6d3cdaab8d36926318bb9339f6fd82d19dbacd186c74d7c20c734355a58b36"}, "https://drive.google.com/uc?export=download&id=1NN33RzyETbanw4Dg2sRrhckhWpzuBQS9": {"num_bytes": 13029595, "checksum": "7e706348a02cf91f38bd8c379f934ab61aedadc901fca10d962c1d82ab78e95b"}, "https://drive.google.com/uc?export=download&id=1lAVDcD94Sigx7gR3jTfStI66o86cflum": {"num_bytes": 3674242, "checksum": "09386b5cb58249d8e087863c379ebd64557169c52ee502193d2f4f215e704ae8"}, "https://drive.google.com/uc?id=1y9A0zKrvETc1ddUFuFhBg3Xfr7FEL4dW&export=download": {"num_bytes": 2452, "checksum": "5ecf4e5e55c179fc83a3a3d19baa01efffecb26ba5edc0b4ac5a54ddf61fe3de"}}, "download_size": 458040413, "post_processing_size": null, "dataset_size": 362683939, "size_in_bytes": 820724352}}
\ No newline at end of file

From 22aca2a58d1592a91d24bedaf728eb996e252c88 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 9 Sep 2021 14:47:35 +0200
Subject: [PATCH 6/6] Update dataset card

---
 datasets/docred/README.md | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/datasets/docred/README.md b/datasets/docred/README.md
index f02679f84f6..fcda8e54448 100644
--- a/datasets/docred/README.md
+++ b/datasets/docred/README.md
@@ -1,10 +1,27 @@
 ---
+annotations_creators:
+- expert-generated
+language_creators:
+- crowdsourced
 languages:
 - en
+licenses:
+- mit
+multilinguality:
+- monolingual
 paperswithcode_id: docred
+pretty_name: DocRED
+size_categories:
+- 100K<n<1M
+source_datasets:
+- original
+task_categories:
+- text-retrieval
+task_ids:
+- entity-linking-retrieval
 ---
 
-# Dataset Card for "docred"
+# Dataset Card for DocRED
 
 ## Table of Contents
 - [Dataset Description](#dataset-description)
@@ -32,9 +49,8 @@ paperswithcode_id: docred
 
 ## Dataset Description
 
-- **Homepage:** [https://github.com/thunlp/DocRED](https://github.com/thunlp/DocRED)
-- **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
-- **Paper:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+- **Repository:** [https://github.com/thunlp/DocRED](https://github.com/thunlp/DocRED)
+- **Paper:** [DocRED: A Large-Scale Document-Level Relation Extraction Dataset](https://arxiv.org/abs/1906.06127)
 - **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 - **Size of downloaded dataset files:** 20.03 MB
 - **Size of the generated dataset:** 19.19 MB
@@ -121,7 +137,7 @@ The data fields are the same among all splits.
 
 | name  |train_annotated|train_distant|validation|test|
 |-------|--------------:|------------:|---------:|---:|
-|default|           3053|         1000|      1000|1000|
+|default|           3053|       101873|       998|1000|
 
 ## Dataset Creation