huggingface · lhoestq · Jun 28, 2021 · Jun 23, 2021 · Jun 28, 2021
diff --git a/datasets/discofuse/dataset_infos.json b/datasets/discofuse/dataset_infos.json
@@ -1 +1 @@
-{"discofuse-sport": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n  title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n  author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n  booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n  note = {arXiv preprint arXiv:1902.10526},\n  year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "discofuse", "config_name": "discofuse-sport", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14736279993, "num_examples": 43291020, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 151656323, "num_examples": 445521, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 150207737, "num_examples": 440902, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/discofuse.appspot.com/discofuse_v1_sports.zip": {"num_bytes": 4326637746, "checksum": "a390083c7923e11efeeea04a9a79074149e5ef9be614466f50aec28f1a5eec41"}}, "download_size": 4326637746, "post_processing_size": null, "dataset_size": 15038144053, "size_in_bytes": 19364781799}, "discofuse-wikipedia": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n  title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n  author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n  booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n  note = {arXiv preprint arXiv:1902.10526},\n  year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "discofuse", "config_name": "discofuse-wikipedia", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6377924196, "num_examples": 16310585, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 64008158, "num_examples": 163657, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 65682035, "num_examples": 168081, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/discofuse.appspot.com/discofuse_v1_wikipedia.zip": {"num_bytes": 1717422334, "checksum": "e8a5ec52cdd9820ce9b410b47c3a57a49a300470c976202cd7caab613658ebfe"}}, "download_size": 1717422334, "post_processing_size": null, "dataset_size": 6507614389, "size_in_bytes": 8225036723}}
+{"discofuse-sport": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n  title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n  author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n  booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n  note = {arXiv preprint arXiv:1902.10526},\n  year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "discofuse", "config_name": "discofuse-sport", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14736279993, "num_examples": 43291020, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 151656323, "num_examples": 445521, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 150207737, "num_examples": 440902, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/gresearch/discofuse/discofuse_v1_sports.zip": {"num_bytes": 4326637746, "checksum": "a390083c7923e11efeeea04a9a79074149e5ef9be614466f50aec28f1a5eec41"}}, "download_size": 4326637746, "post_processing_size": null, "dataset_size": 15038144053, "size_in_bytes": 19364781799}, "discofuse-wikipedia": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n  title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n  author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n  booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n  note = {arXiv preprint arXiv:1902.10526},\n  year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "discofuse", "config_name": "discofuse-wikipedia", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6377924196, "num_examples": 16310585, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 64008158, "num_examples": 163657, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 65682035, "num_examples": 168081, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/gresearch/discofuse/discofuse_v1_wikipedia.zip": {"num_bytes": 1717422334, "checksum": "e8a5ec52cdd9820ce9b410b47c3a57a49a300470c976202cd7caab613658ebfe"}}, "download_size": 1717422334, "post_processing_size": null, "dataset_size": 6507614389, "size_in_bytes": 8225036723}}
diff --git a/datasets/discofuse/discofuse.py b/datasets/discofuse/discofuse.py
@@ -7,7 +7,7 @@
 import datasets
 
 
-_URL_ = "https://storage.googleapis.com/discofuse.appspot.com/"
+_URL_ = "https://storage.googleapis.com/gresearch/discofuse/"
 _CITATION = """\
 @InProceedings{GevaEtAl2019,
   title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"discofuse-sport": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n note = {arXiv preprint arXiv:1902.10526},\n year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "discofuse", "config_name": "discofuse-sport", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14736279993, "num_examples": 43291020, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 151656323, "num_examples": 445521, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 150207737, "num_examples": 440902, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/discofuse.appspot.com/discofuse_v1_sports.zip": {"num_bytes": 4326637746, "checksum": "a390083c7923e11efeeea04a9a79074149e5ef9be614466f50aec28f1a5eec41"}}, "download_size": 4326637746, "post_processing_size": null, "dataset_size": 15038144053, "size_in_bytes": 19364781799}, "discofuse-wikipedia": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n note = {arXiv preprint arXiv:1902.10526},\n year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "discofuse", "config_name": "discofuse-wikipedia", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6377924196, "num_examples": 16310585, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 64008158, "num_examples": 163657, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 65682035, "num_examples": 168081, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/discofuse.appspot.com/discofuse_v1_wikipedia.zip": {"num_bytes": 1717422334, "checksum": "e8a5ec52cdd9820ce9b410b47c3a57a49a300470c976202cd7caab613658ebfe"}}, "download_size": 1717422334, "post_processing_size": null, "dataset_size": 6507614389, "size_in_bytes": 8225036723}}
		{"discofuse-sport": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n note = {arXiv preprint arXiv:1902.10526},\n year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "discofuse", "config_name": "discofuse-sport", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14736279993, "num_examples": 43291020, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 151656323, "num_examples": 445521, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 150207737, "num_examples": 440902, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/gresearch/discofuse/discofuse_v1_sports.zip": {"num_bytes": 4326637746, "checksum": "a390083c7923e11efeeea04a9a79074149e5ef9be614466f50aec28f1a5eec41"}}, "download_size": 4326637746, "post_processing_size": null, "dataset_size": 15038144053, "size_in_bytes": 19364781799}, "discofuse-wikipedia": {"description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "citation": "@InProceedings{GevaEtAl2019,\n title = {DiscoFuse: A Large-Scale Dataset for Discourse-Based Sentence Fusion},\n author = {Geva, Mor and Malmi, Eric and Szpektor, Idan and Berant, Jonathan},\n booktitle = {Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics},\n note = {arXiv preprint arXiv:1902.10526},\n year = {2019}\n}\n\n", "homepage": "https://github.com/google-research-datasets/discofuse", "license": "", "features": {"connective_string": {"dtype": "string", "id": null, "_type": "Value"}, "discourse_type": {"dtype": "string", "id": null, "_type": "Value"}, "coherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_pronoun": {"dtype": "float32", "id": null, "_type": "Value"}, "incoherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "incoherent_second_sentence": {"dtype": "string", "id": null, "_type": "Value"}, "has_coref_type_nominal": {"dtype": "float32", "id": null, "_type": "Value"}, "coherent_first_sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "discofuse", "config_name": "discofuse-wikipedia", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6377924196, "num_examples": 16310585, "dataset_name": "discofuse"}, "test": {"name": "test", "num_bytes": 64008158, "num_examples": 163657, "dataset_name": "discofuse"}, "validation": {"name": "validation", "num_bytes": 65682035, "num_examples": 168081, "dataset_name": "discofuse"}}, "download_checksums": {"https://storage.googleapis.com/gresearch/discofuse/discofuse_v1_wikipedia.zip": {"num_bytes": 1717422334, "checksum": "e8a5ec52cdd9820ce9b410b47c3a57a49a300470c976202cd7caab613658ebfe"}}, "download_size": 1717422334, "post_processing_size": null, "dataset_size": 6507614389, "size_in_bytes": 8225036723}}