Update data URLs in fever dataset (#4455)

albertvillanova · web-flow · commit 5de20eebe16d · 2022-06-08T09:16:16.000+02:00
* Update data URLs in fever dataset

* Update metadata JSON
diff --git a/datasets/fever/dataset_infos.json b/datasets/fever/dataset_infos.json
@@ -1 +1 @@
-{"v1.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER  V1.0", "citation": "\n@inproceedings{Thorne18Fever,\n    author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n    title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n    booktitle = {NAACL-HLT},\n    year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29747512, "num_examples": 311431, "dataset_name": "fever"}, "unlabelled_test": {"name": "unlabelled_test", "num_bytes": 1627026, "num_examples": 19998, "dataset_name": "fever"}, "unlabelled_dev": {"name": "unlabelled_dev", "num_bytes": 1558989, "num_examples": 19998, "dataset_name": "fever"}, "labelled_dev": {"name": "labelled_dev", "num_bytes": 3661989, "num_examples": 37566, "dataset_name": "fever"}, "paper_dev": {"name": "paper_dev", "num_bytes": 1831013, "num_examples": 18999, "dataset_name": "fever"}, "paper_test": {"name": "paper_test", "num_bytes": 1830976, "num_examples": 18567, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl": {"num_bytes": 33024303, "checksum": "eba7e8f87076753f8494718b9a857827af7bf73e76c9e4b75420207d26e588b6"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl": {"num_bytes": 4349935, "checksum": "e89865bfe1b4dd054e03dd57d7241a6fde24862905f31117cf0cd719f7c78df7"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev_public.jsonl": {"num_bytes": 1530640, "checksum": "acda01ae5ee7e75c73909a665f465cec20704ea26e9d676cd7423ff2c8ab0e8b"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl": {"num_bytes": 1599159, "checksum": "76dd0872d8fa1f49efe1194fe8a88b7dd4c715c77d87a142b615d4be583e1e51"}, "https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl": {"num_bytes": 2168767, "checksum": "41158707810008747946bf23471e82df53e77a513524b9e3ec1c2e674ef5ef8c"}, "https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl": {"num_bytes": 2181168, "checksum": "fb7b0280a0adc2302bbb29bfb7af37274fa585de3171bcf908f180642d11d88e"}}, "download_size": 44853972, "dataset_size": 40257505, "size_in_bytes": 85111477}, "v2.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER  V2.0", "citation": "\n@inproceedings{Thorne18Fever,\n    author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n    title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n    booktitle = {NAACL-HLT},\n    year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "v2.0", "version": {"version_str": "2.0.0", "description": "", "datasets_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 307447, "num_examples": 2384, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/fever2-fixers-dev.jsonl": {"num_bytes": 392466, "checksum": "43c3df77cf9bf6022b9356ed1d66df6d8a9a0126c4e4b8d155742e3a9988c814"}}, "download_size": 392466, "dataset_size": 307447, "size_in_bytes": 699913}, "wiki_pages": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nWikipedia pages", "citation": "\n@inproceedings{Thorne18Fever,\n    author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n    title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n    booktitle = {NAACL-HLT},\n    year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "lines": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "wiki_pages", "version": "0.0.0", "splits": {"wikipedia_pages": {"name": "wikipedia_pages", "num_bytes": 7256829814, "num_examples": 5416537, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip": {"num_bytes": 1713485474, "checksum": "4b06d95da6adf7fe02d2796176c670dacccb21348da89cba4c50676ab99665f2"}}, "download_size": 1713485474, "dataset_size": 7256829814, "size_in_bytes": 8970315288}}
+{"v1.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER  V1.0", "citation": "\n@inproceedings{Thorne18Fever,\n    author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n    title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n    booktitle = {NAACL-HLT},\n    year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29591412, "num_examples": 311431, "dataset_name": "fever"}, "unlabelled_test": {"name": "unlabelled_test", "num_bytes": 1617002, "num_examples": 19998, "dataset_name": "fever"}, "unlabelled_dev": {"name": "unlabelled_dev", "num_bytes": 1548965, "num_examples": 19998, "dataset_name": "fever"}, "labelled_dev": {"name": "labelled_dev", "num_bytes": 3643157, "num_examples": 37566, "dataset_name": "fever"}, "paper_dev": {"name": "paper_dev", "num_bytes": 1821489, "num_examples": 18999, "dataset_name": "fever"}, "paper_test": {"name": "paper_test", "num_bytes": 1821668, "num_examples": 18567, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever/train.jsonl": {"num_bytes": 33024303, "checksum": "eba7e8f87076753f8494718b9a857827af7bf73e76c9e4b75420207d26e588b6"}, "https://fever.ai/download/fever/shared_task_dev.jsonl": {"num_bytes": 4349935, "checksum": "e89865bfe1b4dd054e03dd57d7241a6fde24862905f31117cf0cd719f7c78df7"}, "https://fever.ai/download/fever/shared_task_dev_public.jsonl": {"num_bytes": 1530640, "checksum": "acda01ae5ee7e75c73909a665f465cec20704ea26e9d676cd7423ff2c8ab0e8b"}, "https://fever.ai/download/fever/shared_task_test.jsonl": {"num_bytes": 1599159, "checksum": "76dd0872d8fa1f49efe1194fe8a88b7dd4c715c77d87a142b615d4be583e1e51"}, "https://fever.ai/download/fever/paper_dev.jsonl": {"num_bytes": 2168767, "checksum": "41158707810008747946bf23471e82df53e77a513524b9e3ec1c2e674ef5ef8c"}, "https://fever.ai/download/fever/paper_test.jsonl": {"num_bytes": 2181168, "checksum": "fb7b0280a0adc2302bbb29bfb7af37274fa585de3171bcf908f180642d11d88e"}}, "download_size": 44853972, "post_processing_size": null, "dataset_size": 40043693, "size_in_bytes": 84897665}, "v2.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER  V2.0", "citation": "\n@inproceedings{Thorne18Fever,\n    author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n    title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n    booktitle = {NAACL-HLT},\n    year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "v2.0", "version": {"version_str": "2.0.0", "description": "", "major": 2, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 306243, "num_examples": 2384, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever2.0/fever2-fixers-dev.jsonl": {"num_bytes": 392466, "checksum": "43c3df77cf9bf6022b9356ed1d66df6d8a9a0126c4e4b8d155742e3a9988c814"}}, "download_size": 392466, "post_processing_size": null, "dataset_size": 306243, "size_in_bytes": 698709}, "wiki_pages": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nWikipedia pages", "citation": "\n@inproceedings{Thorne18Fever,\n    author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n    title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n    booktitle = {NAACL-HLT},\n    year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "lines": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "wiki_pages", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"wikipedia_pages": {"name": "wikipedia_pages", "num_bytes": 7254115038, "num_examples": 5416537, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever/wiki-pages.zip": {"num_bytes": 1713485474, "checksum": "4b06d95da6adf7fe02d2796176c670dacccb21348da89cba4c50676ab99665f2"}}, "download_size": 1713485474, "post_processing_size": null, "dataset_size": 7254115038, "size_in_bytes": 8967600512}}
diff --git a/datasets/fever/fever.py b/datasets/fever/fever.py
@@ -101,7 +101,8 @@ def _info(self):
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
         if self.config.name == "v2.0":
-            urls = "https://s3-eu-west-1.amazonaws.com/fever.public/fever2-fixers-dev.jsonl"
+            base_url = "https://fever.ai/download/fever2.0"
+            urls = f"{base_url}/fever2-fixers-dev.jsonl"
             dl_path = dl_manager.download_and_extract(urls)
             return [
                 datasets.SplitGenerator(
@@ -112,13 +113,14 @@ def _split_generators(self, dl_manager):
                 )
             ]
         elif self.config.name == "v1.0":
+            base_url = "https://fever.ai/download/fever"
             urls = {
-                "train": "https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl",
-                "labelled_dev": "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl",
-                "unlabelled_dev": "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev_public.jsonl",
-                "unlabelled_test": "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl",
-                "paper_dev": "https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl",
-                "paper_test": "https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl",
+                "train": f"{base_url}/train.jsonl",
+                "labelled_dev": f"{base_url}/shared_task_dev.jsonl",
+                "unlabelled_dev": f"{base_url}/shared_task_dev_public.jsonl",
+                "unlabelled_test": f"{base_url}/shared_task_test.jsonl",
+                "paper_dev": f"{base_url}/paper_dev.jsonl",
+                "paper_test": f"{base_url}/paper_test.jsonl",
             }
             dl_path = dl_manager.download_and_extract(urls)
             return [
@@ -160,7 +162,8 @@ def _split_generators(self, dl_manager):
                 ),
             ]
         elif self.config.name == "wiki_pages":
-            urls = "https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip"
+            base_url = "https://fever.ai/download/fever"
+            urls = f"{base_url}/wiki-pages.zip"
             dl_path = dl_manager.download_and_extract(urls)
             files = sorted(os.listdir(os.path.join(dl_path, "wiki-pages")))
             file_paths = [os.path.join(dl_path, "wiki-pages", file) for file in files]

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"v1.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V1.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29747512, "num_examples": 311431, "dataset_name": "fever"}, "unlabelled_test": {"name": "unlabelled_test", "num_bytes": 1627026, "num_examples": 19998, "dataset_name": "fever"}, "unlabelled_dev": {"name": "unlabelled_dev", "num_bytes": 1558989, "num_examples": 19998, "dataset_name": "fever"}, "labelled_dev": {"name": "labelled_dev", "num_bytes": 3661989, "num_examples": 37566, "dataset_name": "fever"}, "paper_dev": {"name": "paper_dev", "num_bytes": 1831013, "num_examples": 18999, "dataset_name": "fever"}, "paper_test": {"name": "paper_test", "num_bytes": 1830976, "num_examples": 18567, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl": {"num_bytes": 33024303, "checksum": "eba7e8f87076753f8494718b9a857827af7bf73e76c9e4b75420207d26e588b6"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl": {"num_bytes": 4349935, "checksum": "e89865bfe1b4dd054e03dd57d7241a6fde24862905f31117cf0cd719f7c78df7"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev_public.jsonl": {"num_bytes": 1530640, "checksum": "acda01ae5ee7e75c73909a665f465cec20704ea26e9d676cd7423ff2c8ab0e8b"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl": {"num_bytes": 1599159, "checksum": "76dd0872d8fa1f49efe1194fe8a88b7dd4c715c77d87a142b615d4be583e1e51"}, "https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl": {"num_bytes": 2168767, "checksum": "41158707810008747946bf23471e82df53e77a513524b9e3ec1c2e674ef5ef8c"}, "https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl": {"num_bytes": 2181168, "checksum": "fb7b0280a0adc2302bbb29bfb7af37274fa585de3171bcf908f180642d11d88e"}}, "download_size": 44853972, "dataset_size": 40257505, "size_in_bytes": 85111477}, "v2.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V2.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "v2.0", "version": {"version_str": "2.0.0", "description": "", "datasets_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 307447, "num_examples": 2384, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/fever2-fixers-dev.jsonl": {"num_bytes": 392466, "checksum": "43c3df77cf9bf6022b9356ed1d66df6d8a9a0126c4e4b8d155742e3a9988c814"}}, "download_size": 392466, "dataset_size": 307447, "size_in_bytes": 699913}, "wiki_pages": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nWikipedia pages", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "lines": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "wiki_pages", "version": "0.0.0", "splits": {"wikipedia_pages": {"name": "wikipedia_pages", "num_bytes": 7256829814, "num_examples": 5416537, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip": {"num_bytes": 1713485474, "checksum": "4b06d95da6adf7fe02d2796176c670dacccb21348da89cba4c50676ab99665f2"}}, "download_size": 1713485474, "dataset_size": 7256829814, "size_in_bytes": 8970315288}}
	`1`	+{"v1.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V1.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29591412, "num_examples": 311431, "dataset_name": "fever"}, "unlabelled_test": {"name": "unlabelled_test", "num_bytes": 1617002, "num_examples": 19998, "dataset_name": "fever"}, "unlabelled_dev": {"name": "unlabelled_dev", "num_bytes": 1548965, "num_examples": 19998, "dataset_name": "fever"}, "labelled_dev": {"name": "labelled_dev", "num_bytes": 3643157, "num_examples": 37566, "dataset_name": "fever"}, "paper_dev": {"name": "paper_dev", "num_bytes": 1821489, "num_examples": 18999, "dataset_name": "fever"}, "paper_test": {"name": "paper_test", "num_bytes": 1821668, "num_examples": 18567, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever/train.jsonl": {"num_bytes": 33024303, "checksum": "eba7e8f87076753f8494718b9a857827af7bf73e76c9e4b75420207d26e588b6"}, "https://fever.ai/download/fever/shared_task_dev.jsonl": {"num_bytes": 4349935, "checksum": "e89865bfe1b4dd054e03dd57d7241a6fde24862905f31117cf0cd719f7c78df7"}, "https://fever.ai/download/fever/shared_task_dev_public.jsonl": {"num_bytes": 1530640, "checksum": "acda01ae5ee7e75c73909a665f465cec20704ea26e9d676cd7423ff2c8ab0e8b"}, "https://fever.ai/download/fever/shared_task_test.jsonl": {"num_bytes": 1599159, "checksum": "76dd0872d8fa1f49efe1194fe8a88b7dd4c715c77d87a142b615d4be583e1e51"}, "https://fever.ai/download/fever/paper_dev.jsonl": {"num_bytes": 2168767, "checksum": "41158707810008747946bf23471e82df53e77a513524b9e3ec1c2e674ef5ef8c"}, "https://fever.ai/download/fever/paper_test.jsonl": {"num_bytes": 2181168, "checksum": "fb7b0280a0adc2302bbb29bfb7af37274fa585de3171bcf908f180642d11d88e"}}, "download_size": 44853972, "post_processing_size": null, "dataset_size": 40043693, "size_in_bytes": 84897665}, "v2.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V2.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "v2.0", "version": {"version_str": "2.0.0", "description": "", "major": 2, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 306243, "num_examples": 2384, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever2.0/fever2-fixers-dev.jsonl": {"num_bytes": 392466, "checksum": "43c3df77cf9bf6022b9356ed1d66df6d8a9a0126c4e4b8d155742e3a9988c814"}}, "download_size": 392466, "post_processing_size": null, "dataset_size": 306243, "size_in_bytes": 698709}, "wiki_pages": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nWikipedia pages", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "lines": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "wiki_pages", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"wikipedia_pages": {"name": "wikipedia_pages", "num_bytes": 7254115038, "num_examples": 5416537, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever/wiki-pages.zip": {"num_bytes": 1713485474, "checksum": "4b06d95da6adf7fe02d2796176c670dacccb21348da89cba4c50676ab99665f2"}}, "download_size": 1713485474, "post_processing_size": null, "dataset_size": 7254115038, "size_in_bytes": 8967600512}}