Skip to content

Commit 5de20ee

Browse files
Update data URLs in fever dataset (#4455)
* Update data URLs in fever dataset * Update metadata JSON
1 parent 650fb02 commit 5de20ee

File tree

2 files changed

+12
-9
lines changed

2 files changed

+12
-9
lines changed

datasets/fever/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"v1.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V1.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29747512, "num_examples": 311431, "dataset_name": "fever"}, "unlabelled_test": {"name": "unlabelled_test", "num_bytes": 1627026, "num_examples": 19998, "dataset_name": "fever"}, "unlabelled_dev": {"name": "unlabelled_dev", "num_bytes": 1558989, "num_examples": 19998, "dataset_name": "fever"}, "labelled_dev": {"name": "labelled_dev", "num_bytes": 3661989, "num_examples": 37566, "dataset_name": "fever"}, "paper_dev": {"name": "paper_dev", "num_bytes": 1831013, "num_examples": 18999, "dataset_name": "fever"}, "paper_test": {"name": "paper_test", "num_bytes": 1830976, "num_examples": 18567, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl": {"num_bytes": 33024303, "checksum": "eba7e8f87076753f8494718b9a857827af7bf73e76c9e4b75420207d26e588b6"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl": {"num_bytes": 4349935, "checksum": "e89865bfe1b4dd054e03dd57d7241a6fde24862905f31117cf0cd719f7c78df7"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev_public.jsonl": {"num_bytes": 1530640, "checksum": "acda01ae5ee7e75c73909a665f465cec20704ea26e9d676cd7423ff2c8ab0e8b"}, "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl": {"num_bytes": 1599159, "checksum": "76dd0872d8fa1f49efe1194fe8a88b7dd4c715c77d87a142b615d4be583e1e51"}, "https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl": {"num_bytes": 2168767, "checksum": "41158707810008747946bf23471e82df53e77a513524b9e3ec1c2e674ef5ef8c"}, "https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl": {"num_bytes": 2181168, "checksum": "fb7b0280a0adc2302bbb29bfb7af37274fa585de3171bcf908f180642d11d88e"}}, "download_size": 44853972, "dataset_size": 40257505, "size_in_bytes": 85111477}, "v2.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V2.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "v2.0", "version": {"version_str": "2.0.0", "description": "", "datasets_version_to_prepare": null, "major": 2, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 307447, "num_examples": 2384, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/fever2-fixers-dev.jsonl": {"num_bytes": 392466, "checksum": "43c3df77cf9bf6022b9356ed1d66df6d8a9a0126c4e4b8d155742e3a9988c814"}}, "download_size": 392466, "dataset_size": 307447, "size_in_bytes": 699913}, "wiki_pages": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nWikipedia pages", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "lines": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "fever", "config_name": "wiki_pages", "version": "0.0.0", "splits": {"wikipedia_pages": {"name": "wikipedia_pages", "num_bytes": 7256829814, "num_examples": 5416537, "dataset_name": "fever"}}, "download_checksums": {"https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip": {"num_bytes": 1713485474, "checksum": "4b06d95da6adf7fe02d2796176c670dacccb21348da89cba4c50676ab99665f2"}}, "download_size": 1713485474, "dataset_size": 7256829814, "size_in_bytes": 8970315288}}
1+
{"v1.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V1.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29591412, "num_examples": 311431, "dataset_name": "fever"}, "unlabelled_test": {"name": "unlabelled_test", "num_bytes": 1617002, "num_examples": 19998, "dataset_name": "fever"}, "unlabelled_dev": {"name": "unlabelled_dev", "num_bytes": 1548965, "num_examples": 19998, "dataset_name": "fever"}, "labelled_dev": {"name": "labelled_dev", "num_bytes": 3643157, "num_examples": 37566, "dataset_name": "fever"}, "paper_dev": {"name": "paper_dev", "num_bytes": 1821489, "num_examples": 18999, "dataset_name": "fever"}, "paper_test": {"name": "paper_test", "num_bytes": 1821668, "num_examples": 18567, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever/train.jsonl": {"num_bytes": 33024303, "checksum": "eba7e8f87076753f8494718b9a857827af7bf73e76c9e4b75420207d26e588b6"}, "https://fever.ai/download/fever/shared_task_dev.jsonl": {"num_bytes": 4349935, "checksum": "e89865bfe1b4dd054e03dd57d7241a6fde24862905f31117cf0cd719f7c78df7"}, "https://fever.ai/download/fever/shared_task_dev_public.jsonl": {"num_bytes": 1530640, "checksum": "acda01ae5ee7e75c73909a665f465cec20704ea26e9d676cd7423ff2c8ab0e8b"}, "https://fever.ai/download/fever/shared_task_test.jsonl": {"num_bytes": 1599159, "checksum": "76dd0872d8fa1f49efe1194fe8a88b7dd4c715c77d87a142b615d4be583e1e51"}, "https://fever.ai/download/fever/paper_dev.jsonl": {"num_bytes": 2168767, "checksum": "41158707810008747946bf23471e82df53e77a513524b9e3ec1c2e674ef5ef8c"}, "https://fever.ai/download/fever/paper_test.jsonl": {"num_bytes": 2181168, "checksum": "fb7b0280a0adc2302bbb29bfb7af37274fa585de3171bcf908f180642d11d88e"}}, "download_size": 44853972, "post_processing_size": null, "dataset_size": 40043693, "size_in_bytes": 84897665}, "v2.0": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nFEVER V2.0", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "claim": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_annotation_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_id": {"dtype": "int32", "id": null, "_type": "Value"}, "evidence_wiki_url": {"dtype": "string", "id": null, "_type": "Value"}, "evidence_sentence_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "v2.0", "version": {"version_str": "2.0.0", "description": "", "major": 2, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 306243, "num_examples": 2384, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever2.0/fever2-fixers-dev.jsonl": {"num_bytes": 392466, "checksum": "43c3df77cf9bf6022b9356ed1d66df6d8a9a0126c4e4b8d155742e3a9988c814"}}, "download_size": 392466, "post_processing_size": null, "dataset_size": 306243, "size_in_bytes": 698709}, "wiki_pages": {"description": "\nWith billions of individual pages on the web providing information on almost every conceivable topic, we should have the ability to collect facts that answer almost every conceivable question. However, only a small fraction of this information is contained in structured sources (Wikidata, Freebase, etc.) \u2013 we are therefore limited by our ability to transform free-form text to structured knowledge. There is, however, another problem that has become the focus of a lot of recent research and media coverage: false information coming from unreliable sources. [1] [2]\n\nThe FEVER workshops are a venue for work in verifiable knowledge extraction and to stimulate progress in this direction.\n\nWikipedia pages", "citation": "\n@inproceedings{Thorne18Fever,\n author = {Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},\n title = {{FEVER}: a Large-scale Dataset for Fact Extraction and VERification},\n booktitle = {NAACL-HLT},\n year = {2018}\n}\n}\n", "homepage": "https://fever.ai/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "lines": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "fever", "config_name": "wiki_pages", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"wikipedia_pages": {"name": "wikipedia_pages", "num_bytes": 7254115038, "num_examples": 5416537, "dataset_name": "fever"}}, "download_checksums": {"https://fever.ai/download/fever/wiki-pages.zip": {"num_bytes": 1713485474, "checksum": "4b06d95da6adf7fe02d2796176c670dacccb21348da89cba4c50676ab99665f2"}}, "download_size": 1713485474, "post_processing_size": null, "dataset_size": 7254115038, "size_in_bytes": 8967600512}}

datasets/fever/fever.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,8 @@ def _info(self):
101101
def _split_generators(self, dl_manager):
102102
"""Returns SplitGenerators."""
103103
if self.config.name == "v2.0":
104-
urls = "https://s3-eu-west-1.amazonaws.com/fever.public/fever2-fixers-dev.jsonl"
104+
base_url = "https://fever.ai/download/fever2.0"
105+
urls = f"{base_url}/fever2-fixers-dev.jsonl"
105106
dl_path = dl_manager.download_and_extract(urls)
106107
return [
107108
datasets.SplitGenerator(
@@ -112,13 +113,14 @@ def _split_generators(self, dl_manager):
112113
)
113114
]
114115
elif self.config.name == "v1.0":
116+
base_url = "https://fever.ai/download/fever"
115117
urls = {
116-
"train": "https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl",
117-
"labelled_dev": "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl",
118-
"unlabelled_dev": "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev_public.jsonl",
119-
"unlabelled_test": "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_test.jsonl",
120-
"paper_dev": "https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl",
121-
"paper_test": "https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl",
118+
"train": f"{base_url}/train.jsonl",
119+
"labelled_dev": f"{base_url}/shared_task_dev.jsonl",
120+
"unlabelled_dev": f"{base_url}/shared_task_dev_public.jsonl",
121+
"unlabelled_test": f"{base_url}/shared_task_test.jsonl",
122+
"paper_dev": f"{base_url}/paper_dev.jsonl",
123+
"paper_test": f"{base_url}/paper_test.jsonl",
122124
}
123125
dl_path = dl_manager.download_and_extract(urls)
124126
return [
@@ -160,7 +162,8 @@ def _split_generators(self, dl_manager):
160162
),
161163
]
162164
elif self.config.name == "wiki_pages":
163-
urls = "https://s3-eu-west-1.amazonaws.com/fever.public/wiki-pages.zip"
165+
base_url = "https://fever.ai/download/fever"
166+
urls = f"{base_url}/wiki-pages.zip"
164167
dl_path = dl_manager.download_and_extract(urls)
165168
files = sorted(os.listdir(os.path.join(dl_path, "wiki-pages")))
166169
file_paths = [os.path.join(dl_path, "wiki-pages", file) for file in files]

0 commit comments

Comments
 (0)