huggingface · lhoestq · Jun 10, 2021 · Jun 8, 2021
diff --git a/datasets/proto_qa/dataset_infos.json b/datasets/proto_qa/dataset_infos.json
@@ -1 +1 @@
-{"proto_qa": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer-clusters": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "proto_qa", "config_name": "proto_qa", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3943484, "num_examples": 8782, "dataset_name": "proto_qa"}, "validation": {"name": "validation", "num_bytes": 472121, "num_examples": 980, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/master/data/train/protoqa_train.jsonl": {"num_bytes": 6587901, "checksum": "3387c658053ceca6eec3261d2d0b03da4109eb05fa6480b6d02a577714f867e2"}, "https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/protoqa_scraped_dev.jsonl": {"num_bytes": 765031, "checksum": "906385430e473ce7b63e82caa9db34e1f55571a6afcbccfb518308f009bc8af7"}}, "download_size": 7352932, "post_processing_size": null, "dataset_size": 4415605, "size_in_bytes": 11768537}, "proto_qa_cs": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers-cleaned": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 84466, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/crowdsource_dev.jsonl": {"num_bytes": 115704, "checksum": "bbf9113ad57d68937de9367a48bc4994f39d14f4e7a5cd1114b1de0509de4434"}}, "download_size": 115704, "post_processing_size": null, "dataset_size": 84466, "size_in_bytes": 200170}, "proto_qa_cs_assessments": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "assessments": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs_assessments", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 12473, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/crowdsource_dev.assessments.jsonl": {"num_bytes": 24755, "checksum": "2abcf5f7d7ae55847898ac0a76becaaa9a0e72aeecb78c44eeadcec01263e71a"}}, "download_size": 24755, "post_processing_size": null, "dataset_size": 12473, "size_in_bytes": 37228}}
+{"proto_qa": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer-clusters": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "proto_qa", "config_name": "proto_qa", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3943484, "num_examples": 8782, "dataset_name": "proto_qa"}, "validation": {"name": "validation", "num_bytes": 472121, "num_examples": 980, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/train/protoqa_train.jsonl": {"num_bytes": 6587901, "checksum": "3387c658053ceca6eec3261d2d0b03da4109eb05fa6480b6d02a577714f867e2"}, "https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/protoqa_scraped_dev.jsonl": {"num_bytes": 765031, "checksum": "906385430e473ce7b63e82caa9db34e1f55571a6afcbccfb518308f009bc8af7"}}, "download_size": 7352932, "post_processing_size": null, "dataset_size": 4415605, "size_in_bytes": 11768537}, "proto_qa_cs": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers-cleaned": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 84466, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/crowdsource_dev.jsonl": {"num_bytes": 115704, "checksum": "bbf9113ad57d68937de9367a48bc4994f39d14f4e7a5cd1114b1de0509de4434"}}, "download_size": 115704, "post_processing_size": null, "dataset_size": 84466, "size_in_bytes": 200170}, "proto_qa_cs_assessments": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "assessments": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs_assessments", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 12473, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/crowdsource_dev.assessments.jsonl": {"num_bytes": 24755, "checksum": "2abcf5f7d7ae55847898ac0a76becaaa9a0e72aeecb78c44eeadcec01263e71a"}}, "download_size": 24755, "post_processing_size": null, "dataset_size": 12473, "size_in_bytes": 37228}}
diff --git a/datasets/proto_qa/proto_qa.py b/datasets/proto_qa/proto_qa.py
@@ -46,11 +46,11 @@
 
 _URLs = {
     "proto_qa": {
-        "dev": "https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/protoqa_scraped_dev.jsonl",
-        "train": "https://raw.githubusercontent.com/iesl/protoqa-data/master/data/train/protoqa_train.jsonl",
+        "dev": "https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/protoqa_scraped_dev.jsonl",
+        "train": "https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/train/protoqa_train.jsonl",
     },
-    "proto_qa_cs": "https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/crowdsource_dev.jsonl",
-    "proto_qa_cs_assessments": "https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/crowdsource_dev.assessments.jsonl",
+    "proto_qa_cs": "https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/crowdsource_dev.jsonl",
+    "proto_qa_cs_assessments": "https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/crowdsource_dev.assessments.jsonl",
 }
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"proto_qa": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer-clusters": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "proto_qa", "config_name": "proto_qa", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3943484, "num_examples": 8782, "dataset_name": "proto_qa"}, "validation": {"name": "validation", "num_bytes": 472121, "num_examples": 980, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/master/data/train/protoqa_train.jsonl": {"num_bytes": 6587901, "checksum": "3387c658053ceca6eec3261d2d0b03da4109eb05fa6480b6d02a577714f867e2"}, "https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/protoqa_scraped_dev.jsonl": {"num_bytes": 765031, "checksum": "906385430e473ce7b63e82caa9db34e1f55571a6afcbccfb518308f009bc8af7"}}, "download_size": 7352932, "post_processing_size": null, "dataset_size": 4415605, "size_in_bytes": 11768537}, "proto_qa_cs": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers-cleaned": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 84466, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/crowdsource_dev.jsonl": {"num_bytes": 115704, "checksum": "bbf9113ad57d68937de9367a48bc4994f39d14f4e7a5cd1114b1de0509de4434"}}, "download_size": 115704, "post_processing_size": null, "dataset_size": 84466, "size_in_bytes": 200170}, "proto_qa_cs_assessments": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "assessments": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs_assessments", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 12473, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/master/data/dev/crowdsource_dev.assessments.jsonl": {"num_bytes": 24755, "checksum": "2abcf5f7d7ae55847898ac0a76becaaa9a0e72aeecb78c44eeadcec01263e71a"}}, "download_size": 24755, "post_processing_size": null, "dataset_size": 12473, "size_in_bytes": 37228}}
		{"proto_qa": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer-clusters": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "proto_qa", "config_name": "proto_qa", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3943484, "num_examples": 8782, "dataset_name": "proto_qa"}, "validation": {"name": "validation", "num_bytes": 472121, "num_examples": 980, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/train/protoqa_train.jsonl": {"num_bytes": 6587901, "checksum": "3387c658053ceca6eec3261d2d0b03da4109eb05fa6480b6d02a577714f867e2"}, "https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/protoqa_scraped_dev.jsonl": {"num_bytes": 765031, "checksum": "906385430e473ce7b63e82caa9db34e1f55571a6afcbccfb518308f009bc8af7"}}, "download_size": 7352932, "post_processing_size": null, "dataset_size": 4415605, "size_in_bytes": 11768537}, "proto_qa_cs": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"normalized-question": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answers-cleaned": {"feature": {"count": {"dtype": "int32", "id": null, "_type": "Value"}, "clusterid": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerstrings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "totalcount": {"dtype": "int32", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 84466, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/crowdsource_dev.jsonl": {"num_bytes": 115704, "checksum": "bbf9113ad57d68937de9367a48bc4994f39d14f4e7a5cd1114b1de0509de4434"}}, "download_size": 115704, "post_processing_size": null, "dataset_size": 84466, "size_in_bytes": 200170}, "proto_qa_cs_assessments": {"description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.\n\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning},\nauthors={Michael Boratko, Xiang Lorraine Li, Tim O\u2019Gorman, Rajarshi Das, Dan Le, Andrew McCallum},\nyear={2020},\npublisher = {GitHub},\njournal = {GitHub repository},\nhowpublished={\\url{https://github.com/iesl/protoqa-data}},\n}\n", "homepage": "https://github.com/iesl/protoqa-data", "license": "cc-by-4.0", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "assessments": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "proto_qa", "config_name": "proto_qa_cs_assessments", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 12473, "num_examples": 52, "dataset_name": "proto_qa"}}, "download_checksums": {"https://raw.githubusercontent.com/iesl/protoqa-data/9fb72b4e7d41a7d3a9766c33ef66c78d7a100b41/data/dev/crowdsource_dev.assessments.jsonl": {"num_bytes": 24755, "checksum": "2abcf5f7d7ae55847898ac0a76becaaa9a0e72aeecb78c44eeadcec01263e71a"}}, "download_size": 24755, "post_processing_size": null, "dataset_size": 12473, "size_in_bytes": 37228}}