From 5a2a60682d3f17bbe1bbafeb64d80a26d61fefc7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sun, 7 Aug 2022 11:42:26 +0200 Subject: [PATCH 1/5] Fix fine label from 47 to 50 classes --- datasets/trec/trec.py | 151 ++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 79 deletions(-) diff --git a/datasets/trec/trec.py b/datasets/trec/trec.py index da020925c29..23a0852bb13 100644 --- a/datasets/trec/trec.py +++ b/datasets/trec/trec.py @@ -12,12 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" The Text REtrieval Conference (TREC) Question Classification dataset.""" +"""The Text REtrieval Conference (TREC) Question Classification dataset.""" import datasets +_DESCRIPTION = """\ +The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set. + +The dataset has 6 coarse class labels and 50 fine class labels. Average length of each sentence is 10, vocabulary size of 8700. + +Data are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set. These questions were manually labeled. +""" + +_HOMEPAGE = "https://cogcomp.seas.upenn.edu/Data/QA/QC/" + _CITATION = """\ @inproceedings{li-roth-2002-learning, title = "Learning Question Classifiers", @@ -40,114 +50,98 @@ } """ -_DESCRIPTION = """\ -The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set. The dataset has 6 labels, 47 level-2 labels. Average length of each sentence is 10, vocabulary size of 8700. - -Data are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set. -""" - _URLs = { "train": "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label", "test": "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label", } -_COARSE_LABELS = ["DESC", "ENTY", "ABBR", "HUM", "NUM", "LOC"] +_COARSE_LABELS = ["ABBR", "ENTY", "DESC", "HUM", "LOC", "NUM"] _FINE_LABELS = [ - "manner", - "cremat", - "animal", - "exp", - "ind", - "gr", - "title", - "def", - "date", - "reason", - "event", - "state", - "desc", - "count", - "other", - "letter", - "religion", - "food", - "country", - "color", - "termeq", - "city", - "body", - "dismed", - "mount", - "money", - "product", - "period", - "substance", - "sport", - "plant", - "techmeth", - "volsize", - "instru", - "abb", - "speed", - "word", - "lang", - "perc", - "code", - "dist", - "temp", - "symbol", - "ord", - "veh", - "weight", - "currency", + "ABBR:abb", + "ABBR:exp", + "ENTY:animal", + "ENTY:body", + "ENTY:color", + "ENTY:cremat", + "ENTY:currency", + "ENTY:dismed", + "ENTY:event", + "ENTY:food", + "ENTY:instru", + "ENTY:lang", + "ENTY:letter", + "ENTY:other", + "ENTY:plant", + "ENTY:product", + "ENTY:religion", + "ENTY:sport", + "ENTY:substance", + "ENTY:symbol", + "ENTY:techmeth", + "ENTY:termeq", + "ENTY:veh", + "ENTY:word", + "DESC:def", + "DESC:desc", + "DESC:manner", + "DESC:reason", + "HUM:gr", + "HUM:ind", + "HUM:title", + "HUM:desc", + "LOC:city", + "LOC:country", + "LOC:mount", + "LOC:other", + "LOC:state", + "NUM:code", + "NUM:count", + "NUM:date", + "NUM:dist", + "NUM:money", + "NUM:ord", + "NUM:other", + "NUM:period", + "NUM:perc", + "NUM:speed", + "NUM:temp", + "NUM:volsize", + "NUM:weight", ] class Trec(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" + """The Text REtrieval Conference (TREC) Question Classification dataset.""" - VERSION = datasets.Version("1.1.0") + VERSION = datasets.Version("2.0.0", description="Fine label contains 50 classes instead of 47.") def _info(self): - # TODO: Specifies the datasets.DatasetInfo object return datasets.DatasetInfo( - # This is the description that will appear on the datasets page. description=_DESCRIPTION, - # datasets.features.FeatureConnectors features=datasets.Features( { - "label-coarse": datasets.ClassLabel(names=_COARSE_LABELS), - "label-fine": datasets.ClassLabel(names=_FINE_LABELS), "text": datasets.Value("string"), + "coarse_label": datasets.ClassLabel(names=_COARSE_LABELS), + "fine_label": datasets.ClassLabel(names=_FINE_LABELS), } ), - # If there's a common (input, target) tuple from the features, - # specify them here. They'll be used if as_supervised=True in - # builder.as_dataset. - supervised_keys=None, - # Homepage of the dataset for documentation - homepage="https://cogcomp.seas.upenn.edu/Data/QA/QC/", + homepage=_HOMEPAGE, citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" - # TODO: Downloads the data and defines the splits - # dl_manager is a datasets.download.DownloadManager that can be used to - # download and extract URLs - dl_files = dl_manager.download_and_extract(_URLs) + dl_files = dl_manager.download(_URLs) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": dl_files["train"], }, ), datasets.SplitGenerator( name=datasets.Split.TEST, - # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": dl_files["test"], }, @@ -156,14 +150,13 @@ def _split_generators(self, dl_manager): def _generate_examples(self, filepath): """Yields examples.""" - # TODO: Yields (key, example) tuples from the dataset with open(filepath, "rb") as f: for id_, row in enumerate(f): # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space - label, _, text = row.replace(b"\xf0", b" ").strip().decode().partition(" ") - coarse_label, _, fine_label = label.partition(":") + fine_label, _, text = row.replace(b"\xf0", b" ").strip().decode().partition(" ") + coarse_label = fine_label.split(":")[0] yield id_, { - "label-coarse": coarse_label, - "label-fine": fine_label, "text": text, + "coarse_label": coarse_label, + "fine_label": fine_label, } From 0872a378976d3cede3aa4918b4964c4d3fd12bb2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sun, 7 Aug 2022 11:43:11 +0200 Subject: [PATCH 2/5] Update dataset card --- datasets/trec/README.md | 112 ++++++++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 17 deletions(-) diff --git a/datasets/trec/README.md b/datasets/trec/README.md index d8f7e5d6db0..8b383e439b0 100644 --- a/datasets/trec/README.md +++ b/datasets/trec/README.md @@ -1,8 +1,25 @@ --- +annotations_creators: +- expert-generated language: - en -paperswithcode_id: trecqa +language_creators: +- expert-generated +license: +- unknown +multilinguality: +- monolingual pretty_name: Text Retrieval Conference Question Answering +size_categories: +- 1K Date: Sun, 7 Aug 2022 11:46:37 +0200 Subject: [PATCH 3/5] Update metadata JSON --- datasets/trec/dataset_infos.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/trec/dataset_infos.json b/datasets/trec/dataset_infos.json index 072d3ea37c7..3f00658776a 100644 --- a/datasets/trec/dataset_infos.json +++ b/datasets/trec/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set. The dataset has 6 labels, 47 level-2 labels. Average length of each sentence is 10, vocabulary size of 8700.\n\nData are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set.\n", "citation": "@inproceedings{li-roth-2002-learning,\n title = \"Learning Question Classifiers\",\n author = \"Li, Xin and\n Roth, Dan\",\n booktitle = \"{COLING} 2002: The 19th International Conference on Computational Linguistics\",\n year = \"2002\",\n url = \"https://www.aclweb.org/anthology/C02-1150\",\n}\n@inproceedings{hovy-etal-2001-toward,\n title = \"Toward Semantics-Based Answer Pinpointing\",\n author = \"Hovy, Eduard and\n Gerber, Laurie and\n Hermjakob, Ulf and\n Lin, Chin-Yew and\n Ravichandran, Deepak\",\n booktitle = \"Proceedings of the First International Conference on Human Language Technology Research\",\n year = \"2001\",\n url = \"https://www.aclweb.org/anthology/H01-1069\",\n}\n", "homepage": "https://cogcomp.seas.upenn.edu/Data/QA/QC/", "license": "", "features": {"label-coarse": {"num_classes": 6, "names": ["DESC", "ENTY", "ABBR", "HUM", "NUM", "LOC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "label-fine": {"num_classes": 47, "names": ["manner", "cremat", "animal", "exp", "ind", "gr", "title", "def", "date", "reason", "event", "state", "desc", "count", "other", "letter", "religion", "food", "country", "color", "termeq", "city", "body", "dismed", "mount", "money", "product", "period", "substance", "sport", "plant", "techmeth", "volsize", "instru", "abb", "speed", "word", "lang", "perc", "code", "dist", "temp", "symbol", "ord", "veh", "weight", "currency"], "names_file": null, "id": null, "_type": "ClassLabel"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "trec", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 385090, "num_examples": 5452, "dataset_name": "trec"}, "test": {"name": "test", "num_bytes": 27983, "num_examples": 500, "dataset_name": "trec"}}, "download_checksums": {"https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label": {"num_bytes": 335858, "checksum": "9e4c8bdcaffb96ed61041bd64b564183d52793a8e91d84fc3a8646885f466ec3"}, "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label": {"num_bytes": 23354, "checksum": "033f22c028c2bbba9ca682f68ffe204dc1aa6e1cf35dd6207f2d4ca67f0d0e8e"}}, "download_size": 359212, "post_processing_size": null, "dataset_size": 413073, "size_in_bytes": 772285}} \ No newline at end of file +{"default": {"description": "The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set.\n\nThe dataset has 6 coarse class labels and 50 fine class labels. Average length of each sentence is 10, vocabulary size of 8700.\n\nData are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set. These questions were manually labeled.\n", "citation": "@inproceedings{li-roth-2002-learning,\n title = \"Learning Question Classifiers\",\n author = \"Li, Xin and\n Roth, Dan\",\n booktitle = \"{COLING} 2002: The 19th International Conference on Computational Linguistics\",\n year = \"2002\",\n url = \"https://www.aclweb.org/anthology/C02-1150\",\n}\n@inproceedings{hovy-etal-2001-toward,\n title = \"Toward Semantics-Based Answer Pinpointing\",\n author = \"Hovy, Eduard and\n Gerber, Laurie and\n Hermjakob, Ulf and\n Lin, Chin-Yew and\n Ravichandran, Deepak\",\n booktitle = \"Proceedings of the First International Conference on Human Language Technology Research\",\n year = \"2001\",\n url = \"https://www.aclweb.org/anthology/H01-1069\",\n}\n", "homepage": "https://cogcomp.seas.upenn.edu/Data/QA/QC/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "coarse_label": {"num_classes": 6, "names": ["ABBR", "ENTY", "DESC", "HUM", "LOC", "NUM"], "id": null, "_type": "ClassLabel"}, "fine_label": {"num_classes": 50, "names": ["ABBR:abb", "ABBR:exp", "ENTY:animal", "ENTY:body", "ENTY:color", "ENTY:cremat", "ENTY:currency", "ENTY:dismed", "ENTY:event", "ENTY:food", "ENTY:instru", "ENTY:lang", "ENTY:letter", "ENTY:other", "ENTY:plant", "ENTY:product", "ENTY:religion", "ENTY:sport", "ENTY:substance", "ENTY:symbol", "ENTY:techmeth", "ENTY:termeq", "ENTY:veh", "ENTY:word", "DESC:def", "DESC:desc", "DESC:manner", "DESC:reason", "HUM:gr", "HUM:ind", "HUM:title", "HUM:desc", "LOC:city", "LOC:country", "LOC:mount", "LOC:other", "LOC:state", "NUM:code", "NUM:count", "NUM:date", "NUM:dist", "NUM:money", "NUM:ord", "NUM:other", "NUM:period", "NUM:perc", "NUM:speed", "NUM:temp", "NUM:volsize", "NUM:weight"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "trec", "config_name": "default", "version": {"version_str": "2.0.0", "description": "Fine label contains 50 classes instead of 47.", "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 385090, "num_examples": 5452, "dataset_name": "trec"}, "test": {"name": "test", "num_bytes": 27983, "num_examples": 500, "dataset_name": "trec"}}, "download_checksums": {"https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label": {"num_bytes": 335858, "checksum": "9e4c8bdcaffb96ed61041bd64b564183d52793a8e91d84fc3a8646885f466ec3"}, "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label": {"num_bytes": 23354, "checksum": "033f22c028c2bbba9ca682f68ffe204dc1aa6e1cf35dd6207f2d4ca67f0d0e8e"}}, "download_size": 359212, "post_processing_size": null, "dataset_size": 413073, "size_in_bytes": 772285}} \ No newline at end of file From 9aeba54eb7b4f404ff9cc308bea08c353f5be319 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sun, 7 Aug 2022 11:47:36 +0200 Subject: [PATCH 4/5] Update dummy data path --- datasets/trec/dummy/{1.1.0 => 2.0.0}/dummy_data.zip | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename datasets/trec/dummy/{1.1.0 => 2.0.0}/dummy_data.zip (100%) diff --git a/datasets/trec/dummy/1.1.0/dummy_data.zip b/datasets/trec/dummy/2.0.0/dummy_data.zip similarity index 100% rename from datasets/trec/dummy/1.1.0/dummy_data.zip rename to datasets/trec/dummy/2.0.0/dummy_data.zip From 395dc7fdcdd3075b2b9b5ec431b56d2476bed7e8 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Aug 2022 21:11:40 +0200 Subject: [PATCH 5/5] Remove tags tag from dataset card --- datasets/trec/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/trec/README.md b/datasets/trec/README.md index 8b383e439b0..9ffebf5afcf 100644 --- a/datasets/trec/README.md +++ b/datasets/trec/README.md @@ -14,7 +14,6 @@ size_categories: - 1K