From 3bdf2d5b648850c0053667e37e269db58be9c184 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 30 Mar 2022 17:56:09 +0200 Subject: [PATCH 1/7] update tasks list --- src/datasets/utils/metadata.py | 4 +- src/datasets/utils/resources/tasks.json | 260 +++++++++++++++--------- 2 files changed, 162 insertions(+), 102 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 090b59c25a6..a0ed9469ea4 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -321,9 +321,9 @@ def validate_task_categories(task_categories: Union[List[str], Dict[str, List[st def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. - known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] + known_set = [tid for _cat, d in known_task_ids.items() for tid in d.get("subtasks", [])] validated, error = tagset_validator( - task_ids, known_set, "task_ids", known_task_ids_url, lambda e: "-other-" in e or e.startswith("other-") + task_ids, known_set, "task_ids", known_task_ids_url, lambda e: not e or "-other-" in e or e.startswith("other-") ) return validated, error diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index 0585f97b917..4d5658d7645 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -1,54 +1,110 @@ { - "conditional-text-generation": { - "description": "data-to-text and text transduction tasks such as translation or summarization", - "options": [ - "machine-translation", - "sentence-splitting-fusion", - "summarization", - "table-to-text", - "text-simplification", - "explanation-generation", - "other-structured-to-text", - "other" + "audio-classification": { + "type": "audio", + "subtasks": [ + "keyword-spotting", + "speaker-identification", + "speaker-intent-classification", + "emotion-recognition", + "speaker-language-identification" ] }, - "question-answering": { - "description": "question answering tasks", - "options": [ - "open-domain-qa", - "closed-domain-qa", + "audio-to-audio": { + "type": "audio" + }, + "automatic-speech-recognition": { + "type": "multimodal" + }, + "conversational": { + "type": "text", + "subtasks": [ + "dialogue-generation" + ] + }, + "feature-extraction": { + "type": "multimodal" + }, + "fill-mask": { + "type": "text", + "subtasks": [ + "slot-filling", + "masked-language-modeling" + ] + }, + "image-classification": { + "type": "image", + "subtasks": [ + "multi-label-image-classification", + "multi-class-image-classification" + ] + }, + "image-segmentation": { + "type": "image", + "subtasks": [ + "instance-segmentation", + "semantic-segmentation", + "panoptic-segmentation" + ] + }, + "image-to-text": { + "type": "multimodal", + "subtasks": [ + "image-captioning" + ] + }, + "multiple-choice": { + "type": "text", + "subtasks": [ "multiple-choice-qa", + "multiple-choice-coreference-resolution" + ] + }, + "object-detection": { + "type": "image", + "subtasks": [ + "face-detection", + "vehicle-detection" + ] + }, + "question-answering": { + "type": "text", + "aliases": [ + "extractive-question-answering" + ], + "subtasks": [ "extractive-qa", - "abstractive-qa", - "other" + "open-domain-qa", + "closed-domain-qa" ] }, - "sequence-modeling": { - "description": "such as language modeling or dialogue", - "options": [ - "dialogue-modeling", - "language-modeling", - "other-multi-turn", - "slot-filling", - "other" + "sentence-similarity": { + "type": "text" + }, + "tabular-classification": { + "type": "text" + }, + "tabular-to-text": { + "type": "text", + "subtasks": [ + "rdf-to-text" ] }, - "structure-prediction": { - "description": "predicting structural properties of the text, such as syntax", - "options": [ - "coreference-resolution", - "named-entity-recognition", - "part-of-speech-tagging", - "parsing", - "semantic-role-labeling", - "lemmatization", - "word-sense-disambiguation", - "other" + "summarization": { + "type": "text", + "subtasks": [ + "news-articles-summarization", + "news-articles-headline-generation" ] }, + "table-to-text": { + "type": "text" + }, + "table-question-answering": { + "type": "text" + }, "text-classification": { - "description": "predicting a class index or boolean value", - "options": [ + "type": "text", + "subtasks": [ "acceptability-classification", "entity-linking-classification", "fact-checking", @@ -59,90 +115,94 @@ "semantic-similarity-classification", "sentiment-classification", "topic-classification", - "other" + "semantic-similarity-scoring", + "sentiment-scoring", + "sentiment-analysis", + "hate-speech-detection", + "text-scoring" + ] + }, + "text-generation": { + "type": "text", + "subtasks": [ + "dialogue-modeling", + "language-modeling" ] }, "text-retrieval": { - "description": "information or text retrieval tasks", - "options": [ + "type": "text", + "subtasks": [ "document-retrieval", "utterance-retrieval", "entity-linking-retrieval", - "fact-checking-retrieval", - "other" + "fact-checking-retrieval" ] }, - "text-scoring": { - "description": "text scoring tasks, predicting a real valued score for some text", - "options": [ - "semantic-similarity-scoring", - "sentiment-scoring", - "other" + "text-to-image": { + "type": "multimodal" + }, + "text-to-tabular": { + "type": "text", + "subtasks": [ + "relation-extraction", + "semantic-role-labeling" ] }, - "speech-processing": { - "description": "tasks related to the analysis and representations of speech signals", - "options": [ - "automatic-speech-recognition", - "phoneme-recognition", - "keyword-spotting", - "query-by-example-spoken-term-detection", - "speaker-identification", - "automatic-speaker-verification", - "speaker-diarization", - "intent-classification", - "slot-filling", - "emotion-recognition" + "text-to-speech": { + "type": "multimodal" + }, + "text2text-generation": { + "type": "text", + "subtasks": [ + "text-simplification", + "explanation-generation", + "abstractive-qa", + "open-domain-abstractive-qa", + "closed-domain-qa", + "open-book-qa", + "closed-book-qa" ] }, "time-series-forecasting": { - "description": "tasks related to predicting future values of a time series", - "options": [ + "type": "time series", + "subtasks": [ "univariate-time-series-forecasting", "multivariate-time-series-forecasting" ] }, - "object-detection": { - "description": "tasks related to detecting instances of objects from a particular class in an image", - "options": [ - "face-detection", - "other" + "token-classification": { + "type": "text", + "aliases": [ + "structure-prediction" + ], + "subtasks": [ + "named-entity-recognition", + "part-of-speech-tagging", + "parsing", + "lemmatization", + "word-sense-disambiguation", + "coreference-resolution" ] }, - "image-to-text": { - "description": "tasks related to generating text from images", - "options": [ - "image-captioning", - "other" - ] + "translation": { + "type": "text" }, - "text-to-image": { - "description": "tasks related to generating images from text", - "options": [] + "visual-question-answering": { + "type": "multimodal" }, - "image-segmentation": { - "description": "tasks related to detecting and delineating distinct objects in images", - "options": [ - "instance-segmentation", - "semantic-segmentation", - "panoptic-segmentation", - "other" - ] + "voice-activity-detection": { + "type": "audio" }, - "image-classification": { - "description": "tasks related to identifying what images represent", - "options": [ - "multi-label-image-classification", - "single-label-image-classification", - "other" - ] - + "zero-shot-classification": { + "type": "text" + }, + "zero-shot-image-classification": { + "type": "multimodal" + }, + "reinforcement-learning": { + "type": "other" }, - "other": { - "description": "other task family not mentioned here", - "options": [ - "other" - ] + "type": "other" } } \ No newline at end of file From ba102851764b219e99143dc23838891fa3d53168 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 30 Mar 2022 17:57:24 +0200 Subject: [PATCH 2/7] revert bad task verification --- src/datasets/utils/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index a0ed9469ea4..5e64ae8712a 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -323,7 +323,7 @@ def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> Valid # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d.get("subtasks", [])] validated, error = tagset_validator( - task_ids, known_set, "task_ids", known_task_ids_url, lambda e: not e or "-other-" in e or e.startswith("other-") + task_ids, known_set, "task_ids", known_task_ids_url, lambda e: "-other-" in e or e.startswith("other-") ) return validated, error From df3d8b23c18067c7a8b79d7f8aee499c6f2da558 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 1 Apr 2022 14:57:29 +0200 Subject: [PATCH 3/7] speaker-language-identification -> language-identification --- src/datasets/utils/resources/tasks.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index 4d5658d7645..ef1eca590d9 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -6,7 +6,7 @@ "speaker-identification", "speaker-intent-classification", "emotion-recognition", - "speaker-language-identification" + "language-identification" ] }, "audio-to-audio": { From 8f3add08233dfd80c9f03307d5182711e6222bcc Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 1 Apr 2022 14:57:42 +0200 Subject: [PATCH 4/7] speaker-intent-classification -> intent-classification --- src/datasets/utils/resources/tasks.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index ef1eca590d9..45e2634cd54 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -4,7 +4,7 @@ "subtasks": [ "keyword-spotting", "speaker-identification", - "speaker-intent-classification", + "intent-classification", "emotion-recognition", "language-identification" ] From 0df92da22b89b503da44c9f7d33f0da947e4e4c2 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 4 Apr 2022 11:48:31 +0200 Subject: [PATCH 5/7] add tabular-classification subtasks from autotrain --- src/datasets/utils/resources/tasks.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index 45e2634cd54..7098c992a59 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -81,7 +81,12 @@ "type": "text" }, "tabular-classification": { - "type": "text" + "type": "text", + "subtasks": [ + "tabular_multi_class_classification", + "tabular_multi_label_classification", + "tabular_single_column_regression" + ] }, "tabular-to-text": { "type": "text", From b11337191e0b4d484431b59a73236d4c7af46480 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 4 Apr 2022 11:52:18 +0200 Subject: [PATCH 6/7] add "audio-" prefix to some audio-classification subtasks --- src/datasets/utils/resources/tasks.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index 7098c992a59..cee57823bd2 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -4,9 +4,9 @@ "subtasks": [ "keyword-spotting", "speaker-identification", - "intent-classification", - "emotion-recognition", - "language-identification" + "audio-intent-classification", + "audio-emotion-recognition", + "audio-language-identification" ] }, "audio-to-audio": { From 644e008be8847da47b20257d893d52137219c727 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 4 Apr 2022 11:54:39 +0200 Subject: [PATCH 7/7] underscores to dashes --- src/datasets/utils/resources/tasks.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index cee57823bd2..c85ccf004ad 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -83,9 +83,9 @@ "tabular-classification": { "type": "text", "subtasks": [ - "tabular_multi_class_classification", - "tabular_multi_label_classification", - "tabular_single_column_regression" + "tabular-multi-class-classification", + "tabular-multi-label-classification", + "tabular-single-column-regression" ] }, "tabular-to-text": {