From 3bdf2d5b648850c0053667e37e269db58be9c184 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 30 Mar 2022 17:56:09 +0200
Subject: [PATCH 1/7] update tasks list

---
 src/datasets/utils/metadata.py          |   4 +-
 src/datasets/utils/resources/tasks.json | 260 +++++++++++++++---------
 2 files changed, 162 insertions(+), 102 deletions(-)

diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py
index 090b59c25a6..a0ed9469ea4 100644
--- a/src/datasets/utils/metadata.py
+++ b/src/datasets/utils/metadata.py
@@ -321,9 +321,9 @@ def validate_task_categories(task_categories: Union[List[str], Dict[str, List[st
     def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput:
         # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
         #   in the near future and we don't want to waste energy in tagging against a moving taxonomy.
-        known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]]
+        known_set = [tid for _cat, d in known_task_ids.items() for tid in d.get("subtasks", [])]
         validated, error = tagset_validator(
-            task_ids, known_set, "task_ids", known_task_ids_url, lambda e: "-other-" in e or e.startswith("other-")
+            task_ids, known_set, "task_ids", known_task_ids_url, lambda e: not e or "-other-" in e or e.startswith("other-")
         )
         return validated, error
 
diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
index 0585f97b917..4d5658d7645 100644
--- a/src/datasets/utils/resources/tasks.json
+++ b/src/datasets/utils/resources/tasks.json
@@ -1,54 +1,110 @@
 {
-    "conditional-text-generation": {
-        "description": "data-to-text and text transduction tasks such as translation or summarization",
-        "options": [
-            "machine-translation",
-            "sentence-splitting-fusion",
-            "summarization",
-            "table-to-text",
-            "text-simplification",
-            "explanation-generation",
-            "other-structured-to-text",
-            "other"
+    "audio-classification": {
+        "type": "audio",
+        "subtasks": [
+            "keyword-spotting",
+            "speaker-identification",
+            "speaker-intent-classification",
+            "emotion-recognition",
+            "speaker-language-identification"
         ]
     },
-    "question-answering": {
-        "description": "question answering tasks",
-        "options": [
-            "open-domain-qa",
-            "closed-domain-qa",
+    "audio-to-audio": {
+        "type": "audio"
+    },
+    "automatic-speech-recognition": {
+        "type": "multimodal"
+    },
+    "conversational": {
+        "type": "text",
+        "subtasks": [
+            "dialogue-generation"
+        ]
+    },
+    "feature-extraction": {
+        "type": "multimodal"
+    },
+    "fill-mask": {
+        "type": "text",
+        "subtasks": [
+            "slot-filling",
+            "masked-language-modeling"
+        ]
+    },
+    "image-classification": {
+        "type": "image",
+        "subtasks": [
+            "multi-label-image-classification",
+            "multi-class-image-classification"
+        ]
+    },
+    "image-segmentation": {
+        "type": "image",
+        "subtasks": [
+            "instance-segmentation",
+            "semantic-segmentation",
+            "panoptic-segmentation"
+        ]
+    },
+    "image-to-text": {
+        "type": "multimodal",
+        "subtasks": [
+            "image-captioning"
+        ]
+    },
+    "multiple-choice": {
+        "type": "text",
+        "subtasks": [
             "multiple-choice-qa",
+            "multiple-choice-coreference-resolution"
+        ]
+    },
+    "object-detection": {
+        "type": "image",
+        "subtasks": [
+            "face-detection",
+            "vehicle-detection"
+        ]
+    },
+    "question-answering": {
+        "type": "text",
+        "aliases": [
+            "extractive-question-answering"
+        ],
+        "subtasks": [
             "extractive-qa",
-            "abstractive-qa",
-            "other"
+            "open-domain-qa",
+            "closed-domain-qa"
         ]
     },
-    "sequence-modeling": {
-        "description": "such as language modeling or dialogue",
-        "options": [
-            "dialogue-modeling",
-            "language-modeling",
-            "other-multi-turn",
-            "slot-filling",
-            "other"
+    "sentence-similarity": {
+        "type": "text"
+    },
+    "tabular-classification": {
+        "type": "text"
+    },
+    "tabular-to-text": {
+        "type": "text",
+        "subtasks": [
+            "rdf-to-text"
         ]
     },
-    "structure-prediction": {
-        "description": "predicting structural properties of the text, such as syntax",
-        "options": [
-            "coreference-resolution",
-            "named-entity-recognition",
-            "part-of-speech-tagging",
-            "parsing",
-            "semantic-role-labeling",
-            "lemmatization",
-            "word-sense-disambiguation",
-            "other"
+    "summarization": {
+        "type": "text",
+        "subtasks": [
+            "news-articles-summarization",
+            "news-articles-headline-generation"
         ]
     },
+    "table-to-text": {
+        "type": "text"
+    },
+    "table-question-answering": {
+        "type": "text"
+    },
     "text-classification": {
-        "description": "predicting a class index or boolean value",
-        "options": [
+        "type": "text",
+        "subtasks": [
             "acceptability-classification",
             "entity-linking-classification",
             "fact-checking",
@@ -59,90 +115,94 @@
             "semantic-similarity-classification",
             "sentiment-classification",
             "topic-classification",
-            "other"
+            "semantic-similarity-scoring",
+            "sentiment-scoring",
+            "sentiment-analysis",
+            "hate-speech-detection",
+            "text-scoring"
+        ]
+    },
+    "text-generation": {
+        "type": "text",
+        "subtasks": [
+            "dialogue-modeling",
+            "language-modeling"
         ]
     },
     "text-retrieval": {
-        "description": "information or text retrieval tasks",
-        "options": [
+        "type": "text",
+        "subtasks": [
             "document-retrieval",
             "utterance-retrieval",
             "entity-linking-retrieval",
-            "fact-checking-retrieval",
-            "other"
+            "fact-checking-retrieval"
         ]
     },
-    "text-scoring": {
-        "description": "text scoring tasks, predicting a real valued score for some text",
-        "options": [
-            "semantic-similarity-scoring",
-            "sentiment-scoring",
-            "other"
+    "text-to-image": {
+        "type": "multimodal"
+    },
+    "text-to-tabular": {
+        "type": "text",
+        "subtasks": [
+            "relation-extraction",
+            "semantic-role-labeling"
         ]
     },
-    "speech-processing": {
-        "description": "tasks related to the analysis and representations of speech signals",
-        "options": [
-            "automatic-speech-recognition",
-            "phoneme-recognition",
-            "keyword-spotting",
-            "query-by-example-spoken-term-detection",
-            "speaker-identification",
-            "automatic-speaker-verification",
-            "speaker-diarization",
-            "intent-classification",
-            "slot-filling",
-            "emotion-recognition"
+    "text-to-speech": {
+        "type": "multimodal"
+    },
+    "text2text-generation": {
+        "type": "text",
+        "subtasks": [
+            "text-simplification",
+            "explanation-generation",
+            "abstractive-qa",
+            "open-domain-abstractive-qa",
+            "closed-domain-qa",
+            "open-book-qa",
+            "closed-book-qa"
         ]
     },
     "time-series-forecasting": {
-        "description": "tasks related to predicting future values of a time series",
-        "options": [
+        "type": "time series",
+        "subtasks": [
             "univariate-time-series-forecasting",
             "multivariate-time-series-forecasting"
         ]
     },
-    "object-detection": {
-        "description": "tasks related to detecting instances of objects from a particular class in an image",
-        "options": [
-            "face-detection",
-            "other"
+    "token-classification": {
+        "type": "text",
+        "aliases": [
+            "structure-prediction"
+        ],
+        "subtasks": [
+            "named-entity-recognition",
+            "part-of-speech-tagging",
+            "parsing",
+            "lemmatization",
+            "word-sense-disambiguation",
+            "coreference-resolution"
         ]
     },
-    "image-to-text": {
-        "description": "tasks related to generating text from images",
-        "options": [
-            "image-captioning",
-            "other"
-        ]
+    "translation": {
+        "type": "text"
     },
-    "text-to-image": {
-        "description": "tasks related to generating images from text",
-        "options": []
+    "visual-question-answering": {
+        "type": "multimodal"
     },
-    "image-segmentation": {
-        "description": "tasks related to detecting and delineating distinct objects in images",
-        "options": [
-            "instance-segmentation",
-            "semantic-segmentation",
-            "panoptic-segmentation",
-            "other"
-        ]
+    "voice-activity-detection": {
+        "type": "audio"
     },
-    "image-classification": {
-        "description": "tasks related to identifying what images represent",
-        "options": [
-            "multi-label-image-classification",
-            "single-label-image-classification",
-            "other"
-        ]
-
+    "zero-shot-classification": {
+        "type": "text"
+    },
+    "zero-shot-image-classification": {
+        "type": "multimodal"
+    },
+    "reinforcement-learning": {
+        "type": "other"
     },
-
     "other": {
-        "description": "other task family not mentioned here",
-        "options": [
-            "other"
-        ]
+        "type": "other"
     }
 }
\ No newline at end of file

From ba102851764b219e99143dc23838891fa3d53168 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 30 Mar 2022 17:57:24 +0200
Subject: [PATCH 2/7] revert bad task verification

---
 src/datasets/utils/metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py
index a0ed9469ea4..5e64ae8712a 100644
--- a/src/datasets/utils/metadata.py
+++ b/src/datasets/utils/metadata.py
@@ -323,7 +323,7 @@ def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> Valid
         #   in the near future and we don't want to waste energy in tagging against a moving taxonomy.
         known_set = [tid for _cat, d in known_task_ids.items() for tid in d.get("subtasks", [])]
         validated, error = tagset_validator(
-            task_ids, known_set, "task_ids", known_task_ids_url, lambda e: not e or "-other-" in e or e.startswith("other-")
+            task_ids, known_set, "task_ids", known_task_ids_url, lambda e: "-other-" in e or e.startswith("other-")
         )
         return validated, error
 

From df3d8b23c18067c7a8b79d7f8aee499c6f2da558 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 1 Apr 2022 14:57:29 +0200
Subject: [PATCH 3/7] speaker-language-identification ->
 language-identification

---
 src/datasets/utils/resources/tasks.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
index 4d5658d7645..ef1eca590d9 100644
--- a/src/datasets/utils/resources/tasks.json
+++ b/src/datasets/utils/resources/tasks.json
@@ -6,7 +6,7 @@
             "speaker-identification",
             "speaker-intent-classification",
             "emotion-recognition",
-            "speaker-language-identification"
+            "language-identification"
         ]
     },
     "audio-to-audio": {

From 8f3add08233dfd80c9f03307d5182711e6222bcc Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 1 Apr 2022 14:57:42 +0200
Subject: [PATCH 4/7] speaker-intent-classification -> intent-classification

---
 src/datasets/utils/resources/tasks.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
index ef1eca590d9..45e2634cd54 100644
--- a/src/datasets/utils/resources/tasks.json
+++ b/src/datasets/utils/resources/tasks.json
@@ -4,7 +4,7 @@
         "subtasks": [
             "keyword-spotting",
             "speaker-identification",
-            "speaker-intent-classification",
+            "intent-classification",
             "emotion-recognition",
             "language-identification"
         ]

From 0df92da22b89b503da44c9f7d33f0da947e4e4c2 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 4 Apr 2022 11:48:31 +0200
Subject: [PATCH 5/7] add tabular-classification subtasks from autotrain

---
 src/datasets/utils/resources/tasks.json | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
index 45e2634cd54..7098c992a59 100644
--- a/src/datasets/utils/resources/tasks.json
+++ b/src/datasets/utils/resources/tasks.json
@@ -81,7 +81,12 @@
         "type": "text"
     },
     "tabular-classification": {
-        "type": "text"
+        "type": "text",
+        "subtasks": [
+            "tabular_multi_class_classification",
+            "tabular_multi_label_classification",
+            "tabular_single_column_regression"
+        ]
     },
     "tabular-to-text": {
         "type": "text",

From b11337191e0b4d484431b59a73236d4c7af46480 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 4 Apr 2022 11:52:18 +0200
Subject: [PATCH 6/7] add "audio-" prefix to some audio-classification subtasks

---
 src/datasets/utils/resources/tasks.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
index 7098c992a59..cee57823bd2 100644
--- a/src/datasets/utils/resources/tasks.json
+++ b/src/datasets/utils/resources/tasks.json
@@ -4,9 +4,9 @@
         "subtasks": [
             "keyword-spotting",
             "speaker-identification",
-            "intent-classification",
-            "emotion-recognition",
-            "language-identification"
+            "audio-intent-classification",
+            "audio-emotion-recognition",
+            "audio-language-identification"
         ]
     },
     "audio-to-audio": {

From 644e008be8847da47b20257d893d52137219c727 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 4 Apr 2022 11:54:39 +0200
Subject: [PATCH 7/7] underscores to dashes

---
 src/datasets/utils/resources/tasks.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
index cee57823bd2..c85ccf004ad 100644
--- a/src/datasets/utils/resources/tasks.json
+++ b/src/datasets/utils/resources/tasks.json
@@ -83,9 +83,9 @@
     "tabular-classification": {
         "type": "text",
         "subtasks": [
-            "tabular_multi_class_classification",
-            "tabular_multi_label_classification",
-            "tabular_single_column_regression"
+            "tabular-multi-class-classification",
+            "tabular-multi-label-classification",
+            "tabular-single-column-regression"
         ]
     },
     "tabular-to-text": {