huggingface · lewtun · May 18, 2021 · Apr 23, 2021 · Apr 23, 2021 · Apr 23, 2021
diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst
@@ -32,7 +32,7 @@ The base class :class:`datasets.Dataset` implements a Dataset backed by an Apach
         info, split, builder_name, citation, config_name, dataset_size,
         description, download_checksums, download_size, features, homepage,
         license, size_in_bytes, supervised_keys, version,
-        from_csv, from_json, from_text,
+        from_csv, from_json, from_text, prepare_for_task,
 
 .. autofunction:: datasets.concatenate_datasets
 
@@ -54,7 +54,7 @@ It also has dataset transform methods like map or filter, to process all the spl
         flatten_, cast_, remove_columns_, rename_column_,
         flatten, cast, remove_columns, rename_column, class_encode_column,
         save_to_disk, load_from_disk,
-        from_csv, from_json, from_text,
+        from_csv, from_json, from_text, prepare_for_task
 
 
 ``Features``

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -57,6 +57,7 @@
 from .search import IndexableMixin
 from .splits import NamedSplit
 from .table import ConcatenationTable, InMemoryTable, MemoryMappedTable, Table, concat_tables, list_table_cache_files
+from .tasks import TaskTemplate
 from .utils import map_nested
 from .utils.deprecation_utils import deprecated
 from .utils.file_utils import estimate_dataset_size
@@ -1384,6 +1385,44 @@ def with_transform(
         dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns)
         return dataset
 
+    def prepare_for_task(self, task: Union[str, TaskTemplate]) -> "Dataset":
+        """Prepare a dataset for the given task.
+
+        Casts :attr:`datasets.DatasetInfo.features` according to a task-specific schema.
+
+        Args:
+            task (:obj:`Union[str, TaskTemplate]`): The task to prepare the dataset for during training and evaluation. If :obj:`str`, supported tasks include:
+
+                - :obj:`"text-classification"`
+                - :obj:`"question-answering"`
+
+                If :obj:`TaskTemplate`, must be one of the task templates in :obj:`datasets.tasks`.
+        """
+        # TODO(lewtun): Add support for casting nested features like answers.text and answers.answer_start in SQuAD
+        if isinstance(task, str):
+            tasks = [template.task for template in (self.info.task_templates or [])]
+            compatible_templates = [template for template in (self.info.task_templates or []) if template.task == task]
+            if not compatible_templates:
+                raise ValueError(f"Task {task} is not compatible with this dataset! Available tasks: {tasks}")
+
+            if len(compatible_templates) > 1:
+                raise ValueError(
+                    f"Expected 1 task template but found {len(compatible_templates)}! Please ensure that `datasets.DatasetInfo.task_templates` contains a unique set of task types."
+                )
+            template = compatible_templates[0]
+        elif isinstance(task, TaskTemplate):
+            template = task
+        else:
+            raise ValueError(
+                f"Expected a `str` or `datasets.tasks.TaskTemplate` object but got task {task} with type {type(task)}."
+            )
+        column_mapping = template.column_mapping
+        columns_to_drop = [column for column in self.column_names if column not in column_mapping]
+        dataset = self.remove_columns(columns_to_drop)
+        dataset = dataset.rename_columns(column_mapping)
+        dataset = dataset.cast(features=template.features)
+        return dataset
+
     def _getitem(
         self,
         key: Union[int, slice, str],

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -13,6 +13,7 @@
 from .features import Features
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .table import Table
+from .tasks import TaskTemplate
 from .utils.deprecation_utils import deprecated
 from .utils.typing import PathLike
 
@@ -790,3 +791,19 @@ def from_text(
         return TextDatasetReader(
             path_or_paths, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs
         ).read()
+
+    def prepare_for_task(self, task: Union[str, TaskTemplate]):
+        """Prepare a dataset for the given task.
+
+        Casts :attr:`datasets.DatasetInfo.features` according to a task-specific schema.
+
+        Args:
+            task (:obj:`Union[str, TaskTemplate]`): The task to prepare the dataset for during training and evaluation. If :obj:`str`, supported tasks include:
+
+                - :obj:`"text-classification"`
+                - :obj:`"question-answering"`
+
+                If :obj:`TaskTemplate`, must be one of the task templates in :obj:`datasets.tasks`.
+        """
+        self._check_values_type()
+        return DatasetDict({k: dataset.prepare_for_task(task=task) for k, dataset in self.items()})
diff --git a/src/datasets/info.py b/src/datasets/info.py
@@ -39,6 +39,7 @@
 from . import config
 from .features import Features, Value
 from .splits import SplitDict
+from .tasks import TaskTemplate, task_template_from_dict
 from .utils import Version
 from .utils.logging import get_logger
 
@@ -108,6 +109,7 @@ class DatasetInfo:
         post_processing_size (int, optional):
         dataset_size (int, optional):
         size_in_bytes (int, optional):
+        task_templates (List[TaskTemplate], optional):
     """
 
     # Set in the dataset scripts
@@ -118,6 +120,7 @@ class DatasetInfo:
     features: Optional[Features] = None
     post_processed: Optional[PostProcessedInfo] = None
     supervised_keys: Optional[SupervisedKeysData] = None
+    task_templates: Optional[List[TaskTemplate]] = None
 
     # Set later by the builder
     builder_name: Optional[str] = None
@@ -150,6 +153,19 @@ def __post_init__(self):
             else:
                 self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
 
+        if self.task_templates is not None:
+            if isinstance(self.task_templates, (list, tuple)):
+                templates = [
+                    template if isinstance(template, TaskTemplate) else task_template_from_dict(template)
+                    for template in self.task_templates
+                ]
+                self.task_templates = [template for template in templates if template is not None]
+            elif isinstance(self.task_templates, TaskTemplate):
+                self.task_templates = [self.task_templates]
+            else:
+                template = task_template_from_dict(self.task_templates)
+                self.task_templates = [template] if template is not None else []
+
     def _license_path(self, dataset_info_dir):
         return os.path.join(dataset_info_dir, config.LICENSE_FILENAME)
 
@@ -188,6 +204,16 @@ def unique(values):
         license = "\n\n".join(unique(info.license for info in dataset_infos))
         features = None
         supervised_keys = None
+        task_templates = None
+
+        # Find common task templates across all dataset infos
+        all_task_templates = [info.task_templates for info in dataset_infos if info.task_templates is not None]
+        if len(all_task_templates) > 1:
+            task_templates = list(set(all_task_templates[0]).intersection(*all_task_templates[1:]))
+        elif len(all_task_templates):
+            task_templates = list(set(all_task_templates[0]))
+        # If no common task templates found, replace empty list with None
+        task_templates = task_templates if task_templates else None
 
         return cls(
             description=description,
@@ -196,6 +222,7 @@ def unique(values):
             license=license,
             features=features,
             supervised_keys=supervised_keys,
+            task_templates=task_templates,
         )
 
     @classmethod

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -38,6 +38,7 @@
 from .metric import Metric
 from .packaged_modules import _PACKAGED_DATASETS_MODULES, hash_python_lines
 from .splits import Split
+from .tasks import TaskTemplate
 from .utils.download_manager import GenerateMode
 from .utils.file_utils import (
     DownloadConfig,
@@ -635,6 +636,7 @@ def load_dataset(
     save_infos: bool = False,
     script_version: Optional[Union[str, Version]] = None,
     use_auth_token: Optional[Union[bool, str]] = None,
+    task: Optional[Union[str, TaskTemplate]] = None,
     **config_kwargs,
 ) -> Union[DatasetDict, Dataset]:
     """Load a dataset.
@@ -694,6 +696,7 @@ def load_dataset(
               You can specify a different version that the default "main" by using a commit sha or a git tag of the dataset repository.
         use_auth_token (``str`` or ``bool``, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
             If True, will get token from `"~/.huggingface"`.
+        task (``str``): The task to prepare the dataset for during training and evaluation. Casts the dataset's :class:`Features` according to one of the schemas in `~tasks`.
-        task (``str``): The task to prepare the dataset for during training and evaluation. Casts the dataset's :class:`Features` according to one of the schemas in `~tasks`.
+        task (``str``): The task to prepare the dataset for during training and evaluation. Casts the dataset's :class:`Features` according to standardized column names and types as detailed in `~tasks`.
-        task (``str``): The task to prepare the dataset for during training and evaluation. Casts the dataset's :class:`Features` according to one of the schemas in `~tasks`.
+        task (``str``): The task to prepare the dataset for during training and evaluation. Casts the dataset's :class:`Features` according to standardized column names and types as detailed in `~tasks`.
         **config_kwargs: Keyword arguments to be passed to the :class:`BuilderConfig` and used in the :class:`DatasetBuilder`.
 
     Returns:
@@ -752,6 +755,9 @@ def load_dataset(
         keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
     )
     ds = builder_instance.as_dataset(split=split, ignore_verifications=ignore_verifications, in_memory=keep_in_memory)
+    # Rename and cast features to match task schema
+    if task is not None:
+        ds = ds.prepare_for_task(task)
     if save_infos:
         builder_instance._save_infos()
 

diff --git a/src/datasets/tasks/__init__.py b/src/datasets/tasks/__init__.py
@@ -0,0 +1,21 @@
+from typing import Optional
+
+from .base import TaskTemplate
+from .question_answering import QuestionAnswering
+from .text_classification import TextClassification
+
+
+__all__ = ["TaskTemplate", "QuestionAnswering", "TextClassification"]
+
+
+NAME2TEMPLATE = {QuestionAnswering.task: QuestionAnswering, TextClassification.task: TextClassification}
+
+
+def task_template_from_dict(task_template_dict: dict) -> Optional[TaskTemplate]:
+    task_name = task_template_dict.get("task")
+    if task_name is None:
+        return None
+    template = NAME2TEMPLATE.get(task_name)
+    if template is None:
+        return None
+    return template.from_dict(task_template_dict)
diff --git a/src/datasets/tasks/base.py b/src/datasets/tasks/base.py
@@ -0,0 +1,26 @@
+import abc
+from dataclasses import dataclass
+from typing import ClassVar, Dict
+
+from ..features import Features
+
+
+@dataclass(frozen=True)
+class TaskTemplate(abc.ABC):
+    task: ClassVar[str]
+    input_schema: ClassVar[Features]
+    label_schema: ClassVar[Features]
+
+    @property
+    def features(self) -> Features:
+        return Features(**self.input_schema, **self.label_schema)
+
+    @property
+    @abc.abstractmethod
+    def column_mapping(self) -> Dict[str, str]:
+        return NotImplemented
+
+    @classmethod
+    @abc.abstractmethod
+    def from_dict(cls, template_dict: dict) -> "TaskTemplate":
+        return NotImplemented
diff --git a/src/datasets/tasks/question_answering.py b/src/datasets/tasks/question_answering.py
@@ -0,0 +1,41 @@
+from dataclasses import dataclass
+from typing import Dict
+
+from ..features import Features, Sequence, Value
+from .base import TaskTemplate
+
+
+@dataclass(frozen=True)
+class QuestionAnswering(TaskTemplate):
+    task = "question-answering"
+    input_schema = Features({"question": Value("string"), "context": Value("string")})
+    label_schema = Features(
+        {
+            "answers": Sequence(
+                {
+                    "text": Value("string"),
+                    "answer_start": Value("int32"),
+                }
+            )
+        }
+    )
+    question_column: str = "question"
+    context_column: str = "context"
+    answers_column: str = "answers"
+
+    def __post_init__(self):
+        object.__setattr__(self, "question_column", self.question_column)
+        object.__setattr__(self, "context_column", self.context_column)
+        object.__setattr__(self, "answers_column", self.answers_column)
+
+    @property
+    def column_mapping(self) -> Dict[str, str]:
+        return {self.question_column: "question", self.context_column: "context", self.answers_column: "answers"}
+
+    @classmethod
+    def from_dict(cls, template_dict: dict) -> "QuestionAnswering":
+        return cls(
+            question_column=template_dict["question_column"],
+            context_column=template_dict["context_column"],
+            answers_column=template_dict["answers_column"],
+        )
diff --git a/src/datasets/tasks/text_classification.py b/src/datasets/tasks/text_classification.py
@@ -0,0 +1,42 @@
+from dataclasses import dataclass
+from typing import Dict, List
+
+from ..features import ClassLabel, Features, Value
+from .base import TaskTemplate
+
+
+@dataclass(frozen=True)
+class TextClassification(TaskTemplate):
+    task = "text-classification"
+    input_schema = Features({"text": Value("string")})
+    # TODO(lewtun): Since we update this in __post_init__ do we need to set a default? We'll need it for __init__ so
+    # investigate if there's a more elegant approach.
+    label_schema = Features({"labels": ClassLabel})
+    labels: List[str]
+    text_column: str = "text"
+    label_column: str = "labels"
+
+    def __post_init__(self):
+        assert sorted(set(self.labels)) == sorted(self.labels), "Labels must be unique"
+        # Cast labels to tuple to allow hashing
+        object.__setattr__(self, "labels", tuple(sorted(self.labels)))
+        object.__setattr__(self, "text_column", self.text_column)
+        object.__setattr__(self, "label_column", self.label_column)
+        self.label_schema["labels"] = ClassLabel(names=self.labels)
+        object.__setattr__(self, "label2id", {label: idx for idx, label in enumerate(self.labels)})
+        object.__setattr__(self, "id2label", {idx: label for label, idx in self.label2id.items()})
+
+    @property
+    def column_mapping(self) -> Dict[str, str]:
+        return {
+            self.text_column: "text",
+            self.label_column: "labels",
+        }
+
+    @classmethod
+    def from_dict(cls, template_dict: dict) -> "TextClassification":
+        return cls(
+            text_column=template_dict["text_column"],
+            label_column=template_dict["label_column"],
+            labels=template_dict["labels"],
+        )