Fix type hint for data_files (#2793)

albertvillanova · web-flow · commit 366ea28d3242 · 2021-08-12T17:35:29.000+02:00
diff --git a/docs/source/loading_datasets.rst b/docs/source/loading_datasets.rst
@@ -175,11 +175,11 @@ Generic loading scripts are provided for:
 
 If you want to control better how your files are loaded, or if you have a file format exactly reproducing the file format for one of the datasets provided on the `HuggingFace Hub <https://huggingface.co/datasets>`__, it can be more flexible and simpler to create **your own loading script**, from scratch or by adapting one of the provided loading scripts. In this case, please go check the :doc:`add_dataset` chapter.
 
-The :obj:`data_files` argument in :func:`datasets.load_dataset` is used to provide paths to one or several files. This argument currently accepts three types of inputs:
+The :obj:`data_files` argument in :func:`datasets.load_dataset` is used to provide paths to one or several data source files. This argument currently accepts three types of inputs:
 
-- :obj:`str`: a single string as the path to a single file (considered to constitute the `train` split by default)
-- :obj:`List[str]`: a list of strings as paths to a list of files (also considered to constitute the `train` split by default)
-- :obj:`Dict[Union[str, List[str]]]`: a dictionary mapping splits names to a single file or a list of files.
+- :obj:`str`: A single string as the path to a single file (considered to constitute the `train` split by default).
+- :obj:`Sequence[str]`: A list of strings as paths to a list of files (also considered to constitute the `train` split by default).
+- :obj:`Mapping[str, Union[str, Sequence[str]]`: A dictionary mapping splits names to a single file path or a list of file paths.
 
 Let's see an example of all the various ways you can provide files to :func:`datasets.load_dataset`:
 
@@ -490,9 +490,9 @@ For example, run the following to get the path to the cache directory of the IMD
     >>> dataset_builder = load_dataset_builder('imdb')
     >>> print(dataset_builder.cache_dir)
     /Users/thomwolf/.cache/huggingface/datasets/imdb/plain_text/1.0.0/fdc76b18d5506f14b0646729b8d371880ef1bc48a26d00835a7f3da44004b676
-    >>> print(dataset_builder.info.features)            
+    >>> print(dataset_builder.info.features)
     {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}
-    >>> print(dataset_builder.info.splits)              
+    >>> print(dataset_builder.info.splits)
     {'train': SplitInfo(name='train', num_bytes=33432835, num_examples=25000, dataset_name='imdb'), 'test': SplitInfo(name='test', num_bytes=32650697, num_examples=25000, dataset_name='imdb'), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106814, num_examples=50000, dataset_name='imdb')}
 
 You can see all the attributes of ``dataset_builder.info`` in the documentation of :class:`datasets.DatasetInfo`
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -26,7 +26,7 @@
 import urllib
 from dataclasses import dataclass
 from functools import partial
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, Mapping, Optional, Sequence, Tuple, Union
 
 from datasets.features import Features
 from datasets.utils.mock_download_manager import MockDownloadManager
@@ -74,14 +74,14 @@ class BuilderConfig:
         name (:obj:`str`, default ``"default"``):
         version (:class:`Version` or :obj:`str`, optional):
         data_dir (:obj:`str`, optional):
-        data_files (:obj:`str` or :obj:`dict` or :obj:`list` or :obj:`tuple`, optional):
+        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
         description (:obj:`str`, optional):
     """
 
     name: str = "default"
     version: Optional[Union[str, utils.Version]] = "0.0.0"
     data_dir: Optional[str] = None
-    data_files: Optional[Union[str, Dict, List, Tuple]] = None
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None
     description: Optional[str] = None
 
     def __post_init__(self):
@@ -963,7 +963,7 @@ def _as_streaming_dataset_single(
         ex_iterable = self._get_examples_iterable_for_split(splits_generator)
         return IterableDataset(ex_iterable, info=self.info, split=splits_generator.name)
 
-    def _post_process(self, dataset: Dataset, resources_paths: Dict[str, str]) -> Optional[Dataset]:
+    def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
         """Run dataset transforms or add indexes"""
         return None
 
diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -24,7 +24,7 @@
 import shutil
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import List, Mapping, Optional, Sequence, Tuple, Type, Union
 from urllib.parse import urlparse
 
 import fsspec
@@ -239,7 +239,6 @@ def prepare_module(
     and using cloudpickle (among other things).
 
     Args:
-
         path (str):
             path to the dataset or metric script, can be either:
                 - a path to a local directory containing the dataset processing python script
@@ -262,7 +261,8 @@ def prepare_module(
             If True, the url or path to the resolved dataset or metric script is returned with the other ouputs
         download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
 
-    Return: Tuple[``str``, ``str``] with
+    Returns:
+        Tuple[``str``, ``str``]:
         1. The module path being
             - the import path of the dataset/metric package if force_local_path is False: e.g. 'datasets.datasets.squad'
             - the local path to the dataset/metric file if force_local_path is True: e.g. '/User/huggingface/datasets/datasets/squad/squad.py'
@@ -635,7 +635,7 @@ def load_dataset_builder(
     path: str,
     name: Optional[str] = None,
     data_dir: Optional[str] = None,
-    data_files: Union[Dict, List] = None,
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
     cache_dir: Optional[str] = None,
     features: Optional[Features] = None,
     download_config: Optional[DownloadConfig] = None,
@@ -659,7 +659,7 @@ def load_dataset_builder(
               e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
         name (:obj:`str`, optional): Defining the name of the dataset configuration.
         data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
-        data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
+        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
         cache_dir (:obj:`str`, optional): Directory to read/write data. Defaults to "~/datasets".
         features (:class:`Features`, optional): Set the features type to use for this dataset.
         download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters.
@@ -717,7 +717,7 @@ def load_dataset(
     path: str,
     name: Optional[str] = None,
     data_dir: Optional[str] = None,
-    data_files: Union[Dict, List] = None,
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
     split: Optional[Union[str, Split]] = None,
     cache_dir: Optional[str] = None,
     features: Optional[Features] = None,
@@ -765,7 +765,7 @@ def load_dataset(
               e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
         name (:obj:`str`, optional): Defining the name of the dataset configuration.
         data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
-        data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
+        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
         split (:class:`Split` or :obj:`str`): Which split of the data to load.
             If None, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
             If given, will return a single Dataset.
@@ -798,11 +798,13 @@ def load_dataset(
 
     Returns:
         :class:`Dataset` or :class:`DatasetDict`:
-            if `split` is not None: the dataset requested,
-            if `split` is None, a ``datasets.DatasetDict`` with each split.
-        or :class:`IterableDataset` or :class:`IterableDatasetDict` if streaming=True:
-            if `split` is not None: the dataset requested,
-            if `split` is None, a ``datasets.streaming.IterableDatasetDict`` with each split.
+        - if `split` is not None: the dataset requested,
+        - if `split` is None, a ``datasets.DatasetDict`` with each split.
+
+        or :class:`IterableDataset` or :class:`IterableDatasetDict`: if streaming=True
+
+        - if `split` is not None: the dataset requested,
+        - if `split` is None, a ``datasets.streaming.IterableDatasetDict`` with each split.
 
     """
     ignore_verifications = ignore_verifications or save_infos