From 22cff41a61824fd31fd7fd3a81a5a3d042588c7c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Aug 2021 16:39:50 +0200 Subject: [PATCH] Fix type hint for data_files --- docs/source/loading_datasets.rst | 12 ++++++------ src/datasets/builder.py | 8 ++++---- src/datasets/load.py | 26 ++++++++++++++------------ 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/docs/source/loading_datasets.rst b/docs/source/loading_datasets.rst index 02bb5a8c7bc..83086170e97 100644 --- a/docs/source/loading_datasets.rst +++ b/docs/source/loading_datasets.rst @@ -175,11 +175,11 @@ Generic loading scripts are provided for: If you want to control better how your files are loaded, or if you have a file format exactly reproducing the file format for one of the datasets provided on the `HuggingFace Hub `__, it can be more flexible and simpler to create **your own loading script**, from scratch or by adapting one of the provided loading scripts. In this case, please go check the :doc:`add_dataset` chapter. -The :obj:`data_files` argument in :func:`datasets.load_dataset` is used to provide paths to one or several files. This argument currently accepts three types of inputs: +The :obj:`data_files` argument in :func:`datasets.load_dataset` is used to provide paths to one or several data source files. This argument currently accepts three types of inputs: -- :obj:`str`: a single string as the path to a single file (considered to constitute the `train` split by default) -- :obj:`List[str]`: a list of strings as paths to a list of files (also considered to constitute the `train` split by default) -- :obj:`Dict[Union[str, List[str]]]`: a dictionary mapping splits names to a single file or a list of files. +- :obj:`str`: A single string as the path to a single file (considered to constitute the `train` split by default). +- :obj:`Sequence[str]`: A list of strings as paths to a list of files (also considered to constitute the `train` split by default). +- :obj:`Mapping[str, Union[str, Sequence[str]]`: A dictionary mapping splits names to a single file path or a list of file paths. Let's see an example of all the various ways you can provide files to :func:`datasets.load_dataset`: @@ -490,9 +490,9 @@ For example, run the following to get the path to the cache directory of the IMD >>> dataset_builder = load_dataset_builder('imdb') >>> print(dataset_builder.cache_dir) /Users/thomwolf/.cache/huggingface/datasets/imdb/plain_text/1.0.0/fdc76b18d5506f14b0646729b8d371880ef1bc48a26d00835a7f3da44004b676 - >>> print(dataset_builder.info.features) + >>> print(dataset_builder.info.features) {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)} - >>> print(dataset_builder.info.splits) + >>> print(dataset_builder.info.splits) {'train': SplitInfo(name='train', num_bytes=33432835, num_examples=25000, dataset_name='imdb'), 'test': SplitInfo(name='test', num_bytes=32650697, num_examples=25000, dataset_name='imdb'), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106814, num_examples=50000, dataset_name='imdb')} You can see all the attributes of ``dataset_builder.info`` in the documentation of :class:`datasets.DatasetInfo` diff --git a/src/datasets/builder.py b/src/datasets/builder.py index c6f9f346a12..ec4b29671b7 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -26,7 +26,7 @@ import urllib from dataclasses import dataclass from functools import partial -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, Mapping, Optional, Sequence, Tuple, Union from datasets.features import Features from datasets.utils.mock_download_manager import MockDownloadManager @@ -74,14 +74,14 @@ class BuilderConfig: name (:obj:`str`, default ``"default"``): version (:class:`Version` or :obj:`str`, optional): data_dir (:obj:`str`, optional): - data_files (:obj:`str` or :obj:`dict` or :obj:`list` or :obj:`tuple`, optional): + data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). description (:obj:`str`, optional): """ name: str = "default" version: Optional[Union[str, utils.Version]] = "0.0.0" data_dir: Optional[str] = None - data_files: Optional[Union[str, Dict, List, Tuple]] = None + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None description: Optional[str] = None def __post_init__(self): @@ -963,7 +963,7 @@ def _as_streaming_dataset_single( ex_iterable = self._get_examples_iterable_for_split(splits_generator) return IterableDataset(ex_iterable, info=self.info, split=splits_generator.name) - def _post_process(self, dataset: Dataset, resources_paths: Dict[str, str]) -> Optional[Dataset]: + def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]: """Run dataset transforms or add indexes""" return None diff --git a/src/datasets/load.py b/src/datasets/load.py index a0b43f54332..06fcbfa7292 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -24,7 +24,7 @@ import shutil import time from pathlib import Path -from typing import Dict, List, Optional, Tuple, Type, Union +from typing import List, Mapping, Optional, Sequence, Tuple, Type, Union from urllib.parse import urlparse import fsspec @@ -239,7 +239,6 @@ def prepare_module( and using cloudpickle (among other things). Args: - path (str): path to the dataset or metric script, can be either: - a path to a local directory containing the dataset processing python script @@ -262,7 +261,8 @@ def prepare_module( If True, the url or path to the resolved dataset or metric script is returned with the other ouputs download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied. - Return: Tuple[``str``, ``str``] with + Returns: + Tuple[``str``, ``str``]: 1. The module path being - the import path of the dataset/metric package if force_local_path is False: e.g. 'datasets.datasets.squad' - the local path to the dataset/metric file if force_local_path is True: e.g. '/User/huggingface/datasets/datasets/squad/squad.py' @@ -635,7 +635,7 @@ def load_dataset_builder( path: str, name: Optional[str] = None, data_dir: Optional[str] = None, - data_files: Union[Dict, List] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, cache_dir: Optional[str] = None, features: Optional[Features] = None, download_config: Optional[DownloadConfig] = None, @@ -659,7 +659,7 @@ def load_dataset_builder( e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``. name (:obj:`str`, optional): Defining the name of the dataset configuration. data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration. - data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration. + data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). cache_dir (:obj:`str`, optional): Directory to read/write data. Defaults to "~/datasets". features (:class:`Features`, optional): Set the features type to use for this dataset. download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters. @@ -717,7 +717,7 @@ def load_dataset( path: str, name: Optional[str] = None, data_dir: Optional[str] = None, - data_files: Union[Dict, List] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, split: Optional[Union[str, Split]] = None, cache_dir: Optional[str] = None, features: Optional[Features] = None, @@ -765,7 +765,7 @@ def load_dataset( e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``. name (:obj:`str`, optional): Defining the name of the dataset configuration. data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration. - data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration. + data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). split (:class:`Split` or :obj:`str`): Which split of the data to load. If None, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`). If given, will return a single Dataset. @@ -798,11 +798,13 @@ def load_dataset( Returns: :class:`Dataset` or :class:`DatasetDict`: - if `split` is not None: the dataset requested, - if `split` is None, a ``datasets.DatasetDict`` with each split. - or :class:`IterableDataset` or :class:`IterableDatasetDict` if streaming=True: - if `split` is not None: the dataset requested, - if `split` is None, a ``datasets.streaming.IterableDatasetDict`` with each split. + - if `split` is not None: the dataset requested, + - if `split` is None, a ``datasets.DatasetDict`` with each split. + + or :class:`IterableDataset` or :class:`IterableDatasetDict`: if streaming=True + + - if `split` is not None: the dataset requested, + - if `split` is None, a ``datasets.streaming.IterableDatasetDict`` with each split. """ ignore_verifications = ignore_verifications or save_infos