Skip to content

Commit 366ea28

Browse files
Fix type hint for data_files (#2793)
1 parent 0ab2cae commit 366ea28

3 files changed

Lines changed: 24 additions & 22 deletions

File tree

docs/source/loading_datasets.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,11 @@ Generic loading scripts are provided for:
175175

176176
If you want to control better how your files are loaded, or if you have a file format exactly reproducing the file format for one of the datasets provided on the `HuggingFace Hub <https://huggingface.co/datasets>`__, it can be more flexible and simpler to create **your own loading script**, from scratch or by adapting one of the provided loading scripts. In this case, please go check the :doc:`add_dataset` chapter.
177177

178-
The :obj:`data_files` argument in :func:`datasets.load_dataset` is used to provide paths to one or several files. This argument currently accepts three types of inputs:
178+
The :obj:`data_files` argument in :func:`datasets.load_dataset` is used to provide paths to one or several data source files. This argument currently accepts three types of inputs:
179179

180-
- :obj:`str`: a single string as the path to a single file (considered to constitute the `train` split by default)
181-
- :obj:`List[str]`: a list of strings as paths to a list of files (also considered to constitute the `train` split by default)
182-
- :obj:`Dict[Union[str, List[str]]]`: a dictionary mapping splits names to a single file or a list of files.
180+
- :obj:`str`: A single string as the path to a single file (considered to constitute the `train` split by default).
181+
- :obj:`Sequence[str]`: A list of strings as paths to a list of files (also considered to constitute the `train` split by default).
182+
- :obj:`Mapping[str, Union[str, Sequence[str]]`: A dictionary mapping splits names to a single file path or a list of file paths.
183183

184184
Let's see an example of all the various ways you can provide files to :func:`datasets.load_dataset`:
185185

@@ -490,9 +490,9 @@ For example, run the following to get the path to the cache directory of the IMD
490490
>>> dataset_builder = load_dataset_builder('imdb')
491491
>>> print(dataset_builder.cache_dir)
492492
/Users/thomwolf/.cache/huggingface/datasets/imdb/plain_text/1.0.0/fdc76b18d5506f14b0646729b8d371880ef1bc48a26d00835a7f3da44004b676
493-
>>> print(dataset_builder.info.features)
493+
>>> print(dataset_builder.info.features)
494494
{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}
495-
>>> print(dataset_builder.info.splits)
495+
>>> print(dataset_builder.info.splits)
496496
{'train': SplitInfo(name='train', num_bytes=33432835, num_examples=25000, dataset_name='imdb'), 'test': SplitInfo(name='test', num_bytes=32650697, num_examples=25000, dataset_name='imdb'), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106814, num_examples=50000, dataset_name='imdb')}
497497
498498
You can see all the attributes of ``dataset_builder.info`` in the documentation of :class:`datasets.DatasetInfo`

src/datasets/builder.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import urllib
2727
from dataclasses import dataclass
2828
from functools import partial
29-
from typing import Dict, List, Optional, Tuple, Union
29+
from typing import Dict, Mapping, Optional, Sequence, Tuple, Union
3030

3131
from datasets.features import Features
3232
from datasets.utils.mock_download_manager import MockDownloadManager
@@ -74,14 +74,14 @@ class BuilderConfig:
7474
name (:obj:`str`, default ``"default"``):
7575
version (:class:`Version` or :obj:`str`, optional):
7676
data_dir (:obj:`str`, optional):
77-
data_files (:obj:`str` or :obj:`dict` or :obj:`list` or :obj:`tuple`, optional):
77+
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
7878
description (:obj:`str`, optional):
7979
"""
8080

8181
name: str = "default"
8282
version: Optional[Union[str, utils.Version]] = "0.0.0"
8383
data_dir: Optional[str] = None
84-
data_files: Optional[Union[str, Dict, List, Tuple]] = None
84+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None
8585
description: Optional[str] = None
8686

8787
def __post_init__(self):
@@ -963,7 +963,7 @@ def _as_streaming_dataset_single(
963963
ex_iterable = self._get_examples_iterable_for_split(splits_generator)
964964
return IterableDataset(ex_iterable, info=self.info, split=splits_generator.name)
965965

966-
def _post_process(self, dataset: Dataset, resources_paths: Dict[str, str]) -> Optional[Dataset]:
966+
def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
967967
"""Run dataset transforms or add indexes"""
968968
return None
969969

src/datasets/load.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import shutil
2525
import time
2626
from pathlib import Path
27-
from typing import Dict, List, Optional, Tuple, Type, Union
27+
from typing import List, Mapping, Optional, Sequence, Tuple, Type, Union
2828
from urllib.parse import urlparse
2929

3030
import fsspec
@@ -239,7 +239,6 @@ def prepare_module(
239239
and using cloudpickle (among other things).
240240
241241
Args:
242-
243242
path (str):
244243
path to the dataset or metric script, can be either:
245244
- a path to a local directory containing the dataset processing python script
@@ -262,7 +261,8 @@ def prepare_module(
262261
If True, the url or path to the resolved dataset or metric script is returned with the other ouputs
263262
download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
264263
265-
Return: Tuple[``str``, ``str``] with
264+
Returns:
265+
Tuple[``str``, ``str``]:
266266
1. The module path being
267267
- the import path of the dataset/metric package if force_local_path is False: e.g. 'datasets.datasets.squad'
268268
- the local path to the dataset/metric file if force_local_path is True: e.g. '/User/huggingface/datasets/datasets/squad/squad.py'
@@ -635,7 +635,7 @@ def load_dataset_builder(
635635
path: str,
636636
name: Optional[str] = None,
637637
data_dir: Optional[str] = None,
638-
data_files: Union[Dict, List] = None,
638+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
639639
cache_dir: Optional[str] = None,
640640
features: Optional[Features] = None,
641641
download_config: Optional[DownloadConfig] = None,
@@ -659,7 +659,7 @@ def load_dataset_builder(
659659
e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
660660
name (:obj:`str`, optional): Defining the name of the dataset configuration.
661661
data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
662-
data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
662+
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
663663
cache_dir (:obj:`str`, optional): Directory to read/write data. Defaults to "~/datasets".
664664
features (:class:`Features`, optional): Set the features type to use for this dataset.
665665
download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters.
@@ -717,7 +717,7 @@ def load_dataset(
717717
path: str,
718718
name: Optional[str] = None,
719719
data_dir: Optional[str] = None,
720-
data_files: Union[Dict, List] = None,
720+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
721721
split: Optional[Union[str, Split]] = None,
722722
cache_dir: Optional[str] = None,
723723
features: Optional[Features] = None,
@@ -765,7 +765,7 @@ def load_dataset(
765765
e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
766766
name (:obj:`str`, optional): Defining the name of the dataset configuration.
767767
data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
768-
data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
768+
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
769769
split (:class:`Split` or :obj:`str`): Which split of the data to load.
770770
If None, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
771771
If given, will return a single Dataset.
@@ -798,11 +798,13 @@ def load_dataset(
798798
799799
Returns:
800800
:class:`Dataset` or :class:`DatasetDict`:
801-
if `split` is not None: the dataset requested,
802-
if `split` is None, a ``datasets.DatasetDict`` with each split.
803-
or :class:`IterableDataset` or :class:`IterableDatasetDict` if streaming=True:
804-
if `split` is not None: the dataset requested,
805-
if `split` is None, a ``datasets.streaming.IterableDatasetDict`` with each split.
801+
- if `split` is not None: the dataset requested,
802+
- if `split` is None, a ``datasets.DatasetDict`` with each split.
803+
804+
or :class:`IterableDataset` or :class:`IterableDatasetDict`: if streaming=True
805+
806+
- if `split` is not None: the dataset requested,
807+
- if `split` is None, a ``datasets.streaming.IterableDatasetDict`` with each split.
806808
807809
"""
808810
ignore_verifications = ignore_verifications or save_infos

0 commit comments

Comments
 (0)