Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/make.bat
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set SOURCEDIR=source
set BUILDDIR=_build

if "%1" == "" goto help
Expand Down
19 changes: 18 additions & 1 deletion docs/source/loading_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ For example, run the following to skip integrity verifications when loading the
Loading datasets offline
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Each dataset builder (e.g. "squad") is a python script that is downloaded and cached from either from the huggingface/datasets GitHub repository or from the `HuggingFace Hub <https://huggingface.co/datasets>`__.
Each dataset builder (e.g. "squad") is a python script that is downloaded and cached from either from the 🤗Datasets GitHub repository or from the `HuggingFace Hub <https://huggingface.co/datasets>`__.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggested a white space here: 🤗Datasets => 🤗 Datasets

Only the ``text``, ``csv``, ``json`` and ``pandas`` builders are included in ``datasets`` without requiring external downloads.

Therefore if you don't have an internet connection you can't load a dataset that is not packaged with ``datasets``, unless the dataset is already cached.
Expand All @@ -441,6 +441,23 @@ You can even set the environment variable `HF_DATASETS_OFFLINE` to ``1`` to tell
This mode disables all the network calls of the library.
This way, instead of waiting for a dataset builder download to time out, the library looks directly at the cache.

.. _load_dataset_load_builder:

Loading a dataset builder
-----------------------------------------------------------

You can use :func:`datasets.load_dataset_builder` to inspect metadata (cache directory, configs, dataset info, etc.) that is required to build a dataset without downloading the dataset itself.

For example, run the following to get the path to the cache directory of the IMDB dataset:

.. code-block::

>>> from datasets import load_dataset_builder
>>> dataset_builder = load_dataset_builder('imdb')
>>> print(dataset_builder.cache_dir)
/Users/thomwolf/.cache/huggingface/datasets/imdb/plain_text/1.0.0/fdc76b18d5506f14b0646729b8d371880ef1bc48a26d00835a7f3da44004b676


.. _load_dataset_enhancing_performance:

Enhancing performance
Expand Down
2 changes: 2 additions & 0 deletions docs/source/package_reference/loading_methods.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ Datasets

.. autofunction:: datasets.load_from_disk

.. autofunction:: datasets.load_dataset_builder

Metrics
~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion src/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
)
from .iterable_dataset import IterableDataset
from .keyhash import KeyHasher
from .load import import_main_class, load_dataset, load_from_disk, load_metric, prepare_module
from .load import import_main_class, load_dataset, load_dataset_builder, load_from_disk, load_metric, prepare_module
from .metric import Metric
from .splits import (
NamedSplit,
Expand Down
11 changes: 8 additions & 3 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ def __init__(
cache_dir: Optional[str] = None,
name: Optional[str] = None,
hash: Optional[str] = None,
base_path: Optional[str] = None,
features: Optional[Features] = None,
**config_kwargs,
):
Expand All @@ -217,8 +218,9 @@ def __init__(
`builder_config`s will have their own subdirectories and versions.
If not provided, uses the first configuration in self.BUILDER_CONFIGS
hash: a hash specific to the dataset code. Used to update the caching directory when the dataset loading
script code is udpated (to avoid reusing old data).
script code is updated (to avoid reusing old data).
The typical caching directory (defined in ``self._relative_data_dir``) is: ``name/version/hash/``
base_path: `str`, base path for relative paths that are used to download files. This can be a remote url.
features: `Features`, optional features that will be used to read/write the dataset
It can be used to changed the :obj:`datasets.Features` description of a dataset for example.
config_kwargs: will override the defaults kwargs in config
Expand All @@ -227,6 +229,7 @@ def __init__(
# DatasetBuilder name
self.name: str = camelcase_to_snakecase(self.__class__.__name__)
self.hash: Optional[str] = hash
self.base_path = base_path

# Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
config_kwargs = {key: value for key, value in config_kwargs.items() if value is not None}
Expand Down Expand Up @@ -475,13 +478,15 @@ def download_and_prepare(
save_infos (bool): Save the dataset information (checksums/size/splits/...)
try_from_hf_gcs (bool): If True, it will try to download the already prepared dataset from the Hf google cloud storage
dl_manager (Optional ``datasets.DownloadManager``): specific Download Manger to use
base_path: ( Optional ``str``): base path for relative paths that are used to download files. This can be a remote url.
base_path ( Optional ``str``): base path for relative paths that are used to download files. This can be a remote url.
If not specified, the value of the ``base_path`` attribute (``self.base_path``) will be used instead.
use_auth_token (Optional ``Union[str, bool]``): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
If True, will get token from ~/.huggingface.

"""
download_mode = GenerateMode(download_mode or GenerateMode.REUSE_DATASET_IF_EXISTS)
verify_infos = not ignore_verifications
base_path = base_path if base_path is not None else self.base_path

if dl_manager is None:
if download_config is None:
Expand Down Expand Up @@ -930,7 +935,7 @@ def _split_generators(self, dl_manager: DownloadManager):

Example:

return[
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={'file': 'train_data.zip'},
Expand Down
109 changes: 95 additions & 14 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ def load_metric(
Returns:
`datasets.Metric`
"""
module_path, hash = prepare_module(
module_path, _ = prepare_module(
path,
script_version=script_version,
download_config=download_config,
Expand All @@ -626,6 +626,87 @@ def load_metric(
return metric


def load_dataset_builder(
path: str,
name: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Union[Dict, List] = None,
cache_dir: Optional[str] = None,
features: Optional[Features] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[GenerateMode] = None,
script_version: Optional[Union[str, Version]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
**config_kwargs,
) -> DatasetBuilder:
"""Load a builder for the dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
without downloading the dataset itself.

This method will download and import the dataset loading script from ``path`` if it's not already cached inside the library.

Args:

path (:obj:`str`): Path to the dataset processing script with the dataset builder. Can be either:

- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'``.
- a dataset identifier in the HuggingFace Datasets Hub (list all available datasets and ids with ``datasets.list_datasets()``)
e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
name (:obj:`str`, optional): Defining the name of the dataset configuration.
data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
cache_dir (:obj:`str`, optional): Directory to read/write data. Defaults to "~/datasets".
features (:class:`Features`, optional): Set the features type to use for this dataset.
download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`GenerateMode`, optional): Select the download/generate mode - Default to REUSE_DATASET_IF_EXISTS
script_version (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load:

- For canonical datasets in the `huggingface/datasets` library like "squad", the default version of the module is the local version fo the lib.
You can specify a different version from your local version of the lib (e.g. "master" or "1.2.0") but it might cause compatibility issues.
- For community provided datasets like "lhoestq/squad" that have their own git repository on the Datasets Hub, the default version "main" corresponds to the "main" branch.
You can specify a different version that the default "main" by using a commit sha or a git tag of the dataset repository.
use_auth_token (``str`` or ``bool``, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
If True, will get token from `"~/.huggingface"`.

Returns:
:class:`DatasetBuilder`

"""
# Download/copy dataset processing script
module_path, hash, resolved_file_path = prepare_module(
path,
script_version=script_version,
download_config=download_config,
download_mode=download_mode,
dataset=True,
return_resolved_file_path=True,
use_auth_token=use_auth_token,
)

# Get dataset builder class from the processing script
builder_cls = import_main_class(module_path, dataset=True)

# Set the base path for downloads as the parent of the script location
if resolved_file_path is not None:
base_path = url_or_path_parent(resolved_file_path)
else:
base_path = None

# Instantiate the dataset builder
builder_instance: DatasetBuilder = builder_cls(
cache_dir=cache_dir,
name=name,
data_dir=data_dir,
data_files=data_files,
hash=hash,
base_path=base_path,
features=features,
**config_kwargs,
)

return builder_instance


def load_dataset(
path: str,
name: Optional[str] = None,
Expand Down Expand Up @@ -677,8 +758,8 @@ def load_dataset(
- a dataset identifier in the HuggingFace Datasets Hub (list all available datasets and ids with ``datasets.list_datasets()``)
e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
name (:obj:`str`, optional): Defining the name of the dataset configuration.
data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
split (:class:`Split` or :obj:`str`): Which split of the data to load.
If None, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
If given, will return a single Dataset.
Expand Down Expand Up @@ -742,17 +823,18 @@ def load_dataset(
else:
base_path = None

# Get dataset builder class from the processing script
builder_cls = import_main_class(module_path, dataset=True)

# Instantiate the dataset builder
builder_instance: DatasetBuilder = builder_cls(
cache_dir=cache_dir,
name=name,
data_dir=data_dir,
data_files=data_files,
hash=hash,
features=features,
# Create a dataset builder
builder_instance = load_dataset_builder(
path,
name,
data_dir,
data_files,
cache_dir,
features,
download_config,
download_mode,
script_version,
use_auth_token,
**config_kwargs,
)

Expand All @@ -776,7 +858,6 @@ def load_dataset(
download_mode=download_mode,
ignore_verifications=ignore_verifications,
try_from_hf_gcs=try_from_hf_gcs,
base_path=base_path,
use_auth_token=use_auth_token,
)

Expand Down
19 changes: 13 additions & 6 deletions src/datasets/naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,32 @@
# Lint as: python3
"""Utilities for file names."""

import itertools
import os
import re


_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
_all_cap_re = re.compile("([a-z0-9])([A-Z])")
_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")

_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
_multiple_underscores_re = re.compile(r"(_{2,})")

_split_re = r"^\w+(\.\w+)*$"


def camelcase_to_snakecase(name):
"""Convert camel-case string to snake-case."""
s1 = _first_cap_re.sub(r"\1_\2", name)
return _all_cap_re.sub(r"\1_\2", s1).lower()
name = _uppercase_uppercase_re.sub(r"\1_\2", name)
name = _lowercase_uppercase_re.sub(r"\1_\2", name)
return name.lower()


def snake_to_camelcase(name):
def snakecase_to_camelcase(name):
"""Convert snake-case string to camel-case string."""
return "".join(n.capitalize() for n in name.split("_"))
name = _single_underscore_re.split(name)
name = [_multiple_underscores_re.split(n) for n in name]
return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")


def filename_prefix_for_name(name):
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/packaged_modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
def hash_python_lines(lines: List[str]) -> str:
filtered_lines = []
for line in lines:
line.replace("\n", "") # remove line breaks, white space and comments
line.replace(" ", "")
line.replace("\t", "")
line = line.replace("\n", "") # remove line breaks, white space and comments
line = line.replace(" ", "")
line = line.replace("\t", "")
line = re.sub(r"#.*", "", line)
if line:
filtered_lines.append(line)
Expand Down
17 changes: 13 additions & 4 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
import datasets
from datasets import SCRIPTS_VERSION, load_dataset, load_from_disk
from datasets.arrow_dataset import Dataset
from datasets.builder import DatasetBuilder
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.features import Features, Value
from datasets.iterable_dataset import IterableDataset
from datasets.load import prepare_module

Expand Down Expand Up @@ -112,7 +114,7 @@ def test_prepare_module(self):
)
dummy_module = importlib.import_module(importable_module_path)
self.assertEqual(dummy_module.MY_DUMMY_VARIABLE, "hello there")
self.assertEqual(module_hash, sha256(dummy_code.encode("utf-8")).hexdigest())
self.assertEqual(module_hash, sha256(dummy_code.replace(" ", "").encode("utf-8")).hexdigest())
# prepare module from file path + check resolved_file_path
dummy_code = "MY_DUMMY_VARIABLE = 'general kenobi'"
module_dir = self._dummy_module_dir(tmp_dir, "__dummy_module_name1__", dummy_code)
Expand All @@ -123,7 +125,7 @@ def test_prepare_module(self):
self.assertEqual(resolved_file_path, module_path)
dummy_module = importlib.import_module(importable_module_path)
self.assertEqual(dummy_module.MY_DUMMY_VARIABLE, "general kenobi")
self.assertEqual(module_hash, sha256(dummy_code.encode("utf-8")).hexdigest())
self.assertEqual(module_hash, sha256(dummy_code.replace(" ", "").encode("utf-8")).hexdigest())
# missing module
for offline_simulation_mode in list(OfflineSimulationMode):
with offline(offline_simulation_mode):
Expand Down Expand Up @@ -197,6 +199,13 @@ def test_load_dataset_users(self):
)


def test_load_dataset_builder(dataset_loading_script_dir, data_dir):
builder = datasets.load_dataset_builder(dataset_loading_script_dir, data_dir=data_dir)
assert isinstance(builder, DatasetBuilder)
assert builder.name == DATASET_LOADING_SCRIPT_NAME
assert builder.info.features == Features({"text": Value("string")})


@pytest.mark.parametrize("keep_in_memory", [False, True])
def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory, caplog):
with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():
Expand Down Expand Up @@ -229,8 +238,8 @@ def test_load_dataset_streaming(dataset_loading_script_dir, data_dir):
def test_loading_from_the_datasets_hub():
with tempfile.TemporaryDirectory() as tmp_dir:
dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)
assert len(dataset["train"]), 2
assert len(dataset["validation"]), 3
assert len(dataset["train"]) == 2
assert len(dataset["validation"]) == 3
del dataset


Expand Down