huggingface · lhoestq · Jul 5, 2021 · Jun 14, 2021 · Jun 14, 2021 · Jun 15, 2021
diff --git a/docs/make.bat b/docs/make.bat
@@ -7,7 +7,7 @@ REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
-set SOURCEDIR=.
+set SOURCEDIR=source
 set BUILDDIR=_build
 
 if "%1" == "" goto help

diff --git a/docs/source/loading_datasets.rst b/docs/source/loading_datasets.rst
@@ -431,7 +431,7 @@ For example, run the following to skip integrity verifications when loading the
 Loading datasets offline
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Each dataset builder (e.g. "squad") is a python script that is downloaded and cached from either from the huggingface/datasets GitHub repository or from the `HuggingFace Hub <https://huggingface.co/datasets>`__.
+Each dataset builder (e.g. "squad") is a python script that is downloaded and cached from either from the 🤗Datasets GitHub repository or from the `HuggingFace Hub <https://huggingface.co/datasets>`__.
 Only the ``text``, ``csv``, ``json`` and ``pandas`` builders are included in ``datasets`` without requiring external downloads.
 
 Therefore if you don't have an internet connection you can't load a dataset that is not packaged with ``datasets``, unless the dataset is already cached.
@@ -441,6 +441,23 @@ You can even set the environment variable `HF_DATASETS_OFFLINE` to ``1`` to tell
 This mode disables all the network calls of the library.
 This way, instead of waiting for a dataset builder download to time out, the library looks directly at the cache.
 
+.. _load_dataset_load_builder:
+
+Loading a dataset builder
+-----------------------------------------------------------
+
+You can use :func:`datasets.load_dataset_builder` to inspect metadata (cache directory, configs, dataset info, etc.) that is required to build a dataset without downloading the dataset itself.
+
+For example, run the following to get the path to the cache directory of the IMDB dataset:
+
+.. code-block::
+
+    >>> from datasets import load_dataset_builder
+    >>> dataset_builder = load_dataset_builder('imdb')
+    >>> print(dataset_builder.cache_dir)
+    /Users/thomwolf/.cache/huggingface/datasets/imdb/plain_text/1.0.0/fdc76b18d5506f14b0646729b8d371880ef1bc48a26d00835a7f3da44004b676
+
+
 .. _load_dataset_enhancing_performance:
 
 Enhancing performance

diff --git a/docs/source/package_reference/loading_methods.rst b/docs/source/package_reference/loading_methods.rst
@@ -12,6 +12,8 @@ Datasets
 
 .. autofunction:: datasets.load_from_disk
 
+.. autofunction:: datasets.load_dataset_builder
+
 Metrics
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -60,7 +60,7 @@
 )
 from .iterable_dataset import IterableDataset
 from .keyhash import KeyHasher
-from .load import import_main_class, load_dataset, load_from_disk, load_metric, prepare_module
+from .load import import_main_class, load_dataset, load_dataset_builder, load_from_disk, load_metric, prepare_module
 from .metric import Metric
 from .splits import (
     NamedSplit,

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -204,6 +204,7 @@ def __init__(
         cache_dir: Optional[str] = None,
         name: Optional[str] = None,
         hash: Optional[str] = None,
+        base_path: Optional[str] = None,
         features: Optional[Features] = None,
         **config_kwargs,
     ):
@@ -217,8 +218,9 @@ def __init__(
                 `builder_config`s will have their own subdirectories and versions.
                 If not provided, uses the first configuration in self.BUILDER_CONFIGS
             hash: a hash specific to the dataset code. Used to update the caching directory when the dataset loading
-                script code is udpated (to avoid reusing old data).
+                script code is updated (to avoid reusing old data).
                 The typical caching directory (defined in ``self._relative_data_dir``) is: ``name/version/hash/``
+            base_path: `str`, base path for relative paths that are used to download files. This can be a remote url.
             features: `Features`, optional features that will be used to read/write the dataset
                 It can be used to changed the :obj:`datasets.Features` description of a dataset for example.
             config_kwargs: will override the defaults kwargs in config
@@ -227,6 +229,7 @@ def __init__(
         # DatasetBuilder name
         self.name: str = camelcase_to_snakecase(self.__class__.__name__)
         self.hash: Optional[str] = hash
+        self.base_path = base_path
 
         # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
         config_kwargs = {key: value for key, value in config_kwargs.items() if value is not None}
@@ -475,13 +478,15 @@ def download_and_prepare(
             save_infos (bool): Save the dataset information (checksums/size/splits/...)
             try_from_hf_gcs (bool): If True, it will try to download the already prepared dataset from the Hf google cloud storage
             dl_manager (Optional ``datasets.DownloadManager``): specific Download Manger to use
-            base_path: ( Optional ``str``): base path for relative paths that are used to download files. This can be a remote url.
+            base_path ( Optional ``str``): base path for relative paths that are used to download files. This can be a remote url.
+                If not specified, the value of the ``base_path`` attribute (``self.base_path``) will be used instead.
             use_auth_token (Optional ``Union[str, bool]``): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
                 If True, will get token from ~/.huggingface.
 
         """
         download_mode = GenerateMode(download_mode or GenerateMode.REUSE_DATASET_IF_EXISTS)
         verify_infos = not ignore_verifications
+        base_path = base_path if base_path is not None else self.base_path
 
         if dl_manager is None:
             if download_config is None:
@@ -930,7 +935,7 @@ def _split_generators(self, dl_manager: DownloadManager):
 
         Example:
 
-            return[
+            return [
                     datasets.SplitGenerator(
                             name=datasets.Split.TRAIN,
                             gen_kwargs={'file': 'train_data.zip'},

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -602,7 +602,7 @@ def load_metric(
     Returns:
         `datasets.Metric`
     """
-    module_path, hash = prepare_module(
+    module_path, _ = prepare_module(
         path,
         script_version=script_version,
         download_config=download_config,
@@ -626,6 +626,87 @@ def load_metric(
     return metric
 
 
+def load_dataset_builder(
+    path: str,
+    name: Optional[str] = None,
+    data_dir: Optional[str] = None,
+    data_files: Union[Dict, List] = None,
+    cache_dir: Optional[str] = None,
+    features: Optional[Features] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[GenerateMode] = None,
+    script_version: Optional[Union[str, Version]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    **config_kwargs,
+) -> DatasetBuilder:
+    """Load a builder for the dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
+    without downloading the dataset itself.
+
+    This method will download and import the dataset loading script from ``path`` if it's not already cached inside the library.
+
+    Args:
+
+        path (:obj:`str`): Path to the dataset processing script with the dataset builder. Can be either:
+
+            - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
+              e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'``.
+            - a dataset identifier in the HuggingFace Datasets Hub (list all available datasets and ids with ``datasets.list_datasets()``)
+              e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
+        name (:obj:`str`, optional): Defining the name of the dataset configuration.
+        data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
+        data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
+        cache_dir (:obj:`str`, optional): Directory to read/write data. Defaults to "~/datasets".
+        features (:class:`Features`, optional): Set the features type to use for this dataset.
+        download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters.
+        download_mode (:class:`GenerateMode`, optional): Select the download/generate mode - Default to REUSE_DATASET_IF_EXISTS
+        script_version (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load:
+
+            - For canonical datasets in the `huggingface/datasets` library like "squad", the default version of the module is the local version fo the lib.
+              You can specify a different version from your local version of the lib (e.g. "master" or "1.2.0") but it might cause compatibility issues.
+            - For community provided datasets like "lhoestq/squad" that have their own git repository on the Datasets Hub, the default version "main" corresponds to the "main" branch.
+              You can specify a different version that the default "main" by using a commit sha or a git tag of the dataset repository.
+        use_auth_token (``str`` or ``bool``, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+            If True, will get token from `"~/.huggingface"`.
+
+    Returns:
+        :class:`DatasetBuilder`
+
+    """
+    # Download/copy dataset processing script
+    module_path, hash, resolved_file_path = prepare_module(
+        path,
+        script_version=script_version,
+        download_config=download_config,
+        download_mode=download_mode,
+        dataset=True,
+        return_resolved_file_path=True,
+        use_auth_token=use_auth_token,
+    )
+
+    # Get dataset builder class from the processing script
+    builder_cls = import_main_class(module_path, dataset=True)
+
+    # Set the base path for downloads as the parent of the script location
+    if resolved_file_path is not None:
+        base_path = url_or_path_parent(resolved_file_path)
+    else:
+        base_path = None
+
+    # Instantiate the dataset builder
+    builder_instance: DatasetBuilder = builder_cls(
+        cache_dir=cache_dir,
+        name=name,
+        data_dir=data_dir,
+        data_files=data_files,
+        hash=hash,
+        base_path=base_path,
+        features=features,
+        **config_kwargs,
+    )
+
+    return builder_instance
+
+
 def load_dataset(
     path: str,
     name: Optional[str] = None,
@@ -677,8 +758,8 @@ def load_dataset(
             - a dataset identifier in the HuggingFace Datasets Hub (list all available datasets and ids with ``datasets.list_datasets()``)
               e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``.
         name (:obj:`str`, optional): Defining the name of the dataset configuration.
-        data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
         data_dir (:obj:`str`, optional): Defining the data_dir of the dataset configuration.
+        data_files (:obj:`str`, optional): Defining the data_files of the dataset configuration.
         split (:class:`Split` or :obj:`str`): Which split of the data to load.
             If None, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
             If given, will return a single Dataset.
@@ -742,17 +823,18 @@ def load_dataset(
     else:
         base_path = None
 
-    # Get dataset builder class from the processing script
-    builder_cls = import_main_class(module_path, dataset=True)
-
-    # Instantiate the dataset builder
-    builder_instance: DatasetBuilder = builder_cls(
-        cache_dir=cache_dir,
-        name=name,
-        data_dir=data_dir,
-        data_files=data_files,
-        hash=hash,
-        features=features,
+    # Create a dataset builder
+    builder_instance = load_dataset_builder(
+        path,
+        name,
+        data_dir,
+        data_files,
+        cache_dir,
+        features,
+        download_config,
+        download_mode,
+        script_version,
+        use_auth_token,
         **config_kwargs,
     )
 
@@ -776,7 +858,6 @@ def load_dataset(
         download_mode=download_mode,
         ignore_verifications=ignore_verifications,
         try_from_hf_gcs=try_from_hf_gcs,
-        base_path=base_path,
         use_auth_token=use_auth_token,
     )
 

diff --git a/src/datasets/naming.py b/src/datasets/naming.py
@@ -16,25 +16,32 @@
 # Lint as: python3
 """Utilities for file names."""
 
+import itertools
 import os
 import re
 
 
-_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
-_all_cap_re = re.compile("([a-z0-9])([A-Z])")
+_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
+_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
+
+_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
+_multiple_underscores_re = re.compile(r"(_{2,})")
 
 _split_re = r"^\w+(\.\w+)*$"
 
 
 def camelcase_to_snakecase(name):
     """Convert camel-case string to snake-case."""
-    s1 = _first_cap_re.sub(r"\1_\2", name)
-    return _all_cap_re.sub(r"\1_\2", s1).lower()
+    name = _uppercase_uppercase_re.sub(r"\1_\2", name)
+    name = _lowercase_uppercase_re.sub(r"\1_\2", name)
+    return name.lower()
 
 
-def snake_to_camelcase(name):
+def snakecase_to_camelcase(name):
     """Convert snake-case string to camel-case string."""
-    return "".join(n.capitalize() for n in name.split("_"))
+    name = _single_underscore_re.split(name)
+    name = [_multiple_underscores_re.split(n) for n in name]
+    return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
 
 
 def filename_prefix_for_name(name):

diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
@@ -12,9 +12,9 @@
 def hash_python_lines(lines: List[str]) -> str:
     filtered_lines = []
     for line in lines:
-        line.replace("\n", "")  # remove line breaks, white space and comments
-        line.replace(" ", "")
-        line.replace("\t", "")
+        line = line.replace("\n", "")  # remove line breaks, white space and comments
+        line = line.replace(" ", "")
+        line = line.replace("\t", "")
         line = re.sub(r"#.*", "", line)
         if line:
             filtered_lines.append(line)

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -14,7 +14,9 @@
 import datasets
 from datasets import SCRIPTS_VERSION, load_dataset, load_from_disk
 from datasets.arrow_dataset import Dataset
+from datasets.builder import DatasetBuilder
 from datasets.dataset_dict import DatasetDict, IterableDatasetDict
+from datasets.features import Features, Value
 from datasets.iterable_dataset import IterableDataset
 from datasets.load import prepare_module
 
@@ -112,7 +114,7 @@ def test_prepare_module(self):
             )
             dummy_module = importlib.import_module(importable_module_path)
             self.assertEqual(dummy_module.MY_DUMMY_VARIABLE, "hello there")
-            self.assertEqual(module_hash, sha256(dummy_code.encode("utf-8")).hexdigest())
+            self.assertEqual(module_hash, sha256(dummy_code.replace(" ", "").encode("utf-8")).hexdigest())
             # prepare module from file path + check resolved_file_path
             dummy_code = "MY_DUMMY_VARIABLE = 'general kenobi'"
             module_dir = self._dummy_module_dir(tmp_dir, "__dummy_module_name1__", dummy_code)
@@ -123,7 +125,7 @@ def test_prepare_module(self):
             self.assertEqual(resolved_file_path, module_path)
             dummy_module = importlib.import_module(importable_module_path)
             self.assertEqual(dummy_module.MY_DUMMY_VARIABLE, "general kenobi")
-            self.assertEqual(module_hash, sha256(dummy_code.encode("utf-8")).hexdigest())
+            self.assertEqual(module_hash, sha256(dummy_code.replace(" ", "").encode("utf-8")).hexdigest())
             # missing module
             for offline_simulation_mode in list(OfflineSimulationMode):
                 with offline(offline_simulation_mode):
@@ -197,6 +199,13 @@ def test_load_dataset_users(self):
                 )
 
 
+def test_load_dataset_builder(dataset_loading_script_dir, data_dir):
+    builder = datasets.load_dataset_builder(dataset_loading_script_dir, data_dir=data_dir)
+    assert isinstance(builder, DatasetBuilder)
+    assert builder.name == DATASET_LOADING_SCRIPT_NAME
+    assert builder.info.features == Features({"text": Value("string")})
+
+
 @pytest.mark.parametrize("keep_in_memory", [False, True])
 def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory, caplog):
     with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():
@@ -229,8 +238,8 @@ def test_load_dataset_streaming(dataset_loading_script_dir, data_dir):
 def test_loading_from_the_datasets_hub():
     with tempfile.TemporaryDirectory() as tmp_dir:
         dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)
-        assert len(dataset["train"]), 2
-        assert len(dataset["validation"]), 3
+        assert len(dataset["train"]) == 2
+        assert len(dataset["validation"]) == 3
         del dataset
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,8 @@ Datasets @@
     .. autofunction:: datasets.load_from_disk
+    .. autofunction:: datasets.load_dataset_builder
     Metrics
     ~~~~~~~~~~~~~~~~~~~~~
@@ Expand Down @@