From 5593ec3df95098980bd29f8398a44bc00176db7c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 21:10:37 +0200 Subject: [PATCH 1/3] Rename config and env variable IN_MEMORY_MAX_SIZE --- src/datasets/arrow_dataset.py | 12 ++++++------ src/datasets/config.py | 6 +++--- src/datasets/dataset_dict.py | 9 ++++----- src/datasets/load.py | 22 ++++++++++------------ src/datasets/utils/info_utils.py | 8 ++++---- tests/test_info_utils.py | 6 +++--- tests/test_load.py | 8 ++++---- 7 files changed, 34 insertions(+), 37 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index a78e97b5467..dfad4b883bc 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -651,7 +651,8 @@ def save_to_disk(self, dataset_path: str, fs=None): def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "Dataset": """ Loads a dataset that was previously saved using :meth:`save_to_disk` from a dataset directory, or from a - filesystem using either :class:`~filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``. + filesystem using either :class:`~filesystems.S3FileSystem` or any implementation of + ``fsspec.spec.AbstractFileSystem``. Args: dataset_path (:obj:`str`): Path (e.g. `dataset/train`) or remote URI (e.g. `s3//my-bucket/dataset/train`) of @@ -659,11 +660,10 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = fs (:class:`~filesystems.S3FileSystem`, ``fsspec.spec.AbstractFileSystem``, optional, default ``None``): Instance of the remote filesystem used to download the files from. keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the - dataset will be copied in-memory if its size is smaller than - `datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be - disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration - option ``datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the - environment variable ``HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). + dataset will be copied in-memory if its size is smaller than `datasets.config.IN_MEMORY_MAX_SIZE` + (default ``250 * 2 ** 20`` B). This behavior can be disabled (i.e., the dataset will not be loaded in + memory) by setting to ``0`` either the configuration option ``datasets.config.IN_MEMORY_MAX_SIZE`` + (higher precedence) or the environment variable ``HF_DATASETS_IN_MEMORY_MAX_SIZE`` (lower precedence). Returns: :class:`Dataset` or :class:`DatasetDict`. diff --git a/src/datasets/config.py b/src/datasets/config.py index de0fd37cf1c..72e368dd4cb 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -144,9 +144,9 @@ HF_DATASETS_OFFLINE = False # In-memory -DEFAULT_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = 250 * 2 ** 20 # 250 MiB -HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = float( - os.environ.get("HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", DEFAULT_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES) +DEFAULT_IN_MEMORY_MAX_SIZE = 250 * 2 ** 20 # 250 MiB +IN_MEMORY_MAX_SIZE = float( + os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE) ) # File names diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 0eb36016909..146a4d283fb 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -706,11 +706,10 @@ def load_from_disk(dataset_dict_path: str, fs=None, keep_in_memory: Optional[boo fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``): Instance of the remote filesystem used to download the files from. keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the - dataset will be copied in-memory if its size is smaller than - `datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be - disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration - option ``datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment - variable ``HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). + dataset will be copied in-memory if its size is smaller than `datasets.config.IN_MEMORY_MAX_SIZE` + (default ``250 * 2 ** 20`` B). This behavior can be disabled (i.e., the dataset will not be loaded in + memory) by setting to ``0`` either the configuration option ``datasets.config.IN_MEMORY_MAX_SIZE`` + (higher precedence) or the environment variable ``HF_DATASETS_IN_MEMORY_MAX_SIZE`` (lower precedence). Returns: :class:`DatasetDict` diff --git a/src/datasets/load.py b/src/datasets/load.py index d59e9b51525..e7281d38ad8 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -682,12 +682,11 @@ def load_dataset( download_config (:class:`~utils.DownloadConfig`, optional): Specific download configuration parameters. download_mode (:class:`GenerateMode`, optional): Select the download/generate mode - Default to REUSE_DATASET_IF_EXISTS ignore_verifications (:obj:`bool`, default ``False``): Ignore the verifications of the downloaded/processed dataset information (checksums/size/splits/...). - keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the - dataset will be copied in-memory if its size is smaller than - `datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled - (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration option - ``datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment variable - ``HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). + keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset + will be copied in-memory if its size is smaller than `datasets.config.IN_MEMORY_MAX_SIZE` (default + ``250 * 2 ** 20`` B). This behavior can be disabled (i.e., the dataset will not be loaded in memory) by + setting to ``0`` either the configuration option ``datasets.config.IN_MEMORY_MAX_SIZE`` (higher precedence) + or the environment variable ``HF_DATASETS_IN_MEMORY_MAX_SIZE`` (lower precedence). save_infos (:obj:`bool`, default ``False``): Save the dataset information (checksums/size/splits/...). script_version (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load: @@ -775,12 +774,11 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = loaded from. fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``): Instance of of the remote filesystem used to download the files from. - keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the - dataset will be copied in-memory if its size is smaller than - `datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled - (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration option - ``datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment variable - ``HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). + keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset + will be copied in-memory if its size is smaller than `datasets.config.IN_MEMORY_MAX_SIZE` (default + ``250 * 2 ** 20`` B). This behavior can be disabled (i.e., the dataset will not be loaded in memory) by + setting to ``0`` either the configuration option ``datasets.config.IN_MEMORY_MAX_SIZE`` (higher precedence) + or the environment variable ``HF_DATASETS_IN_MEMORY_MAX_SIZE`` (lower precedence). Returns: ``datasets.Dataset`` or ``datasets.DatasetDict`` diff --git a/src/datasets/utils/info_utils.py b/src/datasets/utils/info_utils.py index 9ede702d393..d99f9c4413e 100644 --- a/src/datasets/utils/info_utils.py +++ b/src/datasets/utils/info_utils.py @@ -85,15 +85,15 @@ def get_size_checksum_dict(path: str) -> dict: def is_small_dataset(dataset_size): - """Check if `dataset_size` is smaller than `config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`. + """Check if `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`. Args: dataset_size (int): Dataset size in bytes. Returns: - bool: Whether `dataset_size` is smaller than `config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`. + bool: Whether `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`. """ - if dataset_size and config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES: - return dataset_size < config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES + if dataset_size and config.IN_MEMORY_MAX_SIZE: + return dataset_size < config.IN_MEMORY_MAX_SIZE else: return False diff --git a/tests/test_info_utils.py b/tests/test_info_utils.py index f08d9fe3d14..da278def282 100644 --- a/tests/test_info_utils.py +++ b/tests/test_info_utils.py @@ -7,7 +7,7 @@ @pytest.fixture(params=[None, 0, 100 * 2 ** 20, 900 * 2 ** 20]) def env_max_in_memory_dataset_size(request, monkeypatch): if request.param: - monkeypatch.setenv("HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", request.param) + monkeypatch.setenv("IN_MEMORY_MAX_SIZE", request.param) @pytest.mark.parametrize("dataset_size", [None, 400 * 2 ** 20, 600 * 2 ** 20]) @@ -17,10 +17,10 @@ def test_is_small_dataset( ): if config_max_in_memory_dataset_size != "default": monkeypatch.setattr( - datasets.config, "HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", config_max_in_memory_dataset_size + datasets.config, "IN_MEMORY_MAX_SIZE", config_max_in_memory_dataset_size ) - max_in_memory_dataset_size = datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES + max_in_memory_dataset_size = datasets.config.IN_MEMORY_MAX_SIZE if config_max_in_memory_dataset_size == "default": if env_max_in_memory_dataset_size: assert max_in_memory_dataset_size == env_max_in_memory_dataset_size diff --git a/tests/test_load.py b/tests/test_load.py index ea686cfe1a1..efda38e4eb6 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -234,9 +234,9 @@ def test_load_dataset_local_with_default_in_memory( current_dataset_size = 148 if max_in_memory_dataset_size == "default": # default = 250 * 2 ** 20 - max_in_memory_dataset_size = datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES + max_in_memory_dataset_size = datasets.config.IN_MEMORY_MAX_SIZE else: - monkeypatch.setattr(datasets.config, "HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size) + monkeypatch.setattr(datasets.config, "IN_MEMORY_MAX_SIZE", max_in_memory_dataset_size) if max_in_memory_dataset_size: expected_in_memory = current_dataset_size < max_in_memory_dataset_size else: @@ -254,9 +254,9 @@ def test_load_from_disk_with_default_in_memory( current_dataset_size = 512 # arrow file size = 512, in-memory dataset size = 148 if max_in_memory_dataset_size == "default": # default = 250 * 2 ** 20 - max_in_memory_dataset_size = datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES + max_in_memory_dataset_size = datasets.config.IN_MEMORY_MAX_SIZE else: - monkeypatch.setattr(datasets.config, "HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size) + monkeypatch.setattr(datasets.config, "IN_MEMORY_MAX_SIZE", max_in_memory_dataset_size) if max_in_memory_dataset_size: expected_in_memory = current_dataset_size < max_in_memory_dataset_size else: From 8e80c34dd5ada1676c38b09367a93507b1a81100 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 21:16:54 +0200 Subject: [PATCH 2/3] Rename also in documentation --- docs/source/loading_datasets.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/loading_datasets.rst b/docs/source/loading_datasets.rst index 869af379664..b1fe8200b30 100644 --- a/docs/source/loading_datasets.rst +++ b/docs/source/loading_datasets.rst @@ -67,10 +67,10 @@ This call to :func:`datasets.load_dataset` does the following steps under the ho memory-mapping and pay effectively zero cost with O(1) random access. Alternatively, you can copy it in CPU memory (RAM) by setting the ``keep_in_memory`` argument of :func:`datasets.load_datasets` to ``True``. The default in 🤗Datasets is to memory-map the dataset on drive if its size is larger than - ``datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (default ``250`` MiB); otherwise, the dataset is copied - in-memory. This behavior can be disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either - the configuration option ``datasets.config.HF_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the - environment variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). + ``datasets.config.IN_MEMORY_MAX_SIZE`` (default ``250 * 2 ** 20`` B); otherwise, the dataset is copied in-memory. + This behavior can be disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the + configuration option ``datasets.config.IN_MEMORY_MAX_SIZE`` (higher precedence) or the environment variable + ``HF_DATASETS_IN_MEMORY_MAX_SIZE`` (lower precedence). 3. Return a **dataset built from the splits** asked by the user (default: all); in the above example we create a dataset with the train split. From fb71299aae96b9e32da81b9f7e64892d2e36465f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 21:22:59 +0200 Subject: [PATCH 3/3] Fix style --- src/datasets/config.py | 4 +--- tests/test_info_utils.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/datasets/config.py b/src/datasets/config.py index 72e368dd4cb..6a5bec6b048 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -145,9 +145,7 @@ # In-memory DEFAULT_IN_MEMORY_MAX_SIZE = 250 * 2 ** 20 # 250 MiB -IN_MEMORY_MAX_SIZE = float( - os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE) -) +IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE)) # File names DATASET_ARROW_FILENAME = "dataset.arrow" diff --git a/tests/test_info_utils.py b/tests/test_info_utils.py index da278def282..c4628d86458 100644 --- a/tests/test_info_utils.py +++ b/tests/test_info_utils.py @@ -16,9 +16,7 @@ def test_is_small_dataset( dataset_size, config_max_in_memory_dataset_size, env_max_in_memory_dataset_size, monkeypatch ): if config_max_in_memory_dataset_size != "default": - monkeypatch.setattr( - datasets.config, "IN_MEMORY_MAX_SIZE", config_max_in_memory_dataset_size - ) + monkeypatch.setattr(datasets.config, "IN_MEMORY_MAX_SIZE", config_max_in_memory_dataset_size) max_in_memory_dataset_size = datasets.config.IN_MEMORY_MAX_SIZE if config_max_in_memory_dataset_size == "default":