From 1b6aec1046fbe7dd4e9f2ecade14dd9ba56801e0 Mon Sep 17 00:00:00 2001 From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Thu, 13 Nov 2025 01:08:39 +0530 Subject: [PATCH 1/3] Add flatten_indices option to save_to_disk method Added flatten_indices parameter to control index flattening during dataset saving. --- src/datasets/arrow_dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 36b744a024a..6875016544c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1514,6 +1514,7 @@ def save_to_disk( num_shards: Optional[int] = None, num_proc: Optional[int] = None, storage_options: Optional[dict] = None, + flatten_indices: bool = True, ): """ Saves a dataset to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`. @@ -1611,10 +1612,16 @@ def save_to_disk( total=len(self), desc=f"Saving the dataset ({shards_done}/{num_shards} shards)", ) + # Optionally flatten indices before sharding. Flattening rewrites the dataset as a contiguous + # Arrow table which can be expensive for datasets that already use indices mappings (e.g. after + # filter/shuffle). By default `flatten_indices=True` to preserve existing behavior. Setting it to + # False avoids rebuilding the dataset and can significantly speed up save_to_disk for those cases. + dataset_for_sharding = self.flatten_indices() if (self._indices is not None and flatten_indices) else self + kwargs_per_job = ( { "job_id": shard_idx, - "shard": self.shard(num_shards=num_shards, index=shard_idx, contiguous=True), + "shard": dataset_for_sharding.shard(num_shards=num_shards, index=shard_idx, contiguous=True), "fpath": posixpath.join(dataset_path, f"data-{shard_idx:05d}-of-{num_shards:05d}.arrow"), "storage_options": storage_options, } From b01cf77f1830f51ec9fa4a4e0b2b322c381e4eea Mon Sep 17 00:00:00 2001 From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Thu, 13 Nov 2025 01:11:20 +0530 Subject: [PATCH 2/3] Add flatten_indices parameter to save function --- src/datasets/dataset_dict.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 995103d26e0..5b93abfda42 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1298,6 +1298,7 @@ def save_to_disk( num_shards: Optional[dict[str, int]] = None, num_proc: Optional[int] = None, storage_options: Optional[dict] = None, + flatten_indices: bool = True, ): """ Saves a dataset dict to a filesystem using `fsspec.spec.AbstractFileSystem`. @@ -1363,6 +1364,7 @@ def save_to_disk( max_shard_size=max_shard_size, num_proc=num_proc, storage_options=storage_options, + flatten_indices=flatten_indices, ) @staticmethod From 1e6a284e4db0e19d552fd70a3eb2944e07726330 Mon Sep 17 00:00:00 2001 From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Thu, 13 Nov 2025 01:20:18 +0530 Subject: [PATCH 3/3] Update arrow_dataset.py --- src/datasets/arrow_dataset.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 6875016544c..bc57174c90e 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1612,9 +1612,6 @@ def save_to_disk( total=len(self), desc=f"Saving the dataset ({shards_done}/{num_shards} shards)", ) - # Optionally flatten indices before sharding. Flattening rewrites the dataset as a contiguous - # Arrow table which can be expensive for datasets that already use indices mappings (e.g. after - # filter/shuffle). By default `flatten_indices=True` to preserve existing behavior. Setting it to # False avoids rebuilding the dataset and can significantly speed up save_to_disk for those cases. dataset_for_sharding = self.flatten_indices() if (self._indices is not None and flatten_indices) else self