From 1b6aec1046fbe7dd4e9f2ecade14dd9ba56801e0 Mon Sep 17 00:00:00 2001
From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:08:39 +0530
Subject: [PATCH 1/3] Add flatten_indices option to save_to_disk method

Added flatten_indices parameter to control index flattening during dataset saving.
---
 src/datasets/arrow_dataset.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 36b744a024a..6875016544c 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -1514,6 +1514,7 @@ def save_to_disk(
         num_shards: Optional[int] = None,
         num_proc: Optional[int] = None,
         storage_options: Optional[dict] = None,
+        flatten_indices: bool = True,
     ):
         """
         Saves a dataset to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.
@@ -1611,10 +1612,16 @@ def save_to_disk(
             total=len(self),
             desc=f"Saving the dataset ({shards_done}/{num_shards} shards)",
         )
+        # Optionally flatten indices before sharding. Flattening rewrites the dataset as a contiguous
+        # Arrow table which can be expensive for datasets that already use indices mappings (e.g. after
+        # filter/shuffle). By default `flatten_indices=True` to preserve existing behavior. Setting it to
+        # False avoids rebuilding the dataset and can significantly speed up save_to_disk for those cases.
+        dataset_for_sharding = self.flatten_indices() if (self._indices is not None and flatten_indices) else self
+
         kwargs_per_job = (
             {
                 "job_id": shard_idx,
-                "shard": self.shard(num_shards=num_shards, index=shard_idx, contiguous=True),
+                "shard": dataset_for_sharding.shard(num_shards=num_shards, index=shard_idx, contiguous=True),
                 "fpath": posixpath.join(dataset_path, f"data-{shard_idx:05d}-of-{num_shards:05d}.arrow"),
                 "storage_options": storage_options,
             }

From b01cf77f1830f51ec9fa4a4e0b2b322c381e4eea Mon Sep 17 00:00:00 2001
From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:11:20 +0530
Subject: [PATCH 2/3] Add flatten_indices parameter to save function

---
 src/datasets/dataset_dict.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
index 995103d26e0..5b93abfda42 100644
--- a/src/datasets/dataset_dict.py
+++ b/src/datasets/dataset_dict.py
@@ -1298,6 +1298,7 @@ def save_to_disk(
         num_shards: Optional[dict[str, int]] = None,
         num_proc: Optional[int] = None,
         storage_options: Optional[dict] = None,
+        flatten_indices: bool = True,
     ):
         """
         Saves a dataset dict to a filesystem using `fsspec.spec.AbstractFileSystem`.
@@ -1363,6 +1364,7 @@ def save_to_disk(
                 max_shard_size=max_shard_size,
                 num_proc=num_proc,
                 storage_options=storage_options,
+                flatten_indices=flatten_indices,
             )
 
     @staticmethod

From 1e6a284e4db0e19d552fd70a3eb2944e07726330 Mon Sep 17 00:00:00 2001
From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:20:18 +0530
Subject: [PATCH 3/3] Update arrow_dataset.py

---
 src/datasets/arrow_dataset.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 6875016544c..bc57174c90e 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -1612,9 +1612,6 @@ def save_to_disk(
             total=len(self),
             desc=f"Saving the dataset ({shards_done}/{num_shards} shards)",
         )
-        # Optionally flatten indices before sharding. Flattening rewrites the dataset as a contiguous
-        # Arrow table which can be expensive for datasets that already use indices mappings (e.g. after
-        # filter/shuffle). By default `flatten_indices=True` to preserve existing behavior. Setting it to
         # False avoids rebuilding the dataset and can significantly speed up save_to_disk for those cases.
         dataset_for_sharding = self.flatten_indices() if (self._indices is not None and flatten_indices) else self