rapidsai · rapids-bot · Mar 19, 2026 · Mar 18, 2026 · Mar 19, 2026 · Mar 19, 2026
@@ -3,6 +3,8 @@
 #
 
 import cupy as cp
+import dask
+import dask.array as da
 from dask.distributed import get_worker
 from raft_dask.common.comms import Comms, get_raft_comm_state
 
@@ -159,10 +161,51 @@ def fit(self, X, sample_weight=None):
 
         comms.destroy()
 
-        models = [res.result() for res in kmeans_fit]
-        first = models[0]
-        first.labels_ = cp.concatenate([model.labels_ for model in models])
-        first.inertia_ = sum(model.inertia_ for model in models)
+        # Collect the full model from only the first worker (for
+        # cluster_centers_ etc). Since cluster centers are synchronized
+        # via NCCL, all workers have identical copies — pulling more
+        # than one would waste memory (N * n_clusters * n_features * 4B).
+        #
+        # Labels stay distributed as a dask.array to avoid transferring
+        # per-sample data to the client. Only the scalar inertia values
+        # are gathered.
+        first = kmeans_fit[0].result()
+        workers = list(data.worker_to_parts.keys())
+
+        remote_labels = [
+            self.client.submit(getattr, f, "labels_", workers=[w])
+            for f, w in zip(kmeans_fit[1:], workers[1:])
+        ]
+        remote_inertias = [
+            self.client.submit(getattr, f, "inertia_", workers=[w])
+            for f, w in zip(kmeans_fit[1:], workers[1:])
+        ]
+
+        self.inertia_ = first.inertia_ + sum(
+            self.client.gather(remote_inertias)
+        )
+
+        labels_dtype = first.labels_.dtype
+        label_chunks = [
+            da.from_delayed(
+                dask.delayed(first.labels_, pure=True, traverse=False),
+                shape=(first.labels_.shape[0],),
+                dtype=labels_dtype,
+                meta=cp.zeros(0, dtype=labels_dtype),
+            )
+        ] + [
+            da.from_delayed(
+                f,
+                shape=(float("nan"),),
+                dtype=labels_dtype,
+                meta=cp.zeros(0, dtype=labels_dtype),
+            )
+            for f in remote_labels
+        ]
+        self.labels_ = da.concatenate(
+            label_chunks, allow_unknown_chunksizes=True
+        )
+
         self._set_internal_model(first)
 
         return self