rapidsai
diff --git a/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/allgather.py‎
Lines changed: 3 additions & 5 deletions b/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/allgather.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/shuffle.py‎
Lines changed: 18 additions & 15 deletions b/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/shuffle.py‎
Lines changed: 18 additions & 15 deletions
diff --git a/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/io.py‎
Lines changed: 41 additions & 19 deletions b/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/io.py‎
Lines changed: 41 additions & 19 deletions
diff --git a/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/join.py‎
Lines changed: 24 additions & 19 deletions b/‎python/cudf_polars/cudf_polars/experimental/rapidsmpf/join.py‎
Lines changed: 24 additions & 19 deletions
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """AllGather logic for the RapidsMPF streaming runtime."""
 
@@ -59,6 +59,7 @@ def insert(self, sequence_number: int, chunk: TableChunk) -> None:
                 self.context.br(),
             ),
         )
+        del chunk
 
     def insert_finished(self) -> None:
         """Insert finished into the AllGatherManager."""
@@ -81,12 +82,9 @@ async def extract_concatenated(
         -------
         The concatenated AllGather result.
         """
-        partition_chunks = await self.allgather.extract_all(
-            self.context, ordered=ordered
-        )
         return await asyncio.to_thread(
             unpack_and_concat,
-            partitions=partition_chunks,
+            partitions=await self.allgather.extract_all(self.context, ordered=ordered),
             stream=stream,
             br=self.context.br(),
         )
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Shuffle logic for the RapidsMPF streaming runtime."""
 
@@ -176,13 +176,13 @@ async def shuffle_node(
 
         # Process input chunks
         while (msg := await ch_in.data.recv(context)) is not None:
-            # Extract TableChunk from message
-            chunk = TableChunk.from_message(msg).make_available_and_spill(
-                context.br(), allow_overbooking=True
+            # Extract TableChunk from message and insert into shuffler
+            shuffle.insert_chunk(
+                TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
             )
-
-            # Get the table view and insert into shuffler
-            shuffle.insert_chunk(chunk)
+            del msg
 
         # Insert finished
         await shuffle.insert_finished()
@@ -195,16 +195,19 @@ async def shuffle_node(
             num_partitions,
             context.comm().nranks,
         ):
-            # Create a new TableChunk with the result
-            output_chunk = TableChunk.from_pylibcudf_table(
-                table=await shuffle.extract_chunk(partition_id, stream),
-                stream=stream,
-                exclusive_view=True,
+            # Extract and send the output chunk
+            await ch_out.data.send(
+                context,
+                Message(
+                    partition_id,
+                    TableChunk.from_pylibcudf_table(
+                        table=await shuffle.extract_chunk(partition_id, stream),
+                        stream=stream,
+                        exclusive_view=True,
+                    ),
+                ),
             )
 
-            # Send the output chunk
-            await ch_out.data.send(context, Message(partition_id, output_chunk))
-
         await ch_out.data.drain(context)
 
 
 
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """IO logic for the RapidsMPF streaming runtime."""
 
@@ -40,6 +40,7 @@
 from cudf_polars.experimental.rapidsmpf.utils import (
     ChannelManager,
     Metadata,
+    opaque_reservation,
 )
 
 if TYPE_CHECKING:
@@ -107,7 +108,7 @@ async def drain(self) -> None:
 
         # Forward any remaining buffered messages
         for seq in sorted(buffer.keys()):
-            await self.ch_out.send(self.context, buffer[seq])
+            await self.ch_out.send(self.context, buffer.pop(seq))
 
         await self.ch_out.drain(self.context)
 
@@ -142,6 +143,7 @@ async def dataframescan_node(
     *,
     num_producers: int,
     rows_per_partition: int,
+    estimated_chunk_bytes: int,
 ) -> None:
     """
     DataFrameScan node for rapidsmpf.
@@ -160,6 +162,9 @@ async def dataframescan_node(
         The number of producers to use for the DataFrameScan node.
     rows_per_partition
         The number of rows per partition.
+    estimated_chunk_bytes
+        Estimated size of each chunk in bytes. Used for memory reservation
+        with block spilling to avoid thrashing.
     """
     async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
         # Find local partition count.
@@ -206,6 +211,7 @@ async def dataframescan_node(
                     seq_num,
                     ch_out.data,
                     ir_context,
+                    estimated_chunk_bytes,
                 )
             await ch_out.data.drain(context)
             return
@@ -230,6 +236,7 @@ async def _producer(producer_id: int, ch_out: Channel) -> None:
                     task_idx,
                     ch_out,
                     ir_context,
+                    estimated_chunk_bytes,
                 )
             await ch_out.drain(context)
 
@@ -250,6 +257,8 @@ def _(
     )
     rows_per_partition = config_options.executor.max_rows_per_partition
     num_producers = rec.state["max_io_threads"]
+    # Use target_partition_size as the estimated chunk size
+    estimated_chunk_bytes = config_options.executor.target_partition_size
 
     context = rec.state["context"]
     ir_context = rec.state["ir_context"]
@@ -263,6 +272,7 @@ def _(
                 channels[ir].reserve_input_slot(),
                 num_producers=num_producers,
                 rows_per_partition=rows_per_partition,
+                estimated_chunk_bytes=estimated_chunk_bytes,
             )
         ]
     }
@@ -307,6 +317,7 @@ async def read_chunk(
     seq_num: int,
     ch_out: Channel[TableChunk],
     ir_context: IRExecutionContext,
+    estimated_chunk_bytes: int,
 ) -> None:
     """
     Read a chunk from disk and send it to the output channel.
@@ -323,24 +334,27 @@ async def read_chunk(
         The output channel.
     ir_context
         The execution context for the IR node.
+    estimated_chunk_bytes
+        Estimated size of the chunk in bytes. Used for memory reservation
+        with block spilling to avoid thrashing.
     """
-    # Evaluate and send the Scan-node result
-    df = await asyncio.to_thread(
-        scan.do_evaluate,
-        *scan._non_child_args,
-        context=ir_context,
-    )
-    await ch_out.send(
-        context,
-        Message(
-            seq_num,
-            TableChunk.from_pylibcudf_table(
-                df.table,
-                df.stream,
-                exclusive_view=True,
+    with opaque_reservation(context, estimated_chunk_bytes):
+        df = await asyncio.to_thread(
+            scan.do_evaluate,
+            *scan._non_child_args,
+            context=ir_context,
+        )
+        await ch_out.send(
+            context,
+            Message(
+                seq_num,
+                TableChunk.from_pylibcudf_table(
+                    df.table,
+                    df.stream,
+                    exclusive_view=True,
+                ),
             ),
-        ),
-    )
+        )
 
 
 @define_py_node()
@@ -353,6 +367,7 @@ async def scan_node(
     num_producers: int,
     plan: IOPartitionPlan,
     parquet_options: ParquetOptions,
+    estimated_chunk_bytes: int,
 ) -> None:
     """
     Scan node for rapidsmpf.
@@ -373,6 +388,9 @@ async def scan_node(
         The partitioning plan.
     parquet_options
         The Parquet options.
+    estimated_chunk_bytes
+        Estimated size of each chunk in bytes. Used for memory reservation
+        with block spilling to avoid thrashing.
     """
     async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
         # Build a list of local Scan operations
@@ -460,6 +478,7 @@ async def scan_node(
                     seq_num,
                     ch_out.data,
                     ir_context,
+                    estimated_chunk_bytes,
                 )
             await ch_out.data.drain(context)
             return
@@ -484,6 +503,7 @@ async def _producer(producer_id: int, ch_out: Channel) -> None:
                     task_idx,
                     ch_out,
                     ir_context,
+                    estimated_chunk_bytes,
                 )
             await ch_out.drain(context)
 
@@ -607,7 +627,8 @@ def _(
     ir: Scan, rec: SubNetGenerator
 ) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     config_options = rec.state["config_options"]
-    assert config_options.executor.name == "streaming", (
+    executor = rec.state["config_options"].executor
+    assert executor.name == "streaming", (
         "'in-memory' executor not supported in 'generate_ir_sub_network'"
     )
     parquet_options = config_options.parquet_options
@@ -669,6 +690,7 @@ def _(
                 num_producers=num_producers,
                 plan=plan,
                 parquet_options=parquet_options,
+                estimated_chunk_bytes=executor.target_partition_size,
             )
         ]
     return nodes, channels
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Join logic for the RapidsMPF streaming runtime."""
 
@@ -27,6 +27,7 @@
     Metadata,
     chunk_to_frame,
     empty_table_chunk,
+    opaque_reservation,
     process_children,
 )
 from cudf_polars.experimental.utils import _concat
@@ -128,6 +129,7 @@ async def broadcast_join_node(
                     context.br(), allow_overbooking=True
                 )
             )
+            del msg
             small_size += small_chunks[-1].data_alloc_size(MemoryType.DEVICE)
 
         # Allgather is a collective - all ranks must participate even with no local data
@@ -193,6 +195,7 @@ async def broadcast_join_node(
                     context.br(), allow_overbooking=True
                 )
                 seq_num = msg.sequence_number
+                del msg
 
             large_df = DataFrame.from_table(
                 large_chunk.table_view(),
@@ -207,10 +210,11 @@ async def broadcast_join_node(
                 empty_small_chunk = empty_table_chunk(small_child, context, stream)
                 small_dfs = [chunk_to_frame(empty_small_chunk, small_child)]
 
-            # Perform the join
-            df = _concat(
-                *[
-                    (
+            large_chunk_size = large_chunk.data_alloc_size(MemoryType.DEVICE)
+            input_bytes = large_chunk_size + small_size
+            with opaque_reservation(context, input_bytes):
+                df = _concat(
+                    *[
                         await asyncio.to_thread(
                             ir.do_evaluate,
                             *ir._non_child_args,
@@ -221,23 +225,24 @@ async def broadcast_join_node(
                             ),
                             context=ir_context,
                         )
-                    )
-                    for small_df in small_dfs
-                ],
-                context=ir_context,
-            )
+                        for small_df in small_dfs
+                    ],
+                    context=ir_context,
+                )
 
-            # Send output chunk
-            await ch_out.data.send(
-                context,
-                Message(
-                    seq_num,
-                    TableChunk.from_pylibcudf_table(
-                        df.table, df.stream, exclusive_view=True
+                # Send output chunk
+                await ch_out.data.send(
+                    context,
+                    Message(
+                        seq_num,
+                        TableChunk.from_pylibcudf_table(
+                            df.table, df.stream, exclusive_view=True
+                        ),
                     ),
-                ),
-            )
+                )
+                del df, large_df, large_chunk
 
+        del small_dfs, small_chunks
         await ch_out.data.drain(context)