sgl-project · zhyncs · May 28, 2025 · May 27, 2025 · May 27, 2025 · May 27, 2025
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -40,7 +40,7 @@
 )
 from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.two_batch_overlap import (
-    TboCudaGraphRunnerUtils,
+    TboCudaGraphRunnerPlugin,
     TboForwardBatchPreparer,
 )
 from sglang.srt.utils import (
@@ -256,6 +256,7 @@ def __init__(self, model_runner: ModelRunner):
             self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
             self.num_token_non_padded = torch.zeros((1,), dtype=torch.int32)
+            self.tbo_plugin = TboCudaGraphRunnerPlugin()
 
             # pipeline parallelism
             if self.pp_size > 1:
@@ -481,12 +482,9 @@ def capture_one_batch_size(self, bs: int, forward: Callable):
             capture_hidden_mode=self.capture_hidden_mode,
             lora_paths=lora_paths,
             num_token_non_padded=self.num_token_non_padded,
-            tbo_split_seq_index=TboCudaGraphRunnerUtils.compute_tbo_split_seq_index(
-                self, num_tokens
-            ),
             global_forward_mode=self.capture_forward_mode,
         )
-        TboForwardBatchPreparer.prepare(forward_batch)
+        self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
 
         if lora_paths is not None:
             self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
@@ -582,6 +580,7 @@ def replay_prepare(
         self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
         self.positions[:raw_num_token].copy_(forward_batch.positions)
         self.num_token_non_padded[...] = len(forward_batch.input_ids)
+        self.tbo_plugin.replay_prepare(forward_batch)
         if forward_batch.seq_lens_cpu is not None:
             if bs != raw_bs:
                 self.seq_lens_cpu.fill_(1)

@@ -452,6 +452,7 @@ def op_select_experts(self, state):
                 num_expert_group=self.num_expert_group,
                 correction_bias=self.correction_bias,
                 routed_scaling_factor=self.routed_scaling_factor,
+                num_token_non_padded=forward_batch.num_token_non_padded,
                 expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
                     layer_id=self.layer_id,
                 ),

diff --git a/python/sglang/srt/two_batch_overlap.py b/python/sglang/srt/two_batch_overlap.py
@@ -88,22 +88,36 @@ def compute_split_token_index(
 # -------------------------------- Preparation ---------------------------------------
 
 
-class TboCudaGraphRunnerUtils:
-    @staticmethod
-    def compute_tbo_split_seq_index(that: "CudaGraphRunner", num_tokens: int):
-        if that.model_runner.server_args.enable_two_batch_overlap:
-            tbo_split_seq_index = compute_split_seq_index(
-                forward_mode=that.capture_forward_mode,
-                num_tokens=num_tokens,
-                extend_lens=None,
-            )
-            # For simplicity, when two_batch_overlap is enabled, we only capture CUDA Graph for tbo=true
-            assert (
-                tbo_split_seq_index is not None
-            ), f"{that.capture_forward_mode=} {num_tokens=}"
-        else:
-            tbo_split_seq_index = None
-        return tbo_split_seq_index
+class TboCudaGraphRunnerPlugin:
+    def __init__(self):
+        self._tbo_children_num_token_non_padded = torch.zeros((2,), dtype=torch.int32)
+
+    def capture_one_batch_size(self, batch: ForwardBatch, num_tokens: int):
+        if not global_server_args_dict["enable_two_batch_overlap"]:
+            return
+
+        batch.tbo_split_seq_index = compute_split_seq_index(
+            forward_mode=batch.forward_mode,
+            num_tokens=num_tokens,
+            extend_lens=None,
+        )
+        # For simplicity, when two_batch_overlap is enabled, we only capture CUDA Graph for tbo=true
+        assert batch.tbo_split_seq_index is not None, f"{num_tokens=}"
-        assert batch.tbo_split_seq_index is not None, f"{num_tokens=}"
+        assert batch.tbo_split_seq_index is not None, f"{batch.forward_mode=} {num_tokens=}"
-        assert batch.tbo_split_seq_index is not None, f"{num_tokens=}"
+        assert batch.tbo_split_seq_index is not None, f"{batch.forward_mode=} {num_tokens=}"
+
+        self._fill_tensor_content(batch)
+
+        TboForwardBatchPreparer.prepare_raw(
+            batch,
+            tbo_children_num_token_non_padded=self._tbo_children_num_token_non_padded,
+        )
+
+    def replay_prepare(self, batch: ForwardBatch):
+        self._fill_tensor_content(batch)
+
+    def _fill_tensor_content(self, batch: ForwardBatch):
+        self._tbo_children_num_token_non_padded[...] = (
+            TboForwardBatchPreparer.compute_tbo_children_num_token_non_padded(batch)
+        )
 
 
 class TboDPAttentionPreparer:
@@ -178,17 +192,24 @@ def _is_all_same(x):
 class TboForwardBatchPreparer:
     @classmethod
     def prepare(cls, batch: ForwardBatch):
-        from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
-
         if batch.tbo_split_seq_index is None:
             return
 
-        tbo_split_token_index = compute_split_token_index(
-            split_seq_index=batch.tbo_split_seq_index,
-            forward_mode=batch.forward_mode,
-            extend_seq_lens=batch.extend_seq_lens_cpu,
+        cls.prepare_raw(
+            batch,
+            tbo_children_num_token_non_padded=cls.compute_tbo_children_num_token_non_padded(
+                batch
+            ),
         )
 
+    @classmethod
+    def prepare_raw(
+        cls, batch: ForwardBatch, tbo_children_num_token_non_padded: torch.Tensor
+    ):
+        from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+
+        tbo_split_token_index = cls._compute_split_token_index(batch)
+
         if _tbo_debug:
             logger.info(
                 f"TboForwardBatchPreparer.prepare "
@@ -200,13 +221,18 @@ def prepare(cls, batch: ForwardBatch):
         assert isinstance(batch.attn_backend, TboAttnBackend)
         attn_backend_child_a, attn_backend_child_b = batch.attn_backend.children
 
+        [out_num_token_non_padded_a, out_num_token_non_padded_b] = (
+            tbo_children_num_token_non_padded
+        )
+
         child_a = cls.filter_batch(
             batch,
             start_token_index=0,
             end_token_index=tbo_split_token_index,
             start_seq_index=0,
             end_seq_index=batch.tbo_split_seq_index,
             output_attn_backend=attn_backend_child_a,
+            out_num_token_non_padded=out_num_token_non_padded_a,
         )
         child_b = cls.filter_batch(
             batch,
@@ -215,6 +241,7 @@ def prepare(cls, batch: ForwardBatch):
             start_seq_index=batch.tbo_split_seq_index,
             end_seq_index=batch.batch_size,
             output_attn_backend=attn_backend_child_b,
+            out_num_token_non_padded=out_num_token_non_padded_b,
         )
 
         assert batch.tbo_children is None
@@ -230,9 +257,8 @@ def filter_batch(
         start_seq_index: int,
         end_seq_index: int,
         output_attn_backend: AttentionBackend,
+        out_num_token_non_padded: torch.Tensor,
     ):
-        from sglang.srt.managers.schedule_batch import global_server_args_dict
-
         num_tokens = batch.input_ids.shape[0]
         num_seqs = batch.batch_size
 
@@ -313,6 +339,7 @@ def filter_batch(
                 ),
                 extend_num_tokens=extend_num_tokens,
                 attn_backend=output_attn_backend,
+                num_token_non_padded=out_num_token_non_padded,
                 tbo_split_seq_index=None,
                 tbo_parent_token_range=(start_token_index, end_token_index),
                 tbo_children=None,
@@ -328,7 +355,6 @@ def filter_batch(
                 top_p_normalized_logprobs=False,
                 top_p=None,
                 mm_inputs=None,
-                num_token_non_padded=None,
             )
         )
 
@@ -343,6 +369,28 @@ def filter_batch(
 
         return ForwardBatch(**output_dict)
 
+    @classmethod
+    def compute_tbo_children_num_token_non_padded(cls, batch: ForwardBatch):
+        tbo_split_token_index = cls._compute_split_token_index(batch)
+        num_token_non_padded = len(batch.input_ids)
+
+        # TODO we may make padding on both sub-batches to make it slightly more balanced
+        value_a = min(tbo_split_token_index, num_token_non_padded)
+        value_b = max(0, num_token_non_padded - tbo_split_token_index)
+        return torch.tensor(
+            [value_a, value_b],
+            device=global_server_args_dict["device"],
+            dtype=torch.int32,
+        )
+
+    @classmethod
+    def _compute_split_token_index(cls, batch: ForwardBatch):
+        return compute_split_token_index(
+            split_seq_index=batch.tbo_split_seq_index,
+            forward_mode=batch.forward_mode,
+            extend_seq_lens=batch.extend_seq_lens_cpu,
+        )
+
 
 def _compute_extend_num_tokens(input_ids, forward_mode: ForwardMode):
     if forward_mode.is_extend():