Zero buffer inside shrink op

andylolu2 · andylolu2 · commit 00d132849519 · 2025-10-01T02:35:54.000Z
Signed-off-by: Andy Lo &lt;andy@mistral.ai&gt;
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -152,6 +152,8 @@ def _lora_shrink(
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
     assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
 
+    output_tensor.zero_()
+
     (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
      lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
     N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -207,7 +207,8 @@ def add_lora_linear(self,
             r = lora_b_stacked[0].size(-1)
             # We set the buffer to be float32 by default, refer to:
             # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros(  # type: ignore
+            # Note: buffer is zeroed inside the shrink op
+            buffer = torch.empty(  # type: ignore
                 (len(output_slices), x.size(0), r),
                 dtype=torch.float32,
                 device=x.device,