Fix test_no_weak_ref_output_decorator

sarckk · sarckk · commit c5da93eb94ee · 2025-11-19T12:53:46.000-08:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
@@ -286,18 +286,40 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
 
-def test_no_weak_ref_output_decorator():
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+def test_no_weak_ref_output_decorator(use_inductor_graph_partition, monkeypatch):
+    # disable compile cache so that we can count the number of compilations
+    # appropriately
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
     # piecewise
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
-            splitting_ops=["silly.attention"],
+            splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
+    expected_num_graphs_seen = 1
+    expected_num_cudagraph_captured = (
+        4  # num_cudagraph_sizes * num cudagraphs to capture
+    )
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 1
+        expected_num_piecewise_capturable_graphs_seen = 1
+        expected_num_backend_compilations = 1
+    else:
+        expected_num_piecewise_graphs_seen = 3
+        expected_num_piecewise_capturable_graphs_seen = 2
+        expected_num_backend_compilations = 2
+
     @support_torch_compile(no_weak_ref_output=False)
     class A(nn.Module):
         def __init__(
@@ -330,12 +352,11 @@ class C(B): ...
 
     # A has support_torch_compile
     with compilation_counter.expect(
-        num_graphs_seen=1,
-        num_piecewise_graphs_seen=3,
-        num_piecewise_capturable_graphs_seen=2,
-        num_backend_compilations=2,
-        num_cudagraph_captured=4,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
@@ -346,11 +367,11 @@ class C(B): ...
 
     # B also has support_torch_compile
     with compilation_counter.expect(
-        num_graphs_seen=1,
-        num_piecewise_graphs_seen=3,
-        num_piecewise_capturable_graphs_seen=2,
-        num_backend_compilations=2,
-        num_cudagraph_captured=4,
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_B, cudagraph_runtime_mode)
 
@@ -361,10 +382,10 @@ class C(B): ...
 
     # C has support_torch_compile
     with compilation_counter.expect(
-        num_graphs_seen=1,
-        num_piecewise_graphs_seen=3,
-        num_piecewise_capturable_graphs_seen=2,
-        num_backend_compilations=2,
-        num_cudagraph_captured=4,
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_C, cudagraph_runtime_mode)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -495,7 +495,9 @@ def patched_inline_call(self_):
                 InliningInstructionTranslator, "inline_call_", patched_inline_call
             ),
             torch._dynamo.config.patch(**dynamo_config_patches),
-            maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            maybe_use_cudagraph_partition_wrapper(
+                self.vllm_config, self.no_weak_ref_output
+            ),
             _torch27_patch_tensor_subclasses(),
         ):
             if envs.VLLM_USE_AOT_COMPILE:
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
@@ -94,8 +94,7 @@ def __init__(self, no_weak_ref_output: bool = False):
             raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
 
         backend = vllm_config.compilation_config.init_backend(
-            vllm_config,
-            no_weak_ref_output
+            vllm_config, no_weak_ref_output
         )
         options = {}
 

Original file line number	Diff line number	Diff line change
`@@ -94,8 +94,7 @@ def __init__(self, no_weak_ref_output: bool = False):`
`94`	`94`	`raise RuntimeError("Compilation mode cannot be NO_COMPILATION")`
`95`	`95`
`96`	`96`	`backend = vllm_config.compilation_config.init_backend(`
`97`		`- vllm_config,`
`98`		`- no_weak_ref_output`
	`97`	`+ vllm_config, no_weak_ref_output`
`99`	`98`	`)`
`100`	`99`	`options = {}`
`101`	`100`