Skip to content

Commit 8db884d

Browse files
hmellorgeodavic
authored andcommitted
Remove deprecated fields from CompilationConfig (vllm-project#27593)
Signed-off-by: Harry Mellor <[email protected]> Signed-off-by: George D. Torres <[email protected]>
1 parent b18b0df commit 8db884d

File tree

13 files changed

+122
-164
lines changed

13 files changed

+122
-164
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ steps:
443443
- vllm/
444444
- tests/compile
445445
commands:
446+
- pytest -v -s compile/test_config.py
446447
- pytest -v -s compile/test_pass_manager.py
447448
- pytest -v -s compile/test_fusion.py
448449
- pytest -v -s compile/test_fusion_attn.py

docs/design/cuda_graphs.md

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -218,16 +218,6 @@ outputs = model.generate(
218218
)
219219
```
220220

221-
### Migration from legacy flags
222-
223-
Legacy `use_cudagraph` and `full_cuda_graph` are unified by `cudagraph_mode`:
224-
225-
* `use_cudagraph=False``NONE`.
226-
* `use_cudagraph=True` and `full_cuda_graph=False``PIECEWISE`.
227-
* `full_cuda_graph=True` → directly set `FULL` and rely on the graceful fallback policy.
228-
229-
As they are deprecated and will be removed in the next major or minor release, i.e., v0.11.0 or v1.0.0, we recommend using cudagraph_mode instead.
230-
231221
### Piecewise compilation and full graph custom passes (attention fusion, sequence parallelism)
232222

233223
Unfortunately, some custom compile passes have to see the whole graph to be effective and hence aren't compatible with piecewise compilation. This includes `AttnFusionPass` and `SequenceParallelismPass`. As a short-term solution, we automatically disable piecewise compilation (by setting `splitting_ops=[]`) when attention fusion is enabled. We use CUDA Graph modes `FULL` or `FULL_DECODE_ONLY` (depending on backend support). However, this leads to another optimization incompatibility and confusing performance tradeoffs.

tests/compile/piecewise/test_multiple_graphs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
203203
vllm_config = VllmConfig(
204204
compilation_config=CompilationConfig(
205205
mode=CompilationMode.VLLM_COMPILE,
206-
use_cudagraph=True,
206+
cudagraph_mode=CUDAGraphMode.PIECEWISE,
207207
splitting_ops=["silly::attention"],
208208
cudagraph_capture_sizes=[1, 2],
209209
use_inductor_graph_partition=use_inductor_graph_partition,
@@ -281,7 +281,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
281281
vllm_config = VllmConfig(
282282
compilation_config=CompilationConfig(
283283
mode=CompilationMode.VLLM_COMPILE,
284-
use_cudagraph=False,
284+
cudagraph_mode=CUDAGraphMode.NONE,
285285
splitting_ops=["silly::attention"],
286286
use_inductor_graph_partition=use_inductor_graph_partition,
287287
)

tests/compile/piecewise/test_simple.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ def _run_simple_model(
6262
vllm_config = VllmConfig(
6363
compilation_config=CompilationConfig(
6464
mode=CompilationMode.VLLM_COMPILE,
65-
use_cudagraph=True,
6665
use_inductor=use_inductor,
6766
splitting_ops=splitting_ops,
6867
use_inductor_graph_partition=use_inductor_graph_partition,

tests/compile/piecewise/test_toy_llama.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,6 @@ def benchmark():
449449
if piecewise:
450450
compilation_config = CompilationConfig(
451451
mode=CompilationMode.VLLM_COMPILE,
452-
use_cudagraph=True,
453452
splitting_ops=["silly::attention"],
454453
cudagraph_capture_sizes=cudagraph_sizes,
455454
)

tests/compile/test_config.py

Lines changed: 91 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import copy
44
from contextlib import nullcontext
5+
from unittest.mock import patch
56

67
import pytest
8+
from pydantic import ValidationError
79

810
from vllm.compilation.counter import compilation_counter
911
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
1012
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
1113
from vllm.config.compilation import CompilationMode
1214
from vllm.engine.arg_utils import EngineArgs
1315
from vllm.platforms import current_platform
14-
from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
16+
from vllm.utils.torch_utils import _is_torch_equal_or_newer
1517

1618

1719
def test_version():
@@ -23,14 +25,6 @@ def test_version():
2325
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
2426

2527

26-
def test_use_cudagraphs_dynamic():
27-
vllm_config = VllmConfig()
28-
# Default V1 configuration now starts without cudagraphs enabled; the
29-
# engine decides when to capture based on runtime settings instead of a
30-
# blanket default.
31-
assert vllm_config.compilation_config.use_cudagraph
32-
33-
3428
def test_copy_pass():
3529
vllm_config = VllmConfig()
3630
inductor_pass = FixFunctionalizationPass(vllm_config)
@@ -65,7 +59,7 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
6559
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
6660

6761
compilation_config = {
68-
"use_cudagraph": False, # speed things up a bit
62+
"cudagraph_mode": CUDAGraphMode.NONE, # speed things up a bit
6963
}
7064
with (
7165
compilation_counter.expect(
@@ -83,20 +77,31 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
8377

8478
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
8579
@pytest.mark.forked
86-
@pytest.mark.parametrize("enabled", [True, False])
87-
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
80+
@pytest.mark.parametrize(
81+
"cudagraph_mode,num_cudagraph_captured",
82+
[
83+
(CUDAGraphMode.NONE, 0),
84+
(CUDAGraphMode.FULL_DECODE_ONLY, 1),
85+
(CUDAGraphMode.PIECEWISE, 13),
86+
(CUDAGraphMode.FULL_AND_PIECEWISE, 14),
87+
],
88+
)
89+
def test_use_cudagraphs(
90+
vllm_runner, monkeypatch, cudagraph_mode, num_cudagraph_captured
91+
):
8892
# Disable multiprocessing so that the counter is in the same process
8993
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
9094

9195
compilation_config = {
9296
"cudagraph_capture_sizes": [100],
93-
"use_cudagraph": enabled,
97+
"cudagraph_mode": cudagraph_mode,
9498
}
99+
num_gpu_runner_capture_triggers = 1 if cudagraph_mode != CUDAGraphMode.NONE else 0
95100
with (
96101
compilation_counter.expect(
97102
num_graphs_seen=1,
98-
num_gpu_runner_capture_triggers=1 if enabled else 0,
99-
num_cudagraph_captured=13 if enabled else 0,
103+
num_gpu_runner_capture_triggers=num_gpu_runner_capture_triggers,
104+
num_cudagraph_captured=num_cudagraph_captured,
100105
),
101106
# loading the model causes compilation (if enabled) to happen
102107
vllm_runner(
@@ -168,19 +173,18 @@ def test_splitting_ops_dynamic():
168173
assert not config.compilation_config.splitting_ops_contain_attention()
169174

170175
# When use_inductor_graph_partition=True
171-
if is_torch_equal_or_newer("2.9.0.dev"):
172-
config = VllmConfig(
173-
compilation_config=CompilationConfig(
174-
mode=CompilationMode.VLLM_COMPILE,
175-
use_inductor_graph_partition=True,
176-
splitting_ops=["vllm::unified_attention"],
177-
)
176+
config = VllmConfig(
177+
compilation_config=CompilationConfig(
178+
mode=CompilationMode.VLLM_COMPILE,
179+
use_inductor_graph_partition=True,
180+
splitting_ops=["vllm::unified_attention"],
178181
)
179-
# with inductor partition we use splitting_ops directly for
180-
# partition rules
181-
assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
182+
)
183+
# with inductor partition we use splitting_ops directly for
184+
# partition rules
185+
assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
182186

183-
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
187+
# When attn_fusion pass enabled.
184188
config = VllmConfig(
185189
compilation_config=CompilationConfig(
186190
mode=CompilationMode.VLLM_COMPILE,
@@ -189,29 +193,41 @@ def test_splitting_ops_dynamic():
189193
cudagraph_mode=CUDAGraphMode.PIECEWISE,
190194
)
191195
)
192-
# With the new simplified logic, attention fusion works with splitting_ops
193-
assert config.compilation_config.splitting_ops_contain_attention()
194-
# cudagraph mode remains PIECEWISE
195-
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
196+
assert config.compilation_config.splitting_ops == []
197+
# cudagraph mode also fall back to FULL
198+
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
196199

197-
# When both use_inductor_graph_partition and attn_fusion pass enabled.
198-
if is_torch_equal_or_newer("2.9.0.dev"):
200+
# splitting_ops can not contain attention ops when attn_fusion
201+
# pass enabled.
202+
with pytest.raises(ValidationError):
199203
config = VllmConfig(
200204
compilation_config=CompilationConfig(
201205
mode=CompilationMode.VLLM_COMPILE,
202-
use_inductor_graph_partition=True,
203206
pass_config={"enable_attn_fusion": True, "enable_noop": True},
204207
custom_ops=["+quant_fp8"],
205208
cudagraph_mode=CUDAGraphMode.PIECEWISE,
209+
# work around for accessing all attntion ops
210+
splitting_ops=CompilationConfig()._attention_ops,
206211
)
207212
)
208-
# With inductor graph partition, attn_fusion and splitting_ops
209-
# work together. Default splitting_ops include attention ops.
210-
assert config.compilation_config.splitting_ops_contain_attention()
211-
# enable_attn_fusion is directly supported under
212-
# use_inductor_graph_partition=True, and cudagraph_mode
213-
# is unchanged.
214-
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
213+
214+
# When both use_inductor_graph_partition and attn_fusion pass enabled.
215+
config = VllmConfig(
216+
compilation_config=CompilationConfig(
217+
mode=CompilationMode.VLLM_COMPILE,
218+
use_inductor_graph_partition=True,
219+
pass_config={"enable_attn_fusion": True, "enable_noop": True},
220+
custom_ops=["+quant_fp8"],
221+
cudagraph_mode=CUDAGraphMode.PIECEWISE,
222+
)
223+
)
224+
# With inductor graph partition, attn_fusion and splitting_ops
225+
# work together. Default splitting_ops include attention ops.
226+
assert config.compilation_config.splitting_ops_contain_attention()
227+
# enable_attn_fusion is directly supported under
228+
# use_inductor_graph_partition=True, and cudagraph_mode
229+
# is unchanged.
230+
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
215231

216232

217233
def test_should_split():
@@ -293,25 +309,36 @@ def attention(
293309
"tp_size",
294310
"enable_sequence_parallelism",
295311
"max_num_batched_tokens",
296-
"use_cudagraph",
312+
"cudagraph_mode",
297313
"expected_max_size",
298314
),
299315
[
300-
(None, None, 1, False, 2048, True, 512),
301-
([1, 2, 4], 4, 1, False, 2048, True, 4),
302-
([1, 2, 4], 8, 1, False, 2048, True, RuntimeError),
303-
([1, 256], None, 1, False, 2048, 256),
304-
([], None, 1, False, 2048, False, 0),
305-
(None, 0, 1, False, 2048, False, 0),
316+
(None, None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
317+
([1, 2, 4], 4, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
318+
(
319+
[1, 2, 4],
320+
8,
321+
1,
322+
False,
323+
2048,
324+
CUDAGraphMode.FULL_AND_PIECEWISE,
325+
ValidationError,
326+
),
327+
([1, 256], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
328+
([], None, 1, False, 2048, CUDAGraphMode.NONE, 0),
329+
(None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
306330
# truncated to nearest multiple of 8 or 16
307-
(None, 257, 1, False, 2048, True, 256),
308-
([1, 2, 4, 15], None, 1, False, 2048, True, 15), # max from list
309-
([1, 2, 4, 15], None, 2, True, 2048, True, 4), # filtered out 15 due to SP
310-
([1, 2, 4, 15], None, 1, False, 8, True, 4), # limited by the max_tokens
331+
(None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
332+
# max from list
333+
([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
334+
# filtered out 15 due to SP
335+
([1, 2, 4, 15], None, 2, True, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
336+
# limited by the max_tokens
337+
([1, 2, 4, 15], None, 1, False, 8, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
311338
# the list should contain at least 1 element when use cudagraph
312-
([], None, 1, False, 2048, True, RuntimeError),
339+
([], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
313340
# the max capturing size should be >= 1 when use cudagraph
314-
(None, 0, 1, False, 2048, True, RuntimeError),
341+
(None, 0, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
315342
],
316343
)
317344
def test_cudagraph_sizes_post_init(
@@ -320,15 +347,17 @@ def test_cudagraph_sizes_post_init(
320347
tp_size,
321348
enable_sequence_parallelism,
322349
max_num_batched_tokens,
323-
use_cudagraph,
350+
cudagraph_mode,
324351
expected_max_size,
325352
):
326353
ctx = nullcontext()
327-
if isinstance(expected_max_size, Exception):
354+
if expected_max_size == ValidationError:
328355
ctx = pytest.raises(expected_max_size)
329356

330-
cudagraph_mode = CUDAGraphMode.PIECEWISE if use_cudagraph else CUDAGraphMode.NONE
331-
with ctx:
357+
with (
358+
ctx,
359+
patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
360+
):
332361
compilation_config = CompilationConfig(
333362
cudagraph_capture_sizes=cudagraph_capture_sizes,
334363
max_cudagraph_capture_size=max_cudagraph_capture_size,
@@ -342,11 +371,13 @@ def test_cudagraph_sizes_post_init(
342371
engine_args = EngineArgs(
343372
model="facebook/opt-125m",
344373
tensor_parallel_size=tp_size,
374+
max_num_seqs=min(max_num_batched_tokens, 128),
345375
max_num_batched_tokens=max_num_batched_tokens,
346376
compilation_config=compilation_config,
347377
)
348378
vllm_config = engine_args.create_engine_config()
349379

350-
assert (
351-
vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size
352-
)
380+
assert (
381+
vllm_config.compilation_config.max_cudagraph_capture_size
382+
== expected_max_size
383+
)

tests/compile/test_decorator.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatc
8080
vllm_config = VllmConfig(
8181
compilation_config=CompilationConfig(
8282
mode=CompilationMode.VLLM_COMPILE,
83-
use_cudagraph=True,
8483
splitting_ops=["silly::attention"],
8584
cudagraph_capture_sizes=[1, 2],
8685
use_inductor_graph_partition=use_inductor_graph_partition,
@@ -215,7 +214,6 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
215214
),
216215
compilation_config=CompilationConfig(
217216
mode=CompilationMode.VLLM_COMPILE,
218-
use_cudagraph=True,
219217
splitting_ops=["silly::attention"],
220218
cudagraph_capture_sizes=[1, 2],
221219
use_inductor_graph_partition=use_inductor_graph_partition,
@@ -257,7 +255,6 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
257255
),
258256
compilation_config=CompilationConfig(
259257
mode=CompilationMode.VLLM_COMPILE,
260-
use_cudagraph=True,
261258
splitting_ops=["silly::attention"],
262259
cudagraph_capture_sizes=[1, 2],
263260
use_inductor_graph_partition=use_inductor_graph_partition,

tests/models/multimodal/generation/test_qwen2_5_vl.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,8 @@ def test_qwen2_5_vl_evs_functionality(
6161
model,
6262
runner="generate",
6363
max_model_len=4000,
64-
max_num_seqs=1,
6564
dtype=dtype,
6665
limit_mm_per_prompt={"video": 1},
67-
tensor_parallel_size=1,
6866
video_pruning_rate=video_pruning_rate,
6967
) as vllm_model:
7068
# Generate output - this should not crash

0 commit comments

Comments
 (0)