Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ anyio==4.6.2.post1
# via httpx
argcomplete==3.5.1
# via datamodel-code-generator
async-timeout==5.0.1
# via
# aiohttp
# redis
attrs==24.2.0
# via
# aiohttp
Expand Down Expand Up @@ -117,6 +121,10 @@ encodec==0.1.1
# via vocos
evaluate==0.4.3
# via lm-eval
exceptiongroup==1.2.2
# via
# anyio
# pytest
fastparquet==2024.11.0
# via genai-perf
fastrlock==0.8.2
Expand Down Expand Up @@ -559,9 +567,7 @@ sentence-transformers==3.2.1
sentencepiece==0.2.0
# via mistral-common
setuptools==75.8.0
# via
# pytablewriter
# torch
# via pytablewriter
shellingham==1.5.4
# via typer
six==1.16.0
Expand Down Expand Up @@ -608,6 +614,12 @@ timm==1.0.11
# via -r requirements/test.in
tokenizers==0.21.0
# via transformers
toml==0.10.2
# via datamodel-code-generator
tomli==2.2.1
# via
# black
# pytest
torch==2.6.0
# via
# -r requirements/test.in
Expand Down Expand Up @@ -673,12 +685,16 @@ typer==0.15.2
# via fastsafetensors
typing-extensions==4.12.2
# via
# anyio
# black
# huggingface-hub
# librosa
# mistral-common
# multidict
# pqdm
# pydantic
# pydantic-core
# rich
# torch
# typer
tzdata==2024.2
Expand Down
33 changes: 33 additions & 0 deletions tests/compile/piecewise/test_full_cudagraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationLevel


def run_model(compilation_config: CompilationConfig):
prompts = ["Hello, my name is"]
sampling_params = SamplingParams(temperature=0.0, max_tokens=20)

llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
compilation_config=compilation_config)

return llm.generate(prompts, sampling_params)


def test_full_cudagraph(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_FLASH_ATTN_VERSION", "3")

full_cudagraph_responses = run_model(
compilation_config=CompilationConfig(
level=CompilationLevel.FULL_GRAPH,
use_cudagraph=True,
))

piecewise_responses = run_model(compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
))

assert full_cudagraph_responses[0].outputs[
0].text == piecewise_responses[0].outputs[0].text
4 changes: 2 additions & 2 deletions vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,8 @@ def call_module(self, target: torch.fx.node.Target,

class VllmBackend:
"""The compilation backend for `torch.compile` with vLLM.
It is used for compilation level of `CompilationLevel.PIECEWISE`,
where we customize the compilation.
It is used for compilation level of `CompilationLevel.PIECEWISE` or
`CompilationLevel.FULL_GRAPH`, where we customize the compilation.

The major work of this backend is to split the graph into
piecewise graphs, and pass them to the piecewise backend.
Expand Down
4 changes: 2 additions & 2 deletions vllm/compilation/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
torch_compile_start_time = time.time()

compilation_config: CompilationConfig = vllm_config.compilation_config
if compilation_config.level == CompilationLevel.PIECEWISE and \
if compilation_config.level >= CompilationLevel.PIECEWISE and \
compilation_config.debug_dump_path:
import depyf
path = os.path.join(compilation_config.debug_dump_path,
Expand All @@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):

def end_monitoring_torch_compile(vllm_config: VllmConfig):
compilation_config: CompilationConfig = vllm_config.compilation_config
if compilation_config.level == CompilationLevel.PIECEWISE:
if compilation_config.level >= CompilationLevel.PIECEWISE:
logger.info("torch.compile takes %.2f s in total",
compilation_config.compilation_time)
global context_manager
Expand Down
24 changes: 15 additions & 9 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3065,6 +3065,7 @@ class CompilationLevel:
DYNAMO_AS_IS = 1
DYNAMO_ONCE = 2
PIECEWISE = 3
FULL_GRAPH = 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why have a new compilation level for full cuda graphs? This seems like a special case of PIECEWISE where there is only a single graph

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's true.

I think it's nicer from a usability perspective to have an explicit level that people can just enable with -O4 flag rather than write a compilation config that specifies level 4 and sets an empty array for splitting_ops.

But would love to get your thoughts @WoosukKwon. I am open to either way.

Copy link
Collaborator

@zou3519 zou3519 Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My hot take is that O4 implies that the code is actually faster than O3 and therefore O4 should be enabled by default, but I don't think we have made that conclusion in general yet, so don't think we should have this as O4 right now. Something else we can just do is have a --full-cuda-graphs option which sets the empty array for splitting_ops and document it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still worried that this full cudagraph support can be easily broken silently, when some attention kernels have defferent design and implementation for prefill and decode, for example, deepseek MLA attention. Anyway we should know the list of situations where the current optimization in this pr applies.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@youkaichao How did this issue was solved in V1?

Copy link
Collaborator

@WoosukKwon WoosukKwon Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chanh I'd slightly prefer --full-cuda-graphs over others, but I'd like to follow @youkaichao's opinion since he has much more knowledge about this API than me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO--full-cuda-graphs will be the best UX for most people

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay rather than -04 i will use a separate flag for this feature.

I am thinking to keep it under --compilation-config since in v1 we are forcing PIECEWISE compilation backend as default and all related configs (including other cudagraph configs) are under --compilation-config already.

So it will look like this: --compilation-config {"'full_cuda_graph': True"}

Let me know if you are good with that or you think it should be a top-level config @tlrmchlsmth @youkaichao ? If top-level that might be strange since all the cudagraph related configs are under --compilation-config

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeping it in compilation-config sounds reasonable to me

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good, PR has been updated



class CompilationConfig(BaseModel):
Expand All @@ -3077,6 +3078,7 @@ class CompilationConfig(BaseModel):
- 1: dynamo as is.
- 2: dynamo once.
- 3: piecewise compilation.
- 4: full compilation.
- debug_dump_path: the path to dump the debug information.
- cache_dir: the directory to store the compiled graph, to
accelerate Inductor compilation. By default, it will use
Expand All @@ -3088,6 +3090,7 @@ class CompilationConfig(BaseModel):
We use string to avoid serialization issues when using compilation in a distributed setting.
When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph).
When the compilation level is 3, the backend is used for the piecewise compilation (it sees a part of the graph).
When the compilation level is 4, the backend is used for the full graph.
- custom_ops: fine-grained control over which custom ops to enable/disable.
Use 'all' to enable all, 'none' to disable all.
Also specify a list of custom op names to enable (prefixed with a '+'),
Expand Down Expand Up @@ -3260,7 +3263,7 @@ def __repr__(self) -> str:
@classmethod
def from_cli(cls, cli_value: str) -> "CompilationConfig":
"""Parse the CLI value for the compilation config."""
if cli_value in ["0", "1", "2", "3"]:
if cli_value in ["0", "1", "2", "3", "4"]:
return cls(level=int(cli_value))
# do not use `eval`, it is dangerous and can execute arbitrary code
dict_value = ast.literal_eval(cli_value)
Expand Down Expand Up @@ -3327,7 +3330,7 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:

# TODO: pass user-specified backend to piecewise compilation
# merge with the config use_inductor
assert self.level == CompilationLevel.PIECEWISE
assert self.level >= CompilationLevel.PIECEWISE

from vllm.compilation.backends import VllmBackend
return VllmBackend(vllm_config)
Expand Down Expand Up @@ -3382,13 +3385,15 @@ def init_with_cudagraph_sizes(self,
self.max_capture_size] = self.max_capture_size

def set_splitting_ops_for_v1(self):
# If default, override splitting ops for piecewise cudagraph on V1.
# NOTE: this function needs to be called
if not self.splitting_ops:
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
if self.level == CompilationLevel.PIECEWISE:
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
elif self.level == CompilationLevel.FULL_GRAPH:
self.splitting_ops = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif self.level == CompilationLevel.FULL_GRAPH:
self.splitting_ops = []
else:
assert not self.splitting_ops

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should remove the if not self.splitting_ops: too right? otherwise assert would be redundant.

if not self.splitting_ops:
    if self.level == CompilationLevel.PIECEWISE:
        self.splitting_ops = [
            "vllm.unified_attention",
            "vllm.unified_attention_with_output",
        ]
    else:
        assert not self.splitting_ops



@dataclass
Expand Down Expand Up @@ -3614,7 +3619,8 @@ def __post_init__(self):
self.compilation_config.cudagraph_num_of_warmups = 1
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_noop = False
self.compilation_config.level = CompilationLevel.PIECEWISE
if self.compilation_config.level < CompilationLevel.PIECEWISE:
self.compilation_config.level = CompilationLevel.PIECEWISE
self.compilation_config.set_splitting_ops_for_v1()

self._set_cudagraph_sizes()
Expand Down Expand Up @@ -3787,7 +3793,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
logger.debug("disabled custom ops: %s",
vllm_config.compilation_config.disabled_custom_ops)
if check_compile and \
vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
vllm_config.compilation_config.level >= CompilationLevel.PIECEWISE \
and compilation_counter.num_models_seen == num_models_seen:
# If the model supports compilation,
# compilation_counter.num_models_seen should be increased
Expand Down
11 changes: 4 additions & 7 deletions vllm/v1/attention/backends/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,15 +286,12 @@ def reorder_batch(self, input_batch: "InputBatch",
def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
common_prefix_len: int):
max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
query_start_loc = query_start_loc_cpu.to(self.runner.device,
non_blocking=True)
seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
seq_lens = seq_lens_cpu.to(self.runner.device, non_blocking=True)
query_start_loc = self.runner.query_start_loc[:num_reqs + 1]
seq_lens = self.runner.seq_lens[:num_reqs]

block_table = (
self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
self.runner.device, non_blocking=True).long()
Comment on lines -320 to -321
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering: Why do we have this difference?

@WoosukKwon it's because we call .long() here. We might want to still call it here, to keep the dtypes consistent in the model runner.

Copy link
Contributor Author

@chanh chanh May 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tlrmchlsmth what do you think about just making the CPU tensor int64 too? (that's the route that i went with in latest update on this PR)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had to check - that takes the slot_mapping CPU-> GPU transfer from 32KB to 64KB (by default serving on an H100). That seems fine to me since now we don't do that copy in every layer

slot_mapping = self.runner.slot_mapping[:num_actual_tokens]

# for local attention
local_attn_metadata = None
Expand Down
49 changes: 44 additions & 5 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def __init__(
)

self.use_cuda_graph = (self.vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE
>= CompilationLevel.PIECEWISE
and not self.model_config.enforce_eager)
# TODO(woosuk): Provide an option to tune the max cudagraph batch size.
# The convention is different.
Expand All @@ -206,6 +206,19 @@ def __init__(
self.positions = torch.zeros(self.max_num_tokens,
dtype=torch.int64,
device=self.device)
self.query_start_loc = torch.zeros(self.max_num_reqs + 1,
dtype=torch.int32,
device=self.device)
self.seq_lens = torch.zeros(self.max_num_reqs,
dtype=torch.int32,
device=self.device)
self.slot_mapping = torch.zeros(
self.max_num_tokens,
# CPU slot_mapping is int32, but
# this one must be int64
dtype=torch.int64,
device=self.device)

# None in the first PP rank. The rest are set after load_model.
self.intermediate_tensors: Optional[IntermediateTensors] = None

Expand Down Expand Up @@ -584,6 +597,19 @@ def _prepare_inputs(
scheduler_output.num_common_prefix_blocks,
)

self.query_start_loc[:num_reqs + 1].copy_(
self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
non_blocking=True)
self.slot_mapping[:total_num_scheduled_tokens].copy_(
self.slot_mapping_cpu[:total_num_scheduled_tokens],
non_blocking=True)

# Fill unused with -1. Needed for reshape_and_cache
self.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
self.seq_lens[num_reqs:].fill_(0)
self.query_start_loc[num_reqs + 1:].fill_(-1)

attn_metadata = self.attn_metadata_builder.build(
num_reqs=num_reqs,
num_actual_tokens=total_num_scheduled_tokens,
Expand Down Expand Up @@ -1392,6 +1418,7 @@ def _get_prompt_logprobs_dict(
def _dummy_run(
self,
num_tokens: int,
initialize_attention_metadata: bool = False,
) -> torch.Tensor:

# Set num_scheduled_tokens based on num_tokens and max_num_seqs
Expand All @@ -1408,6 +1435,16 @@ def _dummy_run(
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
dtype=np.int32)

if initialize_attention_metadata:
attn_metadata = self.attn_metadata_builder.build(
num_reqs=num_tokens,
num_actual_tokens=num_tokens,
max_query_len=num_tokens,
common_prefix_len=0,
)
else:
attn_metadata = None

with self.maybe_dummy_run_with_lora(self.lora_config,
num_scheduled_tokens):
model = self.model
Expand Down Expand Up @@ -1436,7 +1473,7 @@ def _dummy_run(
for k, v in self.intermediate_tensors.items()
})

with set_forward_context(None,
with set_forward_context(attn_metadata,
Copy link
Contributor

@hidva hidva Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering that self.maybe_setup_kv_connector(scheduler_output) is not executed here, in the Full Cuda Graph scenario, the sequence unified_attention_with_output -> maybe_save_kv_layer_to_connector -> connector.save_kv_layer() will cause the connector to read uninitialized metadata.

https://github.com/LMCache/LMCache/blob/680fbdf84e2ee1040bf4e084d43c9155a91b8d5c/lmcache/integration/vllm/vllm_v1_adapter.py#L609-L610

Therefore, Full Cuda Graph should be incompatible with kvconnector?

@simon-mo

self.vllm_config,
num_tokens=num_tokens):
hidden_states = model(
Expand Down Expand Up @@ -1603,7 +1640,8 @@ def capture_model(self) -> None:
if not self.use_cuda_graph:
logger.warning(
"Skipping CUDA graph capture. Please add "
"-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
"-O %s or -O %s to use CUDA graphs.",
CompilationLevel.PIECEWISE, CompilationLevel.FULL_GRAPH)
return

start_time = time.perf_counter()
Expand All @@ -1616,8 +1654,9 @@ def capture_model(self) -> None:
for num_tokens in reversed(self.cudagraph_batch_sizes):
for _ in range(self.vllm_config.compilation_config.
cudagraph_num_of_warmups):
self._dummy_run(num_tokens)
self._dummy_run(num_tokens)
self._dummy_run(num_tokens,
initialize_attention_metadata=True)
self._dummy_run(num_tokens, initialize_attention_metadata=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this depend on whether it's O3 or O4?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this part of the code we are doing dummy run for graph capture purposes, so from that perspective it makes sense to always be initializing attention metadata, even if a mode like O3 doesn't end up using it. It doesn't give much benefit to toggle behavior based on O3 vs O4 here imo.

The only time you don't want to initialize attention metadata would be during the profiling run (in v1/attention/backends/flash_attn.py::forward() there is this code to skip attention:

  if attn_metadata is None:
            # Profiling run.
            return output

But let me know if you feel strongly about adding logic here.


end_time = time.perf_counter()
end_free_gpu_memory = torch.cuda.mem_get_info()[0]
Expand Down
Loading