-
-
Notifications
You must be signed in to change notification settings - Fork 11.7k
[V1] address post issues related to #20059 (part 1); cascade attention reenable by default #23046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a012257
c37583c
5e998c2
648fbb3
3c88284
868c85f
3bef9e4
05cf012
20d8afb
9648a84
da79494
01083c3
ca3c2c9
59db6b1
11421de
2bf5569
bd1762a
2a50ecc
4aef453
ff3c671
27eecc2
8d3ecc8
48a8c7f
f3e08f3
3faff97
f09e47f
b8894f2
92cbd4f
df90576
6176761
4679802
5475e9e
413079b
254bfd3
f584663
d8a1ad7
891723a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -62,9 +62,17 @@ def max_cudagraph_mode(self) -> 'CUDAGraphMode': | |
| def has_full_cudagraphs(self) -> bool: | ||
| return self.max_cudagraph_mode() == CUDAGraphMode.FULL | ||
|
|
||
| def has_piecewise_cudagraphs(self) -> bool: | ||
| return self.requires_piecewise_compilation() | ||
|
Comment on lines
+64
to
+65
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two seem semantically different
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, but they are equivalent in actuality since we don't allow piecewise mode with empty splitting ops (translated to FULL in this case). So, having piecewise_cudagraph means requiring piecewise compilation, and requiring piecewise compilation implies having piecewise_cudagraph. |
||
|
|
||
| def separate_routine(self) -> bool: | ||
| return isinstance(self.value, tuple) | ||
|
|
||
| def vaild_runtime_modes(self) -> bool: | ||
fhl2000 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return self in [ | ||
| CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL | ||
| ] | ||
|
|
||
|
|
||
| @config | ||
| @dataclass | ||
|
|
@@ -544,20 +552,37 @@ def set_splitting_ops_for_v1(self): | |
| # full cudagraph outside the fx graph. This reduces some cpu | ||
| # overhead when the runtime batch_size is not cudagraph captured. | ||
| # see https://github.com/vllm-project/vllm/pull/20059 for details. | ||
| self.splitting_ops = self._attention_ops | ||
| if self.pass_config.enable_attn_fusion: | ||
| self.splitting_ops = [] | ||
| if self.cudagraph_mode.has_piecewise_cudagraphs(): | ||
| logger.warning_once( | ||
| "When enable_attn_fusion, splitting_ops will be set " | ||
| "to empty list, and cudagraph_mode containing " | ||
| "PIECEWISE will be treated as FULL cudagraph_mode. " | ||
| "Please ensure you are using attention backends that " | ||
| "support cudagraph or set cudagraph_mode to NONE " | ||
|
||
| "explicitly if encountering any problems.") | ||
| self.cudagraph_mode = CUDAGraphMode.FULL | ||
| else: | ||
| self.splitting_ops = self._attention_ops | ||
| elif len(self.splitting_ops) == 0: | ||
| logger.warning_once("Using piecewise compilation with empty " | ||
| "splitting_ops.") | ||
| if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: | ||
| if self.cudagraph_mode.has_piecewise_cudagraphs(): | ||
| logger.warning_once( | ||
| "When compilation level is piecewise with empty " | ||
| "splitting_ops, PIECEWISE cudagraph_mode will be " | ||
| "treated as FULL cudagraph_mode. Please ensure you are " | ||
| "splitting_ops, cudagraph_mode containing PIECEWISE will " | ||
| "be treated as FULL cudagraph_mode. Please ensure you are " | ||
| "using attention backends that support cudagraph or set " | ||
| "cudagraph_mode to NONE explicitly if encountering " | ||
| "any problems.") | ||
| self.cudagraph_mode = CUDAGraphMode.FULL | ||
| self.splitting_ops = [] | ||
| else: # len(self.splitting_ops) > 0: | ||
| assert not self.pass_config.enable_attn_fusion or \ | ||
| not self.splitting_ops_contain_attention(), ( | ||
| "attention ops should not be in splitting_ops " | ||
| "when enable_attn_fusion is True") | ||
|
||
|
|
||
| def splitting_ops_contain_attention(self) -> bool: | ||
| return self.splitting_ops is not None and all( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.