Skip to content

Commit 58170d6

Browse files
authored
[Hardware][CPU] Add embedding models support for CPU backend (#10193)
Signed-off-by: Isotr0py <[email protected]>
1 parent 9804ac7 commit 58170d6

File tree

9 files changed

+185
-52
lines changed

9 files changed

+185
-52
lines changed

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ function cpu_tests() {
2525
decord einops librosa peft Pillow sentence-transformers soundfile \
2626
transformers_stream_generator matplotlib datamodel_code_generator
2727
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
28-
# Embedding models are not supported for CPU yet
29-
# pytest -v -s tests/models/embedding/language
28+
pytest -v -s tests/models/embedding/language
3029
pytest -v -s tests/models/encoder_decoder/language
3130
pytest -v -s tests/models/decoder_only/language/test_models.py
3231
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model

.buildkite/run-cpu-test.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ function cpu_tests() {
3232
decord einops librosa peft Pillow sentence-transformers soundfile \
3333
transformers_stream_generator matplotlib datamodel_code_generator
3434
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
35-
# Embedding models are not supported for CPU yet
36-
# pytest -v -s tests/models/embedding/language
35+
pytest -v -s tests/models/embedding/language
3736
pytest -v -s tests/models/encoder_decoder/language
3837
pytest -v -s tests/models/decoder_only/language/test_models.py
3938
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model

tests/models/embedding/language/test_embedding.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
"""
55
import pytest
66

7+
from vllm.utils import current_platform
8+
79
from ..utils import check_embeddings_close
810

911
# Model, Guard
@@ -21,15 +23,14 @@
2123
@pytest.mark.parametrize("model", MODELS)
2224
@pytest.mark.parametrize("dtype", ["half"])
2325
def test_models(
24-
monkeypatch,
2526
hf_runner,
2627
vllm_runner,
2728
example_prompts,
2829
model,
2930
dtype: str,
3031
) -> None:
31-
if model in ENCODER_ONLY:
32-
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
32+
if model not in ENCODER_ONLY and current_platform.is_cpu():
33+
pytest.skip("Skip large embedding models test on CPU.")
3334

3435
# The example_prompts has ending "\n", for example:
3536
# "Write a short story about a robot that dreams for the first time.\n"

vllm/attention/backends/torch_sdpa.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ def get_seq_lens(
158158
* Appropriate sequence lengths tensor for key & value
159159
'''
160160

161-
if attn_type == AttentionType.DECODER:
161+
if (attn_type == AttentionType.DECODER
162+
or attn_type == AttentionType.ENCODER_ONLY):
162163
seq_lens_q = self.seq_lens
163164
seq_lens_kv = self.seq_lens
164165
elif attn_type == AttentionType.ENCODER:
@@ -189,7 +190,8 @@ def get_attn_bias(
189190
* Appropriate attention bias value given the attention type
190191
'''
191192

192-
if attn_type == AttentionType.DECODER:
193+
if (attn_type == AttentionType.DECODER
194+
or attn_type == AttentionType.ENCODER_ONLY):
193195
return self.attn_bias
194196
elif attn_type == AttentionType.ENCODER:
195197
return self.encoder_attn_bias
@@ -215,7 +217,8 @@ def set_attn_bias(
215217
encoder/decoder cross-attention
216218
'''
217219

218-
if attn_type == AttentionType.DECODER:
220+
if (attn_type == AttentionType.DECODER
221+
or attn_type == AttentionType.ENCODER_ONLY):
219222
self.attn_bias = attn_bias
220223
elif attn_type == AttentionType.ENCODER:
221224
self.encoder_attn_bias = attn_bias
@@ -252,7 +255,8 @@ def get_seq_len_block_table_args(
252255
* Appropriate block tables (or None)
253256
'''
254257

255-
if attn_type == AttentionType.DECODER:
258+
if (attn_type == AttentionType.DECODER
259+
or attn_type == AttentionType.ENCODER_ONLY):
256260
# Decoder self-attention
257261
# Choose max_seq_len based on whether we are in prompt_run
258262
return (self.seq_lens_tensor, self.max_decode_seq_len,
@@ -420,6 +424,8 @@ def forward(
420424
"Torch SDPA backend doesn't support prefix decoding.")
421425

422426
if decode_meta := attn_metadata.decode_metadata:
427+
assert attn_type != AttentionType.ENCODER_ONLY, (
428+
"Encoder-only models should not have decode metadata.")
423429
# Decoding run.
424430
(
425431
seq_lens_arg,

vllm/model_executor/models/bert.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from transformers import BertConfig
66

77
from vllm.attention import Attention, AttentionMetadata, AttentionType
8-
from vllm.attention.backends.xformers import XFormersImpl
98
from vllm.config import CacheConfig, VllmConfig
109
from vllm.distributed import get_tensor_model_parallel_world_size
1110
from vllm.model_executor.layers.activation import get_act_fn
@@ -218,11 +217,6 @@ def __init__(
218217
quant_config=quant_config,
219218
prefix=f"{prefix}.attn")
220219

221-
if not isinstance(self.attn.impl, XFormersImpl):
222-
raise ValueError(
223-
"Encoder-only models currently require XFORMERS attention "
224-
"backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.")
225-
226220
def forward(
227221
self,
228222
hidden_states: torch.Tensor,
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import dataclasses
2+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
3+
4+
import torch
5+
6+
from vllm.model_executor.pooling_metadata import PoolingMetadata
7+
from vllm.multimodal import MultiModalKwargs
8+
from vllm.pooling_params import PoolingParams
9+
from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
10+
SequenceGroupMetadata)
11+
from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU,
12+
ModelInputForCPUBuilder)
13+
14+
15+
@dataclasses.dataclass(frozen=True)
16+
class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
17+
"""
18+
Used by the CPUEmbeddingModelRunner.
19+
"""
20+
pooling_metadata: Optional["PoolingMetadata"] = None
21+
22+
23+
class CPUEmbeddingModelRunner(
24+
CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
25+
_model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
26+
ModelInputForCPUWithPoolingMetadata)
27+
_builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
28+
29+
@torch.inference_mode()
30+
def execute_model(
31+
self,
32+
model_input: ModelInputForCPUWithPoolingMetadata,
33+
kv_caches: List[torch.Tensor],
34+
intermediate_tensors: Optional[IntermediateTensors] = None,
35+
num_steps: int = 1,
36+
) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
37+
if num_steps > 1:
38+
raise ValueError(
39+
"CPU worker does not support multi-step execution.")
40+
41+
num_layers = self.model_config.get_num_layers(self.parallel_config)
42+
# use an empty tensor instead of `None`` to force Dynamo to pass
43+
# it by reference, rather by specializing on the value ``None``.
44+
# the `dtype` argument does not matter, and we use `float32` as
45+
# a placeholder (it has wide hardware support).
46+
kv_caches = [
47+
torch.tensor([], dtype=torch.float32, device=self.device)
48+
for _ in range(num_layers)
49+
]
50+
51+
model_executable = self.model
52+
execute_model_kwargs = {
53+
"input_ids":
54+
model_input.input_tokens,
55+
"positions":
56+
model_input.input_positions,
57+
"kv_caches":
58+
kv_caches,
59+
"attn_metadata":
60+
model_input.attn_metadata,
61+
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
62+
device=self.device),
63+
"intermediate_tensors":
64+
intermediate_tensors,
65+
}
66+
67+
hidden_states = model_executable(**execute_model_kwargs)
68+
69+
return [
70+
self.model.pooler(hidden_states=hidden_states,
71+
pooling_metadata=model_input.pooling_metadata)
72+
]
73+
74+
def make_model_input_from_broadcasted_tensor_dict(
75+
self,
76+
tensor_dict: Dict[str,
77+
Any]) -> ModelInputForCPUWithPoolingMetadata:
78+
return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict(
79+
tensor_dict,
80+
attn_backend=self.attn_backend,
81+
)
82+
83+
def prepare_model_input(
84+
self,
85+
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
86+
virtual_engine: int = 0,
87+
finished_requests_ids: Optional[List[str]] = None
88+
) -> ModelInputForCPUWithPoolingMetadata:
89+
assert seq_group_metadata_list is not None
90+
model_input = self._prepare_model_input_tensors(
91+
seq_group_metadata_list, finished_requests_ids)
92+
# Prepare PoolingMetadata.
93+
assert model_input.seq_lens is not None
94+
pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
95+
model_input.seq_lens)
96+
97+
return dataclasses.replace(model_input,
98+
pooling_metadata=pooling_metadata)
99+
100+
def _prepare_pooling(
101+
self,
102+
seq_group_metadata_list: List[SequenceGroupMetadata],
103+
prompt_lens: List[int],
104+
) -> PoolingMetadata:
105+
"""Prepare PoolingMetadata for the sequence group metadata list."""
106+
seq_groups: List[Tuple[List[int], PoolingParams]] = []
107+
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
108+
seq_ids = list(seq_group_metadata.seq_data.keys())
109+
pooling_params = seq_group_metadata.pooling_params
110+
seq_groups.append((seq_ids, pooling_params))
111+
112+
seq_data: Dict[int, SequenceData] = {}
113+
for seq_group_metadata in seq_group_metadata_list:
114+
seq_data.update(seq_group_metadata.seq_data)
115+
116+
pooling_metadata = PoolingMetadata(
117+
seq_groups=seq_groups,
118+
seq_data=seq_data,
119+
prompt_lens=prompt_lens,
120+
)
121+
122+
return pooling_metadata

vllm/worker/cpu_enc_dec_model_runner.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from vllm.multimodal import MultiModalKwargs
99
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
1010
from vllm.utils import make_tensor_with_pad
11-
from vllm.worker.cpu_model_runner import (CPUModelRunner,
11+
from vllm.worker.cpu_model_runner import (CPUModelRunnerBase,
1212
ModelInputForCPUBuilder,
1313
ModelInputForCPUWithSamplingMetadata)
1414
from vllm.worker.model_runner_base import (
@@ -50,7 +50,8 @@ def from_broadcasted_tensor_dict(
5050
super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
5151

5252

53-
class CPUEncoderDecoderModelRunner(CPUModelRunner):
53+
class CPUEncoderDecoderModelRunner(
54+
CPUModelRunnerBase[EncoderDecoderModelInputForCPU]):
5455
_model_input_cls: Type[EncoderDecoderModelInputForCPU] = (
5556
EncoderDecoderModelInputForCPU)
5657
_builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
@@ -87,10 +88,8 @@ def prepare_model_input(
8788
virtual_engine: int = 0,
8889
finished_requests_ids: Optional[List[str]] = None
8990
) -> EncoderDecoderModelInputForCPU:
90-
model_input = super().prepare_model_input(seq_group_metadata_list,
91-
virtual_engine,
92-
finished_requests_ids)
93-
model_input = cast(EncoderDecoderModelInputForCPU, model_input)
91+
model_input = self._prepare_model_input_tensors(
92+
seq_group_metadata_list, finished_requests_ids)
9493
(
9594
attn_metadata,
9695
encoder_input_tokens_tensor,

vllm/worker/cpu_model_runner.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import weakref
33
from collections import defaultdict
44
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
5+
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
6+
TypeVar, Union)
67

78
import torch
89
from torch import nn
@@ -31,6 +32,7 @@
3132

3233
logger = init_logger(__name__)
3334

35+
TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU")
3436
_PAD_SLOT_ID = -1
3537

3638

@@ -60,10 +62,10 @@ def as_broadcastable_tensor_dict(
6062

6163
@classmethod
6264
def from_broadcasted_tensor_dict(
63-
cls: Type["ModelInputForCPU"],
65+
cls: Type[TModelInputForCPU],
6466
tensor_dict: Dict[str, Any],
6567
attn_backend: Optional["AttentionBackend"] = None
66-
) -> "ModelInputForCPU":
68+
) -> TModelInputForCPU:
6769
if attn_backend is not None:
6870
tensor_dict = _init_attn_metadata_from_tensor_dict(
6971
attn_backend, tensor_dict)
@@ -255,11 +257,14 @@ def _prepare_prompt(
255257
slot_mapping.append(_PAD_SLOT_ID)
256258
continue
257259

258-
block_number = block_table[i //
259-
self.block_size] # type: ignore
260-
block_offset = i % self.block_size # type: ignore
261-
slot = block_number * self.block_size + block_offset
262-
slot_mapping.append(slot)
260+
# For encoder-only models, the block_table is None,
261+
# and there is no need to initialize the slot_mapping.
262+
if block_table is not None:
263+
block_number = block_table[i //
264+
self.block_size] # type: ignore
265+
block_offset = i % self.block_size # type: ignore
266+
slot = block_number * self.block_size + block_offset
267+
slot_mapping.append(slot)
263268

264269
if any(input_mrope_positions):
265270
input_positions = None # type: ignore
@@ -402,10 +407,12 @@ def _prepare_decode(
402407
)
403408

404409

405-
class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
406-
_model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
407-
ModelInputForCPUWithSamplingMetadata)
408-
_builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
410+
class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
411+
"""
412+
Helper class for shared methods between CPU model runners.
413+
"""
414+
_model_input_cls: Type[TModelInputForCPU]
415+
_builder_cls: Type[ModelInputForCPUBuilder]
409416

410417
def __init__(
411418
self,
@@ -448,20 +455,11 @@ def __init__(
448455
def load_model(self) -> None:
449456
self.model = get_model(vllm_config=self.vllm_config)
450457

451-
def make_model_input_from_broadcasted_tensor_dict(
452-
self,
453-
tensor_dict: Dict[str, Any],
454-
) -> ModelInputForCPUWithSamplingMetadata:
455-
return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501
456-
tensor_dict,
457-
attn_backend=self.attn_backend,
458-
)
459-
460458
def _prepare_model_input_tensors(
461459
self,
462460
seq_group_metadata_list: List[SequenceGroupMetadata],
463461
finished_requests_ids: Optional[List[str]] = None
464-
) -> ModelInputForCPUWithSamplingMetadata:
462+
) -> TModelInputForCPU:
465463
"""Helper method to prepare the model input based on a given sequence
466464
group. Prepares metadata needed for the base model forward pass but not
467465
metadata for possible additional steps, e.g., sampling.
@@ -473,6 +471,21 @@ def _prepare_model_input_tensors(
473471

474472
return builder.build() # type: ignore
475473

474+
475+
class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
476+
_model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
477+
ModelInputForCPUWithSamplingMetadata)
478+
_builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
479+
480+
def make_model_input_from_broadcasted_tensor_dict(
481+
self,
482+
tensor_dict: Dict[str, Any],
483+
) -> ModelInputForCPUWithSamplingMetadata:
484+
return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501
485+
tensor_dict,
486+
attn_backend=self.attn_backend,
487+
)
488+
476489
def prepare_model_input(
477490
self,
478491
seq_group_metadata_list: List[SequenceGroupMetadata],

0 commit comments

Comments
 (0)