|
1 | 1 | import enum |
2 | 2 | import json |
3 | 3 | from dataclasses import dataclass, field, fields |
4 | | -from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple, |
5 | | - Type, Union) |
| 4 | +from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping, |
| 5 | + Optional, Tuple, Type, Union) |
6 | 6 |
|
7 | 7 | import torch |
8 | 8 | from transformers import PretrainedConfig |
@@ -115,35 +115,39 @@ class ModelConfig: |
115 | 115 | the model name will be the same as `model`. |
116 | 116 | limit_mm_per_prompt: Maximum number of data instances per modality |
117 | 117 | per prompt. Only applicable for multimodal models. |
| 118 | + override_neuron_config: Initialize non default neuron config or |
| 119 | + override default neuron config that are specific to Neuron devices, |
| 120 | + this argument will be used to configure the neuron config that |
| 121 | + can not be gathered from the vllm arguments. |
118 | 122 | """ |
119 | 123 |
|
120 | 124 | def __init__( |
121 | | - self, |
122 | | - model: str, |
123 | | - tokenizer: str, |
124 | | - tokenizer_mode: str, |
125 | | - trust_remote_code: bool, |
126 | | - dtype: Union[str, torch.dtype], |
127 | | - seed: int, |
128 | | - revision: Optional[str] = None, |
129 | | - code_revision: Optional[str] = None, |
130 | | - rope_scaling: Optional[dict] = None, |
131 | | - rope_theta: Optional[float] = None, |
132 | | - tokenizer_revision: Optional[str] = None, |
133 | | - max_model_len: Optional[int] = None, |
134 | | - spec_target_max_model_len: Optional[int] = None, |
135 | | - quantization: Optional[str] = None, |
136 | | - quantization_param_path: Optional[str] = None, |
137 | | - enforce_eager: Optional[bool] = None, |
138 | | - max_context_len_to_capture: Optional[int] = None, |
139 | | - max_seq_len_to_capture: Optional[int] = None, |
140 | | - max_logprobs: int = 20, |
141 | | - disable_sliding_window: bool = False, |
142 | | - skip_tokenizer_init: bool = False, |
143 | | - served_model_name: Optional[Union[str, List[str]]] = None, |
144 | | - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, |
145 | | - use_async_output_proc: bool = True, |
146 | | - ) -> None: |
| 125 | + self, |
| 126 | + model: str, |
| 127 | + tokenizer: str, |
| 128 | + tokenizer_mode: str, |
| 129 | + trust_remote_code: bool, |
| 130 | + dtype: Union[str, torch.dtype], |
| 131 | + seed: int, |
| 132 | + revision: Optional[str] = None, |
| 133 | + code_revision: Optional[str] = None, |
| 134 | + rope_scaling: Optional[dict] = None, |
| 135 | + rope_theta: Optional[float] = None, |
| 136 | + tokenizer_revision: Optional[str] = None, |
| 137 | + max_model_len: Optional[int] = None, |
| 138 | + spec_target_max_model_len: Optional[int] = None, |
| 139 | + quantization: Optional[str] = None, |
| 140 | + quantization_param_path: Optional[str] = None, |
| 141 | + enforce_eager: Optional[bool] = None, |
| 142 | + max_context_len_to_capture: Optional[int] = None, |
| 143 | + max_seq_len_to_capture: Optional[int] = None, |
| 144 | + max_logprobs: int = 20, |
| 145 | + disable_sliding_window: bool = False, |
| 146 | + skip_tokenizer_init: bool = False, |
| 147 | + served_model_name: Optional[Union[str, List[str]]] = None, |
| 148 | + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, |
| 149 | + use_async_output_proc: bool = True, |
| 150 | + override_neuron_config: Optional[Dict[str, Any]] = None) -> None: |
147 | 151 | self.model = model |
148 | 152 | self.tokenizer = tokenizer |
149 | 153 | self.tokenizer_mode = tokenizer_mode |
@@ -233,6 +237,9 @@ def __init__( |
233 | 237 | limit_mm_per_prompt) |
234 | 238 | if not self.skip_tokenizer_init: |
235 | 239 | self._verify_tokenizer_mode() |
| 240 | + |
| 241 | + self.override_neuron_config = override_neuron_config if is_neuron( |
| 242 | + ) else None |
236 | 243 | self._verify_embedding_mode() |
237 | 244 | self._verify_quantization() |
238 | 245 | self._verify_cuda_graph() |
@@ -289,6 +296,7 @@ def _verify_quantization(self) -> None: |
289 | 296 | "experts_int8", |
290 | 297 | ] |
291 | 298 | tpu_supported_quantization = ["tpu_int8"] |
| 299 | + neuron_supported_quantization = ["neuron_quant"] |
292 | 300 | if self.quantization is not None: |
293 | 301 | self.quantization = self.quantization.lower() |
294 | 302 |
|
@@ -343,6 +351,11 @@ def _verify_quantization(self) -> None: |
343 | 351 | "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" |
344 | 352 | " is not set, enabling VLLM_USE_TRITON_AWQ.") |
345 | 353 | envs.VLLM_USE_TRITON_AWQ = True |
| 354 | + if is_neuron( |
| 355 | + ) and self.quantization not in neuron_supported_quantization: |
| 356 | + raise ValueError( |
| 357 | + f"{self.quantization} quantization is currently not " |
| 358 | + f"supported in Neuron Backend.") |
346 | 359 |
|
347 | 360 | def _verify_cuda_graph(self) -> None: |
348 | 361 | if self.max_seq_len_to_capture is None: |
|
0 commit comments