Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/attention/backends/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def __post_init__(self):
not in supported_head_sizes:
raise ValueError(
f"Only {supported_head_sizes} are supported for head_dim,",
f"received {self.head_dim}.")
f" received {self.head_dim}.")

def begin_forward(self):
if self.num_prefill_tokens > 0:
Expand Down
2 changes: 1 addition & 1 deletion vllm/attention/backends/mla/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ def __post_init__(self):
not in supported_head_sizes:
raise ValueError(
f"Only {supported_head_sizes} are supported for head_dim,",
f"received {self.head_dim}.")
f" received {self.head_dim}.")

@property
def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
Expand Down
4 changes: 2 additions & 2 deletions vllm/attention/backends/rocm_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ def __init__(
if logits_soft_cap is not None:
raise ValueError(
"ROCm Triton FlashAttention does not support attention"
"logits soft capping."
" logits soft capping."
" please try using the ROCm CK "
"FA backend instead by setting the env var "
"`VLLM_USE_TRITON_FLASH_ATTN=0`")
Expand Down Expand Up @@ -528,7 +528,7 @@ def __init__(
if self.use_naive_attn:
if logits_soft_cap is not None:
raise ValueError(
"ROCm Naive FlashAttention does not support"
"ROCm Naive FlashAttention does not support "
"attention logits soft capping.")

self.attn_func = _sdpa_attention
Expand Down
12 changes: 6 additions & 6 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,8 +906,8 @@ def get_num_layers_by_block_type(
layers_block_type_value = getattr(self.hf_config,
"layers_block_type", None)
if layers_block_type_value is None:
raise ValueError("The model is an hybrid without a"
"layers_block_type in the hf_config,"
raise ValueError("The model is an hybrid without a "
"layers_block_type in the hf_config, "
"cannot determine the num of "
f"{block_type.value} layers")

Expand Down Expand Up @@ -2494,7 +2494,7 @@ def _get_and_verify_dtype(

if current_platform.is_hpu() and config_dtype == torch.float16:
logger.info(
"For HPU, we cast models to bfloat16 instead of"
"For HPU, we cast models to bfloat16 instead of "
"using float16 by default. Please specify `dtype` if you "
"want to use float16.")
torch_dtype = torch.bfloat16
Expand Down Expand Up @@ -2710,7 +2710,7 @@ def __post_init__(self):
backend=self.guided_decoding_backend).backend_name
if backend not in valid_guided_backends:
raise ValueError(f"Invalid guided_decoding_backend '{backend},"
f"must be one of {valid_guided_backends}")
f" must be one of {valid_guided_backends}")


@dataclass
Expand Down Expand Up @@ -2986,7 +2986,7 @@ def uuid(self):
def model_post_init(self, __context: Any) -> None:
if not self.enable_reshape and self.enable_fusion:
logger.warning_once(
"Fusion enabled but reshape elimination disabled."
"Fusion enabled but reshape elimination disabled. "
"RMSNorm + quant (fp8) fusion might not work")

pass_config: PassConfig = Field(default_factory=PassConfig)
Expand Down Expand Up @@ -3541,7 +3541,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
logger.warning(
"`torch.compile` is turned on, but the model %s"
" does not support it. Please open an issue on GitHub"
"if you want it to be supported.",
" if you want it to be supported.",
vllm_config.model_config.model)
_current_vllm_config = old_vllm_config

Expand Down
4 changes: 2 additions & 2 deletions vllm/distributed/device_communicators/pynccl_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,10 @@ def __init__(self, so_file: Optional[str] = None):
self.lib = NCCLLibrary.path_to_library_cache[so_file]
except Exception as e:
logger.error(
"Failed to load NCCL library from %s ."
"Failed to load NCCL library from %s. "
"It is expected if you are not running on NVIDIA/AMD GPUs."
"Otherwise, the nccl library might not exist, be corrupted "
"or it does not support the current platform %s."
"or it does not support the current platform %s. "
"If you already have the library, please set the "
"environment variable VLLM_NCCL_SO_PATH"
" to point to the correct nccl library path.", so_file,
Expand Down
2 changes: 1 addition & 1 deletion vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def initialize(self, local_hostname: str, metadata_server: str,
if metadata_backend not in supported_backend:
raise ValueError(
"Mooncake Configuration error. `metadata_backend`"
f"should be one of {supported_backend}.")
f" should be one of {supported_backend}.")

self.engine.initializeExt(local_hostname, metadata_server,
protocol, device_name, metadata_backend)
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,7 @@ def _parse_chat_message_content_part(
# content is empty, log a warning and skip
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
logger.warning(
"Skipping multimodal part (type: '%s')"
"Skipping multimodal part (type: '%s') "
"with empty / unparsable content.", part_type)
return None

Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1342,7 +1342,7 @@ def _add_guided_params(
return params

if params.guided_decoding is not None:
raise ValueError("Cannot set both guided_options_request and"
raise ValueError("Cannot set both guided_options_request and "
"params.guided_decoding.")

params.guided_decoding = GuidedDecodingParams(
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
async def do_rerank_v1(request: RerankRequest, raw_request: Request):
logger.warning_once(
"To indicate that the rerank API is not part of the standard OpenAI"
" API, we have located it at `/rerank`. Please update your client"
" API, we have located it at `/rerank`. Please update your client "
"accordingly. (Note: Conforms to JinaAI rerank API)")

return await do_rerank(request, raw_request)
Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/ray_distributed_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ def _check_ray_adag_installation(self):
if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
raise ValueError(
"cupy is not installed but required since "
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
"Run `pip install ray[adag]` and check cupy installation.")

def _compiled_ray_dag(self, enable_asyncio: bool):
Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/ray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def initialize_ray_cluster(
if parallel_config.world_size > device_bundles:
raise ValueError(
f"The number of required {device_str}s exceeds the total "
f"number of available {device_str}s in the placement group."
f"number of available {device_str}s in the placement group. "
f"Required number of devices: {parallel_config.world_size}. "
f"Total number of devices: {device_bundles}.")
else:
Expand Down
2 changes: 1 addition & 1 deletion vllm/lora/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def _add_adapter(self, lora: LoRAModel):
def pin_adapter(self, lora_id: int) -> bool:
"""Pin a LoRAModel in the manager cache."""
raise NotImplementedError(
"Pinning is not supported in LoRAModelManager."
"Pinning is not supported in LoRAModelManager. "
"Use LRUCacheLoRAModelManager for pinning") # type: ignore

def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __init__(
if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
and self.input_quant.strategy == QuantizationStrategy.TENSOR):
raise ValueError(
"For FP8 Fused MoE layers, only per-tensor scales"
"For FP8 Fused MoE layers, only per-tensor scales "
"for weights and activations are supported. Found "
f"{self.weight_quant}, {self.input_quant}")

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(
def __repr__(self) -> str:
return (f"GPTQConfig(weight_bits={self.weight_bits}, "
f"group_size={self.group_size}, "
f"desc_act={self.desc_act}),"
f"desc_act={self.desc_act}), "
f"lm_head_quantized={self.lm_head_quantized}), "
f"dynamic={self.dynamic}")

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/modelopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config":
quant_method = quant_config["quant_algo"]
is_checkpoint_fp8_serialized = ("FP8" in quant_method)
if not is_checkpoint_fp8_serialized:
raise ValueError("ModelOpt currently only supports static FP8"
raise ValueError("ModelOpt currently only supports static FP8 "
"quantization in vLLM. Please check the "
"`hf_quant_config.json` file for your model's "
"quant configuration.")
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/quantization/neuron_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def __init__(
if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
raise ValueError(
f"Neuron quantization datatype {self.quant_dtype} is not valid,"
f"the quantization datatype should match one of the below types"
f"{SUPPORTED_QUANT_DTYPE_LIST}")
f" the quantization datatype should match one of the below "
f"types {SUPPORTED_QUANT_DTYPE_LIST}")
self.dequant_dtype = dequant_dtype
self.quantize_method = quantize_method

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str,
if not (weight_qscheme == "per_tensor"
and input_qscheme == "per_tensor"):
raise ValueError(
"For FP8 Fused MoE layers, only per-tensor scales"
"For FP8 Fused MoE layers, only per-tensor scales "
"for weights and activations are supported. Found "
f"{weight_qscheme}, {input_qscheme}") # noqa E501

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def verify_marlin_supports_shape(output_size_per_partition: int,
and input_size_per_partition % group_size != 0):
raise ValueError(
f"Weight input_size_per_partition = {input_size_per_partition}"
f" is not divisible by group_size = {group_size}."
f" is not divisible by group_size = {group_size}. "
"Consider reducing tensor_parallel_size or running "
"with --quantization gptq.")

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,7 @@ def _load_weights(self, model_config: ModelConfig,
self.model_type = type(model).__name__

logger.info("Loading weights with BitsAndBytes quantization. "
" May take a while ...")
"May take a while ...")

quant_config = getattr(model_config.hf_config, "quantization_config",
None)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def _process_image_input(
# 3D tensor
return list(torch.unbind(image_data, dim=0))
raise ValueError(
"We expect batched 2D tensors;"
"We expect batched 2D tensors; "
"this can be either a list of 2D tensors or a single 3D tensor."
)

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def _validate_shape(d: torch.Tensor):
expected_expr = str(expected_dims)
raise ValueError(
"The expected shape of pixel values per image per batch "
f" per patch is {expected_expr}. "
f"per patch is {expected_expr}. "
f"You supplied {tuple(d.shape)}.")

for d in data:
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/models/gritlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def _get_instruction_len(self, prompt_token_ids: array) -> int:

# Return no instruction in case of missing BOS token.
if prompt_token_ids[0] != self.token_ids["<s>"]:
logger.warning("BOS token not found in prompt,"
"thus using empty string for instruction."
logger.warning("BOS token not found in prompt, "
"thus using empty string for instruction. "
"GritLM requires BOS token in prompt.")
return instruction_len

Expand All @@ -111,8 +111,8 @@ def _get_instruction_len(self, prompt_token_ids: array) -> int:
if found_embed_pattern_idx != -1:
instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
else:
logger.warning("Query instruction not found in prompt,"
"thus using BOS token as instruction instead."
logger.warning("Query instruction not found in prompt, "
"thus using BOS token as instruction instead. "
"GritLM requires query instruction in prompt.")
instruction_len = 1

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ def check_mm_inputs(self, inputs: Dict[str, object],
for modality, count in counts.items():
if modality not in inputs or not inputs[modality]:
raise ValueError(f"None input data of {modality}."
"But prompt requires.")
" But prompt requires.")
counter_key = self.get_modality_num_counter(modality)
if len(inputs[modality][counter_key]) != count:
raise ValueError(f"The prompt requires {count} "
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ def _process_image_input(
# 3D tensor
return list(torch.unbind(image_data, dim=0))
raise ValueError(
"We expect batched 2D tensors;"
"We expect batched 2D tensors; "
"this can be either a list of 2D tensors or a single 3D tensor."
)

Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/prithvi_geospatial_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
if self.model is None:
raise ValueError(
"Unsupported task."
"Only SemanticSegmentationTask is supported for now"
"Unsupported task. "
"Only SemanticSegmentationTask is supported for now "
"by PrithviGeospatialMAE.")

def _parse_and_validate_multimodal_data(
Expand Down
2 changes: 1 addition & 1 deletion vllm/multimodal/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def get_dummy_data(

if mm_counts.keys() != mm_max_tokens_per_item.keys():
raise AssertionError(
"The keys returned by `get_supported_mm_limits`"
"The keys returned by `get_supported_mm_limits` "
f"({set(mm_counts.keys())}) should be the same as those "
"returned by `get_mm_max_tokens_per_item` "
f"({set(mm_max_tokens_per_item.keys())})")
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
"Cannot use FlashAttention-2 backend for FP8 KV cache.")
logger.warning(
"Please use FlashInfer backend with FP8 KV Cache for "
"better performance by setting environment variable "
"better performance by setting environment variable "
"VLLM_ATTENTION_BACKEND=FLASHINFER")
target_backend = _Backend.XFORMERS
elif block_size % 16 != 0:
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:

if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
if not OpenVinoPlatform.is_openvino_cpu():
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
"ignored for GPU, f16 data type will be used.")
cache_config.cache_dtype = ov.Type.f16
else:
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
logger.warning(
"bfloat16 is only supported on Intel Data Center GPU, "
"Intel Arc GPU is not supported yet. Your device is %s,"
"which is not supported. will fallback to float16",
" which is not supported. will fallback to float16",
cls.get_device_name())
model_config.dtype = torch.float16
if not model_config.enforce_eager:
Expand Down
2 changes: 1 addition & 1 deletion vllm/prompt_adapter/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def register_module(self, module_name: str, module: nn.Module):
def pin_adapter(self, prompt_adapter_id: int) -> bool:
"""Pin a PromptAdapterModel in the manager cache."""
raise NotImplementedError(
"Pinning is not supported in PromptAdapterModelManager."
"Pinning is not supported in PromptAdapterModelManager. "
"Use LRUCachePromptAdapterModelManager for pinning"
) # type: ignore

Expand Down
2 changes: 1 addition & 1 deletion vllm/spec_decode/draft_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
ROCmFlashAttentionMetadata as FlashAttentionMetadata)
except (ModuleNotFoundError, ImportError) as err:
raise RuntimeError(
"Draft model speculative decoding currently only supports"
"Draft model speculative decoding currently only supports "
"CUDA and ROCm flash attention backend.") from err

from vllm.logger import init_logger
Expand Down
8 changes: 4 additions & 4 deletions vllm/transformers_utils/configs/jais.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,26 +212,26 @@ def _alibi_scaling_validation(self):
if (not isinstance(self.alibi_scaling, dict)
or len(self.alibi_scaling) != 2):
raise ValueError(
"`alibi_scaling` must be a dictionary with two fields,"
"`alibi_scaling` must be a dictionary with two fields, "
"`type` and `factor` or `type` and `train_seq_len`, "
f"got {self.alibi_scaling}")
alibi_scaling_type = self.alibi_scaling.get("type", None)
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
if alibi_scaling_type is None or alibi_scaling_type != "linear":
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
f"got {alibi_scaling_type}")
if (alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None
and alibi_scaling_factor <= 1.0)):
raise ValueError(
f"`alibi_scaling`'s factor field must be a float > 1.0,"
f"`alibi_scaling`'s factor field must be a float > 1.0, "
f"got {alibi_scaling_factor}")
if (alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None
and alibi_dynamic_scaling <= 1)):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an"
f"`alibi_scaling`'s `train_seq_len` field must be an "
f"integer > 1, got {alibi_dynamic_scaling}")
Loading