diff --git a/vllm/envs.py b/vllm/envs.py index 45547416314f..7b64b919db62 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -355,8 +355,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # Enables weights compression during model export via HF Optimum # default is False "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS": - lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)), - + lambda: + (os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in + ("on", "true", "1")), # If the env var is set, then all workers will execute as separate # processes from the engine, and we use the same mechanism to trigger # execution on all workers. diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index fde200d576e2..805f0cfc585e 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -125,7 +125,8 @@ def __init__( "as-is, all possible options that may affect model conversion " "are ignored.") - load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS + load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS + if export else False) pt_model = OVModelForCausalLM.from_pretrained( model_config.model, export=export,