Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ def get_vllm_port() -> int | None:
# If set, the OpenAI API server will stay alive even after the underlying
# AsyncLLMEngine errors and stops serving requests
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
int(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", "0"))
),
# If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
# the user to specify a max sequence length greater than
Expand Down Expand Up @@ -1313,7 +1313,9 @@ def get_vllm_port() -> int | None:
),
# If set, it means we pre-downloaded cubin files and flashinfer will
# read the cubin files directly.
"VLLM_HAS_FLASHINFER_CUBIN": lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
"VLLM_HAS_FLASHINFER_CUBIN": lambda: bool(
int(os.getenv("VLLM_HAS_FLASHINFER_CUBIN", "0"))
),
# Supported options:
# - "flashinfer-cudnn": use flashinfer cudnn GEMM backend
# - "flashinfer-trtllm": use flashinfer trtllm GEMM backend
Expand Down Expand Up @@ -1449,8 +1451,8 @@ def get_vllm_port() -> int | None:
# top 5 collected objects
"VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""),
# Disables parallel execution of shared_experts via separate cuda stream
"VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: os.getenv(
"VLLM_DISABLE_SHARED_EXPERTS_STREAM", False
"VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
),
# Format for saving torch.compile cache artifacts
# - "binary": saves as binary file
Expand Down