Skip to content

Commit 97671ec

Browse files
lvliang-intelaMahanna
authored andcommitted
Update vLLM parameter max-seq-len-to-capture (opea-project#1565)
Signed-off-by: lvliang-intel <[email protected]> Signed-off-by: Anthony Mahanna <[email protected]>
1 parent 69e5126 commit 97671ec

3 files changed

Lines changed: 3 additions & 3 deletions

File tree

comps/agent/deployment/kubernetes/gaudi-values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@ vllm:
1919
OMPI_MCA_btl_vader_single_copy_mechanism: none
2020
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
2121
VLLM_SKIP_WARMUP: true
22-
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
22+
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
2323

2424
llm_endpoint_url: http://{{ .Release.Name }}-vllm

comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ volume=$PWD/data
3838

3939
# Build the Docker run command based on hardware mode
4040
if [ "$hw_mode" = "hpu" ]; then
41-
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture
41+
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq-len-to-capture $max_seq_len_to_capture
4242
else
4343
docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80
4444
fi

comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ image:
88

99
# VLLM_CPU_KVCACHE_SPACE: "40"
1010
OMPI_MCA_btl_vader_single_copy_mechanism: none
11-
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
11+
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq-len-to-capture","2048"]
1212
resources:
1313
limits:
1414
habana.ai/gaudi: 1

0 commit comments

Comments
 (0)