diff --git a/comps/agent/deployment/kubernetes/gaudi-values.yaml b/comps/agent/deployment/kubernetes/gaudi-values.yaml index 89457d6bab..53c22a2e62 100644 --- a/comps/agent/deployment/kubernetes/gaudi-values.yaml +++ b/comps/agent/deployment/kubernetes/gaudi-values.yaml @@ -19,6 +19,6 @@ vllm: OMPI_MCA_btl_vader_single_copy_mechanism: none PT_HPU_ENABLE_LAZY_COLLECTIVES: true VLLM_SKIP_WARMUP: true - extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"] + extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"] llm_endpoint_url: http://{{ .Release.Name }}-vllm diff --git a/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh b/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh index 83ecd67530..5da9a8e110 100644 --- a/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh +++ b/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq-len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml b/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml index 0266c45b62..078f4b14dc 100644 --- a/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml +++ b/comps/third_parties/vllm/deployment/kubernetes/gaudi-values.yaml @@ -8,7 +8,7 @@ image: # VLLM_CPU_KVCACHE_SPACE: "40" OMPI_MCA_btl_vader_single_copy_mechanism: none -extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] +extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq-len-to-capture","2048"] resources: limits: habana.ai/gaudi: 1