Fix the vLLM docker compose issues (#134)

tianyil1 · pre-commit-ci[bot] · web-flow · commit 3d134d260b89 · 2024-06-07T10:52:01.000+08:00
* refine the vLLM docker compose Signed-off-by: tianyil1 <tianyi.liu@intel.com> * update the vllm openai api call Signed-off-by: tianyil1 <tianyi.liu@intel.com> * refine the default network configuration in the docker-compose Signed-off-by: tianyil1 <tianyi.liu@intel.com> * refine the network config of docker compose and launch service Signed-off-by: tianyil1 <tianyi.liu@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: tianyil1 <tianyi.liu@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/comps/llms/text-generation/vllm/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/docker_compose_llm.yaml
@@ -14,9 +14,10 @@ services:
     environment:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $LLM_MODEL_ID --port 80
+    command: /bin/sh -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model $LLM_MODEL_ID --port 80"
   llm:
     image: opea/gen-ai-comps:llm-vllm-server
     container_name: llm-vllm-server
@@ -26,6 +27,7 @@ services:
     environment:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
       vLLM_LLM_ENDPOINT: ${vLLM_LLM_ENDPOINT}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh
@@ -22,4 +22,4 @@ fi
 volume=$PWD/data
 
 # Build the Docker run command based on the number of cards
-docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --port $port_number"
+docker run -it --rm --name="ChatQnA_server" -p $port_number:$port_number --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port $port_number"
diff --git a/comps/llms/text-generation/vllm/llm.py b/comps/llms/text-generation/vllm/llm.py
@@ -5,6 +5,7 @@
 
 from fastapi.responses import StreamingResponse
 from langchain_community.llms import VLLMOpenAI
+from langsmith import traceable
 
 from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, opea_telemetry, register_microservice
 
@@ -28,12 +29,12 @@ def post_process_text(text: str):
     host="0.0.0.0",
     port=9000,
 )
-@opea_telemetry
+@traceable(run_type="llm")
 def llm_generate(input: LLMParamsDoc):
     llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:8080")
     llm = VLLMOpenAI(
         openai_api_key="EMPTY",
-        endpoint_url=llm_endpoint + "/v1",
+        openai_api_base=llm_endpoint + "/v1",
         max_tokens=input.max_new_tokens,
         model_name=os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct"),
         top_p=input.top_p,