opea-project · lvliang-intel · Apr 2, 2025 · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025
@@ -24,7 +24,7 @@ services:
     ports:
       - ${VLLM_PORT:-9699}:80
     volumes:
-      - "./data:/data"
+      - "./data:/root/.cache/huggingface/hub/"
     shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
@@ -45,7 +45,7 @@ services:
     ports:
       - ${VLLM_PORT:-9699}:80
     volumes:
-      - "./data:/data"
+      - "./data:/root/.cache/huggingface/hub/"
     shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
@@ -60,6 +60,7 @@ services:
       MAX_MODEL_LEN: ${MAX_TOTAL_TOKENS:-4096}
       MAX_SEQ_LEN_TO_CAPTURE: ${MAX_TOTAL_TOKENS:-4096}
       PT_HPUGRAPH_DISABLE_TENSOR_CACHE: false # https://github.com/HabanaAI/vllm-fork/issues/841#issuecomment-2700421704
+      PT_HPU_ENABLE_LAZY_COLLECTIVES: true # for tensor parallel inference with hpu graphs
     runtime: habana
     cap_add:
       - SYS_NICE
@@ -69,7 +70,7 @@ services:
       interval: 10s
       timeout: 10s
       retries: 150
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja  # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
+    command: --model $LLM_MODEL_ID --tensor-parallel-size ${TP_SIZE:-1} --host 0.0.0.0 --port 80 --enable-auto-tool-choice --tool-call-parser hermes  --chat-template ${CHAT_TEMPLATE:-examples/template_llava.jinja} # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
   llava-tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: llava-tgi-service

@@ -83,8 +83,18 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export LVM_PORT=9399
 export VLLM_PORT=11507
 export LVM_ENDPOINT=http://$ip_address:$VLLM_PORT
+
+# llava (option 1)
 export LLM_MODEL_ID=llava-hf/llava-1.5-7b-hf
-docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-service lvm-vllm -d
+export CHAT_TEMPLATE=examples/template_llava.jinja
+# UI-TARS (option 2)
+export LLM_MODEL_ID=bytedance-research/UI-TARS-7B-DPO
+export TP_SIZE=1    # change to 4 or 8 if using UI-TARS-72B-DPO
+export CHAT_TEMPLATE=None
+
+export VLLM_SKIP_WARMUP=true # skip the warmup-phase will start the vLLM server quickly on Gaudi, but increase runtime inference time when meeting unseen HPU shape
+
+docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-gaudi-service lvm-vllm-gaudi -d
 ```
 
 ## Test

@@ -45,6 +45,65 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im
 
         return template.format(context=context, question=question)
 
+    @staticmethod
+    def generate_ui_tars_prompt_for_computer(prompt: str):
+        template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+
+## Output Format
+```\nThought: ...
+Action: ...\n```
+
+## Action Space
+
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use \"\
+\" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished()
+call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
+
+
+## Note
+- Use Chinese in `Thought` part.
+- Summarize your next action (with its target element) in one sentence in `Thought` part.
+
+## User Instruction
+"""
+
+        return template + prompt
+
+    @staticmethod
+    def generate_ui_tars_prompt_for_mobile(prompt: str):
+        template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+
+## Output Format
+```\nThought: ...
+Action: ...\n```
+
+## Action Space
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='')
+type(content='')
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+press_home()
+press_back()
+finished(content='') # Submit the task regardless of whether it succeeds or fails.
+
+## Note
+- Use English in `Thought` part.
+
+- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+
+## User Instruction
+"""
+
+        return template + prompt
+
 
 @OpeaComponentRegistry.register("OPEA_VLLM_LVM")
 class OpeaVllmLvm(OpeaComponent):
@@ -116,6 +175,11 @@ async def invoke(
             # top_k = request.top_k
             top_p = request.top_p
 
+        # update ui_tars prompt
+        if "UI-TARS" in LLM_MODEL_ID:
+            # TODO validate mobile flow https://github.com/bytedance/UI-TARS
+            prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt)
+
         if not img_b64_str:
             # If img_b64_str was an empty string, which means we have just have a text prompt.
             # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.