Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions comps/lvms/deployment/docker_compose/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ services:
ports:
- ${VLLM_PORT:-9699}:80
volumes:
- "./data:/data"
- "./data:/root/.cache/huggingface/hub/"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
Expand All @@ -45,7 +45,7 @@ services:
ports:
- ${VLLM_PORT:-9699}:80
volumes:
- "./data:/data"
- "./data:/root/.cache/huggingface/hub/"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
Expand All @@ -60,6 +60,7 @@ services:
MAX_MODEL_LEN: ${MAX_TOTAL_TOKENS:-4096}
MAX_SEQ_LEN_TO_CAPTURE: ${MAX_TOTAL_TOKENS:-4096}
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: false # https://github.com/HabanaAI/vllm-fork/issues/841#issuecomment-2700421704
PT_HPU_ENABLE_LAZY_COLLECTIVES: true # for tensor parallel inference with hpu graphs
runtime: habana
cap_add:
- SYS_NICE
Expand All @@ -69,7 +70,7 @@ services:
interval: 10s
timeout: 10s
retries: 150
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
command: --model $LLM_MODEL_ID --tensor-parallel-size ${TP_SIZE:-1} --host 0.0.0.0 --port 80 --enable-auto-tool-choice --tool-call-parser hermes --chat-template ${CHAT_TEMPLATE:-examples/template_llava.jinja} # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
llava-tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: llava-tgi-service
Expand Down
12 changes: 11 additions & 1 deletion comps/lvms/src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,18 @@ export ip_address=$(hostname -I | awk '{print $1}')
export LVM_PORT=9399
export VLLM_PORT=11507
export LVM_ENDPOINT=http://$ip_address:$VLLM_PORT

# llava (option 1)
export LLM_MODEL_ID=llava-hf/llava-1.5-7b-hf
docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-service lvm-vllm -d
export CHAT_TEMPLATE=examples/template_llava.jinja
# UI-TARS (option 2)
export LLM_MODEL_ID=bytedance-research/UI-TARS-7B-DPO
export TP_SIZE=1 # change to 4 or 8 if using UI-TARS-72B-DPO
export CHAT_TEMPLATE=None

export VLLM_SKIP_WARMUP=true # skip the warmup-phase will start the vLLM server quickly on Gaudi, but increase runtime inference time when meeting unseen HPU shape

docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-gaudi-service lvm-vllm-gaudi -d
```

## Test
Expand Down
64 changes: 64 additions & 0 deletions comps/lvms/src/integrations/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,65 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im

return template.format(context=context, question=question)

@staticmethod
def generate_ui_tars_prompt_for_computer(prompt: str):
template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.

## Output Format
```\nThought: ...
Action: ...\n```

## Action Space

click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='')
type(content='') #If you want to submit your input, use \"\
\" at the end of `content`.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.


## Note
- Use Chinese in `Thought` part.
- Summarize your next action (with its target element) in one sentence in `Thought` part.

## User Instruction
"""

return template + prompt

@staticmethod
def generate_ui_tars_prompt_for_mobile(prompt: str):
template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.

## Output Format
```\nThought: ...
Action: ...\n```

## Action Space
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='')
type(content='')
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
press_home()
press_back()
finished(content='') # Submit the task regardless of whether it succeeds or fails.

## Note
- Use English in `Thought` part.

- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.

## User Instruction
"""

return template + prompt


@OpeaComponentRegistry.register("OPEA_VLLM_LVM")
class OpeaVllmLvm(OpeaComponent):
Expand Down Expand Up @@ -116,6 +175,11 @@ async def invoke(
# top_k = request.top_k
top_p = request.top_p

# update ui_tars prompt
if "UI-TARS" in LLM_MODEL_ID:
# TODO validate mobile flow https://github.com/bytedance/UI-TARS
prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt)

if not img_b64_str:
# If img_b64_str was an empty string, which means we have just have a text prompt.
# Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
Expand Down
Loading