diff --git a/comps/lvms/deployment/docker_compose/compose.yaml b/comps/lvms/deployment/docker_compose/compose.yaml index 01a2b90ce6..808f159ab9 100644 --- a/comps/lvms/deployment/docker_compose/compose.yaml +++ b/comps/lvms/deployment/docker_compose/compose.yaml @@ -24,7 +24,7 @@ services: ports: - ${VLLM_PORT:-9699}:80 volumes: - - "./data:/data" + - "./data:/root/.cache/huggingface/hub/" shm_size: 128g environment: no_proxy: ${no_proxy} @@ -45,7 +45,7 @@ services: ports: - ${VLLM_PORT:-9699}:80 volumes: - - "./data:/data" + - "./data:/root/.cache/huggingface/hub/" shm_size: 128g environment: no_proxy: ${no_proxy} @@ -60,6 +60,7 @@ services: MAX_MODEL_LEN: ${MAX_TOTAL_TOKENS:-4096} MAX_SEQ_LEN_TO_CAPTURE: ${MAX_TOTAL_TOKENS:-4096} PT_HPUGRAPH_DISABLE_TENSOR_CACHE: false # https://github.com/HabanaAI/vllm-fork/issues/841#issuecomment-2700421704 + PT_HPU_ENABLE_LAZY_COLLECTIVES: true # for tensor parallel inference with hpu graphs runtime: habana cap_add: - SYS_NICE @@ -69,7 +70,7 @@ services: interval: 10s timeout: 10s retries: 150 - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja # https://docs.vllm.ai/en/v0.5.0/models/vlm.html + command: --model $LLM_MODEL_ID --tensor-parallel-size ${TP_SIZE:-1} --host 0.0.0.0 --port 80 --enable-auto-tool-choice --tool-call-parser hermes --chat-template ${CHAT_TEMPLATE:-examples/template_llava.jinja} # https://docs.vllm.ai/en/v0.5.0/models/vlm.html llava-tgi-service: image: ghcr.io/huggingface/tgi-gaudi:2.3.1 container_name: llava-tgi-service diff --git a/comps/lvms/src/README.md b/comps/lvms/src/README.md index 4c8281065b..9ee0756aab 100644 --- a/comps/lvms/src/README.md +++ b/comps/lvms/src/README.md @@ -83,8 +83,18 @@ export ip_address=$(hostname -I | awk '{print $1}') export LVM_PORT=9399 export VLLM_PORT=11507 export LVM_ENDPOINT=http://$ip_address:$VLLM_PORT + +# llava (option 1) export LLM_MODEL_ID=llava-hf/llava-1.5-7b-hf -docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-service lvm-vllm -d +export CHAT_TEMPLATE=examples/template_llava.jinja +# UI-TARS (option 2) +export LLM_MODEL_ID=bytedance-research/UI-TARS-7B-DPO +export TP_SIZE=1 # change to 4 or 8 if using UI-TARS-72B-DPO +export CHAT_TEMPLATE=None + +export VLLM_SKIP_WARMUP=true # skip the warmup-phase will start the vLLM server quickly on Gaudi, but increase runtime inference time when meeting unseen HPU shape + +docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-gaudi-service lvm-vllm-gaudi -d ``` ## Test diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 5764299e76..1042734760 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -45,6 +45,71 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im return template.format(context=context, question=question) + @staticmethod + def generate_ui_tars_prompt_for_computer(prompt: str): + template = ( + r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + + r""" + +## Output Format +```\nThought: ... +Action: ...\n``` + +## Action Space + +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +hotkey(key='') +type(content='') #If you want to submit your input, use \"\ +\" at the end of `content`. +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') +wait() #Sleep for 5s and take a screenshot to check for any changes. +finished() +call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. + + +## Note +- Use Chinese in `Thought` part. +- Summarize your next action (with its target element) in one sentence in `Thought` part. + +## User Instruction +""" + ) + + return template + prompt + + @staticmethod + def generate_ui_tars_prompt_for_mobile(prompt: str): + template = ( + r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + + """ + +## Output Format +```\nThought: ... +Action: ...\n``` + +## Action Space +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='') +type(content='') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +press_home() +press_back() +finished(content='') # Submit the task regardless of whether it succeeds or fails. + +## Note +- Use English in `Thought` part. + +- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. + +## User Instruction +""" + ) + + return template + prompt + @OpeaComponentRegistry.register("OPEA_VLLM_LVM") class OpeaVllmLvm(OpeaComponent): @@ -116,6 +181,16 @@ async def invoke( # top_k = request.top_k top_p = request.top_p + # update ui_tars prompt + if "UI-TARS" in LLM_MODEL_ID: + # TODO validate mobile flow https://github.com/bytedance/UI-TARS + prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt) + frequency_penalty = 1.0 # force to use frequency_penalty 1.0 + temperature = 1.0 + top_p = 1.0 + else: + frequency_penalty = 0.0 # default + if not img_b64_str: # If img_b64_str was an empty string, which means we have just have a text prompt. # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image. @@ -147,6 +222,7 @@ def stream_generator(time_start): } ], max_tokens=max_new_tokens, + frequency_penalty=frequency_penalty, temperature=temperature, top_p=top_p, stream=True, @@ -185,6 +261,7 @@ def stream_generator(time_start): } ], max_tokens=max_new_tokens, + frequency_penalty=frequency_penalty, temperature=temperature, top_p=top_p, )