From 0727384606ba212a838e479895506d5a2f15c704 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 21:13:56 -0700 Subject: [PATCH 01/14] integrate UI-TARS vLLM in lvm component --- .../deployment/docker_compose/compose.yaml | 7 +- comps/lvms/src/README.md | 12 +++- comps/lvms/src/integrations/vllm.py | 64 +++++++++++++++++++ 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/comps/lvms/deployment/docker_compose/compose.yaml b/comps/lvms/deployment/docker_compose/compose.yaml index 01a2b90ce6..808f159ab9 100644 --- a/comps/lvms/deployment/docker_compose/compose.yaml +++ b/comps/lvms/deployment/docker_compose/compose.yaml @@ -24,7 +24,7 @@ services: ports: - ${VLLM_PORT:-9699}:80 volumes: - - "./data:/data" + - "./data:/root/.cache/huggingface/hub/" shm_size: 128g environment: no_proxy: ${no_proxy} @@ -45,7 +45,7 @@ services: ports: - ${VLLM_PORT:-9699}:80 volumes: - - "./data:/data" + - "./data:/root/.cache/huggingface/hub/" shm_size: 128g environment: no_proxy: ${no_proxy} @@ -60,6 +60,7 @@ services: MAX_MODEL_LEN: ${MAX_TOTAL_TOKENS:-4096} MAX_SEQ_LEN_TO_CAPTURE: ${MAX_TOTAL_TOKENS:-4096} PT_HPUGRAPH_DISABLE_TENSOR_CACHE: false # https://github.com/HabanaAI/vllm-fork/issues/841#issuecomment-2700421704 + PT_HPU_ENABLE_LAZY_COLLECTIVES: true # for tensor parallel inference with hpu graphs runtime: habana cap_add: - SYS_NICE @@ -69,7 +70,7 @@ services: interval: 10s timeout: 10s retries: 150 - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja # https://docs.vllm.ai/en/v0.5.0/models/vlm.html + command: --model $LLM_MODEL_ID --tensor-parallel-size ${TP_SIZE:-1} --host 0.0.0.0 --port 80 --enable-auto-tool-choice --tool-call-parser hermes --chat-template ${CHAT_TEMPLATE:-examples/template_llava.jinja} # https://docs.vllm.ai/en/v0.5.0/models/vlm.html llava-tgi-service: image: ghcr.io/huggingface/tgi-gaudi:2.3.1 container_name: llava-tgi-service diff --git a/comps/lvms/src/README.md b/comps/lvms/src/README.md index 4c8281065b..49d71d1a5b 100644 --- a/comps/lvms/src/README.md +++ b/comps/lvms/src/README.md @@ -83,8 +83,18 @@ export ip_address=$(hostname -I | awk '{print $1}') export LVM_PORT=9399 export VLLM_PORT=11507 export LVM_ENDPOINT=http://$ip_address:$VLLM_PORT + +# llava (option 1) export LLM_MODEL_ID=llava-hf/llava-1.5-7b-hf -docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-service lvm-vllm -d +export CHAT_TEMPLATE=examples/template_llava.jinja +# UI-TARS (option 2) +export LLM_MODEL_ID=bytedance-research/UI-TARS-7B-DPO +export TP_SIZE=1 # change to 4 or 8 if using UI-TARS-72B-DPO +export CHAT_TEMPLATE=None + +export VLLM_SKIP_WARMUP=true # skip the warmup-phase will start the vLLM server quickly on Gaudi, but increase runtime inference time when meeting unseen HPU shape + +docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-gaudi-service lvm-vllm -d ``` ## Test diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 5764299e76..7e35d23311 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -45,6 +45,65 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im return template.format(context=context, question=question) + @staticmethod + def generate_ui_tars_prompt_for_computer(prompt: str): + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + + ## Output Format + ```\nThought: ... + Action: ...\n``` + + ## Action Space + + click(start_box='<|box_start|>(x1,y1)<|box_end|>') + left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') + right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') + drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') + hotkey(key='') + type(content='') #If you want to submit your input, use \"\ + \" at the end of `content`. + scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') + wait() #Sleep for 5s and take a screenshot to check for any changes. + finished() + call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. + + + ## Note + - Use Chinese in `Thought` part. + - Summarize your next action (with its target element) in one sentence in `Thought` part. + + ## User Instruction + """ + + return template + prompt + + @staticmethod + def generate_ui_tars_prompt_for_mobile(prompt: str): + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + + ## Output Format + ```\nThought: ... + Action: ...\n``` + + ## Action Space + click(start_box='<|box_start|>(x1,y1)<|box_end|>') + long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='') + type(content='') + scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') + press_home() + press_back() + finished(content='') # Submit the task regardless of whether it succeeds or fails. + + ## Note + - Use English in `Thought` part. + + - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. + + ## User Instruction + """ + + return template + prompt + @OpeaComponentRegistry.register("OPEA_VLLM_LVM") class OpeaVllmLvm(OpeaComponent): @@ -116,6 +175,11 @@ async def invoke( # top_k = request.top_k top_p = request.top_p + # update ui_tars prompt + if "UI-TARS" in LLM_MODEL_ID: + # TODO validate mobile flow https://github.com/bytedance/UI-TARS + prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt) + if not img_b64_str: # If img_b64_str was an empty string, which means we have just have a text prompt. # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image. From dbc1ee8e8faa325eaf5fec9ba5309a16712ffb6d Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 21:14:59 -0700 Subject: [PATCH 02/14] fix name --- comps/lvms/src/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/lvms/src/README.md b/comps/lvms/src/README.md index 49d71d1a5b..9ee0756aab 100644 --- a/comps/lvms/src/README.md +++ b/comps/lvms/src/README.md @@ -94,7 +94,7 @@ export CHAT_TEMPLATE=None export VLLM_SKIP_WARMUP=true # skip the warmup-phase will start the vLLM server quickly on Gaudi, but increase runtime inference time when meeting unseen HPU shape -docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-gaudi-service lvm-vllm -d +docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-gaudi-service lvm-vllm-gaudi -d ``` ## Test From 9c5d816acaea3ea083f7adfdf7bd8df6cdf3dc26 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 22:16:41 -0700 Subject: [PATCH 03/14] fix acc issue caused by template indent --- comps/lvms/src/integrations/vllm.py | 82 ++++++++++++++--------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 7e35d23311..c053ecc5dc 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -49,31 +49,31 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im def generate_ui_tars_prompt_for_computer(prompt: str): template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. - ## Output Format - ```\nThought: ... - Action: ...\n``` - - ## Action Space - - click(start_box='<|box_start|>(x1,y1)<|box_end|>') - left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') - right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') - drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') - hotkey(key='') - type(content='') #If you want to submit your input, use \"\ - \" at the end of `content`. - scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') - wait() #Sleep for 5s and take a screenshot to check for any changes. - finished() - call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. - - - ## Note - - Use Chinese in `Thought` part. - - Summarize your next action (with its target element) in one sentence in `Thought` part. - - ## User Instruction - """ +## Output Format +```\nThought: ... +Action: ...\n``` + +## Action Space + +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +hotkey(key='') +type(content='') #If you want to submit your input, use \"\ +\" at the end of `content`. +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') +wait() #Sleep for 5s and take a screenshot to check for any changes. +finished() +call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help. + + +## Note +- Use Chinese in `Thought` part. +- Summarize your next action (with its target element) in one sentence in `Thought` part. + +## User Instruction +""" return template + prompt @@ -81,26 +81,26 @@ def generate_ui_tars_prompt_for_computer(prompt: str): def generate_ui_tars_prompt_for_mobile(prompt: str): template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. - ## Output Format - ```\nThought: ... - Action: ...\n``` +## Output Format +```\nThought: ... +Action: ...\n``` - ## Action Space - click(start_box='<|box_start|>(x1,y1)<|box_end|>') - long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='') - type(content='') - scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') - press_home() - press_back() - finished(content='') # Submit the task regardless of whether it succeeds or fails. +## Action Space +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +long_press(start_box='<|box_start|>(x1,y1)<|box_end|>', time='') +type(content='') +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +press_home() +press_back() +finished(content='') # Submit the task regardless of whether it succeeds or fails. - ## Note - - Use English in `Thought` part. +## Note +- Use English in `Thought` part. - - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. +- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. - ## User Instruction - """ +## User Instruction +""" return template + prompt From 904e4d7afbe0aa61beb80ef964332943a5f69a41 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 22:29:04 -0700 Subject: [PATCH 04/14] add space for accuracy issue --- comps/lvms/src/integrations/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index c053ecc5dc..1393b289ca 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -47,7 +47,7 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im @staticmethod def generate_ui_tars_prompt_for_computer(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... @@ -79,7 +79,7 @@ def generate_ui_tars_prompt_for_computer(prompt: str): @staticmethod def generate_ui_tars_prompt_for_mobile(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... From c18191bdfb4a1e71df3b3dcbac05bb1147003392 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Mar 2025 05:29:57 +0000 Subject: [PATCH 05/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/lvms/src/integrations/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 1393b289ca..c053ecc5dc 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -47,7 +47,7 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im @staticmethod def generate_ui_tars_prompt_for_computer(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... @@ -79,7 +79,7 @@ def generate_ui_tars_prompt_for_computer(prompt: str): @staticmethod def generate_ui_tars_prompt_for_mobile(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... From 24b16cbd922eda5c9d49917c055f368123ef94f2 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 23:02:59 -0700 Subject: [PATCH 06/14] force to use frequency_penalty 1.0 --- comps/lvms/src/integrations/vllm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 1393b289ca..2cd0466fb7 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -179,6 +179,9 @@ async def invoke( if "UI-TARS" in LLM_MODEL_ID: # TODO validate mobile flow https://github.com/bytedance/UI-TARS prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt) + frequency_penalty = 1.0 # force to use frequency_penalty 1.0 + else: + frequency_penalty = 0.0 # default if not img_b64_str: # If img_b64_str was an empty string, which means we have just have a text prompt. @@ -211,6 +214,7 @@ def stream_generator(time_start): } ], max_tokens=max_new_tokens, + frequency_penalty=frequency_penalty, temperature=temperature, top_p=top_p, stream=True, From 44957c9fbe103f3e5ec40015bfd90fed1b70f8bf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Mar 2025 06:04:10 +0000 Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/lvms/src/integrations/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index bfc16ab842..85e31210e9 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -179,9 +179,9 @@ async def invoke( if "UI-TARS" in LLM_MODEL_ID: # TODO validate mobile flow https://github.com/bytedance/UI-TARS prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt) - frequency_penalty = 1.0 # force to use frequency_penalty 1.0 + frequency_penalty = 1.0 # force to use frequency_penalty 1.0 else: - frequency_penalty = 0.0 # default + frequency_penalty = 0.0 # default if not img_b64_str: # If img_b64_str was an empty string, which means we have just have a text prompt. From ff829b382db5f0577b5e40328d80e6970aa06904 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 23:13:02 -0700 Subject: [PATCH 08/14] bypass autofix whitespace --- comps/lvms/src/integrations/vllm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index bfc16ab842..2d0cb7a05e 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -47,7 +47,8 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im @staticmethod def generate_ui_tars_prompt_for_computer(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + # fmt: off + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... @@ -79,7 +80,8 @@ def generate_ui_tars_prompt_for_computer(prompt: str): @staticmethod def generate_ui_tars_prompt_for_mobile(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + # fmt: off + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... From 3c5a1071be59cc7b3dc9f0bcd932aecc412b22c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Mar 2025 06:13:38 +0000 Subject: [PATCH 09/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/lvms/src/integrations/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index bf016c6d87..4ceaa9ee5c 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -48,7 +48,7 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im @staticmethod def generate_ui_tars_prompt_for_computer(prompt: str): # fmt: off - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... @@ -81,7 +81,7 @@ def generate_ui_tars_prompt_for_computer(prompt: str): @staticmethod def generate_ui_tars_prompt_for_mobile(prompt: str): # fmt: off - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. ## Output Format ```\nThought: ... From 39833d82e88717a43ac8230ade39421537a86176 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 25 Mar 2025 23:38:35 -0700 Subject: [PATCH 10/14] concat two str --- comps/lvms/src/integrations/vllm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 4ceaa9ee5c..6c2aecac4d 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -47,8 +47,7 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im @staticmethod def generate_ui_tars_prompt_for_computer(prompt: str): - # fmt: off - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + r""" ## Output Format ```\nThought: ... @@ -80,8 +79,7 @@ def generate_ui_tars_prompt_for_computer(prompt: str): @staticmethod def generate_ui_tars_prompt_for_mobile(prompt: str): - # fmt: off - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + """ ## Output Format ```\nThought: ... From 0778038c420ac66bc535b494cff697dccb51be6d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Mar 2025 06:39:03 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/lvms/src/integrations/vllm.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 6c2aecac4d..d1a21d5210 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -47,7 +47,9 @@ def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_im @staticmethod def generate_ui_tars_prompt_for_computer(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + r""" + template = ( + r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + + r""" ## Output Format ```\nThought: ... @@ -74,12 +76,15 @@ def generate_ui_tars_prompt_for_computer(prompt: str): ## User Instruction """ + ) return template + prompt @staticmethod def generate_ui_tars_prompt_for_mobile(prompt: str): - template = r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + """ + template = ( + r"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. """ + + """ ## Output Format ```\nThought: ... @@ -101,6 +106,7 @@ def generate_ui_tars_prompt_for_mobile(prompt: str): ## User Instruction """ + ) return template + prompt From 3828119bc17541f28122419cf754d512b2493211 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Wed, 26 Mar 2025 00:02:59 -0700 Subject: [PATCH 12/14] fix --- comps/lvms/src/integrations/vllm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index d1a21d5210..78178b2325 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -259,6 +259,7 @@ def stream_generator(time_start): } ], max_tokens=max_new_tokens, + frequency_penalty=frequency_penalty, temperature=temperature, top_p=top_p, ) From 09c834b53d13fc8e54ef37165b110d0e7f15ad15 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Wed, 26 Mar 2025 00:19:32 -0700 Subject: [PATCH 13/14] force param --- comps/lvms/src/integrations/vllm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 78178b2325..2c7a0f6509 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -186,6 +186,8 @@ async def invoke( # TODO validate mobile flow https://github.com/bytedance/UI-TARS prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt) frequency_penalty = 1.0 # force to use frequency_penalty 1.0 + temperature=1.0 + top_p=1.0 else: frequency_penalty = 0.0 # default From 505b9844710d13cadff7ef8e4dd8ea8f9fd3e42a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Mar 2025 07:19:59 +0000 Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/lvms/src/integrations/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py index 2c7a0f6509..1042734760 100644 --- a/comps/lvms/src/integrations/vllm.py +++ b/comps/lvms/src/integrations/vllm.py @@ -186,8 +186,8 @@ async def invoke( # TODO validate mobile flow https://github.com/bytedance/UI-TARS prompt = ChatTemplate.generate_ui_tars_prompt_for_computer(prompt=prompt) frequency_penalty = 1.0 # force to use frequency_penalty 1.0 - temperature=1.0 - top_p=1.0 + temperature = 1.0 + top_p = 1.0 else: frequency_penalty = 0.0 # default