Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion multimodal/omni-tars/gui-agent/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"@gui-agent/operator-browser": "workspace:*",
"@gui-agent/operator-aio": "workspace:*",
"@gui-agent/action-parser": "workspace:*",
"@tarko/shared-media-utils": "workspace:*",
"lodash.isnumber": "3.0.3"
},
"devDependencies": {
Expand All @@ -48,4 +49,4 @@
"openai": "4.93.0",
"@types/lodash.isnumber": "3.0.3"
}
}
}
42 changes: 23 additions & 19 deletions multimodal/omni-tars/gui-agent/src/GuiAgentPlugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,10 @@
* SPDX-License-Identifier: Apache-2.0
*/
import { AgentPlugin, COMPUTER_USE_ENVIRONMENT } from '@omni-tars/core';
import {
Tool,
LLMRequestHookPayload,
LLMResponseHookPayload,
AgentEventStream,
ChatCompletionContentPart,
} from '@tarko/agent';
import {
GUIExecuteResult,
convertToGUIResponse,
createGUIErrorResponse,
} from '@tarko/shared-utils';
import { Tool, LLMRequestHookPayload, ChatCompletionContentPart } from '@tarko/agent';
import { createGUIErrorResponse } from '@tarko/shared-utils';
import { Base64ImageParser } from '@agent-infra/media-utils';
import { ImageCompressor, formatBytes } from '@tarko/shared-media-utils';
import { setScreenInfo } from './shared';
import { OperatorManager } from './OperatorManager';
import { BrowserOperator } from '@gui-agent/operator-browser';
Expand Down Expand Up @@ -96,22 +87,35 @@ export class GuiAgentPlugin extends AgentPlugin {

const operator = await this.operatorManager.getInstance();
const output = await operator?.doScreenshot();
if (!output) {
if (!output?.base64) {
this.agent.logger.error('Failed to get screenshot');
return;
}
const base64Tool = new Base64ImageParser(output.base64);
const base64Uri = base64Tool.getDataUri();
if (!base64Uri) {
this.agent.logger.error('Failed to get base64 image uri');
return;
}
const originalBuffer = Buffer.from(output.base64, 'base64');
const originalSize = originalBuffer.byteLength;

// Create image compressor with WebP format and 80% quality
const compressor = new ImageCompressor({
quality: 80,
format: 'webp',
});
const compressedBuffer = await compressor.compressToBuffer(originalBuffer);
const compressedBase64 = `data:image/webp;base64,${compressedBuffer.toString('base64')}`;
const compressedSize = compressedBuffer.byteLength;
const compressionRatio = (((originalSize - compressedSize) / originalSize) * 100).toFixed(2);

this.agent.logger.debug(`compression stat: `, {
originalSize: formatBytes(originalSize),
compressedSize: formatBytes(compressedSize),
compressionRatio: `${compressionRatio}% reduction`,
});

const content: ChatCompletionContentPart[] = [
{
type: 'image_url',
image_url: {
url: base64Uri,
url: compressedBase64,
},
},
];
Expand Down
12 changes: 12 additions & 0 deletions multimodal/omni-tars/omni-agent/tarko.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,16 @@ export default {
},
},
logLevel: LogLevel.DEBUG,
webui: {
logo: 'https://lf3-static.bytednsdoc.com/obj/eden-cn/zyha-aulnh/ljhwZthlaukjlkulzlp/appicon.png',
subtitle: 'Offering seamless integration with a wide range of real-world tools.',
welcomTitle: 'Omni Agent',
welcomePrompts: [
'Search for the latest GUI Agent papers',
'Find information about UI TARS',
'Tell me the top 5 most popular projects on ProductHunt today',
'Please book me the earliest flight from Hangzhou to Shenzhen on 10.1',
'What is Agent TARS',
],
},
} as AgentAppConfig;
7 changes: 3 additions & 4 deletions multimodal/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading