Skip to content

Commit 71e6e1a

Browse files
authored
feat(gui-agent): add image detail calculator for enhanced screenshot processing (#1724)
1 parent 06258c3 commit 71e6e1a

4 files changed

Lines changed: 71 additions & 22 deletions

File tree

multimodal/gui-agent/agent-sdk/src/GUIAgent.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,20 @@ import { GUIAgentToolCallEngine } from './ToolCallEngine';
1313
import { SYSTEM_PROMPT } from './prompts';
1414
import { Base64ImageParser } from '@agent-infra/media-utils';
1515
import { Operator, BaseGUIAgent } from '@gui-agent/shared/base';
16-
import { GUIAgentConfig, NormalizeCoordinates } from '@gui-agent/shared/types';
16+
import {
17+
GUIAgentConfig,
18+
NormalizeCoordinates,
19+
ImageDetailCalculator,
20+
} from '@gui-agent/shared/types';
1721
import {
1822
assembleSystemPrompt,
1923
isSystemPromptTemplate,
20-
defaultNormalizeCoords,
2124
normalizeActionCoords,
2225
sleep,
2326
} from '@gui-agent/shared/utils';
2427
import { GUI_ADAPTED_TOOL_NAME } from './constants';
2528
import { convertToAgentUIAction, createGUIErrorResponse } from './utils';
29+
import { defaultNormalizeCoords, defaultDetailCalculator } from './defaultImpls';
2630

2731
const defaultLogger = new ConsoleLogger('[GUIAgent]', LogLevel.DEBUG);
2832

@@ -31,6 +35,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
3135

3236
private operator: Operator | undefined;
3337
private normalizeCoordinates: NormalizeCoordinates;
38+
private detailCalculator: ImageDetailCalculator;
3439
private loopIntervalInMs: number;
3540

3641
constructor(config: GUIAgentConfig<T>) {
@@ -40,6 +45,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
4045
systemPrompt,
4146
customeActionParser,
4247
normalizeCoordinates,
48+
detailCalculator,
4349
maxLoopCount,
4450
loopIntervalInMs = 500,
4551
} = config;
@@ -69,6 +75,8 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
6975
});
7076
this.operator = operator;
7177
this.normalizeCoordinates = normalizeCoordinates ?? defaultNormalizeCoords;
78+
// Default detail calculator implementation
79+
this.detailCalculator = detailCalculator ?? defaultDetailCalculator;
7280
this.loopIntervalInMs = loopIntervalInMs;
7381
this.logger = this.logger.spawn('[GUIAgent]');
7482
}
@@ -163,11 +171,17 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
163171
return;
164172
}
165173

174+
const { width: imageWidth, height: imageHeight } = base64Tool.getDimensions() || {
175+
width: -1,
176+
height: -1,
177+
};
178+
166179
const content: ChatCompletionContentPart[] = [
167180
{
168181
type: 'image_url',
169182
image_url: {
170183
url: base64Uri,
184+
detail: this.detailCalculator(imageWidth, imageHeight),
171185
},
172186
},
173187
];
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
import { Coordinates, ImageDetailCalculator, NormalizeCoordinates } from '@gui-agent/shared/types';
6+
7+
/**
8+
* Default coordinate normalization function
9+
* Normalizes raw coordinates by dividing by 1000 (simple scaling)
10+
* @param rawCoords - The raw coordinates to normalize
11+
* @returns Object containing normalized coordinates
12+
*/
13+
export const defaultNormalizeCoords: NormalizeCoordinates = (rawCoords: Coordinates) => {
14+
if (!rawCoords.raw) {
15+
return { normalized: rawCoords };
16+
}
17+
const normalizedCoords = {
18+
...rawCoords,
19+
normalized: {
20+
x: rawCoords.raw.x / 1000,
21+
y: rawCoords.raw.y / 1000,
22+
},
23+
};
24+
return { normalized: normalizedCoords };
25+
};
26+
27+
/**
28+
* Default implementation for detail calculation based on pixel count
29+
* detail:low mode: 1,048,576 px (1024×1024)
30+
* detail:high mode: 4,014,080 px (2048×1960)
31+
*/
32+
export const defaultDetailCalculator: ImageDetailCalculator = (
33+
width: number,
34+
height: number,
35+
): 'low' | 'high' | 'auto' => {
36+
const LOW_DETAIL_THRESHOLD = 1024 * 1024; // 1,048,576 px
37+
const HIGH_DETAIL_THRESHOLD = 2048 * 1960; // 4,014,080 px
38+
39+
const pixelCount = width * height;
40+
if (pixelCount <= LOW_DETAIL_THRESHOLD) {
41+
return 'low';
42+
} else if (pixelCount <= HIGH_DETAIL_THRESHOLD) {
43+
return 'high';
44+
} else {
45+
// For images larger than high detail threshold, use high detail
46+
return 'auto';
47+
}
48+
};

multimodal/gui-agent/shared/src/types/agents.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ export type ExecuteOutput = {
6464
url?: string; // url of the page
6565
} & Record<string, any>;
6666

67+
/**
68+
* Function type for calculating detail level based on image dimensions
69+
*/
70+
export type ImageDetailCalculator = (width: number, height: number) => 'low' | 'high' | 'auto';
71+
6772
export interface ScreenshotOutput extends ExecuteOutput {
6873
/** screenshot base64, `keep screenshot size as physical pixels` */
6974
base64: string;
@@ -113,6 +118,8 @@ export interface GUIAgentConfig<TOperator> extends AgentOptions {
113118
customeActionParser?: CustomActionParser;
114119
/** The function to normalize raw coordinates */
115120
normalizeCoordinates?: NormalizeCoordinates;
121+
/** The function to calculate detail level based on image dimensions */
122+
detailCalculator?: ImageDetailCalculator;
116123
/** Maximum number of turns for Agent to execute, @default 1000 */
117124
maxLoopCount?: number;
118125
/** Time interval between two loop iterations (in milliseconds), @default 0 */

multimodal/gui-agent/shared/src/utils/coordinateNormalizer.ts

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,6 @@
55

66
import { BaseAction, Coordinates, NormalizeCoordinates } from '../types';
77

8-
/**
9-
* Default coordinate normalization function
10-
* Normalizes raw coordinates by dividing by 1000 (simple scaling)
11-
* @param rawCoords - The raw coordinates to normalize
12-
* @returns Object containing normalized coordinates
13-
*/
14-
export const defaultNormalizeCoords: NormalizeCoordinates = (rawCoords: Coordinates) => {
15-
if (!rawCoords.raw) {
16-
return { normalized: rawCoords };
17-
}
18-
const normalizedCoords = {
19-
...rawCoords,
20-
normalized: {
21-
x: rawCoords.raw.x / 1000,
22-
y: rawCoords.raw.y / 1000,
23-
},
24-
};
25-
return { normalized: normalizedCoords };
26-
};
27-
288
/**
299
* Normalizes coordinates in a BaseAction object
3010
* Processes point, start, and end coordinate fields if they exist

0 commit comments

Comments
 (0)