Skip to content

Commit 97ad8aa

Browse files
authored
feat(tarko): add metadata field to EnvironmentInputEvent (#1272)
1 parent 23d73a5 commit 97ad8aa

File tree

2 files changed

+108
-1
lines changed

2 files changed

+108
-1
lines changed

multimodal/agent-tars/core/src/browser/browser-gui-agent.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,9 @@ wait() - Wait 5 seconds and take a scree
245245
const event = this.eventStream.createEvent('environment_input', {
246246
content: markdown,
247247
description: 'Page Content After Browser Action',
248+
metadata: {
249+
type: 'text',
250+
},
248251
});
249252

250253
// Send the event
@@ -339,6 +342,9 @@ wait() - Wait 5 seconds and take a scree
339342
},
340343
],
341344
description: 'Browser Screenshot',
345+
metadata: {
346+
type: 'screenshot',
347+
},
342348
});
343349

344350
return eventStream.sendEvent(event);
@@ -397,6 +403,10 @@ wait() - Wait 5 seconds and take a scree
397403
},
398404
],
399405
description: 'Browser Screenshot',
406+
metadata: {
407+
type: 'screenshot',
408+
devicePixelRatio: await this.getDevicePixelRatio(),
409+
},
400410
});
401411

402412
eventStream.sendEvent(event);
@@ -593,6 +603,20 @@ wait() - Wait 5 seconds and take a scree
593603
}
594604
}
595605

606+
/**
607+
* Get the device pixel ratio from the browser page
608+
*/
609+
private async getDevicePixelRatio(): Promise<number> {
610+
try {
611+
const page = await this.getPage();
612+
const devicePixelRatio = await page.evaluate(() => window.devicePixelRatio);
613+
return devicePixelRatio || 1;
614+
} catch (error) {
615+
this.logger.warn('Failed to get device pixel ratio, defaulting to 1:', error);
616+
return 1;
617+
}
618+
}
619+
596620
/**
597621
* Get access to the underlying Puppeteer page
598622
* This allows custom browser tools to be implemented

multimodal/tarko/agent-interface/src/agent-event-stream.ts

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,53 @@ export namespace AgentEventStream {
339339
status: AgentStatus;
340340
}
341341

342+
/**
343+
* Base metadata interface for environment input events
344+
*/
345+
export interface BaseEnvironmentInputMetadata {
346+
/** Type of environment input */
347+
type: string;
348+
}
349+
350+
/**
351+
* Screenshot-specific metadata
352+
*/
353+
export interface ScreenshotMetadata extends BaseEnvironmentInputMetadata {
354+
type: 'screenshot';
355+
/** Device pixel ratio for the screenshot */
356+
devicePixelRatio?: number;
357+
}
358+
359+
/**
360+
* Text content metadata
361+
*/
362+
export interface TextMetadata extends BaseEnvironmentInputMetadata {
363+
type: 'text';
364+
}
365+
366+
/**
367+
* Codebase metadata
368+
*/
369+
export interface CodebaseMetadata extends BaseEnvironmentInputMetadata {
370+
type: 'codebase';
371+
}
372+
373+
/**
374+
* Generic metadata for other types
375+
*/
376+
export interface GenericMetadata extends BaseEnvironmentInputMetadata {
377+
type: string;
378+
}
379+
380+
/**
381+
* Union type for all environment input metadata types
382+
*/
383+
export type EnvironmentInputMetadata =
384+
| ScreenshotMetadata
385+
| TextMetadata
386+
| CodebaseMetadata
387+
| GenericMetadata;
388+
342389
/**
343390
* Environment input event - for injecting contextual information
344391
*
@@ -351,8 +398,44 @@ export namespace AgentEventStream {
351398
/** The environment content (can be multimodal) */
352399
content: string | ChatCompletionContentPart[];
353400

354-
/** Optional description of the environment input */
401+
/**
402+
* Optional description of the environment input.
403+
* Description is used in Message History for constructing context,
404+
* while metadata is not included in the context.
405+
*/
355406
description?: string;
407+
408+
/**
409+
* Optional metadata for the environment input.
410+
* Metadata provides structured information about the input type and properties
411+
* but is NOT included in Message History context construction.
412+
*/
413+
metadata?: EnvironmentInputMetadata;
414+
}
415+
416+
/**
417+
* Type guard function to check if metadata is screenshot metadata
418+
*/
419+
export function isScreenshotMetadata(
420+
metadata: EnvironmentInputMetadata,
421+
): metadata is ScreenshotMetadata {
422+
return metadata.type === 'screenshot';
423+
}
424+
425+
/**
426+
* Type guard function to check if an event is an environment input event
427+
*/
428+
export function isEnvironmentInputEvent(event: Event): event is EnvironmentInputEvent {
429+
return event.type === 'environment_input';
430+
}
431+
432+
/**
433+
* Type guard function to check if an environment input event has screenshot metadata
434+
*/
435+
export function hasScreenshotMetadata(
436+
event: EnvironmentInputEvent,
437+
): event is EnvironmentInputEvent & { metadata: ScreenshotMetadata } {
438+
return event.metadata !== undefined && isScreenshotMetadata(event.metadata);
356439
}
357440

358441
/**

0 commit comments

Comments
 (0)