Skip to content

Commit 2561fcd

Browse files
qnixsynapseurmauur
andauthored
feat: support multimodal tool results and improve tool message handling (#6816)
* feat: support multimodal tool results and improve tool message handling - Added a temporary `ToolResult` type that mirrors the structure returned by tools (text, image data, URLs, errors). - Implemented `convertToolPartToApiContentPart` to translate each tool output part into the format expected by the OpenAI chat completion API. - Updated `CompletionMessagesBuilder.addToolMessage` to accept a full `ToolResult` instead of a plain string and to: - Detect multimodal content (base64 images, image URLs) and build a structured `content` array. - Properly handle plain‑text results, tool execution errors, and unexpected formats with sensible fallbacks. - Cast the final content to `any` for the `tool` role as required by the API. - Modified `postMessageProcessing` to pass the raw tool result (`result as any`) to `addToolMessage`, avoiding premature extraction of only the first text part. - Refactored several formatting and type‑annotation sections: - Added multiline guard for empty user messages to insert a placeholder. - Split the image URL construction into a clearer multiline object. - Adjusted method signatures and added minor line‑breaks for readability. - Included extensive comments explaining the new logic and edge‑case handling. These changes enable the chat system to handle richer tool outputs (e.g., images, mixed content) and provide more robust error handling. * Satisfy ts linter * Make ts linter happy x2 * chore: update test message creation --------- Co-authored-by: Faisal Amir <[email protected]>
1 parent 28ed5e2 commit 2561fcd

File tree

2 files changed

+90
-8
lines changed

2 files changed

+90
-8
lines changed

web-app/src/lib/completion.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ type ExtendedConfigOptions = ConfigOptions & {
3232
}
3333
import { ulid } from 'ulidx'
3434
import { MCPTool } from '@/types/completion'
35-
import { CompletionMessagesBuilder } from './messages'
35+
import { CompletionMessagesBuilder, ToolResult } from './messages'
3636
import { ChatCompletionMessageToolCall } from 'openai/resources'
3737
import { ExtensionManager } from './extension'
3838
import { useAppState } from '@/hooks/useAppState'
@@ -543,7 +543,7 @@ export const postMessageProcessing = async (
543543
},
544544
],
545545
}
546-
builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
546+
builder.addToolMessage(result as ToolResult, toolCall.id)
547547
// update message metadata
548548
}
549549
return message

web-app/src/lib/messages.ts

Lines changed: 88 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* eslint-disable @typescript-eslint/no-explicit-any */
12
import { ChatCompletionMessageParam } from 'token.js'
23
import { ChatCompletionMessageToolCall } from 'openai/resources'
34
import { ThreadMessage, ContentType } from '@janhq/core'
@@ -6,6 +7,48 @@ import { removeReasoningContent } from '@/utils/reasoning'
67

78
type ThreadContent = NonNullable<ThreadMessage['content']>[number]
89

10+
// Define a temporary type for the expected tool result shape (ToolResult as before)
11+
export type ToolResult = {
12+
content: Array<{
13+
type?: string
14+
text?: string
15+
data?: string
16+
image_url?: { url: string; detail?: string }
17+
}>
18+
error?: string
19+
}
20+
21+
// Helper function to convert the tool's output part into an API content part
22+
const convertToolPartToApiContentPart = (part: ToolResult['content'][0]) => {
23+
if (part.text) {
24+
return { type: 'text', text: part.text }
25+
}
26+
27+
// Handle base64 image data
28+
if (part.data) {
29+
// Assume default image type, though a proper tool should return the mime type
30+
const mimeType =
31+
part.type === 'image' ? 'image/png' : part.type || 'image/png'
32+
const dataUrl = `data:${mimeType};base64,${part.data}`
33+
34+
return {
35+
type: 'image_url',
36+
image_url: {
37+
url: dataUrl,
38+
detail: 'auto',
39+
},
40+
}
41+
}
42+
43+
// Handle pre-formatted image URL
44+
if (part.image_url) {
45+
return { type: 'image_url', image_url: part.image_url }
46+
}
47+
48+
// Fallback to text stringification for structured but unhandled data
49+
return { type: 'text', text: JSON.stringify(part) }
50+
}
51+
952
/**
1053
* @fileoverview Helper functions for creating chat completion request.
1154
* These functions are used to create chat completion request objects
@@ -26,7 +69,11 @@ export class CompletionMessagesBuilder {
2669
.map<ChatCompletionMessageParam>((msg) => {
2770
const param = this.toCompletionParamFromThread(msg)
2871
// In constructor context, normalize empty user text to a placeholder
29-
if (param.role === 'user' && typeof param.content === 'string' && param.content === '') {
72+
if (
73+
param.role === 'user' &&
74+
typeof param.content === 'string' &&
75+
param.content === ''
76+
) {
3077
return { ...param, content: '.' }
3178
}
3279
return param
@@ -35,7 +82,9 @@ export class CompletionMessagesBuilder {
3582
}
3683

3784
// Normalize a ThreadMessage into a ChatCompletionMessageParam for Token.js
38-
private toCompletionParamFromThread(msg: ThreadMessage): ChatCompletionMessageParam {
85+
private toCompletionParamFromThread(
86+
msg: ThreadMessage
87+
): ChatCompletionMessageParam {
3988
if (msg.role === 'assistant') {
4089
return {
4190
role: 'assistant',
@@ -60,7 +109,10 @@ export class CompletionMessagesBuilder {
60109
if (part.type === ContentType.Image) {
61110
return {
62111
type: 'image_url' as const,
63-
image_url: { url: part.image_url?.url || '', detail: part.image_url?.detail || 'auto' },
112+
image_url: {
113+
url: part.image_url?.url || '',
114+
detail: part.image_url?.detail || 'auto',
115+
},
64116
}
65117
}
66118
// Fallback for unknown content types
@@ -110,13 +162,43 @@ export class CompletionMessagesBuilder {
110162

111163
/**
112164
* Add a tool message to the messages array.
113-
* @param content - The content of the tool message.
165+
* @param content - The content of the tool message (string or ToolResult object).
114166
* @param toolCallId - The ID of the tool call associated with the message.
115167
*/
116-
addToolMessage(content: string, toolCallId: string) {
168+
addToolMessage(result: string | ToolResult, toolCallId: string) {
169+
let content: string | any[] = ''
170+
171+
// Handle simple string case
172+
if (typeof result === 'string') {
173+
content = result
174+
} else {
175+
// Check for multimodal content (more than just a simple text string)
176+
const hasMultimodalContent = result.content?.some(
177+
(p) => p.data || p.image_url
178+
)
179+
180+
if (hasMultimodalContent) {
181+
// Build the structured content array
182+
content = result.content.map(convertToolPartToApiContentPart)
183+
} else if (result.content?.[0]?.text) {
184+
// Standard text case
185+
content = result.content[0].text
186+
} else if (result.error) {
187+
// Error case
188+
content = `Tool execution failed: ${result.error}`
189+
} else {
190+
// Fallback: serialize the whole result structure if content is unexpected
191+
try {
192+
content = JSON.stringify(result)
193+
} catch {
194+
content = 'Tool call completed, unexpected output format.'
195+
}
196+
}
197+
}
117198
this.messages.push({
118199
role: 'tool',
119-
content: content,
200+
// for role 'tool', need to use 'as ChatCompletionMessageParam'
201+
content: content as any,
120202
tool_call_id: toolCallId,
121203
})
122204
}

0 commit comments

Comments
 (0)