From b490ad070c68baa2db9e4f79c6365621d66eac92 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Fri, 24 Oct 2025 07:42:55 +0530 Subject: [PATCH 1/4] feat: support multimodal tool results and improve tool message handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added a temporary `ToolResult` type that mirrors the structure returned by tools (text, image data, URLs, errors). - Implemented `convertToolPartToApiContentPart` to translate each tool output part into the format expected by the OpenAI chat completion API. - Updated `CompletionMessagesBuilder.addToolMessage` to accept a full `ToolResult` instead of a plain string and to: - Detect multimodal content (base64 images, image URLs) and build a structured `content` array. - Properly handle plain‑text results, tool execution errors, and unexpected formats with sensible fallbacks. - Cast the final content to `any` for the `tool` role as required by the API. - Modified `postMessageProcessing` to pass the raw tool result (`result as any`) to `addToolMessage`, avoiding premature extraction of only the first text part. - Refactored several formatting and type‑annotation sections: - Added multiline guard for empty user messages to insert a placeholder. - Split the image URL construction into a clearer multiline object. - Adjusted method signatures and added minor line‑breaks for readability. - Included extensive comments explaining the new logic and edge‑case handling. These changes enable the chat system to handle richer tool outputs (e.g., images, mixed content) and provide more robust error handling. --- web-app/src/lib/completion.ts | 2 +- web-app/src/lib/messages.ts | 86 +++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/web-app/src/lib/completion.ts b/web-app/src/lib/completion.ts index e602ff88ee..e72e9e6b47 100644 --- a/web-app/src/lib/completion.ts +++ b/web-app/src/lib/completion.ts @@ -543,7 +543,7 @@ export const postMessageProcessing = async ( }, ], } - builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id) + builder.addToolMessage(result as any, toolCall.id) // update message metadata } return message diff --git a/web-app/src/lib/messages.ts b/web-app/src/lib/messages.ts index 3361e2703d..7f56e2ac60 100644 --- a/web-app/src/lib/messages.ts +++ b/web-app/src/lib/messages.ts @@ -6,6 +6,48 @@ import { removeReasoningContent } from '@/utils/reasoning' type ThreadContent = NonNullable[number] +// Define a temporary type for the expected tool result shape (ToolResult as before) +type ToolResult = { + content: Array<{ + type?: string + text?: string + data?: string + image_url?: { url: string; detail?: string } + }> + error?: string +} + +// Helper function to convert the tool's output part into an API content part +const convertToolPartToApiContentPart = (part: ToolResult['content'][0]) => { + if (part.text) { + return { type: 'text', text: part.text } + } + + // Handle base64 image data + if (part.data) { + // Assume default image type, though a proper tool should return the mime type + const mimeType = + part.type === 'image' ? 'image/png' : part.type || 'image/png' + const dataUrl = `data:${mimeType};base64,${part.data}` + + return { + type: 'image_url', + image_url: { + url: dataUrl, + detail: 'auto', + }, + } + } + + // Handle pre-formatted image URL + if (part.image_url) { + return { type: 'image_url', image_url: part.image_url } + } + + // Fallback to text stringification for structured but unhandled data + return { type: 'text', text: JSON.stringify(part) } +} + /** * @fileoverview Helper functions for creating chat completion request. * These functions are used to create chat completion request objects @@ -26,7 +68,11 @@ export class CompletionMessagesBuilder { .map((msg) => { const param = this.toCompletionParamFromThread(msg) // In constructor context, normalize empty user text to a placeholder - if (param.role === 'user' && typeof param.content === 'string' && param.content === '') { + if ( + param.role === 'user' && + typeof param.content === 'string' && + param.content === '' + ) { return { ...param, content: '.' } } return param @@ -35,7 +81,9 @@ export class CompletionMessagesBuilder { } // Normalize a ThreadMessage into a ChatCompletionMessageParam for Token.js - private toCompletionParamFromThread(msg: ThreadMessage): ChatCompletionMessageParam { + private toCompletionParamFromThread( + msg: ThreadMessage + ): ChatCompletionMessageParam { if (msg.role === 'assistant') { return { role: 'assistant', @@ -60,7 +108,10 @@ export class CompletionMessagesBuilder { if (part.type === ContentType.Image) { return { type: 'image_url' as const, - image_url: { url: part.image_url?.url || '', detail: part.image_url?.detail || 'auto' }, + image_url: { + url: part.image_url?.url || '', + detail: part.image_url?.detail || 'auto', + }, } } // Fallback for unknown content types @@ -113,10 +164,35 @@ export class CompletionMessagesBuilder { * @param content - The content of the tool message. * @param toolCallId - The ID of the tool call associated with the message. */ - addToolMessage(content: string, toolCallId: string) { + addToolMessage(result: ToolResult, toolCallId: string) { + let content: string | any[] = '' + + // Check for multimodal content (more than just a simple text string) + const hasMultimodalContent = result.content?.some( + (p) => p.data || p.image_url + ) + + if (hasMultimodalContent) { + // Build the structured content array + content = result.content.map(convertToolPartToApiContentPart) + } else if (result.content?.[0]?.text) { + // Standard text case + content = result.content[0].text + } else if (result.error) { + // Error case + content = `Tool execution failed: ${result.error}` + } else { + // Fallback: serialize the whole result structure if content is unexpected + try { + content = JSON.stringify(result) + } catch { + content = 'Tool call completed, unexpected output format.' + } + } this.messages.push({ role: 'tool', - content: content, + // for role 'tool', need to use 'as ChatCompletionMessageParam' + content: content as any, tool_call_id: toolCallId, }) } From 4dcca1c63c2c66330814cf069918d575f0abe2b3 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Fri, 24 Oct 2025 07:49:14 +0530 Subject: [PATCH 2/4] Satisfy ts linter --- web-app/src/lib/completion.ts | 4 ++-- web-app/src/lib/messages.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/web-app/src/lib/completion.ts b/web-app/src/lib/completion.ts index e72e9e6b47..d722340245 100644 --- a/web-app/src/lib/completion.ts +++ b/web-app/src/lib/completion.ts @@ -32,7 +32,7 @@ type ExtendedConfigOptions = ConfigOptions & { } import { ulid } from 'ulidx' import { MCPTool } from '@/types/completion' -import { CompletionMessagesBuilder } from './messages' +import { CompletionMessagesBuilder, ToolResult } from './messages' import { ChatCompletionMessageToolCall } from 'openai/resources' import { ExtensionManager } from './extension' import { useAppState } from '@/hooks/useAppState' @@ -543,7 +543,7 @@ export const postMessageProcessing = async ( }, ], } - builder.addToolMessage(result as any, toolCall.id) + builder.addToolMessage(result as ToolResult, toolCall.id) // update message metadata } return message diff --git a/web-app/src/lib/messages.ts b/web-app/src/lib/messages.ts index 7f56e2ac60..386dbf4c47 100644 --- a/web-app/src/lib/messages.ts +++ b/web-app/src/lib/messages.ts @@ -7,7 +7,7 @@ import { removeReasoningContent } from '@/utils/reasoning' type ThreadContent = NonNullable[number] // Define a temporary type for the expected tool result shape (ToolResult as before) -type ToolResult = { +export type ToolResult = { content: Array<{ type?: string text?: string From f79fe7e330056a32d685fcef51065085bfe6a210 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Fri, 24 Oct 2025 08:39:43 +0530 Subject: [PATCH 3/4] Make ts linter happy x2 --- web-app/src/lib/messages.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/web-app/src/lib/messages.ts b/web-app/src/lib/messages.ts index 386dbf4c47..8e9dc649e6 100644 --- a/web-app/src/lib/messages.ts +++ b/web-app/src/lib/messages.ts @@ -1,3 +1,4 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ import { ChatCompletionMessageParam } from 'token.js' import { ChatCompletionMessageToolCall } from 'openai/resources' import { ThreadMessage, ContentType } from '@janhq/core' From 16a6384100ad19b250838a1c06a9f079b69bd1df Mon Sep 17 00:00:00 2001 From: Faisal Amir Date: Fri, 24 Oct 2025 10:21:08 +0700 Subject: [PATCH 4/4] chore: update test message creation --- web-app/src/lib/messages.ts | 47 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/web-app/src/lib/messages.ts b/web-app/src/lib/messages.ts index 8e9dc649e6..c06a67bad0 100644 --- a/web-app/src/lib/messages.ts +++ b/web-app/src/lib/messages.ts @@ -162,32 +162,37 @@ export class CompletionMessagesBuilder { /** * Add a tool message to the messages array. - * @param content - The content of the tool message. + * @param content - The content of the tool message (string or ToolResult object). * @param toolCallId - The ID of the tool call associated with the message. */ - addToolMessage(result: ToolResult, toolCallId: string) { + addToolMessage(result: string | ToolResult, toolCallId: string) { let content: string | any[] = '' - // Check for multimodal content (more than just a simple text string) - const hasMultimodalContent = result.content?.some( - (p) => p.data || p.image_url - ) - - if (hasMultimodalContent) { - // Build the structured content array - content = result.content.map(convertToolPartToApiContentPart) - } else if (result.content?.[0]?.text) { - // Standard text case - content = result.content[0].text - } else if (result.error) { - // Error case - content = `Tool execution failed: ${result.error}` + // Handle simple string case + if (typeof result === 'string') { + content = result } else { - // Fallback: serialize the whole result structure if content is unexpected - try { - content = JSON.stringify(result) - } catch { - content = 'Tool call completed, unexpected output format.' + // Check for multimodal content (more than just a simple text string) + const hasMultimodalContent = result.content?.some( + (p) => p.data || p.image_url + ) + + if (hasMultimodalContent) { + // Build the structured content array + content = result.content.map(convertToolPartToApiContentPart) + } else if (result.content?.[0]?.text) { + // Standard text case + content = result.content[0].text + } else if (result.error) { + // Error case + content = `Tool execution failed: ${result.error}` + } else { + // Fallback: serialize the whole result structure if content is unexpected + try { + content = JSON.stringify(result) + } catch { + content = 'Tool call completed, unexpected output format.' + } } } this.messages.push({