diff --git a/pkg/agent/context.go b/pkg/agent/context.go index 6fccbaf53f..782858a94f 100644 --- a/pkg/agent/context.go +++ b/pkg/agent/context.go @@ -466,10 +466,24 @@ func (cb *ContextBuilder) BuildMessages( // Add current user message if strings.TrimSpace(currentMessage) != "" { - messages = append(messages, providers.Message{ + userMsg := providers.Message{ Role: "user", Content: currentMessage, - }) + } + + // Attach image content parts for multimodal messages + if len(media) > 0 { + imageParts := providers.LoadMediaAsContentParts(media) + if len(imageParts) > 0 { + // Build ContentParts: text block first, then image blocks + userMsg.ContentParts = append( + []providers.ContentBlock{{Type: "text", Text: currentMessage}}, + imageParts..., + ) + } + } + + messages = append(messages, userMsg) } return messages diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index ac9b449a21..c5a0469d4d 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -47,14 +47,15 @@ type AgentLoop struct { // processOptions configures how a message is processed type processOptions struct { - SessionKey string // Session identifier for history/context - Channel string // Target channel for tool execution - ChatID string // Target chat ID for tool execution - UserMessage string // User message content (may include prefix) - DefaultResponse string // Response when LLM returns empty - EnableSummary bool // Whether to trigger summarization - SendResponse bool // Whether to send response via bus - NoHistory bool // If true, don't load session history (for heartbeat) + SessionKey string // Session identifier for history/context + Channel string // Target channel for tool execution + ChatID string // Target chat ID for tool execution + UserMessage string // User message content (may include prefix) + Media []string // Local file paths for attached images + DefaultResponse string // Response when LLM returns empty + EnableSummary bool // Whether to trigger summarization + SendResponse bool // Whether to send response via bus + NoHistory bool // If true, don't load session history (for heartbeat) } const defaultResponse = "I've completed processing but have no response to give. Increase `max_tool_iterations` in config.json." @@ -492,11 +493,27 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) "matched_by": route.MatchedBy, }) + // Resolve media refs to local file paths for multimodal support + var mediaPaths []string + if len(msg.Media) > 0 && al.mediaStore != nil { + for _, ref := range msg.Media { + if localPath, err := al.mediaStore.Resolve(ref); err == nil { + mediaPaths = append(mediaPaths, localPath) + } else { + logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{ + "ref": ref, + "error": err.Error(), + }) + } + } + } + return al.runAgentLoop(ctx, agent, processOptions{ SessionKey: sessionKey, Channel: msg.Channel, ChatID: msg.ChatID, UserMessage: msg.Content, + Media: mediaPaths, DefaultResponse: defaultResponse, EnableSummary: true, SendResponse: false, @@ -603,7 +620,7 @@ func (al *AgentLoop) runAgentLoop( history, summary, opts.UserMessage, - nil, + opts.Media, opts.Channel, opts.ChatID, ) diff --git a/pkg/providers/anthropic/provider.go b/pkg/providers/anthropic/provider.go index 1bb15f771f..31185a220f 100644 --- a/pkg/providers/anthropic/provider.go +++ b/pkg/providers/anthropic/provider.go @@ -132,6 +132,24 @@ func buildParams( anthropicMessages = append(anthropicMessages, anthropic.NewUserMessage(anthropic.NewToolResultBlock(msg.ToolCallID, msg.Content, false)), ) + } else if len(msg.ContentParts) > 0 { + // Multimodal message with text + images + var blocks []anthropic.ContentBlockParamUnion + for _, part := range msg.ContentParts { + switch part.Type { + case "text": + blocks = append(blocks, anthropic.NewTextBlock(part.Text)) + case "image_url": + if part.ImageURL != nil { + if mediaType, b64Data, ok := parseDataURL(part.ImageURL.URL); ok { + blocks = append(blocks, anthropic.NewImageBlockBase64(mediaType, b64Data)) + } + } + } + } + if len(blocks) > 0 { + anthropicMessages = append(anthropicMessages, anthropic.NewUserMessage(blocks...)) + } } else { anthropicMessages = append(anthropicMessages, anthropic.NewUserMessage(anthropic.NewTextBlock(msg.Content)), @@ -273,3 +291,24 @@ func normalizeBaseURL(apiBase string) string { return base } + +// parseDataURL extracts the media type and base64 data from a data URL. +// Expected format: "data:;base64," +// Returns (mediaType, base64Data, ok). +func parseDataURL(url string) (string, string, bool) { + if !strings.HasPrefix(url, "data:") { + return "", "", false + } + // Strip "data:" prefix + rest := url[5:] + semicolon := strings.Index(rest, ";base64,") + if semicolon < 0 { + return "", "", false + } + mediaType := rest[:semicolon] + b64Data := rest[semicolon+8:] // len(";base64,") == 8 + if mediaType == "" || b64Data == "" { + return "", "", false + } + return mediaType, b64Data, true +} diff --git a/pkg/providers/media.go b/pkg/providers/media.go new file mode 100644 index 0000000000..26ba7cdcb9 --- /dev/null +++ b/pkg/providers/media.go @@ -0,0 +1,87 @@ +package providers + +import ( + "encoding/base64" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/sipeed/picoclaw/pkg/logger" + "github.com/sipeed/picoclaw/pkg/providers/protocoltypes" +) + +// maxImageFileSize is the maximum raw file size for inline base64 images (5 MB). +const maxImageFileSize = 5 * 1024 * 1024 + +// supportedImageExts maps lowercase file extensions to MIME types. +var supportedImageExts = map[string]string{ + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", +} + +// LoadMediaAsContentParts converts a list of local file paths (or URLs) +// into ContentBlock slices suitable for multimodal LLM requests. +// +// - Local files are read, base64-encoded, and wrapped in a data URL. +// - http(s) URLs are passed through as image_url blocks directly. +// - Non-image files and files exceeding maxImageFileSize are skipped. +func LoadMediaAsContentParts(paths []string) []protocoltypes.ContentBlock { + var parts []protocoltypes.ContentBlock + + for _, p := range paths { + // HTTP(S) URL — pass through + if strings.HasPrefix(p, "http://") || strings.HasPrefix(p, "https://") { + parts = append(parts, protocoltypes.ContentBlock{ + Type: "image_url", + ImageURL: &protocoltypes.ImageURL{URL: p}, + }) + continue + } + + // Local file + ext := strings.ToLower(filepath.Ext(p)) + mimeType, ok := supportedImageExts[ext] + if !ok { + logger.DebugCF("media", "Skipping non-image file", map[string]any{"path": p, "ext": ext}) + continue + } + + info, err := os.Stat(p) + if err != nil { + logger.WarnCF("media", "Cannot stat media file", map[string]any{"path": p, "error": err.Error()}) + continue + } + if info.Size() > maxImageFileSize { + logger.WarnCF("media", "Skipping oversized image", map[string]any{ + "path": p, + "size": info.Size(), + "max_bytes": maxImageFileSize, + }) + continue + } + + data, err := os.ReadFile(p) + if err != nil { + logger.WarnCF("media", "Cannot read media file", map[string]any{"path": p, "error": err.Error()}) + continue + } + + dataURL := fmt.Sprintf("data:%s;base64,%s", mimeType, base64.StdEncoding.EncodeToString(data)) + parts = append(parts, protocoltypes.ContentBlock{ + Type: "image_url", + ImageURL: &protocoltypes.ImageURL{URL: dataURL}, + }) + + logger.DebugCF("media", "Loaded image", map[string]any{ + "path": p, + "mime_type": mimeType, + "size": info.Size(), + }) + } + + return parts +} diff --git a/pkg/providers/openai_compat/provider.go b/pkg/providers/openai_compat/provider.go index 3a18b8b16c..0263372856 100644 --- a/pkg/providers/openai_compat/provider.go +++ b/pkg/providers/openai_compat/provider.go @@ -285,12 +285,21 @@ func parseResponse(body []byte) (*LLMResponse, error) { }, nil } +// openaiContentPart represents a content part in the OpenAI Vision API format. +type openaiContentPart struct { + Type string `json:"type"` // "text" or "image_url" + Text string `json:"text,omitempty"` + ImageURL *protocoltypes.ImageURL `json:"image_url,omitempty"` +} + // openaiMessage is the wire-format message for OpenAI-compatible APIs. -// It mirrors protocoltypes.Message but omits SystemParts, which is an -// internal field that would be unknown to third-party endpoints. +// It mirrors protocoltypes.Message but omits SystemParts/ContentParts, +// which are internal fields that would be unknown to third-party endpoints. +// Content is `any` to support both string (text-only) and []openaiContentPart +// (multimodal) as required by the OpenAI Vision API. type openaiMessage struct { Role string `json:"role"` - Content string `json:"content"` + Content any `json:"content"` ReasoningContent string `json:"reasoning_content,omitempty"` ToolCalls []ToolCall `json:"tool_calls,omitempty"` ToolCallID string `json:"tool_call_id,omitempty"` @@ -299,12 +308,30 @@ type openaiMessage struct { // stripSystemParts converts []Message to []openaiMessage, dropping the // SystemParts field so it doesn't leak into the JSON payload sent to // OpenAI-compatible APIs (some strict endpoints reject unknown fields). +// When ContentParts is present, Content is serialized as an array of +// content parts (OpenAI Vision API format). func stripSystemParts(messages []Message) []openaiMessage { out := make([]openaiMessage, len(messages)) for i, m := range messages { + var content any = m.Content + + // Convert multimodal ContentParts to OpenAI Vision format + if len(m.ContentParts) > 0 { + parts := make([]openaiContentPart, 0, len(m.ContentParts)) + for _, p := range m.ContentParts { + switch p.Type { + case "text": + parts = append(parts, openaiContentPart{Type: "text", Text: p.Text}) + case "image_url": + parts = append(parts, openaiContentPart{Type: "image_url", ImageURL: p.ImageURL}) + } + } + content = parts + } + out[i] = openaiMessage{ Role: m.Role, - Content: m.Content, + Content: content, ReasoningContent: m.ReasoningContent, ToolCalls: m.ToolCalls, ToolCallID: m.ToolCallID, diff --git a/pkg/providers/protocoltypes/types.go b/pkg/providers/protocoltypes/types.go index 99f13334e7..a41dcb7ade 100644 --- a/pkg/providers/protocoltypes/types.go +++ b/pkg/providers/protocoltypes/types.go @@ -53,18 +53,26 @@ type CacheControl struct { Type string `json:"type"` // "ephemeral" } -// ContentBlock represents a structured segment of a system message. +// ImageURL holds a URL for an image content block. +// The URL can be an http(s) link or a base64 data URL (e.g. "data:image/jpeg;base64,..."). +type ImageURL struct { + URL string `json:"url"` +} + +// ContentBlock represents a structured segment of a message. // Adapters that understand SystemParts can use these blocks to set // per-block cache control (e.g. Anthropic's cache_control: ephemeral). type ContentBlock struct { - Type string `json:"type"` // "text" - Text string `json:"text"` + Type string `json:"type"` // "text" or "image_url" + Text string `json:"text,omitempty"` + ImageURL *ImageURL `json:"image_url,omitempty"` CacheControl *CacheControl `json:"cache_control,omitempty"` } type Message struct { Role string `json:"role"` Content string `json:"content"` + ContentParts []ContentBlock `json:"content_parts,omitempty"` // multimodal content (text + images) ReasoningContent string `json:"reasoning_content,omitempty"` SystemParts []ContentBlock `json:"system_parts,omitempty"` // structured system blocks for cache-aware adapters ToolCalls []ToolCall `json:"tool_calls,omitempty"`