Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions pkg/agent/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,10 +466,24 @@ func (cb *ContextBuilder) BuildMessages(

// Add current user message
if strings.TrimSpace(currentMessage) != "" {
messages = append(messages, providers.Message{
userMsg := providers.Message{
Role: "user",
Content: currentMessage,
})
}

// Attach image content parts for multimodal messages
if len(media) > 0 {
imageParts := providers.LoadMediaAsContentParts(media)
if len(imageParts) > 0 {
// Build ContentParts: text block first, then image blocks
userMsg.ContentParts = append(
[]providers.ContentBlock{{Type: "text", Text: currentMessage}},
imageParts...,
)
}
}

messages = append(messages, userMsg)
}

return messages
Expand Down
35 changes: 26 additions & 9 deletions pkg/agent/loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,15 @@ type AgentLoop struct {

// processOptions configures how a message is processed
type processOptions struct {
SessionKey string // Session identifier for history/context
Channel string // Target channel for tool execution
ChatID string // Target chat ID for tool execution
UserMessage string // User message content (may include prefix)
DefaultResponse string // Response when LLM returns empty
EnableSummary bool // Whether to trigger summarization
SendResponse bool // Whether to send response via bus
NoHistory bool // If true, don't load session history (for heartbeat)
SessionKey string // Session identifier for history/context
Channel string // Target channel for tool execution
ChatID string // Target chat ID for tool execution
UserMessage string // User message content (may include prefix)
Media []string // Local file paths for attached images
DefaultResponse string // Response when LLM returns empty
EnableSummary bool // Whether to trigger summarization
SendResponse bool // Whether to send response via bus
NoHistory bool // If true, don't load session history (for heartbeat)
}

const defaultResponse = "I've completed processing but have no response to give. Increase `max_tool_iterations` in config.json."
Expand Down Expand Up @@ -492,11 +493,27 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
"matched_by": route.MatchedBy,
})

// Resolve media refs to local file paths for multimodal support
var mediaPaths []string
if len(msg.Media) > 0 && al.mediaStore != nil {
for _, ref := range msg.Media {
if localPath, err := al.mediaStore.Resolve(ref); err == nil {
mediaPaths = append(mediaPaths, localPath)
} else {
logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{
"ref": ref,
"error": err.Error(),
})
}
}
}

return al.runAgentLoop(ctx, agent, processOptions{
SessionKey: sessionKey,
Channel: msg.Channel,
ChatID: msg.ChatID,
UserMessage: msg.Content,
Media: mediaPaths,
DefaultResponse: defaultResponse,
EnableSummary: true,
SendResponse: false,
Expand Down Expand Up @@ -603,7 +620,7 @@ func (al *AgentLoop) runAgentLoop(
history,
summary,
opts.UserMessage,
nil,
opts.Media,
opts.Channel,
opts.ChatID,
)
Expand Down
39 changes: 39 additions & 0 deletions pkg/providers/anthropic/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,24 @@ func buildParams(
anthropicMessages = append(anthropicMessages,
anthropic.NewUserMessage(anthropic.NewToolResultBlock(msg.ToolCallID, msg.Content, false)),
)
} else if len(msg.ContentParts) > 0 {
// Multimodal message with text + images
var blocks []anthropic.ContentBlockParamUnion
for _, part := range msg.ContentParts {
switch part.Type {
case "text":
blocks = append(blocks, anthropic.NewTextBlock(part.Text))
case "image_url":
if part.ImageURL != nil {
if mediaType, b64Data, ok := parseDataURL(part.ImageURL.URL); ok {
blocks = append(blocks, anthropic.NewImageBlockBase64(mediaType, b64Data))
}
}
}
}
if len(blocks) > 0 {
anthropicMessages = append(anthropicMessages, anthropic.NewUserMessage(blocks...))
}
} else {
anthropicMessages = append(anthropicMessages,
anthropic.NewUserMessage(anthropic.NewTextBlock(msg.Content)),
Expand Down Expand Up @@ -273,3 +291,24 @@ func normalizeBaseURL(apiBase string) string {

return base
}

// parseDataURL extracts the media type and base64 data from a data URL.
// Expected format: "data:<mediaType>;base64,<data>"
// Returns (mediaType, base64Data, ok).
func parseDataURL(url string) (string, string, bool) {
if !strings.HasPrefix(url, "data:") {
return "", "", false
}
// Strip "data:" prefix
rest := url[5:]
semicolon := strings.Index(rest, ";base64,")
if semicolon < 0 {
return "", "", false
}
mediaType := rest[:semicolon]
b64Data := rest[semicolon+8:] // len(";base64,") == 8
if mediaType == "" || b64Data == "" {
return "", "", false
}
return mediaType, b64Data, true
}
87 changes: 87 additions & 0 deletions pkg/providers/media.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package providers

import (
"encoding/base64"
"fmt"
"os"
"path/filepath"
"strings"

"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/providers/protocoltypes"
)

// maxImageFileSize is the maximum raw file size for inline base64 images (5 MB).
const maxImageFileSize = 5 * 1024 * 1024

// supportedImageExts maps lowercase file extensions to MIME types.
var supportedImageExts = map[string]string{
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
}

// LoadMediaAsContentParts converts a list of local file paths (or URLs)
// into ContentBlock slices suitable for multimodal LLM requests.
//
// - Local files are read, base64-encoded, and wrapped in a data URL.
// - http(s) URLs are passed through as image_url blocks directly.
// - Non-image files and files exceeding maxImageFileSize are skipped.
func LoadMediaAsContentParts(paths []string) []protocoltypes.ContentBlock {
var parts []protocoltypes.ContentBlock

for _, p := range paths {
// HTTP(S) URL — pass through
if strings.HasPrefix(p, "http://") || strings.HasPrefix(p, "https://") {
parts = append(parts, protocoltypes.ContentBlock{
Type: "image_url",
ImageURL: &protocoltypes.ImageURL{URL: p},
})
continue
}

// Local file
ext := strings.ToLower(filepath.Ext(p))
mimeType, ok := supportedImageExts[ext]
if !ok {
logger.DebugCF("media", "Skipping non-image file", map[string]any{"path": p, "ext": ext})
continue
}

info, err := os.Stat(p)
if err != nil {
logger.WarnCF("media", "Cannot stat media file", map[string]any{"path": p, "error": err.Error()})
continue
}
if info.Size() > maxImageFileSize {
logger.WarnCF("media", "Skipping oversized image", map[string]any{
"path": p,
"size": info.Size(),
"max_bytes": maxImageFileSize,
})
continue
}

data, err := os.ReadFile(p)
if err != nil {
logger.WarnCF("media", "Cannot read media file", map[string]any{"path": p, "error": err.Error()})
continue
}

dataURL := fmt.Sprintf("data:%s;base64,%s", mimeType, base64.StdEncoding.EncodeToString(data))
parts = append(parts, protocoltypes.ContentBlock{
Type: "image_url",
ImageURL: &protocoltypes.ImageURL{URL: dataURL},
})

logger.DebugCF("media", "Loaded image", map[string]any{
"path": p,
"mime_type": mimeType,
"size": info.Size(),
})
}

return parts
}
35 changes: 31 additions & 4 deletions pkg/providers/openai_compat/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,21 @@ func parseResponse(body []byte) (*LLMResponse, error) {
}, nil
}

// openaiContentPart represents a content part in the OpenAI Vision API format.
type openaiContentPart struct {
Type string `json:"type"` // "text" or "image_url"
Text string `json:"text,omitempty"`
ImageURL *protocoltypes.ImageURL `json:"image_url,omitempty"`
}

// openaiMessage is the wire-format message for OpenAI-compatible APIs.
// It mirrors protocoltypes.Message but omits SystemParts, which is an
// internal field that would be unknown to third-party endpoints.
// It mirrors protocoltypes.Message but omits SystemParts/ContentParts,
// which are internal fields that would be unknown to third-party endpoints.
// Content is `any` to support both string (text-only) and []openaiContentPart
// (multimodal) as required by the OpenAI Vision API.
type openaiMessage struct {
Role string `json:"role"`
Content string `json:"content"`
Content any `json:"content"`
ReasoningContent string `json:"reasoning_content,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
Expand All @@ -299,12 +308,30 @@ type openaiMessage struct {
// stripSystemParts converts []Message to []openaiMessage, dropping the
// SystemParts field so it doesn't leak into the JSON payload sent to
// OpenAI-compatible APIs (some strict endpoints reject unknown fields).
// When ContentParts is present, Content is serialized as an array of
// content parts (OpenAI Vision API format).
func stripSystemParts(messages []Message) []openaiMessage {
out := make([]openaiMessage, len(messages))
for i, m := range messages {
var content any = m.Content

// Convert multimodal ContentParts to OpenAI Vision format
if len(m.ContentParts) > 0 {
parts := make([]openaiContentPart, 0, len(m.ContentParts))
for _, p := range m.ContentParts {
switch p.Type {
case "text":
parts = append(parts, openaiContentPart{Type: "text", Text: p.Text})
case "image_url":
parts = append(parts, openaiContentPart{Type: "image_url", ImageURL: p.ImageURL})
}
}
content = parts
}

out[i] = openaiMessage{
Role: m.Role,
Content: m.Content,
Content: content,
ReasoningContent: m.ReasoningContent,
ToolCalls: m.ToolCalls,
ToolCallID: m.ToolCallID,
Expand Down
14 changes: 11 additions & 3 deletions pkg/providers/protocoltypes/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,26 @@ type CacheControl struct {
Type string `json:"type"` // "ephemeral"
}

// ContentBlock represents a structured segment of a system message.
// ImageURL holds a URL for an image content block.
// The URL can be an http(s) link or a base64 data URL (e.g. "data:image/jpeg;base64,...").
type ImageURL struct {
URL string `json:"url"`
}

// ContentBlock represents a structured segment of a message.
// Adapters that understand SystemParts can use these blocks to set
// per-block cache control (e.g. Anthropic's cache_control: ephemeral).
type ContentBlock struct {
Type string `json:"type"` // "text"
Text string `json:"text"`
Type string `json:"type"` // "text" or "image_url"
Text string `json:"text,omitempty"`
ImageURL *ImageURL `json:"image_url,omitempty"`
CacheControl *CacheControl `json:"cache_control,omitempty"`
}

type Message struct {
Role string `json:"role"`
Content string `json:"content"`
ContentParts []ContentBlock `json:"content_parts,omitempty"` // multimodal content (text + images)
ReasoningContent string `json:"reasoning_content,omitempty"`
SystemParts []ContentBlock `json:"system_parts,omitempty"` // structured system blocks for cache-aware adapters
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
Expand Down