diff --git a/pkg/agent/context.go b/pkg/agent/context.go index 8868d6bf4..6fccbaf53 100644 --- a/pkg/agent/context.go +++ b/pkg/agent/context.go @@ -465,11 +465,10 @@ func (cb *ContextBuilder) BuildMessages( messages = append(messages, history...) // Add current user message - if strings.TrimSpace(currentMessage) != "" || len(media) > 0 { + if strings.TrimSpace(currentMessage) != "" { messages = append(messages, providers.Message{ Role: "user", Content: currentMessage, - Media: media, }) } diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 29c39d396..88afa6119 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -8,11 +8,9 @@ package agent import ( "context" - "encoding/base64" "encoding/json" "errors" "fmt" - "os" "path/filepath" "strings" "sync" @@ -49,12 +47,11 @@ type AgentLoop struct { // processOptions configures how a message is processed type processOptions struct { - SessionKey string // Session identifier for history/context - Channel string // Target channel for tool execution - ChatID string // Target chat ID for tool execution - UserMessage string // User message content (may include prefix) - Media []string // Media URLs attached to the user message - DefaultResponse string // Response when LLM returns empty + SessionKey string // Session identifier for history/context + Channel string // Target channel for tool execution + ChatID string // Target chat ID for tool execution + UserMessage string // User message content (may include prefix) + DefaultResponse string // Response when LLM returns empty EnableSummary bool // Whether to trigger summarization SendResponse bool // Whether to send response via bus NoHistory bool // If true, don't load session history (for heartbeat) @@ -499,7 +496,6 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) Channel: msg.Channel, ChatID: msg.ChatID, UserMessage: msg.Content, - Media: msg.Media, DefaultResponse: defaultResponse, EnableSummary: true, SendResponse: false, @@ -606,11 +602,10 @@ func (al *AgentLoop) runAgentLoop( history, summary, opts.UserMessage, - opts.Media, + nil, opts.Channel, opts.ChatID, ) - messages = resolveMediaRefs(messages, al.mediaStore) // 3. Save user message to session agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage) @@ -1481,105 +1476,3 @@ func extractParentPeer(msg bus.InboundMessage) *routing.RoutePeer { } return &routing.RoutePeer{Kind: parentKind, ID: parentID} } - -// maxMediaFileSize is the maximum file size (20 MB) for media resolution. -// Files larger than this are skipped to prevent OOM under concurrent load. -const maxMediaFileSize = 20 * 1024 * 1024 - -// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs. -// Returns a new slice with resolved URLs; original messages are not mutated. -func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []providers.Message { - if store == nil { - return messages - } - - result := make([]providers.Message, len(messages)) - copy(result, messages) - - for i, m := range result { - if len(m.Media) == 0 { - continue - } - - resolved := make([]string, 0, len(m.Media)) - for _, ref := range m.Media { - if !strings.HasPrefix(ref, "media://") { - resolved = append(resolved, ref) - continue - } - - localPath, meta, err := store.ResolveWithMeta(ref) - if err != nil { - logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{ - "ref": ref, - "error": err.Error(), - }) - continue - } - - info, err := os.Stat(localPath) - if err != nil { - logger.WarnCF("agent", "Failed to stat media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - continue - } - if info.Size() > maxMediaFileSize { - logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ - "path": localPath, - "size": info.Size(), - "max_size": maxMediaFileSize, - }) - continue - } - - data, err := os.ReadFile(localPath) - if err != nil { - logger.WarnCF("agent", "Failed to read media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - continue - } - - mime := meta.ContentType - if mime == "" { - mime = mimeFromExtension(filepath.Ext(localPath)) - } - if mime == "" { - logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{ - "path": localPath, - "ext": filepath.Ext(localPath), - }) - continue - } - - dataURL := "data:" + mime + ";base64," + base64.StdEncoding.EncodeToString(data) - resolved = append(resolved, dataURL) - } - - result[i].Media = resolved - } - - return result -} - -// mimeFromExtension returns a MIME type for common image extensions. -// Returns empty string for unrecognized extensions. -func mimeFromExtension(ext string) string { - switch strings.ToLower(ext) { - case ".jpg", ".jpeg": - return "image/jpeg" - case ".png": - return "image/png" - case ".gif": - return "image/gif" - case ".webp": - return "image/webp" - case ".bmp": - return "image/bmp" - default: - return "" - } -} diff --git a/pkg/agent/loop_test.go b/pkg/agent/loop_test.go index 4c803fea6..3565314fe 100644 --- a/pkg/agent/loop_test.go +++ b/pkg/agent/loop_test.go @@ -6,14 +6,12 @@ import ( "os" "path/filepath" "slices" - "strings" "testing" "time" "github.com/sipeed/picoclaw/pkg/bus" "github.com/sipeed/picoclaw/pkg/channels" "github.com/sipeed/picoclaw/pkg/config" - "github.com/sipeed/picoclaw/pkg/media" "github.com/sipeed/picoclaw/pkg/providers" "github.com/sipeed/picoclaw/pkg/tools" ) @@ -810,124 +808,3 @@ func TestHandleReasoning(t *testing.T) { } }) } - -func TestMimeFromExtension(t *testing.T) { - tests := []struct { - ext string - want string - }{ - {".jpg", "image/jpeg"}, - {".JPEG", "image/jpeg"}, - {".png", "image/png"}, - {".gif", "image/gif"}, - {".webp", "image/webp"}, - {".bmp", "image/bmp"}, - {".txt", ""}, - {".pdf", ""}, - {"", ""}, - } - for _, tt := range tests { - if got := mimeFromExtension(tt.ext); got != tt.want { - t.Errorf("mimeFromExtension(%q) = %q, want %q", tt.ext, got, tt.want) - } - } -} - -func TestResolveMediaRefs_NilStore(t *testing.T) { - msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"media://abc"}}} - result := resolveMediaRefs(msgs, nil) - if result[0].Media[0] != "media://abc" { - t.Error("nil store should return messages unchanged") - } -} - -func TestResolveMediaRefs_NonMediaRef(t *testing.T) { - msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"https://example.com/img.png"}}} - result := resolveMediaRefs(msgs, media.NewFileMediaStore()) - if result[0].Media[0] != "https://example.com/img.png" { - t.Error("non-media:// refs should be passed through unchanged") - } -} - -func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) { - store := media.NewFileMediaStore() - - imgPath := filepath.Join(t.TempDir(), "test.png") - if err := os.WriteFile(imgPath, []byte("fake-png-data"), 0o644); err != nil { - t.Fatal(err) - } - - ref, err := store.Store(imgPath, media.MediaMeta{ContentType: "image/png"}, "test") - if err != nil { - t.Fatal(err) - } - - msgs := []providers.Message{{Role: "user", Content: "describe", Media: []string{ref}}} - result := resolveMediaRefs(msgs, store) - - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 resolved media, got %d", len(result[0].Media)) - } - if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { - t.Errorf("expected data URL, got %s", result[0].Media[0][:40]) - } -} - -func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { - store := media.NewFileMediaStore() - - bigPath := filepath.Join(t.TempDir(), "big.jpg") - if err := os.WriteFile(bigPath, make([]byte, maxMediaFileSize+1), 0o644); err != nil { - t.Fatal(err) - } - - ref, err := store.Store(bigPath, media.MediaMeta{ContentType: "image/jpeg"}, "test") - if err != nil { - t.Fatal(err) - } - - msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}} - result := resolveMediaRefs(msgs, store) - - if len(result[0].Media) != 0 { - t.Error("oversized file should be skipped") - } -} - -func TestResolveMediaRefs_SkipsUnknownExtension(t *testing.T) { - store := media.NewFileMediaStore() - - txtPath := filepath.Join(t.TempDir(), "readme.txt") - if err := os.WriteFile(txtPath, []byte("hello"), 0o644); err != nil { - t.Fatal(err) - } - - ref, err := store.Store(txtPath, media.MediaMeta{}, "test") - if err != nil { - t.Fatal(err) - } - - msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}} - result := resolveMediaRefs(msgs, store) - - if len(result[0].Media) != 0 { - t.Error("unknown extension with no ContentType should be skipped") - } -} - -func TestResolveMediaRefs_DoesNotMutateOriginal(t *testing.T) { - store := media.NewFileMediaStore() - - imgPath := filepath.Join(t.TempDir(), "test.jpg") - if err := os.WriteFile(imgPath, []byte("data"), 0o644); err != nil { - t.Fatal(err) - } - - ref, _ := store.Store(imgPath, media.MediaMeta{ContentType: "image/jpeg"}, "test") - original := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}} - resolveMediaRefs(original, store) - - if !strings.HasPrefix(original[0].Media[0], "media://") { - t.Error("original message should not be mutated") - } -} diff --git a/pkg/providers/openai_compat/provider.go b/pkg/providers/openai_compat/provider.go index e3cd1560e..3a18b8b16 100644 --- a/pkg/providers/openai_compat/provider.go +++ b/pkg/providers/openai_compat/provider.go @@ -116,7 +116,7 @@ func (p *Provider) Chat( requestBody := map[string]any{ "model": model, - "messages": serializeMessages(messages), + "messages": stripSystemParts(messages), } if len(tools) > 0 { @@ -195,60 +195,6 @@ func (p *Provider) Chat( return parseResponse(body) } -func serializeMessages(messages []Message) []map[string]interface{} { - result := make([]map[string]interface{}, 0, len(messages)) - for _, m := range messages { - if len(m.Media) == 0 { - msg := map[string]interface{}{ - "role": m.Role, - "content": m.Content, - } - if m.ToolCallID != "" { - msg["tool_call_id"] = m.ToolCallID - } - if len(m.ToolCalls) > 0 { - msg["tool_calls"] = m.ToolCalls - } - if m.ReasoningContent != "" { - msg["reasoning_content"] = m.ReasoningContent - } - result = append(result, msg) - continue - } - - parts := make([]map[string]interface{}, 0, 1+len(m.Media)) - if m.Content != "" { - parts = append(parts, map[string]interface{}{ - "type": "text", - "text": m.Content, - }) - } - for _, mediaURL := range m.Media { - parts = append(parts, map[string]interface{}{ - "type": "image_url", - "image_url": map[string]interface{}{ - "url": mediaURL, - }, - }) - } - msg := map[string]interface{}{ - "role": m.Role, - "content": parts, - } - if m.ToolCallID != "" { - msg["tool_call_id"] = m.ToolCallID - } - if len(m.ToolCalls) > 0 { - msg["tool_calls"] = m.ToolCalls - } - if m.ReasoningContent != "" { - msg["reasoning_content"] = m.ReasoningContent - } - result = append(result, msg) - } - return result -} - func parseResponse(body []byte) (*LLMResponse, error) { var apiResponse struct { Choices []struct { diff --git a/pkg/providers/openai_compat/provider_test.go b/pkg/providers/openai_compat/provider_test.go index 0fd912541..53b9e75ee 100644 --- a/pkg/providers/openai_compat/provider_test.go +++ b/pkg/providers/openai_compat/provider_test.go @@ -416,73 +416,3 @@ func TestProvider_FunctionalOptionRequestTimeoutNonPositive(t *testing.T) { t.Fatalf("http timeout = %v, want %v", p.httpClient.Timeout, defaultRequestTimeout) } } - -func TestSerializeMessages_PlainText(t *testing.T) { - msgs := []Message{ - {Role: "user", Content: "hello"}, - {Role: "assistant", Content: "hi"}, - } - result := serializeMessages(msgs) - if len(result) != 2 { - t.Fatalf("expected 2 messages, got %d", len(result)) - } - if result[0]["content"] != "hello" { - t.Errorf("expected plain string content, got %v", result[0]["content"]) - } -} - -func TestSerializeMessages_WithMedia(t *testing.T) { - msgs := []Message{ - {Role: "user", Content: "describe this", Media: []string{"data:image/png;base64,abc123"}}, - } - result := serializeMessages(msgs) - if len(result) != 1 { - t.Fatalf("expected 1 message, got %d", len(result)) - } - parts, ok := result[0]["content"].([]map[string]interface{}) - if !ok { - t.Fatalf("expected content to be []map, got %T", result[0]["content"]) - } - if len(parts) != 2 { - t.Fatalf("expected 2 parts (text + image), got %d", len(parts)) - } - if parts[0]["type"] != "text" { - t.Errorf("expected first part type=text, got %v", parts[0]["type"]) - } - if parts[1]["type"] != "image_url" { - t.Errorf("expected second part type=image_url, got %v", parts[1]["type"]) - } -} - -func TestSerializeMessages_WithMediaPreservesToolFields(t *testing.T) { - msgs := []Message{ - { - Role: "assistant", - Content: "result", - Media: []string{"data:image/png;base64,abc"}, - ToolCallID: "call_123", - ToolCalls: []ToolCall{{ID: "tc_1", Type: "function", Function: &FunctionCall{Name: "test", Arguments: "{}"}}}, - ReasoningContent: "thinking...", - }, - } - result := serializeMessages(msgs) - if result[0]["tool_call_id"] != "call_123" { - t.Errorf("expected tool_call_id=call_123, got %v", result[0]["tool_call_id"]) - } - if result[0]["tool_calls"] == nil { - t.Error("expected tool_calls to be present") - } - if result[0]["reasoning_content"] != "thinking..." { - t.Errorf("expected reasoning_content, got %v", result[0]["reasoning_content"]) - } -} - -func TestSerializeMessages_EmptyMediaUsesPlainFormat(t *testing.T) { - msgs := []Message{ - {Role: "user", Content: "hello", Media: []string{}}, - } - result := serializeMessages(msgs) - if _, ok := result[0]["content"].(string); !ok { - t.Errorf("empty Media should use plain string format, got %T", result[0]["content"]) - } -} diff --git a/pkg/providers/protocoltypes/types.go b/pkg/providers/protocoltypes/types.go index efac1e10b..99f13334e 100644 --- a/pkg/providers/protocoltypes/types.go +++ b/pkg/providers/protocoltypes/types.go @@ -65,7 +65,6 @@ type ContentBlock struct { type Message struct { Role string `json:"role"` Content string `json:"content"` - Media []string `json:"media,omitempty"` // URLs of images or other media attachments ReasoningContent string `json:"reasoning_content,omitempty"` SystemParts []ContentBlock `json:"system_parts,omitempty"` // structured system blocks for cache-aware adapters ToolCalls []ToolCall `json:"tool_calls,omitempty"`