Skip to content

Commit 3bb4f4e

Browse files
authored
Merge pull request #1010 from sipeed/revert-990-feat/agent-vision-pipeline
revert: "feat(agent): add vision/image support to agent pipeline"
2 parents 12d4570 + 407707a commit 3bb4f4e

File tree

6 files changed

+8
-364
lines changed

6 files changed

+8
-364
lines changed

pkg/agent/context.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,11 +465,10 @@ func (cb *ContextBuilder) BuildMessages(
465465
messages = append(messages, history...)
466466

467467
// Add current user message
468-
if strings.TrimSpace(currentMessage) != "" || len(media) > 0 {
468+
if strings.TrimSpace(currentMessage) != "" {
469469
messages = append(messages, providers.Message{
470470
Role: "user",
471471
Content: currentMessage,
472-
Media: media,
473472
})
474473
}
475474

pkg/agent/loop.go

Lines changed: 6 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,9 @@ package agent
88

99
import (
1010
"context"
11-
"encoding/base64"
1211
"encoding/json"
1312
"errors"
1413
"fmt"
15-
"os"
1614
"path/filepath"
1715
"strings"
1816
"sync"
@@ -49,12 +47,11 @@ type AgentLoop struct {
4947

5048
// processOptions configures how a message is processed
5149
type processOptions struct {
52-
SessionKey string // Session identifier for history/context
53-
Channel string // Target channel for tool execution
54-
ChatID string // Target chat ID for tool execution
55-
UserMessage string // User message content (may include prefix)
56-
Media []string // Media URLs attached to the user message
57-
DefaultResponse string // Response when LLM returns empty
50+
SessionKey string // Session identifier for history/context
51+
Channel string // Target channel for tool execution
52+
ChatID string // Target chat ID for tool execution
53+
UserMessage string // User message content (may include prefix)
54+
DefaultResponse string // Response when LLM returns empty
5855
EnableSummary bool // Whether to trigger summarization
5956
SendResponse bool // Whether to send response via bus
6057
NoHistory bool // If true, don't load session history (for heartbeat)
@@ -499,7 +496,6 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
499496
Channel: msg.Channel,
500497
ChatID: msg.ChatID,
501498
UserMessage: msg.Content,
502-
Media: msg.Media,
503499
DefaultResponse: defaultResponse,
504500
EnableSummary: true,
505501
SendResponse: false,
@@ -606,11 +602,10 @@ func (al *AgentLoop) runAgentLoop(
606602
history,
607603
summary,
608604
opts.UserMessage,
609-
opts.Media,
605+
nil,
610606
opts.Channel,
611607
opts.ChatID,
612608
)
613-
messages = resolveMediaRefs(messages, al.mediaStore)
614609

615610
// 3. Save user message to session
616611
agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage)
@@ -1481,105 +1476,3 @@ func extractParentPeer(msg bus.InboundMessage) *routing.RoutePeer {
14811476
}
14821477
return &routing.RoutePeer{Kind: parentKind, ID: parentID}
14831478
}
1484-
1485-
// maxMediaFileSize is the maximum file size (20 MB) for media resolution.
1486-
// Files larger than this are skipped to prevent OOM under concurrent load.
1487-
const maxMediaFileSize = 20 * 1024 * 1024
1488-
1489-
// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs.
1490-
// Returns a new slice with resolved URLs; original messages are not mutated.
1491-
func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []providers.Message {
1492-
if store == nil {
1493-
return messages
1494-
}
1495-
1496-
result := make([]providers.Message, len(messages))
1497-
copy(result, messages)
1498-
1499-
for i, m := range result {
1500-
if len(m.Media) == 0 {
1501-
continue
1502-
}
1503-
1504-
resolved := make([]string, 0, len(m.Media))
1505-
for _, ref := range m.Media {
1506-
if !strings.HasPrefix(ref, "media://") {
1507-
resolved = append(resolved, ref)
1508-
continue
1509-
}
1510-
1511-
localPath, meta, err := store.ResolveWithMeta(ref)
1512-
if err != nil {
1513-
logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{
1514-
"ref": ref,
1515-
"error": err.Error(),
1516-
})
1517-
continue
1518-
}
1519-
1520-
info, err := os.Stat(localPath)
1521-
if err != nil {
1522-
logger.WarnCF("agent", "Failed to stat media file", map[string]any{
1523-
"path": localPath,
1524-
"error": err.Error(),
1525-
})
1526-
continue
1527-
}
1528-
if info.Size() > maxMediaFileSize {
1529-
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
1530-
"path": localPath,
1531-
"size": info.Size(),
1532-
"max_size": maxMediaFileSize,
1533-
})
1534-
continue
1535-
}
1536-
1537-
data, err := os.ReadFile(localPath)
1538-
if err != nil {
1539-
logger.WarnCF("agent", "Failed to read media file", map[string]any{
1540-
"path": localPath,
1541-
"error": err.Error(),
1542-
})
1543-
continue
1544-
}
1545-
1546-
mime := meta.ContentType
1547-
if mime == "" {
1548-
mime = mimeFromExtension(filepath.Ext(localPath))
1549-
}
1550-
if mime == "" {
1551-
logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{
1552-
"path": localPath,
1553-
"ext": filepath.Ext(localPath),
1554-
})
1555-
continue
1556-
}
1557-
1558-
dataURL := "data:" + mime + ";base64," + base64.StdEncoding.EncodeToString(data)
1559-
resolved = append(resolved, dataURL)
1560-
}
1561-
1562-
result[i].Media = resolved
1563-
}
1564-
1565-
return result
1566-
}
1567-
1568-
// mimeFromExtension returns a MIME type for common image extensions.
1569-
// Returns empty string for unrecognized extensions.
1570-
func mimeFromExtension(ext string) string {
1571-
switch strings.ToLower(ext) {
1572-
case ".jpg", ".jpeg":
1573-
return "image/jpeg"
1574-
case ".png":
1575-
return "image/png"
1576-
case ".gif":
1577-
return "image/gif"
1578-
case ".webp":
1579-
return "image/webp"
1580-
case ".bmp":
1581-
return "image/bmp"
1582-
default:
1583-
return ""
1584-
}
1585-
}

pkg/agent/loop_test.go

Lines changed: 0 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,12 @@ import (
66
"os"
77
"path/filepath"
88
"slices"
9-
"strings"
109
"testing"
1110
"time"
1211

1312
"github.com/sipeed/picoclaw/pkg/bus"
1413
"github.com/sipeed/picoclaw/pkg/channels"
1514
"github.com/sipeed/picoclaw/pkg/config"
16-
"github.com/sipeed/picoclaw/pkg/media"
1715
"github.com/sipeed/picoclaw/pkg/providers"
1816
"github.com/sipeed/picoclaw/pkg/tools"
1917
)
@@ -810,124 +808,3 @@ func TestHandleReasoning(t *testing.T) {
810808
}
811809
})
812810
}
813-
814-
func TestMimeFromExtension(t *testing.T) {
815-
tests := []struct {
816-
ext string
817-
want string
818-
}{
819-
{".jpg", "image/jpeg"},
820-
{".JPEG", "image/jpeg"},
821-
{".png", "image/png"},
822-
{".gif", "image/gif"},
823-
{".webp", "image/webp"},
824-
{".bmp", "image/bmp"},
825-
{".txt", ""},
826-
{".pdf", ""},
827-
{"", ""},
828-
}
829-
for _, tt := range tests {
830-
if got := mimeFromExtension(tt.ext); got != tt.want {
831-
t.Errorf("mimeFromExtension(%q) = %q, want %q", tt.ext, got, tt.want)
832-
}
833-
}
834-
}
835-
836-
func TestResolveMediaRefs_NilStore(t *testing.T) {
837-
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"media://abc"}}}
838-
result := resolveMediaRefs(msgs, nil)
839-
if result[0].Media[0] != "media://abc" {
840-
t.Error("nil store should return messages unchanged")
841-
}
842-
}
843-
844-
func TestResolveMediaRefs_NonMediaRef(t *testing.T) {
845-
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"https://example.com/img.png"}}}
846-
result := resolveMediaRefs(msgs, media.NewFileMediaStore())
847-
if result[0].Media[0] != "https://example.com/img.png" {
848-
t.Error("non-media:// refs should be passed through unchanged")
849-
}
850-
}
851-
852-
func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) {
853-
store := media.NewFileMediaStore()
854-
855-
imgPath := filepath.Join(t.TempDir(), "test.png")
856-
if err := os.WriteFile(imgPath, []byte("fake-png-data"), 0o644); err != nil {
857-
t.Fatal(err)
858-
}
859-
860-
ref, err := store.Store(imgPath, media.MediaMeta{ContentType: "image/png"}, "test")
861-
if err != nil {
862-
t.Fatal(err)
863-
}
864-
865-
msgs := []providers.Message{{Role: "user", Content: "describe", Media: []string{ref}}}
866-
result := resolveMediaRefs(msgs, store)
867-
868-
if len(result[0].Media) != 1 {
869-
t.Fatalf("expected 1 resolved media, got %d", len(result[0].Media))
870-
}
871-
if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") {
872-
t.Errorf("expected data URL, got %s", result[0].Media[0][:40])
873-
}
874-
}
875-
876-
func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) {
877-
store := media.NewFileMediaStore()
878-
879-
bigPath := filepath.Join(t.TempDir(), "big.jpg")
880-
if err := os.WriteFile(bigPath, make([]byte, maxMediaFileSize+1), 0o644); err != nil {
881-
t.Fatal(err)
882-
}
883-
884-
ref, err := store.Store(bigPath, media.MediaMeta{ContentType: "image/jpeg"}, "test")
885-
if err != nil {
886-
t.Fatal(err)
887-
}
888-
889-
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}}
890-
result := resolveMediaRefs(msgs, store)
891-
892-
if len(result[0].Media) != 0 {
893-
t.Error("oversized file should be skipped")
894-
}
895-
}
896-
897-
func TestResolveMediaRefs_SkipsUnknownExtension(t *testing.T) {
898-
store := media.NewFileMediaStore()
899-
900-
txtPath := filepath.Join(t.TempDir(), "readme.txt")
901-
if err := os.WriteFile(txtPath, []byte("hello"), 0o644); err != nil {
902-
t.Fatal(err)
903-
}
904-
905-
ref, err := store.Store(txtPath, media.MediaMeta{}, "test")
906-
if err != nil {
907-
t.Fatal(err)
908-
}
909-
910-
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}}
911-
result := resolveMediaRefs(msgs, store)
912-
913-
if len(result[0].Media) != 0 {
914-
t.Error("unknown extension with no ContentType should be skipped")
915-
}
916-
}
917-
918-
func TestResolveMediaRefs_DoesNotMutateOriginal(t *testing.T) {
919-
store := media.NewFileMediaStore()
920-
921-
imgPath := filepath.Join(t.TempDir(), "test.jpg")
922-
if err := os.WriteFile(imgPath, []byte("data"), 0o644); err != nil {
923-
t.Fatal(err)
924-
}
925-
926-
ref, _ := store.Store(imgPath, media.MediaMeta{ContentType: "image/jpeg"}, "test")
927-
original := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}}
928-
resolveMediaRefs(original, store)
929-
930-
if !strings.HasPrefix(original[0].Media[0], "media://") {
931-
t.Error("original message should not be mutated")
932-
}
933-
}

pkg/providers/openai_compat/provider.go

Lines changed: 1 addition & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func (p *Provider) Chat(
116116

117117
requestBody := map[string]any{
118118
"model": model,
119-
"messages": serializeMessages(messages),
119+
"messages": stripSystemParts(messages),
120120
}
121121

122122
if len(tools) > 0 {
@@ -195,60 +195,6 @@ func (p *Provider) Chat(
195195
return parseResponse(body)
196196
}
197197

198-
func serializeMessages(messages []Message) []map[string]interface{} {
199-
result := make([]map[string]interface{}, 0, len(messages))
200-
for _, m := range messages {
201-
if len(m.Media) == 0 {
202-
msg := map[string]interface{}{
203-
"role": m.Role,
204-
"content": m.Content,
205-
}
206-
if m.ToolCallID != "" {
207-
msg["tool_call_id"] = m.ToolCallID
208-
}
209-
if len(m.ToolCalls) > 0 {
210-
msg["tool_calls"] = m.ToolCalls
211-
}
212-
if m.ReasoningContent != "" {
213-
msg["reasoning_content"] = m.ReasoningContent
214-
}
215-
result = append(result, msg)
216-
continue
217-
}
218-
219-
parts := make([]map[string]interface{}, 0, 1+len(m.Media))
220-
if m.Content != "" {
221-
parts = append(parts, map[string]interface{}{
222-
"type": "text",
223-
"text": m.Content,
224-
})
225-
}
226-
for _, mediaURL := range m.Media {
227-
parts = append(parts, map[string]interface{}{
228-
"type": "image_url",
229-
"image_url": map[string]interface{}{
230-
"url": mediaURL,
231-
},
232-
})
233-
}
234-
msg := map[string]interface{}{
235-
"role": m.Role,
236-
"content": parts,
237-
}
238-
if m.ToolCallID != "" {
239-
msg["tool_call_id"] = m.ToolCallID
240-
}
241-
if len(m.ToolCalls) > 0 {
242-
msg["tool_calls"] = m.ToolCalls
243-
}
244-
if m.ReasoningContent != "" {
245-
msg["reasoning_content"] = m.ReasoningContent
246-
}
247-
result = append(result, msg)
248-
}
249-
return result
250-
}
251-
252198
func parseResponse(body []byte) (*LLMResponse, error) {
253199
var apiResponse struct {
254200
Choices []struct {

0 commit comments

Comments
 (0)