sipeed · as3k · Feb 20, 2026 · Feb 20, 2026 · Copilot · Feb 20, 2026
diff --git a/pkg/agent/context.go b/pkg/agent/context.go
@@ -198,10 +198,11 @@ func (cb *ContextBuilder) BuildMessages(history []providers.Message, summary str
 
 	messages = append(messages, history...)
 
-	if strings.TrimSpace(currentMessage) != "" {
+	if strings.TrimSpace(currentMessage) != "" || len(media) > 0 {
 		messages = append(messages, providers.Message{
 			Role:    "user",
 			Content: currentMessage,
+			Media:   media,
 		})
 	}
 

diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go
@@ -42,14 +42,15 @@ type AgentLoop struct {
 
 // processOptions configures how a message is processed
 type processOptions struct {
-	SessionKey      string // Session identifier for history/context
-	Channel         string // Target channel for tool execution
-	ChatID          string // Target chat ID for tool execution
-	UserMessage     string // User message content (may include prefix)
-	DefaultResponse string // Response when LLM returns empty
-	EnableSummary   bool   // Whether to trigger summarization
-	SendResponse    bool   // Whether to send response via bus
-	NoHistory       bool   // If true, don't load session history (for heartbeat)
+	SessionKey      string   // Session identifier for history/context
+	Channel         string   // Target channel for tool execution
+	ChatID          string   // Target chat ID for tool execution
+	UserMessage     string   // User message content (may include prefix)
+	Media           []string // Media URLs attached to the user message
+	DefaultResponse string   // Response when LLM returns empty
+	EnableSummary   bool     // Whether to trigger summarization
+	SendResponse    bool     // Whether to send response via bus
+	NoHistory       bool     // If true, don't load session history (for heartbeat)
 }
 
 func NewAgentLoop(cfg *config.Config, msgBus *bus.MessageBus, provider providers.LLMProvider) *AgentLoop {
@@ -313,6 +314,7 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
 		Channel:         msg.Channel,
 		ChatID:          msg.ChatID,
 		UserMessage:     msg.Content,
+		Media:           msg.Media,
 		DefaultResponse: "I've completed processing but have no response to give.",
 		EnableSummary:   true,
 		SendResponse:    false,
@@ -402,7 +404,7 @@ func (al *AgentLoop) runAgentLoop(ctx context.Context, agent *AgentInstance, opt
 		history,
 		summary,
 		opts.UserMessage,
-		nil,
+		opts.Media,
 		opts.Channel,
 		opts.ChatID,
 	)

diff --git a/pkg/providers/openai_compat/provider.go b/pkg/providers/openai_compat/provider.go
@@ -69,7 +69,7 @@ func (p *Provider) Chat(ctx context.Context, messages []Message, tools []ToolDef
 
 	requestBody := map[string]interface{}{
 		"model":    model,
-		"messages": messages,
+		"messages": serializeMessages(messages),
 	}
 
 	if len(tools) > 0 {
@@ -135,6 +135,47 @@ func (p *Provider) Chat(ctx context.Context, messages []Message, tools []ToolDef
 	return parseResponse(body)
 }
 
+func serializeMessages(messages []Message) []map[string]interface{} {
+	result := make([]map[string]interface{}, 0, len(messages))
+	for _, m := range messages {
+		if len(m.Media) == 0 {
+			msg := map[string]interface{}{
+				"role":    m.Role,
+				"content": m.Content,
+			}
+			if m.ToolCallID != "" {
+				msg["tool_call_id"] = m.ToolCallID
+			}
+			if len(m.ToolCalls) > 0 {
+				msg["tool_calls"] = m.ToolCalls
+			}
+			result = append(result, msg)
+			continue
+		}
+
+		parts := make([]map[string]interface{}, 0, 1+len(m.Media))
+		if m.Content != "" {
+			parts = append(parts, map[string]interface{}{
+				"type": "text",
+				"text": m.Content,
+			})
+		}
+		for _, mediaURL := range m.Media {
+			parts = append(parts, map[string]interface{}{
+				"type": "image_url",
+				"image_url": map[string]interface{}{
+					"url": mediaURL,
+				},
+			})
+		}
+		result = append(result, map[string]interface{}{
+			"role":    m.Role,
+			"content": parts,
+		})
-		result = append(result, map[string]interface{}{
-			"role":    m.Role,
-			"content": parts,
-		})
+		msg := map[string]interface{}{
+			"role":    m.Role,
+			"content": parts,
+		}
+		if m.ToolCallID != "" {
+			msg["tool_call_id"] = m.ToolCallID
+		}
+		if len(m.ToolCalls) > 0 {
+			msg["tool_calls"] = m.ToolCalls
+		}
+		result = append(result, msg)
-		result = append(result, map[string]interface{}{
-			"role":    m.Role,
-			"content": parts,
-		})
+		msg := map[string]interface{}{
+			"role":    m.Role,
+			"content": parts,
+		}
+		if m.ToolCallID != "" {
+			msg["tool_call_id"] = m.ToolCallID
+		}
+		if len(m.ToolCalls) > 0 {
+			msg["tool_calls"] = m.ToolCalls
+		}
+		result = append(result, msg)
+	}
+	return result
+}
+
 func parseResponse(body []byte) (*LLMResponse, error) {
 	var apiResponse struct {
 		Choices []struct {

diff --git a/pkg/providers/protocoltypes/types.go b/pkg/providers/protocoltypes/types.go
@@ -40,6 +40,7 @@ type UsageInfo struct {
 type Message struct {
 	Role       string     `json:"role"`
 	Content    string     `json:"content"`
+	Media      []string   `json:"media,omitempty"` // URLs of images or other media attachments
 	ToolCalls  []ToolCall `json:"tool_calls,omitempty"`
 	ToolCallID string     `json:"tool_call_id,omitempty"`
 }