-
Notifications
You must be signed in to change notification settings - Fork 3.8k
feat(agent): add vision/image support to agent pipeline #990
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3d54a77
6997edc
a4e5c39
18b36af
8ebeefc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,9 +8,11 @@ package agent | |
|
|
||
| import ( | ||
| "context" | ||
| "encoding/base64" | ||
| "encoding/json" | ||
| "errors" | ||
| "fmt" | ||
| "os" | ||
| "path/filepath" | ||
| "strings" | ||
| "sync" | ||
|
|
@@ -46,11 +48,12 @@ type AgentLoop struct { | |
|
|
||
| // processOptions configures how a message is processed | ||
| type processOptions struct { | ||
| SessionKey string // Session identifier for history/context | ||
| Channel string // Target channel for tool execution | ||
| ChatID string // Target chat ID for tool execution | ||
| UserMessage string // User message content (may include prefix) | ||
| DefaultResponse string // Response when LLM returns empty | ||
| SessionKey string // Session identifier for history/context | ||
| Channel string // Target channel for tool execution | ||
| ChatID string // Target chat ID for tool execution | ||
| UserMessage string // User message content (may include prefix) | ||
| Media []string // Media URLs attached to the user message | ||
| DefaultResponse string // Response when LLM returns empty | ||
| EnableSummary bool // Whether to trigger summarization | ||
| SendResponse bool // Whether to send response via bus | ||
| NoHistory bool // If true, don't load session history (for heartbeat) | ||
|
|
@@ -417,6 +420,7 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) | |
| Channel: msg.Channel, | ||
| ChatID: msg.ChatID, | ||
| UserMessage: msg.Content, | ||
| Media: msg.Media, | ||
| DefaultResponse: defaultResponse, | ||
| EnableSummary: true, | ||
| SendResponse: false, | ||
|
|
@@ -509,10 +513,11 @@ func (al *AgentLoop) runAgentLoop(ctx context.Context, agent *AgentInstance, opt | |
| history, | ||
| summary, | ||
| opts.UserMessage, | ||
| nil, | ||
| opts.Media, | ||
| opts.Channel, | ||
| opts.ChatID, | ||
| ) | ||
| messages = resolveMediaRefs(messages, al.mediaStore) | ||
|
|
||
| // 3. Save user message to session | ||
| agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage) | ||
|
|
@@ -1350,3 +1355,105 @@ func extractParentPeer(msg bus.InboundMessage) *routing.RoutePeer { | |
| } | ||
| return &routing.RoutePeer{Kind: parentKind, ID: parentID} | ||
| } | ||
|
|
||
| // maxMediaFileSize is the maximum file size (20 MB) for media resolution. | ||
| // Files larger than this are skipped to prevent OOM under concurrent load. | ||
| const maxMediaFileSize = 20 * 1024 * 1024 | ||
|
|
||
| // resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs. | ||
| // Returns a new slice with resolved URLs; original messages are not mutated. | ||
| func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []providers.Message { | ||
| if store == nil { | ||
| return messages | ||
| } | ||
|
|
||
| result := make([]providers.Message, len(messages)) | ||
| copy(result, messages) | ||
|
|
||
| for i, m := range result { | ||
| if len(m.Media) == 0 { | ||
| continue | ||
| } | ||
|
|
||
| resolved := make([]string, 0, len(m.Media)) | ||
| for _, ref := range m.Media { | ||
| if !strings.HasPrefix(ref, "media://") { | ||
| resolved = append(resolved, ref) | ||
| continue | ||
| } | ||
|
|
||
| localPath, meta, err := store.ResolveWithMeta(ref) | ||
| if err != nil { | ||
| logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{ | ||
| "ref": ref, | ||
| "error": err.Error(), | ||
| }) | ||
| continue | ||
| } | ||
|
|
||
| info, err := os.Stat(localPath) | ||
| if err != nil { | ||
| logger.WarnCF("agent", "Failed to stat media file", map[string]any{ | ||
| "path": localPath, | ||
| "error": err.Error(), | ||
| }) | ||
| continue | ||
| } | ||
| if info.Size() > maxMediaFileSize { | ||
| logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ | ||
| "path": localPath, | ||
| "size": info.Size(), | ||
| "max_size": maxMediaFileSize, | ||
| }) | ||
| continue | ||
| } | ||
|
|
||
| data, err := os.ReadFile(localPath) | ||
| if err != nil { | ||
| logger.WarnCF("agent", "Failed to read media file", map[string]any{ | ||
| "path": localPath, | ||
| "error": err.Error(), | ||
| }) | ||
| continue | ||
| } | ||
|
|
||
| mime := meta.ContentType | ||
| if mime == "" { | ||
| mime = mimeFromExtension(filepath.Ext(localPath)) | ||
| } | ||
| if mime == "" { | ||
| logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{ | ||
| "path": localPath, | ||
| "ext": filepath.Ext(localPath), | ||
| }) | ||
| continue | ||
| } | ||
|
|
||
| dataURL := "data:" + mime + ";base64," + base64.StdEncoding.EncodeToString(data) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use file handler and encoder for later use, instead of allocation 2X of memory for media files. |
||
| resolved = append(resolved, dataURL) | ||
| } | ||
|
|
||
| result[i].Media = resolved | ||
| } | ||
|
|
||
| return result | ||
| } | ||
|
|
||
| // mimeFromExtension returns a MIME type for common image extensions. | ||
| // Returns empty string for unrecognized extensions. | ||
| func mimeFromExtension(ext string) string { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use github.com/h2non/filetype for file type detection |
||
| switch strings.ToLower(ext) { | ||
| case ".jpg", ".jpeg": | ||
| return "image/jpeg" | ||
| case ".png": | ||
| return "image/png" | ||
| case ".gif": | ||
| return "image/gif" | ||
| case ".webp": | ||
| return "image/webp" | ||
| case ".bmp": | ||
| return "image/bmp" | ||
| default: | ||
| return "" | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bake size limitation into config.