diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index cd42761559..a5744aa199 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -84,6 +84,16 @@ func createToolRegistry(workspace string, restrict bool, cfg *config.Config, msg } registry.Register(tools.NewWebFetchTool(50000)) + // Browser automation tool (agent-browser CLI) + if cfg.Tools.Browser.Enabled { + registry.Register(tools.NewBrowserTool(tools.BrowserToolOptions{ + Session: cfg.Tools.Browser.Session, + Headless: cfg.Tools.Browser.Headless, + Timeout: cfg.Tools.Browser.Timeout, + CDPPort: cfg.Tools.Browser.CDPPort, + })) + } + // Hardware tools (I2C, SPI) - Linux only, returns error on other platforms registry.Register(tools.NewI2CTool()) registry.Register(tools.NewSPITool()) diff --git a/pkg/config/config.go b/pkg/config/config.go index d189ff00bf..7d8fcef0c1 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -211,8 +211,17 @@ type WebToolsConfig struct { DuckDuckGo DuckDuckGoConfig `json:"duckduckgo"` } +type BrowserConfig struct { + Enabled bool `json:"enabled" env:"PICOCLAW_TOOLS_BROWSER_ENABLED"` + Session string `json:"session" env:"PICOCLAW_TOOLS_BROWSER_SESSION"` + Headless bool `json:"headless" env:"PICOCLAW_TOOLS_BROWSER_HEADLESS"` + Timeout int `json:"timeout" env:"PICOCLAW_TOOLS_BROWSER_TIMEOUT"` + CDPPort int `json:"cdp_port" env:"PICOCLAW_TOOLS_BROWSER_CDP_PORT"` +} + type ToolsConfig struct { - Web WebToolsConfig `json:"web"` + Web WebToolsConfig `json:"web"` + Browser BrowserConfig `json:"browser"` } func DefaultConfig() *Config { @@ -322,6 +331,12 @@ func DefaultConfig() *Config { MaxResults: 5, }, }, + Browser: BrowserConfig{ + Enabled: false, + Headless: true, + Timeout: 30, + CDPPort: 9222, + }, }, Heartbeat: HeartbeatConfig{ Enabled: true, diff --git a/pkg/tools/browser.go b/pkg/tools/browser.go new file mode 100644 index 0000000000..85530b5f82 --- /dev/null +++ b/pkg/tools/browser.go @@ -0,0 +1,229 @@ +package tools + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "strings" + "time" +) + +// BrowserToolOptions configures the BrowserTool. +type BrowserToolOptions struct { + Session string // Session name for isolation + Headless bool // Run in headless mode (default true) + Timeout int // Command timeout in seconds (default 30) + CDPPort int // Chrome DevTools Protocol port (default 9222) +} + +// BrowserTool wraps the agent-browser CLI for headless browser automation. +// It delegates all browser complexity to the external `agent-browser` binary. +type BrowserTool struct { + session string + headless bool + timeout time.Duration + cdpPort int +} + +// NewBrowserTool creates a new BrowserTool with the given options. +func NewBrowserTool(opts BrowserToolOptions) *BrowserTool { + timeout := 30 + if opts.Timeout > 0 { + timeout = opts.Timeout + } + cdpPort := 9222 + if opts.CDPPort > 0 { + cdpPort = opts.CDPPort + } + return &BrowserTool{ + session: opts.Session, + headless: opts.Headless, + timeout: time.Duration(timeout) * time.Second, + cdpPort: cdpPort, + } +} + +func (t *BrowserTool) Name() string { + return "browser" +} + +func (t *BrowserTool) Description() string { + return `Automate a headless browser via agent-browser CLI. Pass the subcommand as 'command'. +The browser daemon persists between calls — open a page first, then interact with it. + +Core workflow: + browser open → Navigate to URL + browser snapshot -i → Get interactive elements with refs (@e1, @e2, ...) + browser click @e2 → Click element by ref + browser fill @e3 "text" → Fill input by ref + browser type @e3 "text" → Type into element + browser press Enter → Press a key + browser screenshot [path] → Take screenshot + browser get text @e1 → Get text content of element + browser get title → Get page title + browser get url → Get current URL + browser eval "js code" → Run JavaScript + browser scroll down [px] → Scroll page + browser wait → Wait for element or time + browser close → Close browser + +CSS selectors also work: browser click "#submit" + +Examples: + command: "open https://example.com" + command: "snapshot -i" + command: "click @e2" + command: "fill @e3 \"user@example.com\"" + command: "get title" + command: "screenshot /tmp/page.png" + command: "close"` +} + +func (t *BrowserTool) Parameters() map[string]interface{} { + return map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "command": map[string]interface{}{ + "type": "string", + "description": "The agent-browser subcommand to execute (e.g. 'open https://example.com', 'snapshot -i', 'click @e2')", + }, + }, + "required": []string{"command"}, + } +} + +func (t *BrowserTool) Execute(ctx context.Context, args map[string]interface{}) *ToolResult { + command, ok := args["command"].(string) + if !ok || strings.TrimSpace(command) == "" { + return ErrorResult("command is required (e.g. 'open https://example.com')") + } + + // Build the full agent-browser command line + cmdArgs := t.buildArgs(command) + + cmdCtx, cancel := context.WithTimeout(ctx, t.timeout) + defer cancel() + + cmd := exec.CommandContext(cmdCtx, "agent-browser", cmdArgs...) + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + output := stdout.String() + if stderr.Len() > 0 { + errOut := stderr.String() + // Filter out noise from stderr (daemon startup messages, etc.) + if !strings.Contains(errOut, "Daemon started") { + if output != "" { + output += "\n" + } + output += errOut + } + } + + if err != nil { + if cmdCtx.Err() == context.DeadlineExceeded { + msg := fmt.Sprintf("Browser command timed out after %v: %s", t.timeout, command) + return &ToolResult{ + ForLLM: msg, + ForUser: msg, + IsError: true, + } + } + // Include output even on error — agent-browser often puts useful info in stdout + if output == "" { + output = fmt.Sprintf("command failed: %v", err) + } else { + output += fmt.Sprintf("\nExit code: %v", err) + } + } + + if output == "" { + output = "(no output)" + } + + // Truncate long output + maxLen := 10000 + if len(output) > maxLen { + output = output[:maxLen] + fmt.Sprintf("\n... (truncated, %d more chars)", len(output)-maxLen) + } + + if err != nil { + return &ToolResult{ + ForLLM: output, + ForUser: output, + IsError: true, + } + } + + return &ToolResult{ + ForLLM: output, + ForUser: output, + IsError: false, + } +} + +// buildArgs constructs the argument list for the agent-browser command. +// It splits the user command string and prepends global flags. +func (t *BrowserTool) buildArgs(command string) []string { + var globalArgs []string + + // Add CDP port + globalArgs = append(globalArgs, "--cdp", fmt.Sprintf("%d", t.cdpPort)) + + // Add session flag if configured + if t.session != "" { + globalArgs = append(globalArgs, "--session", t.session) + } + + // Add --headed if not headless (agent-browser defaults to headless) + if !t.headless { + globalArgs = append(globalArgs, "--headed") + } + + // Add --json for machine-readable output + globalArgs = append(globalArgs, "--json") + + // Parse the command string into arguments, respecting quotes + cmdArgs := splitCommand(command) + + return append(globalArgs, cmdArgs...) +} + +// splitCommand splits a command string into arguments, respecting quoted strings. +func splitCommand(command string) []string { + var args []string + var current strings.Builder + inQuote := false + quoteChar := byte(0) + + for i := 0; i < len(command); i++ { + ch := command[i] + switch { + case inQuote: + if ch == quoteChar { + inQuote = false + } else { + current.WriteByte(ch) + } + case ch == '"' || ch == '\'': + inQuote = true + quoteChar = ch + case ch == ' ' || ch == '\t': + if current.Len() > 0 { + args = append(args, current.String()) + current.Reset() + } + default: + current.WriteByte(ch) + } + } + if current.Len() > 0 { + args = append(args, current.String()) + } + + return args +} diff --git a/pkg/tools/browser_test.go b/pkg/tools/browser_test.go new file mode 100644 index 0000000000..a8be7ebb99 --- /dev/null +++ b/pkg/tools/browser_test.go @@ -0,0 +1,150 @@ +package tools + +import ( + "context" + "strings" + "testing" +) + +func TestBrowserTool_Name(t *testing.T) { + tool := NewBrowserTool(BrowserToolOptions{}) + if tool.Name() != "browser" { + t.Errorf("Expected name 'browser', got %q", tool.Name()) + } +} + +func TestBrowserTool_Description(t *testing.T) { + tool := NewBrowserTool(BrowserToolOptions{}) + desc := tool.Description() + if !strings.Contains(desc, "agent-browser") { + t.Error("Description should mention agent-browser") + } + if !strings.Contains(desc, "snapshot") { + t.Error("Description should mention snapshot command") + } +} + +func TestBrowserTool_Parameters(t *testing.T) { + tool := NewBrowserTool(BrowserToolOptions{}) + params := tool.Parameters() + + props, ok := params["properties"].(map[string]interface{}) + if !ok { + t.Fatal("Expected properties map") + } + + if _, ok := props["command"]; !ok { + t.Error("Expected 'command' in properties") + } + + required, ok := params["required"].([]string) + if !ok { + t.Fatal("Expected required slice") + } + if len(required) != 1 || required[0] != "command" { + t.Errorf("Expected required=['command'], got %v", required) + } +} + +func TestBrowserTool_MissingCommand(t *testing.T) { + tool := NewBrowserTool(BrowserToolOptions{}) + ctx := context.Background() + + // Empty args + result := tool.Execute(ctx, map[string]interface{}{}) + if !result.IsError { + t.Error("Expected error for missing command") + } + + // Empty string + result = tool.Execute(ctx, map[string]interface{}{"command": ""}) + if !result.IsError { + t.Error("Expected error for empty command") + } + + // Whitespace only + result = tool.Execute(ctx, map[string]interface{}{"command": " "}) + if !result.IsError { + t.Error("Expected error for whitespace-only command") + } +} + +func TestBrowserTool_BuildArgs(t *testing.T) { + tests := []struct { + name string + session string + command string + wantArgs []string + }{ + { + name: "simple command", + command: "open https://example.com", + wantArgs: []string{"--cdp", "9222", "--headed", "--json", "open", "https://example.com"}, + }, + { + name: "with session", + session: "test-session", + command: "snapshot -i", + wantArgs: []string{"--cdp", "9222", "--session", "test-session", "--headed", "--json", "snapshot", "-i"}, + }, + { + name: "quoted arguments", + command: `fill @e3 "hello world"`, + wantArgs: []string{"--cdp", "9222", "--headed", "--json", "fill", "@e3", "hello world"}, + }, + { + name: "single quoted", + command: `fill @e3 'hello world'`, + wantArgs: []string{"--cdp", "9222", "--headed", "--json", "fill", "@e3", "hello world"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tool := NewBrowserTool(BrowserToolOptions{Session: tt.session}) + got := tool.buildArgs(tt.command) + + if len(got) != len(tt.wantArgs) { + t.Errorf("buildArgs(%q) = %v (len %d), want %v (len %d)", + tt.command, got, len(got), tt.wantArgs, len(tt.wantArgs)) + return + } + + for i := range got { + if got[i] != tt.wantArgs[i] { + t.Errorf("buildArgs(%q)[%d] = %q, want %q", + tt.command, i, got[i], tt.wantArgs[i]) + } + } + }) + } +} + +func TestSplitCommand(t *testing.T) { + tests := []struct { + input string + want []string + }{ + {"open https://example.com", []string{"open", "https://example.com"}}, + {`fill @e3 "test@example.com"`, []string{"fill", "@e3", "test@example.com"}}, + {"snapshot -i -c -d 3", []string{"snapshot", "-i", "-c", "-d", "3"}}, + {`eval "document.title"`, []string{"eval", "document.title"}}, + {" click @e2 ", []string{"click", "@e2"}}, + {`get text @e1`, []string{"get", "text", "@e1"}}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got := splitCommand(tt.input) + if len(got) != len(tt.want) { + t.Errorf("splitCommand(%q) = %v, want %v", tt.input, got, tt.want) + return + } + for i := range got { + if got[i] != tt.want[i] { + t.Errorf("splitCommand(%q)[%d] = %q, want %q", tt.input, i, got[i], tt.want[i]) + } + } + }) + } +}