feat: add browser automation tool via agent-browser CLI

Integrate agent-browser CLI as a lightweight browser automation tool. Instead of embedding browser dependencies, this wraps the external agent-browser binary via exec.Command, keeping PicoClaw lean. Changes: - Add BrowserTool (pkg/tools/browser.go) wrapping agent-browser CLI - Add BrowserConfig to config with enabled, session, headless, timeout, cdp_port - Register browser tool conditionally in agent loop - Add unit tests for argument building, command splitting, error handling The tool accepts a single 'command' parameter and delegates to agent-browser. Default CDP port is 9222. Zero new Go dependencies - all stdlib imports.
2026-05-25 16:00:35 +00:00 · 2026-02-16 22:38:02 +07:00
4 changed files with 405 additions and 1 deletions
@@ -84,6 +84,16 @@ func createToolRegistry(workspace string, restrict bool, cfg *config.Config, msg
 	}
 	registry.Register(tools.NewWebFetchTool(50000))

+	// Browser automation tool (agent-browser CLI)
+	if cfg.Tools.Browser.Enabled {
+		registry.Register(tools.NewBrowserTool(tools.BrowserToolOptions{
+			Session:  cfg.Tools.Browser.Session,
+			Headless: cfg.Tools.Browser.Headless,
+			Timeout:  cfg.Tools.Browser.Timeout,
+			CDPPort:  cfg.Tools.Browser.CDPPort,
+		}))
+	}
+
 	// Hardware tools (I2C, SPI) - Linux only, returns error on other platforms
 	registry.Register(tools.NewI2CTool())
 	registry.Register(tools.NewSPITool())
@@ -211,8 +211,17 @@ type WebToolsConfig struct {
 	DuckDuckGo DuckDuckGoConfig `json:"duckduckgo"`
 }

+type BrowserConfig struct {
+	Enabled  bool   `json:"enabled" env:"PICOCLAW_TOOLS_BROWSER_ENABLED"`
+	Session  string `json:"session" env:"PICOCLAW_TOOLS_BROWSER_SESSION"`
+	Headless bool   `json:"headless" env:"PICOCLAW_TOOLS_BROWSER_HEADLESS"`
+	Timeout  int    `json:"timeout" env:"PICOCLAW_TOOLS_BROWSER_TIMEOUT"`
+	CDPPort  int    `json:"cdp_port" env:"PICOCLAW_TOOLS_BROWSER_CDP_PORT"`
+}
+
 type ToolsConfig struct {
-	Web WebToolsConfig `json:"web"`
+	Web     WebToolsConfig `json:"web"`
+	Browser BrowserConfig  `json:"browser"`
 }

 func DefaultConfig() *Config {
@@ -322,6 +331,12 @@ func DefaultConfig() *Config {
 					MaxResults: 5,
 				},
 			},
+			Browser: BrowserConfig{
+				Enabled:  false,
+				Headless: true,
+				Timeout:  30,
+				CDPPort:  9222,
+			},
 		},
 		Heartbeat: HeartbeatConfig{
 			Enabled:  true,
@@ -0,0 +1,229 @@
+package tools
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+)
+
+// BrowserToolOptions configures the BrowserTool.
+type BrowserToolOptions struct {
+	Session  string // Session name for isolation
+	Headless bool   // Run in headless mode (default true)
+	Timeout  int    // Command timeout in seconds (default 30)
+	CDPPort  int    // Chrome DevTools Protocol port (default 9222)
+}
+
+// BrowserTool wraps the agent-browser CLI for headless browser automation.
+// It delegates all browser complexity to the external `agent-browser` binary.
+type BrowserTool struct {
+	session  string
+	headless bool
+	timeout  time.Duration
+	cdpPort  int
+}
+
+// NewBrowserTool creates a new BrowserTool with the given options.
+func NewBrowserTool(opts BrowserToolOptions) *BrowserTool {
+	timeout := 30
+	if opts.Timeout > 0 {
+		timeout = opts.Timeout
+	}
+	cdpPort := 9222
+	if opts.CDPPort > 0 {
+		cdpPort = opts.CDPPort
+	}
+	return &BrowserTool{
+		session:  opts.Session,
+		headless: opts.Headless,
+		timeout:  time.Duration(timeout) * time.Second,
+		cdpPort:  cdpPort,
+	}
+}
+
+func (t *BrowserTool) Name() string {
+	return "browser"
+}
+
+func (t *BrowserTool) Description() string {
+	return `Automate a headless browser via agent-browser CLI. Pass the subcommand as 'command'.
+The browser daemon persists between calls — open a page first, then interact with it.
+
+Core workflow:
+  browser open <url>           → Navigate to URL
+  browser snapshot -i          → Get interactive elements with refs (@e1, @e2, ...)
+  browser click @e2            → Click element by ref
+  browser fill @e3 "text"      → Fill input by ref
+  browser type @e3 "text"      → Type into element
+  browser press Enter          → Press a key
+  browser screenshot [path]    → Take screenshot
+  browser get text @e1         → Get text content of element
+  browser get title            → Get page title
+  browser get url              → Get current URL
+  browser eval "js code"       → Run JavaScript
+  browser scroll down [px]     → Scroll page
+  browser wait <selector|ms>   → Wait for element or time
+  browser close                → Close browser
+
+CSS selectors also work: browser click "#submit"
+
+Examples:
+  command: "open https://example.com"
+  command: "snapshot -i"
+  command: "click @e2"
+  command: "fill @e3 \"user@example.com\""
+  command: "get title"
+  command: "screenshot /tmp/page.png"
+  command: "close"`
+}
+
+func (t *BrowserTool) Parameters() map[string]interface{} {
+	return map[string]interface{}{
+		"type": "object",
+		"properties": map[string]interface{}{
+			"command": map[string]interface{}{
+				"type":        "string",
+				"description": "The agent-browser subcommand to execute (e.g. 'open https://example.com', 'snapshot -i', 'click @e2')",
+			},
+		},
+		"required": []string{"command"},
+	}
+}
+
+func (t *BrowserTool) Execute(ctx context.Context, args map[string]interface{}) *ToolResult {
+	command, ok := args["command"].(string)
+	if !ok || strings.TrimSpace(command) == "" {
+		return ErrorResult("command is required (e.g. 'open https://example.com')")
+	}
+
+	// Build the full agent-browser command line
+	cmdArgs := t.buildArgs(command)
+
+	cmdCtx, cancel := context.WithTimeout(ctx, t.timeout)
+	defer cancel()
+
+	cmd := exec.CommandContext(cmdCtx, "agent-browser", cmdArgs...)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	err := cmd.Run()
+	output := stdout.String()
+	if stderr.Len() > 0 {
+		errOut := stderr.String()
+		// Filter out noise from stderr (daemon startup messages, etc.)
+		if !strings.Contains(errOut, "Daemon started") {
+			if output != "" {
+				output += "\n"
+			}
+			output += errOut
+		}
+	}
+
+	if err != nil {
+		if cmdCtx.Err() == context.DeadlineExceeded {
+			msg := fmt.Sprintf("Browser command timed out after %v: %s", t.timeout, command)
+			return &ToolResult{
+				ForLLM:  msg,
+				ForUser: msg,
+				IsError: true,
+			}
+		}
+		// Include output even on error — agent-browser often puts useful info in stdout
+		if output == "" {
+			output = fmt.Sprintf("command failed: %v", err)
+		} else {
+			output += fmt.Sprintf("\nExit code: %v", err)
+		}
+	}
+
+	if output == "" {
+		output = "(no output)"
+	}
+
+	// Truncate long output
+	maxLen := 10000
+	if len(output) > maxLen {
+		output = output[:maxLen] + fmt.Sprintf("\n... (truncated, %d more chars)", len(output)-maxLen)
+	}
+
+	if err != nil {
+		return &ToolResult{
+			ForLLM:  output,
+			ForUser: output,
+			IsError: true,
+		}
+	}
+
+	return &ToolResult{
+		ForLLM:  output,
+		ForUser: output,
+		IsError: false,
+	}
+}
+
+// buildArgs constructs the argument list for the agent-browser command.
+// It splits the user command string and prepends global flags.
+func (t *BrowserTool) buildArgs(command string) []string {
+	var globalArgs []string
+
+	// Add CDP port
+	globalArgs = append(globalArgs, "--cdp", fmt.Sprintf("%d", t.cdpPort))
+
+	// Add session flag if configured
+	if t.session != "" {
+		globalArgs = append(globalArgs, "--session", t.session)
+	}
+
+	// Add --headed if not headless (agent-browser defaults to headless)
+	if !t.headless {
+		globalArgs = append(globalArgs, "--headed")
+	}
+
+	// Add --json for machine-readable output
+	globalArgs = append(globalArgs, "--json")
+
+	// Parse the command string into arguments, respecting quotes
+	cmdArgs := splitCommand(command)
+
+	return append(globalArgs, cmdArgs...)
+}
+
+// splitCommand splits a command string into arguments, respecting quoted strings.
+func splitCommand(command string) []string {
+	var args []string
+	var current strings.Builder
+	inQuote := false
+	quoteChar := byte(0)
+
+	for i := 0; i < len(command); i++ {
+		ch := command[i]
+		switch {
+		case inQuote:
+			if ch == quoteChar {
+				inQuote = false
+			} else {
+				current.WriteByte(ch)
+			}
+		case ch == '"' || ch == '\'':
+			inQuote = true
+			quoteChar = ch
+		case ch == ' ' || ch == '\t':
+			if current.Len() > 0 {
+				args = append(args, current.String())
+				current.Reset()
+			}
+		default:
+			current.WriteByte(ch)
+		}
+	}
+	if current.Len() > 0 {
+		args = append(args, current.String())
+	}
+
+	return args
+}
@@ -0,0 +1,150 @@
+package tools
+
+import (
+	"context"
+	"strings"
+	"testing"
+)
+
+func TestBrowserTool_Name(t *testing.T) {
+	tool := NewBrowserTool(BrowserToolOptions{})
+	if tool.Name() != "browser" {
+		t.Errorf("Expected name 'browser', got %q", tool.Name())
+	}
+}
+
+func TestBrowserTool_Description(t *testing.T) {
+	tool := NewBrowserTool(BrowserToolOptions{})
+	desc := tool.Description()
+	if !strings.Contains(desc, "agent-browser") {
+		t.Error("Description should mention agent-browser")
+	}
+	if !strings.Contains(desc, "snapshot") {
+		t.Error("Description should mention snapshot command")
+	}
+}
+
+func TestBrowserTool_Parameters(t *testing.T) {
+	tool := NewBrowserTool(BrowserToolOptions{})
+	params := tool.Parameters()
+
+	props, ok := params["properties"].(map[string]interface{})
+	if !ok {
+		t.Fatal("Expected properties map")
+	}
+
+	if _, ok := props["command"]; !ok {
+		t.Error("Expected 'command' in properties")
+	}
+
+	required, ok := params["required"].([]string)
+	if !ok {
+		t.Fatal("Expected required slice")
+	}
+	if len(required) != 1 || required[0] != "command" {
+		t.Errorf("Expected required=['command'], got %v", required)
+	}
+}
+
+func TestBrowserTool_MissingCommand(t *testing.T) {
+	tool := NewBrowserTool(BrowserToolOptions{})
+	ctx := context.Background()
+
+	// Empty args
+	result := tool.Execute(ctx, map[string]interface{}{})
+	if !result.IsError {
+		t.Error("Expected error for missing command")
+	}
+
+	// Empty string
+	result = tool.Execute(ctx, map[string]interface{}{"command": ""})
+	if !result.IsError {
+		t.Error("Expected error for empty command")
+	}
+
+	// Whitespace only
+	result = tool.Execute(ctx, map[string]interface{}{"command": "   "})
+	if !result.IsError {
+		t.Error("Expected error for whitespace-only command")
+	}
+}
+
+func TestBrowserTool_BuildArgs(t *testing.T) {
+	tests := []struct {
+		name     string
+		session  string
+		command  string
+		wantArgs []string
+	}{
+		{
+			name:     "simple command",
+			command:  "open https://example.com",
+			wantArgs: []string{"--cdp", "9222", "--headed", "--json", "open", "https://example.com"},
+		},
+		{
+			name:     "with session",
+			session:  "test-session",
+			command:  "snapshot -i",
+			wantArgs: []string{"--cdp", "9222", "--session", "test-session", "--headed", "--json", "snapshot", "-i"},
+		},
+		{
+			name:     "quoted arguments",
+			command:  `fill @e3 "hello world"`,
+			wantArgs: []string{"--cdp", "9222", "--headed", "--json", "fill", "@e3", "hello world"},
+		},
+		{
+			name:     "single quoted",
+			command:  `fill @e3 'hello world'`,
+			wantArgs: []string{"--cdp", "9222", "--headed", "--json", "fill", "@e3", "hello world"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tool := NewBrowserTool(BrowserToolOptions{Session: tt.session})
+			got := tool.buildArgs(tt.command)
+
+			if len(got) != len(tt.wantArgs) {
+				t.Errorf("buildArgs(%q) = %v (len %d), want %v (len %d)",
+					tt.command, got, len(got), tt.wantArgs, len(tt.wantArgs))
+				return
+			}
+
+			for i := range got {
+				if got[i] != tt.wantArgs[i] {
+					t.Errorf("buildArgs(%q)[%d] = %q, want %q",
+						tt.command, i, got[i], tt.wantArgs[i])
+				}
+			}
+		})
+	}
+}
+
+func TestSplitCommand(t *testing.T) {
+	tests := []struct {
+		input string
+		want  []string
+	}{
+		{"open https://example.com", []string{"open", "https://example.com"}},
+		{`fill @e3 "test@example.com"`, []string{"fill", "@e3", "test@example.com"}},
+		{"snapshot -i -c -d 3", []string{"snapshot", "-i", "-c", "-d", "3"}},
+		{`eval "document.title"`, []string{"eval", "document.title"}},
+		{"  click   @e2  ", []string{"click", "@e2"}},
+		{`get text @e1`, []string{"get", "text", "@e1"}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			got := splitCommand(tt.input)
+			if len(got) != len(tt.want) {
+				t.Errorf("splitCommand(%q) = %v, want %v", tt.input, got, tt.want)
+				return
+			}
+			for i := range got {
+				if got[i] != tt.want[i] {
+					t.Errorf("splitCommand(%q)[%d] = %q, want %q", tt.input, i, got[i], tt.want[i])
+				}
+			}
+		})
+	}
+}