feat: add browser automation tool via agent-browser CLI

Integrate agent-browser CLI as a lightweight browser automation tool. Instead of embedding browser dependencies, this wraps the external agent-browser binary via exec.Command, keeping PicoClaw lean. Changes: - Add BrowserTool (pkg/tools/browser.go) wrapping agent-browser CLI - Add BrowserConfig to config with enabled, session, headless, timeout, cdp_port - Register browser tool conditionally in agent loop - Add unit tests for argument building, command splitting, error handling The tool accepts a single 'command' parameter and delegates to agent-browser. Default CDP port is 9222. Zero new Go dependencies - all stdlib imports.
2026-06-12 18:08:54 +00:00 · 2026-02-16 22:38:02 +07:00
4 changed files with 405 additions and 1 deletions
@@ -84,6 +84,16 @@ func createToolRegistry(workspace string, restrict bool, cfg *config.Config, msg
 	}
 	registry.Register(tools.NewWebFetchTool(50000))
 	// Browser automation tool (agent-browser CLI)
 	if cfg.Tools.Browser.Enabled {
 		registry.Register(tools.NewBrowserTool(tools.BrowserToolOptions{
 			Session:  cfg.Tools.Browser.Session,
 			Headless: cfg.Tools.Browser.Headless,
 			Timeout:  cfg.Tools.Browser.Timeout,
 			CDPPort:  cfg.Tools.Browser.CDPPort,
 		}))
 	}
 	// Hardware tools (I2C, SPI) - Linux only, returns error on other platforms
 	registry.Register(tools.NewI2CTool())
 	registry.Register(tools.NewSPITool())
@@ -211,8 +211,17 @@ type WebToolsConfig struct {
 	DuckDuckGo DuckDuckGoConfig `json:"duckduckgo"`
 }
 type BrowserConfig struct {
 	Enabled  bool   `json:"enabled" env:"PICOCLAW_TOOLS_BROWSER_ENABLED"`
 	Session  string `json:"session" env:"PICOCLAW_TOOLS_BROWSER_SESSION"`
 	Headless bool   `json:"headless" env:"PICOCLAW_TOOLS_BROWSER_HEADLESS"`
 	Timeout  int    `json:"timeout" env:"PICOCLAW_TOOLS_BROWSER_TIMEOUT"`
 	CDPPort  int    `json:"cdp_port" env:"PICOCLAW_TOOLS_BROWSER_CDP_PORT"`
 }
 type ToolsConfig struct {
-	Web WebToolsConfig `json:"web"`
+	Web     WebToolsConfig `json:"web"`
 	Browser BrowserConfig  `json:"browser"`
 }
 func DefaultConfig() *Config {
@@ -322,6 +331,12 @@ func DefaultConfig() *Config {
 					MaxResults: 5,
 				},
 			},
 			Browser: BrowserConfig{
 				Enabled:  false,
 				Headless: true,
 				Timeout:  30,
 				CDPPort:  9222,
 			},
 		},
 		Heartbeat: HeartbeatConfig{
 			Enabled:  true,
@@ -0,0 +1,229 @@
 package tools
 import (
 	"bytes"
 	"context"
 	"fmt"
 	"os/exec"
 	"strings"
 	"time"
 )
 // BrowserToolOptions configures the BrowserTool.
 type BrowserToolOptions struct {
 	Session  string // Session name for isolation
 	Headless bool   // Run in headless mode (default true)
 	Timeout  int    // Command timeout in seconds (default 30)
 	CDPPort  int    // Chrome DevTools Protocol port (default 9222)
 }
 // BrowserTool wraps the agent-browser CLI for headless browser automation.
 // It delegates all browser complexity to the external `agent-browser` binary.
 type BrowserTool struct {
 	session  string
 	headless bool
 	timeout  time.Duration
 	cdpPort  int
 }
 // NewBrowserTool creates a new BrowserTool with the given options.
 func NewBrowserTool(opts BrowserToolOptions) *BrowserTool {
 	timeout := 30
 	if opts.Timeout > 0 {
 		timeout = opts.Timeout
 	}
 	cdpPort := 9222
 	if opts.CDPPort > 0 {
 		cdpPort = opts.CDPPort
 	}
 	return &BrowserTool{
 		session:  opts.Session,
 		headless: opts.Headless,
 		timeout:  time.Duration(timeout) * time.Second,
 		cdpPort:  cdpPort,
 	}
 }
 func (t *BrowserTool) Name() string {
 	return "browser"
 }
 func (t *BrowserTool) Description() string {
 	return `Automate a headless browser via agent-browser CLI. Pass the subcommand as 'command'.
 The browser daemon persists between calls — open a page first, then interact with it.
 Core workflow:
  browser open <url>           → Navigate to URL
  browser snapshot -i          → Get interactive elements with refs (@e1, @e2, ...)
  browser click @e2            → Click element by ref
  browser fill @e3 "text"      → Fill input by ref
  browser type @e3 "text"      → Type into element
  browser press Enter          → Press a key
  browser screenshot [path]    → Take screenshot
  browser get text @e1         → Get text content of element
  browser get title            → Get page title
  browser get url              → Get current URL
  browser eval "js code"       → Run JavaScript
  browser scroll down [px]     → Scroll page
  browser wait <selector|ms>   → Wait for element or time
  browser close                → Close browser
 CSS selectors also work: browser click "#submit"
 Examples:
  command: "open https://example.com"
  command: "snapshot -i"
  command: "click @e2"
  command: "fill @e3 \"user@example.com\""
  command: "get title"
  command: "screenshot /tmp/page.png"
  command: "close"`
 }
 func (t *BrowserTool) Parameters() map[string]interface{} {
 	return map[string]interface{}{
 		"type": "object",
 		"properties": map[string]interface{}{
 			"command": map[string]interface{}{
 				"type":        "string",
 				"description": "The agent-browser subcommand to execute (e.g. 'open https://example.com', 'snapshot -i', 'click @e2')",
 			},
 		},
 		"required": []string{"command"},
 	}
 }
 func (t *BrowserTool) Execute(ctx context.Context, args map[string]interface{}) *ToolResult {
 	command, ok := args["command"].(string)
 	if !ok || strings.TrimSpace(command) == "" {
 		return ErrorResult("command is required (e.g. 'open https://example.com')")
 	}
 	// Build the full agent-browser command line
 	cmdArgs := t.buildArgs(command)
 	cmdCtx, cancel := context.WithTimeout(ctx, t.timeout)
 	defer cancel()
 	cmd := exec.CommandContext(cmdCtx, "agent-browser", cmdArgs...)
 	var stdout, stderr bytes.Buffer
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr
 	err := cmd.Run()
 	output := stdout.String()
 	if stderr.Len() > 0 {
 		errOut := stderr.String()
 		// Filter out noise from stderr (daemon startup messages, etc.)
 		if !strings.Contains(errOut, "Daemon started") {
 			if output != "" {
 				output += "\n"
 			}
 			output += errOut
 		}
 	}
 	if err != nil {
 		if cmdCtx.Err() == context.DeadlineExceeded {
 			msg := fmt.Sprintf("Browser command timed out after %v: %s", t.timeout, command)
 			return &ToolResult{
 				ForLLM:  msg,
 				ForUser: msg,
 				IsError: true,
 			}
 		}
 		// Include output even on error — agent-browser often puts useful info in stdout
 		if output == "" {
 			output = fmt.Sprintf("command failed: %v", err)
 		} else {
 			output += fmt.Sprintf("\nExit code: %v", err)
 		}
 	}
 	if output == "" {
 		output = "(no output)"
 	}
 	// Truncate long output
 	maxLen := 10000
 	if len(output) > maxLen {
 		output = output[:maxLen] + fmt.Sprintf("\n... (truncated, %d more chars)", len(output)-maxLen)
 	}
 	if err != nil {
 		return &ToolResult{
 			ForLLM:  output,
 			ForUser: output,
 			IsError: true,
 		}
 	}
 	return &ToolResult{
 		ForLLM:  output,
 		ForUser: output,
 		IsError: false,
 	}
 }
 // buildArgs constructs the argument list for the agent-browser command.
 // It splits the user command string and prepends global flags.
 func (t *BrowserTool) buildArgs(command string) []string {
 	var globalArgs []string
 	// Add CDP port
 	globalArgs = append(globalArgs, "--cdp", fmt.Sprintf("%d", t.cdpPort))
 	// Add session flag if configured
 	if t.session != "" {
 		globalArgs = append(globalArgs, "--session", t.session)
 	}
 	// Add --headed if not headless (agent-browser defaults to headless)
 	if !t.headless {
 		globalArgs = append(globalArgs, "--headed")
 	}
 	// Add --json for machine-readable output
 	globalArgs = append(globalArgs, "--json")
 	// Parse the command string into arguments, respecting quotes
 	cmdArgs := splitCommand(command)
 	return append(globalArgs, cmdArgs...)
 }
 // splitCommand splits a command string into arguments, respecting quoted strings.
 func splitCommand(command string) []string {
 	var args []string
 	var current strings.Builder
 	inQuote := false
 	quoteChar := byte(0)
 	for i := 0; i < len(command); i++ {
 		ch := command[i]
 		switch {
 		case inQuote:
 			if ch == quoteChar {
 				inQuote = false
 			} else {
 				current.WriteByte(ch)
 			}
 		case ch == '"' || ch == '\'':
 			inQuote = true
 			quoteChar = ch
 		case ch == ' ' || ch == '\t':
 			if current.Len() > 0 {
 				args = append(args, current.String())
 				current.Reset()
 			}
 		default:
 			current.WriteByte(ch)
 		}
 	}
 	if current.Len() > 0 {
 		args = append(args, current.String())
 	}
 	return args
 }
@@ -0,0 +1,150 @@
 package tools
 import (
 	"context"
 	"strings"
 	"testing"
 )
 func TestBrowserTool_Name(t *testing.T) {
 	tool := NewBrowserTool(BrowserToolOptions{})
 	if tool.Name() != "browser" {
 		t.Errorf("Expected name 'browser', got %q", tool.Name())
 	}
 }
 func TestBrowserTool_Description(t *testing.T) {
 	tool := NewBrowserTool(BrowserToolOptions{})
 	desc := tool.Description()
 	if !strings.Contains(desc, "agent-browser") {
 		t.Error("Description should mention agent-browser")
 	}
 	if !strings.Contains(desc, "snapshot") {
 		t.Error("Description should mention snapshot command")
 	}
 }
 func TestBrowserTool_Parameters(t *testing.T) {
 	tool := NewBrowserTool(BrowserToolOptions{})
 	params := tool.Parameters()
 	props, ok := params["properties"].(map[string]interface{})
 	if !ok {
 		t.Fatal("Expected properties map")
 	}
 	if _, ok := props["command"]; !ok {
 		t.Error("Expected 'command' in properties")
 	}
 	required, ok := params["required"].([]string)
 	if !ok {
 		t.Fatal("Expected required slice")
 	}
 	if len(required) != 1 || required[0] != "command" {
 		t.Errorf("Expected required=['command'], got %v", required)
 	}
 }
 func TestBrowserTool_MissingCommand(t *testing.T) {
 	tool := NewBrowserTool(BrowserToolOptions{})
 	ctx := context.Background()
 	// Empty args
 	result := tool.Execute(ctx, map[string]interface{}{})
 	if !result.IsError {
 		t.Error("Expected error for missing command")
 	}
 	// Empty string
 	result = tool.Execute(ctx, map[string]interface{}{"command": ""})
 	if !result.IsError {
 		t.Error("Expected error for empty command")
 	}
 	// Whitespace only
 	result = tool.Execute(ctx, map[string]interface{}{"command": "   "})
 	if !result.IsError {
 		t.Error("Expected error for whitespace-only command")
 	}
 }
 func TestBrowserTool_BuildArgs(t *testing.T) {
 	tests := []struct {
 		name     string
 		session  string
 		command  string
 		wantArgs []string
 	}{
 		{
 			name:     "simple command",
 			command:  "open https://example.com",
 			wantArgs: []string{"--cdp", "9222", "--headed", "--json", "open", "https://example.com"},
 		},
 		{
 			name:     "with session",
 			session:  "test-session",
 			command:  "snapshot -i",
 			wantArgs: []string{"--cdp", "9222", "--session", "test-session", "--headed", "--json", "snapshot", "-i"},
 		},
 		{
 			name:     "quoted arguments",
 			command:  `fill @e3 "hello world"`,
 			wantArgs: []string{"--cdp", "9222", "--headed", "--json", "fill", "@e3", "hello world"},
 		},
 		{
 			name:     "single quoted",
 			command:  `fill @e3 'hello world'`,
 			wantArgs: []string{"--cdp", "9222", "--headed", "--json", "fill", "@e3", "hello world"},
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			tool := NewBrowserTool(BrowserToolOptions{Session: tt.session})
 			got := tool.buildArgs(tt.command)
 			if len(got) != len(tt.wantArgs) {
 				t.Errorf("buildArgs(%q) = %v (len %d), want %v (len %d)",
 					tt.command, got, len(got), tt.wantArgs, len(tt.wantArgs))
 				return
 			}
 			for i := range got {
 				if got[i] != tt.wantArgs[i] {
 					t.Errorf("buildArgs(%q)[%d] = %q, want %q",
 						tt.command, i, got[i], tt.wantArgs[i])
 				}
 			}
 		})
 	}
 }
 func TestSplitCommand(t *testing.T) {
 	tests := []struct {
 		input string
 		want  []string
 	}{
 		{"open https://example.com", []string{"open", "https://example.com"}},
 		{`fill @e3 "test@example.com"`, []string{"fill", "@e3", "test@example.com"}},
 		{"snapshot -i -c -d 3", []string{"snapshot", "-i", "-c", "-d", "3"}},
 		{`eval "document.title"`, []string{"eval", "document.title"}},
 		{"  click   @e2  ", []string{"click", "@e2"}},
 		{`get text @e1`, []string{"get", "text", "@e1"}},
 	}
 	for _, tt := range tests {
 		t.Run(tt.input, func(t *testing.T) {
 			got := splitCommand(tt.input)
 			if len(got) != len(tt.want) {
 				t.Errorf("splitCommand(%q) = %v, want %v", tt.input, got, tt.want)
 				return
 			}
 			for i := range got {
 				if got[i] != tt.want[i] {
 					t.Errorf("splitCommand(%q)[%d] = %q, want %q", tt.input, i, got[i], tt.want[i])
 				}
 			}
 		})
 	}
 }