From 520391643b7062f62cd58e987433d79a81ea0f3b Mon Sep 17 00:00:00 2001
From: Amir Mamaghani <hello@amirmamaghani.com>
Date: Sat, 21 Mar 2026 15:14:32 +0100
Subject: [PATCH] feat: add agent-browser skill and Dockerfile.heavy with full
 runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add agent-browser skill to the default workspace with complete CLI
reference for browser automation via Chrome/Chromium CDP. The skill
includes a runtime guard that checks for the binary before use.

Add Dockerfile.heavy — a batteries-included container image with:
- Node.js 24 + npm
- Python 3 + pip + uv
- Chromium + Playwright (for agent-browser)
- agent-browser CLI pre-installed
- Non-root picoclaw user (UID/GID 1000)
- Default workspace with all skills
- Persistent workspace volume

This complements the existing minimal Dockerfile and Dockerfile.full
for deployments that need browser automation and rich tool support.
---
 docker/Dockerfile.heavy                 |  67 ++++++++++++
 workspace/skills/agent-browser/SKILL.md | 129 ++++++++++++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 docker/Dockerfile.heavy
 create mode 100644 workspace/skills/agent-browser/SKILL.md

diff --git a/docker/Dockerfile.heavy b/docker/Dockerfile.heavy
new file mode 100644
index 000000000..cbc243e39
--- /dev/null
+++ b/docker/Dockerfile.heavy
@@ -0,0 +1,67 @@
+# ============================================================
+# Stage 1: Build the picoclaw binary
+# ============================================================
+FROM golang:1.26.0-alpine AS builder
+
+RUN apk add --no-cache git make
+
+WORKDIR /src
+
+# Cache dependencies
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy source and build
+COPY . .
+RUN make build
+
+# ============================================================
+# Stage 2: Node.js runtime with Python + MCP support
+# ============================================================
+FROM node:24-alpine3.23
+
+RUN apk add --no-cache \
+  ca-certificates \
+  curl \
+  git \
+  python3 \
+  py3-pip \
+  chromium \
+  jq
+
+# Install Playwright browsers for agent-browser
+ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
+RUN npm install -g agent-browser && \
+    npx playwright install chromium && \
+    chmod -R o+rx $PLAYWRIGHT_BROWSERS_PATH
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+  ln -s /root/.local/bin/uv /usr/local/bin/uv && \
+  ln -s /root/.local/bin/uvx /usr/local/bin/uvx && \
+  uv --version
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+  CMD wget -q --spider http://localhost:18790/health || exit 1
+
+# Copy binary
+COPY --from=builder /src/build/picoclaw /usr/local/bin/picoclaw
+
+# Reuse existing node user (UID/GID 1000) — rename to picoclaw
+RUN deluser node 2>/dev/null; delgroup node 2>/dev/null; \
+    addgroup -g 1000 picoclaw 2>/dev/null; \
+    adduser -D -u 1000 -G picoclaw -h /home/picoclaw picoclaw 2>/dev/null || true
+
+USER picoclaw
+
+# Run onboard to create initial directories and config
+RUN /usr/local/bin/picoclaw onboard
+
+# Copy default workspace
+COPY --chown=picoclaw:picoclaw workspace/ /home/picoclaw/.picoclaw/workspace/
+
+VOLUME /home/picoclaw/.picoclaw/workspace
+
+ENTRYPOINT ["picoclaw"]
+CMD ["gateway"]
diff --git a/workspace/skills/agent-browser/SKILL.md b/workspace/skills/agent-browser/SKILL.md
new file mode 100644
index 000000000..43505996d
--- /dev/null
+++ b/workspace/skills/agent-browser/SKILL.md
@@ -0,0 +1,129 @@
+---
+name: agent-browser
+description: "Browser automation via agent-browser CLI. Use when the user needs to navigate websites, fill forms, click buttons, take screenshots, extract data, or test web apps."
+metadata: {"nanobot":{"emoji":"🌐","requires":{"bins":["agent-browser"]},"install":[{"id":"npm","kind":"npm","package":"agent-browser","global":true,"bins":["agent-browser"],"label":"Install agent-browser (npm)"}]}}
+---
+
+# Agent Browser
+
+CLI browser automation via Chrome/Chromium CDP. Install: `npm i -g agent-browser && agent-browser install`.
+
+**Before using this skill**, verify the tool is available by running `which agent-browser`. If the command is not found, tell the user that browser automation requires the `agent-browser` CLI and Chromium, which are only available in the heavy container image. Do not attempt to install it at runtime.
+
+## Core Workflow
+
+1. `agent-browser open <url>` — navigate
+2. `agent-browser snapshot -i` — get interactive elements with refs (`@e1`, `@e2`, ...)
+3. Interact using refs — `click @e1`, `fill @e2 "text"`
+4. Re-snapshot after any navigation or DOM change — refs are invalidated
+
+```bash
+agent-browser open https://example.com/form
+agent-browser snapshot -i
+# @e1 [input] "Email", @e2 [input] "Password", @e3 [button] "Submit"
+agent-browser fill @e1 "user@example.com"
+agent-browser fill @e2 "secret"
+agent-browser click @e3
+agent-browser wait --load networkidle
+agent-browser snapshot -i
+```
+
+Chain commands with `&&` when you don't need intermediate output:
+```bash
+agent-browser open https://example.com && agent-browser wait --load networkidle && agent-browser snapshot -i
+```
+
+## Commands
+
+```bash
+# Navigation
+agent-browser open <url>
+agent-browser close
+
+# Snapshot
+agent-browser snapshot -i                # Interactive elements with refs
+agent-browser snapshot -s "#selector"    # Scope to CSS selector
+
+# Interaction (use @refs from snapshot)
+agent-browser click @e1
+agent-browser fill @e2 "text"            # Clear + type
+agent-browser type @e2 "text"            # Type without clearing
+agent-browser select @e1 "option"
+agent-browser check @e1
+agent-browser press Enter
+agent-browser scroll down 500
+
+# Get info
+agent-browser get text @e1
+agent-browser get url
+agent-browser get title
+
+# Wait
+agent-browser wait @e1                   # Wait for element
+agent-browser wait --load networkidle    # Wait for network idle
+agent-browser wait --url "**/dashboard"  # Wait for URL pattern
+agent-browser wait --text "Welcome"      # Wait for text
+agent-browser wait 2000                  # Wait ms
+
+# Capture
+agent-browser screenshot                 # Screenshot to temp dir
+agent-browser screenshot --full          # Full page
+agent-browser screenshot --annotate      # With numbered element labels ([N] -> @eN)
+agent-browser pdf output.pdf
+
+# Semantic locators (when refs unavailable)
+agent-browser find text "Sign In" click
+agent-browser find label "Email" fill "user@test.com"
+agent-browser find role button click --name "Submit"
+```
+
+## Authentication
+
+```bash
+# Option 1: Import from user's running Chrome
+agent-browser --auto-connect state save ./auth.json
+agent-browser --state ./auth.json open https://app.example.com
+
+# Option 2: Persistent profile
+agent-browser --profile ~/.myapp open https://app.example.com/login
+# ... login once, all future runs are authenticated
+
+# Option 3: Session name (auto-save/restore)
+agent-browser --session-name myapp open https://app.example.com/login
+# ... login, close, next run state is restored
+
+# Option 4: State file
+agent-browser state save auth.json
+agent-browser state load auth.json
+```
+
+## Iframes
+
+Iframe content is inlined in snapshots. Interact with iframe refs directly — no frame switch needed.
+
+## Parallel Sessions
+
+```bash
+agent-browser --session s1 open https://site-a.com
+agent-browser --session s2 open https://site-b.com
+agent-browser session list
+```
+
+## JavaScript Eval
+
+```bash
+agent-browser eval 'document.title'
+
+# Complex JS — use --stdin to avoid shell quoting issues
+agent-browser eval --stdin <<'EVALEOF'
+JSON.stringify(Array.from(document.querySelectorAll("a")).map(a => a.href))
+EVALEOF
+```
+
+## Cleanup
+
+Always close sessions when done:
+```bash
+agent-browser close
+agent-browser --session s1 close
+```