From 31afad6e87b48d7d863ffd39d8b42ed6da762d0c Mon Sep 17 00:00:00 2001 From: reusu Date: Wed, 1 Apr 2026 21:32:10 +0800 Subject: [PATCH] feat: add load_image tool for local file vision (#2116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add load_image tool for local file vision * fix: address load_image PR review feedback - Exclude load_image from sub-agent tools via Unregister after Clone, since RunToolLoop does not call resolveMediaRefs - Add ToolRegistry.Unregister() method - Fix scope collision: use channel:chatID instead of filename - Add channel/chatID context resolution matching send_file pattern - Add comment explaining iteration > 1 guard on resolveMediaRefs - Remove emoji from ForUser for consistency with send_file - Add load_image_test.go * feat: enable load_image for subagents via MediaResolver in RunToolLoop Instead of removing load_image from sub-agent tools (28f69e71), inject a MediaResolver into the legacy RunToolLoop fallback path so media:// refs are resolved to base64 before each LLM call — matching the main agent loop behavior. - Add MediaResolver field to ToolLoopConfig and call it on iteration > 1 - Add SubagentManager.SetMediaResolver() and wire it through runTask - Remove ToolRegistry.Unregister() (no longer needed) - Restore load_image in sub-agent tool set (revert Clone+Unregister) - Add TestSubagentManager_SetMediaResolver_StoresResolver * refactor(load_image): remove prompt parameter from tool schema * test(tools): add success-path test for LoadImageTool Add TestLoadImage_SuccessPath that creates a real PNG file with valid magic bytes, calls Execute with WithToolContext, and verifies: - result.IsError == false - ToolResult.Media contains a media:// ref - ToolResult.ForLLM contains the [image: marker - media ref is resolvable in the store Add explanatory comment in loop.go for why Media and ArtifactTags coexist on non-ResponseHandled tool results (e.g. load_image). * fix: preallocate slice in tests and add ResponseHandled guard in toolloop Fix prealloc linter failure in load_image_test.go. Prevent double-resolving media by checking ResponseHandled in toolloop.go. * Register TTS tool if provider is available --------- Co-authored-by: Reusu Co-authored-by: 美電球 --- pkg/agent/loop.go | 37 ++++++++ pkg/tools/load_image.go | 163 ++++++++++++++++++++++++++++++++ pkg/tools/load_image_test.go | 174 +++++++++++++++++++++++++++++++++++ pkg/tools/subagent.go | 19 ++++ pkg/tools/toolloop.go | 36 +++++++- 5 files changed, 425 insertions(+), 4 deletions(-) create mode 100644 pkg/tools/load_image.go create mode 100644 pkg/tools/load_image_test.go diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index b376ed0af..19c1f0369 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -281,6 +281,17 @@ func registerSharedTools( agent.Tools.Register(tools.NewSendTTSTool(ttsProvider, nil)) } + if cfg.Tools.IsToolEnabled("load_image") { + loadImageTool := tools.NewLoadImageTool( + agent.Workspace, + cfg.Agents.Defaults.RestrictToWorkspace, + cfg.Agents.Defaults.GetMaxMediaSize(), + nil, + allowReadPaths, + ) + agent.Tools.Register(loadImageTool) + } + // Skill discovery and installation tools skills_enabled := cfg.Tools.IsToolEnabled("skills") find_skills_enable := cfg.Tools.IsToolEnabled("find_skills") @@ -323,6 +334,14 @@ func registerSharedTools( subagentManager := tools.NewSubagentManager(provider, agent.Model, agent.Workspace) subagentManager.SetLLMOptions(agent.MaxTokens, agent.Temperature) + // Inject a media resolver so the legacy RunToolLoop fallback path can + // resolve media:// refs in the same way the main AgentLoop does. + // This keeps subagent vision support working even when the optimized + // sub-turn spawner path is unavailable. + subagentManager.SetMediaResolver(func(msgs []providers.Message) []providers.Message { + return resolveMediaRefs(msgs, al.mediaStore, cfg.Agents.Defaults.GetMaxMediaSize()) + }) + // Set the spawner that links into AgentLoop's turnState subagentManager.SetSpawner(func( ctx context.Context, @@ -1861,6 +1880,14 @@ turnLoop: providerToolDefs = filtered } + // Resolve media:// refs produced by tool results (e.g. load_image). + // Skipped on iteration 1 because inbound user media is already resolved + // before entering the loop; only subsequent iterations can contain new + // tool-generated media refs that need base64 encoding. + if iteration > 1 { + messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize) + } + callMessages := messages if gracefulTerminal { callMessages = append(append([]providers.Message(nil), messages...), ts.interruptHintMessage()) @@ -2551,6 +2578,13 @@ turnLoop: } if len(toolResult.Media) > 0 && !toolResult.ResponseHandled { + // For tools like load_image that produce media refs without sending them + // to the user channel (ResponseHandled == false), both Media and ArtifactTags + // coexist on the result: + // - Media: carries media:// refs that resolveMediaRefs will base64-encode + // into image_url parts in the next LLM iteration (enabling vision). + // - ArtifactTags: exposes the local file path as a structured [file:…] tag + // in the tool result text, so the LLM knows an artifact was produced. toolResult.ArtifactTags = buildArtifactTags(al.mediaStore, toolResult.Media) } @@ -2570,6 +2604,9 @@ turnLoop: Content: contentForLLM, ToolCallID: toolCallID, } + if len(toolResult.Media) > 0 && !toolResult.ResponseHandled { + toolResultMsg.Media = append(toolResultMsg.Media, toolResult.Media...) + } al.emitEvent( EventKindToolExecEnd, ts.eventMeta("runTurn", "turn.tool.end"), diff --git a/pkg/tools/load_image.go b/pkg/tools/load_image.go new file mode 100644 index 000000000..41ea6d054 --- /dev/null +++ b/pkg/tools/load_image.go @@ -0,0 +1,163 @@ +package tools + +import ( + "context" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/sipeed/picoclaw/pkg/config" + "github.com/sipeed/picoclaw/pkg/media" +) + +// LoadImageTool loads a local image file into the MediaStore and returns a +// media:// reference. The agent loop's resolveMediaRefs will then base64-encode +// it and attach it as an image_url part in the next LLM request, enabling +// vision on local files — the same pipeline used when a user sends an image +// through a chat channel. +// +// This is intentionally different from SendFileTool: +// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn +// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn +type LoadImageTool struct { + workspace string + restrict bool + maxFileSize int + mediaStore media.MediaStore + allowPaths []*regexp.Regexp + + defaultChannel string + defaultChatID string +} + +func NewLoadImageTool( + workspace string, + restrict bool, + maxFileSize int, + store media.MediaStore, + allowPaths ...[]*regexp.Regexp, +) *LoadImageTool { + if maxFileSize <= 0 { + maxFileSize = config.DefaultMaxMediaSize + } + var patterns []*regexp.Regexp + if len(allowPaths) > 0 { + patterns = allowPaths[0] + } + return &LoadImageTool{ + workspace: workspace, + restrict: restrict, + maxFileSize: maxFileSize, + mediaStore: store, + allowPaths: patterns, + } +} + +func (t *LoadImageTool) Name() string { return "load_image" } + +func (t *LoadImageTool) Description() string { + return "Load a local image file so you can analyze its contents with vision. " + + "Supported formats: JPEG, PNG, GIF, WebP, BMP. " + + "After calling this tool, describe or analyze the image in your next response." +} + +func (t *LoadImageTool) Parameters() map[string]any { + return map[string]any{ + "type": "object", + "properties": map[string]any{ + "path": map[string]any{ + "type": "string", + "description": "Path to the local image file. Relative paths are resolved from workspace.", + }, + }, + "required": []string{"path"}, + } +} + +func (t *LoadImageTool) SetContext(channel, chatID string) { + t.defaultChannel = channel + t.defaultChatID = chatID +} + +func (t *LoadImageTool) SetMediaStore(store media.MediaStore) { + t.mediaStore = store +} + +func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult { + path, _ := args["path"].(string) + if strings.TrimSpace(path) == "" { + return ErrorResult("path is required") + } + + // Prefer context-injected channel/chatID (set by ExecuteWithContext), fall back to SetContext values. + channel := ToolChannel(ctx) + if channel == "" { + channel = t.defaultChannel + } + chatID := ToolChatID(ctx) + if chatID == "" { + chatID = t.defaultChatID + } + if channel == "" || chatID == "" { + return ErrorResult("no target channel/chat available") + } + + if t.mediaStore == nil { + return ErrorResult("media store not configured") + } + + resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths) + if err != nil { + return ErrorResult(fmt.Sprintf("invalid path: %v", err)) + } + + info, err := os.Stat(resolved) + if err != nil { + return ErrorResult(fmt.Sprintf("file not found: %v", err)) + } + if info.IsDir() { + return ErrorResult("path is a directory, expected an image file") + } + if info.Size() > int64(t.maxFileSize) { + return ErrorResult(fmt.Sprintf( + "file too large: %d bytes (max %d bytes)", info.Size(), t.maxFileSize, + )) + } + + // Detect MIME type — reuse the helper already in send_file.go + mediaType := detectMediaType(resolved) + if !strings.HasPrefix(mediaType, "image/") { + return ErrorResult(fmt.Sprintf( + "file does not appear to be an image (detected type: %s)", mediaType, + )) + } + + filename := filepath.Base(resolved) + scope := fmt.Sprintf("tool:load_image:%s:%s", channel, chatID) + + ref, err := t.mediaStore.Store(resolved, media.MediaMeta{ + Filename: filename, + ContentType: mediaType, + Source: "tool:load_image", + CleanupPolicy: media.CleanupPolicyForgetOnly, + }, scope) + if err != nil { + return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err)) + } + + // Build the tool result text. The media:// ref will be picked up by + // resolveMediaRefs in loop_media.go and converted to a base64 data URL + // before the next LLM call, exactly like channel-received images. + msg := fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref) + + return &ToolResult{ + ForLLM: msg, + ForUser: fmt.Sprintf("Loaded image: %s", filename), + // Media refs inside ForLLM are resolved by resolveMediaRefs in the + // agent loop before the next LLM call. Do NOT use MediaResult here — + // that would send the file to the user channel instead. + Media: []string{ref}, + } +} diff --git a/pkg/tools/load_image_test.go b/pkg/tools/load_image_test.go new file mode 100644 index 000000000..91118f93e --- /dev/null +++ b/pkg/tools/load_image_test.go @@ -0,0 +1,174 @@ +package tools + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/sipeed/picoclaw/pkg/config" + "github.com/sipeed/picoclaw/pkg/media" + "github.com/sipeed/picoclaw/pkg/providers" +) + +func TestLoadImage_PathRequired(t *testing.T) { + tool := NewLoadImageTool("/tmp", false, 0, nil) + ctx := WithToolContext(context.Background(), "test", "chat1") + result := tool.Execute(ctx, map[string]any{}) + if !result.IsError { + t.Fatal("expected error for missing path") + } +} + +func TestLoadImage_NilMediaStore(t *testing.T) { + tool := NewLoadImageTool("/tmp", false, 0, nil) + ctx := WithToolContext(context.Background(), "test", "chat1") + result := tool.Execute(ctx, map[string]any{"path": "test.png"}) + if !result.IsError || result.ForLLM != "media store not configured" { + t.Fatalf("expected media store error, got: %s", result.ForLLM) + } +} + +func TestLoadImage_NoChannelContext(t *testing.T) { + store := media.NewFileMediaStore() + tool := NewLoadImageTool("/tmp", false, 0, store) + // No WithToolContext — should fail + result := tool.Execute(context.Background(), map[string]any{"path": "test.png"}) + if !result.IsError || result.ForLLM != "no target channel/chat available" { + t.Fatalf("expected channel error, got: %s", result.ForLLM) + } +} + +func TestLoadImage_NonImageFile(t *testing.T) { + dir := t.TempDir() + txtFile := filepath.Join(dir, "readme.txt") + os.WriteFile(txtFile, []byte("hello"), 0o644) + + store := media.NewFileMediaStore() + tool := NewLoadImageTool(dir, false, 0, store) + ctx := WithToolContext(context.Background(), "test", "chat1") + result := tool.Execute(ctx, map[string]any{"path": txtFile}) + if !result.IsError { + t.Fatal("expected error for non-image file") + } +} + +func TestLoadImage_DefaultMaxSize(t *testing.T) { + tool := NewLoadImageTool("/tmp", false, 0, nil) + if tool.maxFileSize != config.DefaultMaxMediaSize { + t.Errorf("expected default max size %d, got %d", config.DefaultMaxMediaSize, tool.maxFileSize) + } +} + +func TestLoadImage_FileTooLarge(t *testing.T) { + dir := t.TempDir() + bigFile := filepath.Join(dir, "big.png") + // Create a file with PNG header but exceeding max size + data := make([]byte, 1024) + copy(data, []byte{0x89, 0x50, 0x4E, 0x47}) // PNG magic bytes + os.WriteFile(bigFile, data, 0o644) + + store := media.NewFileMediaStore() + tool := NewLoadImageTool(dir, false, 512, store) // maxSize = 512 + ctx := WithToolContext(context.Background(), "test", "chat1") + result := tool.Execute(ctx, map[string]any{"path": bigFile}) + if !result.IsError { + t.Fatal("expected error for oversized file") + } +} + +func TestSubagentManager_SetMediaResolver_StoresResolver(t *testing.T) { + manager := NewSubagentManager(nil, "gpt-test", "/tmp") + + called := false + manager.SetMediaResolver(func(msgs []providers.Message) []providers.Message { + called = true + return msgs + }) + + manager.mu.RLock() + got := manager.mediaResolver + manager.mu.RUnlock() + + if got == nil { + t.Fatal("expected mediaResolver to be set") + } + + if called { + t.Fatal("resolver should not be called during SetMediaResolver") + } +} + +func TestLoadImage_SuccessPath(t *testing.T) { + dir := t.TempDir() + + // Create a minimal valid PNG file (8-byte signature + minimal IHDR + IEND). + // The PNG spec requires the 8-byte magic header: 0x89 P N G \r \n 0x1a \n + pngSignature := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A} + // IHDR chunk: length(13) + "IHDR" + 1x1 px, 8-bit RGB, no interlace + CRC + ihdr := []byte{ + 0x00, 0x00, 0x00, 0x0D, // chunk length = 13 + 0x49, 0x48, 0x44, 0x52, // "IHDR" + 0x00, 0x00, 0x00, 0x01, // width = 1 + 0x00, 0x00, 0x00, 0x01, // height = 1 + 0x08, // bit depth = 8 + 0x02, // color type = RGB + 0x00, 0x00, 0x00, // compression, filter, interlace + 0x90, 0x77, 0x53, 0xDE, // CRC (valid for this IHDR) + } + // IEND chunk + iend := []byte{ + 0x00, 0x00, 0x00, 0x00, // chunk length = 0 + 0x49, 0x45, 0x4E, 0x44, // "IEND" + 0xAE, 0x42, 0x60, 0x82, // CRC + } + + pngData := make([]byte, 0, len(pngSignature)+len(ihdr)+len(iend)) + pngData = append(pngData, pngSignature...) + pngData = append(pngData, ihdr...) + pngData = append(pngData, iend...) + + imgPath := filepath.Join(dir, "test_image.png") + if err := os.WriteFile(imgPath, pngData, 0o644); err != nil { + t.Fatalf("failed to create test PNG: %v", err) + } + + store := media.NewFileMediaStore() + tool := NewLoadImageTool(dir, false, 0, store) + ctx := WithToolContext(context.Background(), "test", "chat1") + + result := tool.Execute(ctx, map[string]any{"path": imgPath}) + + // 1. Must not be an error + if result.IsError { + t.Fatalf("expected success, got error: %s", result.ForLLM) + } + + // 2. Media must contain exactly one media:// ref + if len(result.Media) != 1 { + t.Fatalf("expected 1 media ref, got %d", len(result.Media)) + } + if !strings.HasPrefix(result.Media[0], "media://") { + t.Errorf("expected media ref to start with 'media://', got: %s", result.Media[0]) + } + + // 3. ForLLM must contain the [image: marker + if !strings.Contains(result.ForLLM, "[image:") { + t.Errorf("expected ForLLM to contain '[image:' marker, got: %s", result.ForLLM) + } + + // 4. ForLLM should also contain the media:// ref + if !strings.Contains(result.ForLLM, result.Media[0]) { + t.Errorf("expected ForLLM to contain media ref %q, got: %s", result.Media[0], result.ForLLM) + } + + // 5. Verify the ref is resolvable in the store + resolved, err := store.Resolve(result.Media[0]) + if err != nil { + t.Fatalf("media ref not resolvable: %v", err) + } + if resolved != imgPath { + t.Errorf("expected resolved path %q, got %q", imgPath, resolved) + } +} diff --git a/pkg/tools/subagent.go b/pkg/tools/subagent.go index 9a1a8b802..ada89efb7 100644 --- a/pkg/tools/subagent.go +++ b/pkg/tools/subagent.go @@ -67,6 +67,12 @@ type SubagentManager struct { hasTemperature bool nextID int spawner SpawnSubTurnFunc + + // mediaResolver resolves media:// refs in tool-loop messages before + // each LLM call in the legacy RunToolLoop fallback path. + // This lets subagents reuse the same media handling behavior as the + // main agent loop without importing pkg/agent and creating a cycle. + mediaResolver func([]providers.Message) []providers.Message } func NewSubagentManager( @@ -90,6 +96,17 @@ func (sm *SubagentManager) SetSpawner(spawner SpawnSubTurnFunc) { sm.spawner = spawner } +// SetMediaResolver injects a message preprocessor that resolves media:// refs +// into LLM-ready content before each tool-loop iteration. +// This is only used by the legacy RunToolLoop fallback path. +func (sm *SubagentManager) SetMediaResolver( + resolver func([]providers.Message) []providers.Message, +) { + sm.mu.Lock() + defer sm.mu.Unlock() + sm.mediaResolver = resolver +} + // SetLLMOptions sets max tokens and temperature for subagent LLM calls. func (sm *SubagentManager) SetLLMOptions(maxTokens int, temperature float64) { sm.mu.Lock() @@ -177,6 +194,7 @@ func (sm *SubagentManager) runTask( temperature := sm.temperature hasMaxTokens := sm.hasMaxTokens hasTemperature := sm.hasTemperature + mediaResolver := sm.mediaResolver sm.mu.RUnlock() var result *ToolResult @@ -223,6 +241,7 @@ After completing the task, provide a clear summary of what was done.` Tools: tools, MaxIterations: maxIter, LLMOptions: llmOptions, + MediaResolver: mediaResolver, }, messages, task.OriginChannel, task.OriginChatID) if err == nil { diff --git a/pkg/tools/toolloop.go b/pkg/tools/toolloop.go index 387813e94..ac568f598 100644 --- a/pkg/tools/toolloop.go +++ b/pkg/tools/toolloop.go @@ -24,6 +24,11 @@ type ToolLoopConfig struct { Tools *ToolRegistry MaxIterations int LLMOptions map[string]any + + // MediaResolver resolves media:// refs in messages before each LLM call. + // This is optional and is mainly used by subagent legacy fallback execution + // so subagents can reuse the same multimodal media handling as the main loop. + MediaResolver func(messages []providers.Message) []providers.Message } // ToolLoopResult contains the result of running the tool loop. @@ -63,8 +68,27 @@ func RunToolLoop( if llmOpts == nil { llmOpts = map[string]any{} } - // 3. Call LLM - response, err := config.Provider.Chat(ctx, messages, providerToolDefs, config.Model, llmOpts) + + // 3. Resolve media:// refs and Call LLM. + // Tools like load_image produce media:// refs in their result messages. + // Without this step, the LLM would receive raw "media://uuid" strings + // instead of base64-encoded image data URLs. + // + // We build a separate callMessages slice so that: + // (a) the resolver output is used for the LLM call only, + // (b) the original `messages` slice keeps the unresolved refs for + // subsequent iterations — the resolver is idempotent but working + // on the original avoids double-encoding issues. + // + // On iteration 1 the initial user messages typically have no media:// + // refs (they come from plain text), so this is effectively a no-op; + // it becomes relevant from iteration 2 onward when tool results may + // contain media refs. + callMessages := messages + if config.MediaResolver != nil && iteration > 1 { + callMessages = config.MediaResolver(messages) + } + response, err := config.Provider.Chat(ctx, callMessages, providerToolDefs, config.Model, llmOpts) if err != nil { logger.ErrorCF("toolloop", "LLM call failed", map[string]any{ @@ -161,11 +185,15 @@ func RunToolLoop( for _, r := range results { contentForLLM := r.result.ContentForLLM() - messages = append(messages, providers.Message{ + toolMsg := providers.Message{ Role: "tool", Content: contentForLLM, ToolCallID: r.tc.ID, - }) + } + if len(r.result.Media) > 0 && !r.result.ResponseHandled { + toolMsg.Media = append(toolMsg.Media, r.result.Media...) + } + messages = append(messages, toolMsg) } }