From 66924457bc8067668a8b8fff866c9f014f2712b4 Mon Sep 17 00:00:00 2001 From: Reusu Date: Sat, 28 Mar 2026 16:05:30 +0800 Subject: [PATCH] feat: add load_image tool for local file vision --- pkg/agent/loop.go | 18 +++++ pkg/tools/load_image.go | 152 ++++++++++++++++++++++++++++++++++++++++ pkg/tools/toolloop.go | 8 ++- 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 pkg/tools/load_image.go diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 48932b10b..819eeb4e6 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -246,6 +246,17 @@ func registerSharedTools( agent.Tools.Register(sendFileTool) } + if cfg.Tools.IsToolEnabled("load_image") { + loadImageTool := tools.NewLoadImageTool( + agent.Workspace, + cfg.Agents.Defaults.RestrictToWorkspace, + cfg.Agents.Defaults.GetMaxMediaSize(), + nil, + allowReadPaths, + ) + agent.Tools.Register(loadImageTool) + } + // Skill discovery and installation tools skills_enabled := cfg.Tools.IsToolEnabled("skills") find_skills_enable := cfg.Tools.IsToolEnabled("find_skills") @@ -1806,6 +1817,10 @@ turnLoop: providerToolDefs = filtered } + if iteration > 1 { + messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize) + } + callMessages := messages if gracefulTerminal { callMessages = append(append([]providers.Message(nil), messages...), ts.interruptHintMessage()) @@ -2499,6 +2514,9 @@ turnLoop: Content: contentForLLM, ToolCallID: toolCallID, } + if len(toolResult.Media) > 0 && !toolResult.ResponseHandled { + toolResultMsg.Media = append(toolResultMsg.Media, toolResult.Media...) + } al.emitEvent( EventKindToolExecEnd, ts.eventMeta("runTurn", "turn.tool.end"), diff --git a/pkg/tools/load_image.go b/pkg/tools/load_image.go new file mode 100644 index 000000000..bd386346c --- /dev/null +++ b/pkg/tools/load_image.go @@ -0,0 +1,152 @@ +package tools + +import ( + "context" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/sipeed/picoclaw/pkg/config" + "github.com/sipeed/picoclaw/pkg/media" +) + +// LoadImageTool loads a local image file into the MediaStore and returns a +// media:// reference. The agent loop's resolveMediaRefs will then base64-encode +// it and attach it as an image_url part in the next LLM request, enabling +// vision on local files — the same pipeline used when a user sends an image +// through a chat channel. +// +// This is intentionally different from SendFileTool: +// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn +// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn +type LoadImageTool struct { + workspace string + restrict bool + maxSize int + mediaStore media.MediaStore + allowPaths []*regexp.Regexp +} + +func NewLoadImageTool( + workspace string, + restrict bool, + maxSize int, + store media.MediaStore, + allowPaths ...[]*regexp.Regexp, +) *LoadImageTool { + if maxSize <= 0 { + maxSize = config.DefaultMaxMediaSize + } + var patterns []*regexp.Regexp + if len(allowPaths) > 0 { + patterns = allowPaths[0] + } + return &LoadImageTool{ + workspace: workspace, + restrict: restrict, + maxSize: maxSize, + mediaStore: store, + allowPaths: patterns, + } +} + +func (t *LoadImageTool) Name() string { return "load_image" } + +func (t *LoadImageTool) Description() string { + return "Load a local image file so you can analyze its contents with vision. " + + "Supported formats: JPEG, PNG, GIF, WebP, BMP. " + + "After calling this tool, describe or analyze the image in your next response." +} + +func (t *LoadImageTool) Parameters() map[string]any { + return map[string]any{ + "type": "object", + "properties": map[string]any{ + "path": map[string]any{ + "type": "string", + "description": "Path to the local image file. Relative paths are resolved from workspace.", + }, + "prompt": map[string]any{ + "type": "string", + "description": "Optional question or instruction about the image, e.g. 'What objects are in this image?'", + }, + }, + "required": []string{"path"}, + } +} + +func (t *LoadImageTool) SetMediaStore(store media.MediaStore) { + t.mediaStore = store +} + +func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult { + path, _ := args["path"].(string) + if strings.TrimSpace(path) == "" { + return ErrorResult("path is required") + } + + if t.mediaStore == nil { + return ErrorResult("media store not configured") + } + + resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths) + if err != nil { + return ErrorResult(fmt.Sprintf("invalid path: %v", err)) + } + + info, err := os.Stat(resolved) + if err != nil { + return ErrorResult(fmt.Sprintf("file not found: %v", err)) + } + if info.IsDir() { + return ErrorResult("path is a directory, expected an image file") + } + if info.Size() > int64(t.maxSize) { + return ErrorResult(fmt.Sprintf( + "file too large: %d bytes (max %d bytes)", info.Size(), t.maxSize, + )) + } + + // Detect MIME type — reuse the helper already in send_file.go + mediaType := detectMediaType(resolved) + if !strings.HasPrefix(mediaType, "image/") { + return ErrorResult(fmt.Sprintf( + "file does not appear to be an image (detected type: %s)", mediaType, + )) + } + + filename := filepath.Base(resolved) + scope := fmt.Sprintf("tool:load_image:%s", filename) + + ref, err := t.mediaStore.Store(resolved, media.MediaMeta{ + Filename: filename, + ContentType: mediaType, + Source: "tool:load_image", + CleanupPolicy: media.CleanupPolicyForgetOnly, + }, scope) + if err != nil { + return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err)) + } + + // Build the tool result text. The media:// ref will be picked up by + // resolveMediaRefs in loop_media.go and converted to a base64 data URL + // before the next LLM call, exactly like channel-received images. + prompt, _ := args["prompt"].(string) + var msg string + if prompt != "" { + msg = fmt.Sprintf("Image loaded: %s\n%s\n[image: %s]", filename, prompt, ref) + } else { + msg = fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref) + } + + return &ToolResult{ + ForLLM: msg, + ForUser: fmt.Sprintf("📷 Loaded image: %s", filename), + // Media refs inside ForLLM are resolved by resolveMediaRefs in the + // agent loop before the next LLM call. Do NOT use MediaResult here — + // that would send the file to the user channel instead. + Media: []string{ref}, + } +} diff --git a/pkg/tools/toolloop.go b/pkg/tools/toolloop.go index 387813e94..1704efeff 100644 --- a/pkg/tools/toolloop.go +++ b/pkg/tools/toolloop.go @@ -161,11 +161,15 @@ func RunToolLoop( for _, r := range results { contentForLLM := r.result.ContentForLLM() - messages = append(messages, providers.Message{ + toolMsg := providers.Message{ Role: "tool", Content: contentForLLM, ToolCallID: r.tc.ID, - }) + } + if len(r.result.Media) > 0 { + toolMsg.Media = append(toolMsg.Media, r.result.Media...) + } + messages = append(messages, toolMsg) } }