feat: add load_image tool for local file vision

This commit is contained in:
Reusu
2026-03-28 16:05:30 +08:00
parent e011284d8f
commit 66924457bc
3 changed files with 176 additions and 2 deletions
+18
View File
@@ -246,6 +246,17 @@ func registerSharedTools(
agent.Tools.Register(sendFileTool)
}
if cfg.Tools.IsToolEnabled("load_image") {
loadImageTool := tools.NewLoadImageTool(
agent.Workspace,
cfg.Agents.Defaults.RestrictToWorkspace,
cfg.Agents.Defaults.GetMaxMediaSize(),
nil,
allowReadPaths,
)
agent.Tools.Register(loadImageTool)
}
// Skill discovery and installation tools
skills_enabled := cfg.Tools.IsToolEnabled("skills")
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
@@ -1806,6 +1817,10 @@ turnLoop:
providerToolDefs = filtered
}
if iteration > 1 {
messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize)
}
callMessages := messages
if gracefulTerminal {
callMessages = append(append([]providers.Message(nil), messages...), ts.interruptHintMessage())
@@ -2499,6 +2514,9 @@ turnLoop:
Content: contentForLLM,
ToolCallID: toolCallID,
}
if len(toolResult.Media) > 0 && !toolResult.ResponseHandled {
toolResultMsg.Media = append(toolResultMsg.Media, toolResult.Media...)
}
al.emitEvent(
EventKindToolExecEnd,
ts.eventMeta("runTurn", "turn.tool.end"),
+152
View File
@@ -0,0 +1,152 @@
package tools
import (
"context"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/sipeed/picoclaw/pkg/config"
"github.com/sipeed/picoclaw/pkg/media"
)
// LoadImageTool loads a local image file into the MediaStore and returns a
// media:// reference. The agent loop's resolveMediaRefs will then base64-encode
// it and attach it as an image_url part in the next LLM request, enabling
// vision on local files — the same pipeline used when a user sends an image
// through a chat channel.
//
// This is intentionally different from SendFileTool:
// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn
// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn
type LoadImageTool struct {
workspace string
restrict bool
maxSize int
mediaStore media.MediaStore
allowPaths []*regexp.Regexp
}
func NewLoadImageTool(
workspace string,
restrict bool,
maxSize int,
store media.MediaStore,
allowPaths ...[]*regexp.Regexp,
) *LoadImageTool {
if maxSize <= 0 {
maxSize = config.DefaultMaxMediaSize
}
var patterns []*regexp.Regexp
if len(allowPaths) > 0 {
patterns = allowPaths[0]
}
return &LoadImageTool{
workspace: workspace,
restrict: restrict,
maxSize: maxSize,
mediaStore: store,
allowPaths: patterns,
}
}
func (t *LoadImageTool) Name() string { return "load_image" }
func (t *LoadImageTool) Description() string {
return "Load a local image file so you can analyze its contents with vision. " +
"Supported formats: JPEG, PNG, GIF, WebP, BMP. " +
"After calling this tool, describe or analyze the image in your next response."
}
func (t *LoadImageTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"path": map[string]any{
"type": "string",
"description": "Path to the local image file. Relative paths are resolved from workspace.",
},
"prompt": map[string]any{
"type": "string",
"description": "Optional question or instruction about the image, e.g. 'What objects are in this image?'",
},
},
"required": []string{"path"},
}
}
func (t *LoadImageTool) SetMediaStore(store media.MediaStore) {
t.mediaStore = store
}
func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
path, _ := args["path"].(string)
if strings.TrimSpace(path) == "" {
return ErrorResult("path is required")
}
if t.mediaStore == nil {
return ErrorResult("media store not configured")
}
resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths)
if err != nil {
return ErrorResult(fmt.Sprintf("invalid path: %v", err))
}
info, err := os.Stat(resolved)
if err != nil {
return ErrorResult(fmt.Sprintf("file not found: %v", err))
}
if info.IsDir() {
return ErrorResult("path is a directory, expected an image file")
}
if info.Size() > int64(t.maxSize) {
return ErrorResult(fmt.Sprintf(
"file too large: %d bytes (max %d bytes)", info.Size(), t.maxSize,
))
}
// Detect MIME type — reuse the helper already in send_file.go
mediaType := detectMediaType(resolved)
if !strings.HasPrefix(mediaType, "image/") {
return ErrorResult(fmt.Sprintf(
"file does not appear to be an image (detected type: %s)", mediaType,
))
}
filename := filepath.Base(resolved)
scope := fmt.Sprintf("tool:load_image:%s", filename)
ref, err := t.mediaStore.Store(resolved, media.MediaMeta{
Filename: filename,
ContentType: mediaType,
Source: "tool:load_image",
CleanupPolicy: media.CleanupPolicyForgetOnly,
}, scope)
if err != nil {
return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err))
}
// Build the tool result text. The media:// ref will be picked up by
// resolveMediaRefs in loop_media.go and converted to a base64 data URL
// before the next LLM call, exactly like channel-received images.
prompt, _ := args["prompt"].(string)
var msg string
if prompt != "" {
msg = fmt.Sprintf("Image loaded: %s\n%s\n[image: %s]", filename, prompt, ref)
} else {
msg = fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref)
}
return &ToolResult{
ForLLM: msg,
ForUser: fmt.Sprintf("📷 Loaded image: %s", filename),
// Media refs inside ForLLM are resolved by resolveMediaRefs in the
// agent loop before the next LLM call. Do NOT use MediaResult here —
// that would send the file to the user channel instead.
Media: []string{ref},
}
}
+6 -2
View File
@@ -161,11 +161,15 @@ func RunToolLoop(
for _, r := range results {
contentForLLM := r.result.ContentForLLM()
messages = append(messages, providers.Message{
toolMsg := providers.Message{
Role: "tool",
Content: contentForLLM,
ToolCallID: r.tc.ID,
})
}
if len(r.result.Media) > 0 {
toolMsg.Media = append(toolMsg.Media, r.result.Media...)
}
messages = append(messages, toolMsg)
}
}