Files
picoclaw/pkg/tools/load_image.go
T
reusu 28f69e71cc fix: address load_image PR review feedback
- Exclude load_image from sub-agent tools via Unregister after Clone,
  since RunToolLoop does not call resolveMediaRefs
- Add ToolRegistry.Unregister() method
- Fix scope collision: use channel:chatID instead of filename
- Add channel/chatID context resolution matching send_file pattern
- Add comment explaining iteration > 1 guard on resolveMediaRefs
- Remove emoji from ForUser for consistency with send_file
- Add load_image_test.go
2026-03-28 22:49:54 +08:00

174 lines
5.0 KiB
Go

package tools
import (
"context"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/sipeed/picoclaw/pkg/config"
"github.com/sipeed/picoclaw/pkg/media"
)
// LoadImageTool loads a local image file into the MediaStore and returns a
// media:// reference. The agent loop's resolveMediaRefs will then base64-encode
// it and attach it as an image_url part in the next LLM request, enabling
// vision on local files — the same pipeline used when a user sends an image
// through a chat channel.
//
// This is intentionally different from SendFileTool:
// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn
// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn
type LoadImageTool struct {
workspace string
restrict bool
maxFileSize int
mediaStore media.MediaStore
allowPaths []*regexp.Regexp
defaultChannel string
defaultChatID string
}
func NewLoadImageTool(
workspace string,
restrict bool,
maxFileSize int,
store media.MediaStore,
allowPaths ...[]*regexp.Regexp,
) *LoadImageTool {
if maxFileSize <= 0 {
maxFileSize = config.DefaultMaxMediaSize
}
var patterns []*regexp.Regexp
if len(allowPaths) > 0 {
patterns = allowPaths[0]
}
return &LoadImageTool{
workspace: workspace,
restrict: restrict,
maxFileSize: maxFileSize,
mediaStore: store,
allowPaths: patterns,
}
}
func (t *LoadImageTool) Name() string { return "load_image" }
func (t *LoadImageTool) Description() string {
return "Load a local image file so you can analyze its contents with vision. " +
"Supported formats: JPEG, PNG, GIF, WebP, BMP. " +
"After calling this tool, describe or analyze the image in your next response."
}
func (t *LoadImageTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"path": map[string]any{
"type": "string",
"description": "Path to the local image file. Relative paths are resolved from workspace.",
},
"prompt": map[string]any{
"type": "string",
"description": "Optional question or instruction about the image, e.g. 'What objects are in this image?'",
},
},
"required": []string{"path"},
}
}
func (t *LoadImageTool) SetContext(channel, chatID string) {
t.defaultChannel = channel
t.defaultChatID = chatID
}
func (t *LoadImageTool) SetMediaStore(store media.MediaStore) {
t.mediaStore = store
}
func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
path, _ := args["path"].(string)
if strings.TrimSpace(path) == "" {
return ErrorResult("path is required")
}
// Prefer context-injected channel/chatID (set by ExecuteWithContext), fall back to SetContext values.
channel := ToolChannel(ctx)
if channel == "" {
channel = t.defaultChannel
}
chatID := ToolChatID(ctx)
if chatID == "" {
chatID = t.defaultChatID
}
if channel == "" || chatID == "" {
return ErrorResult("no target channel/chat available")
}
if t.mediaStore == nil {
return ErrorResult("media store not configured")
}
resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths)
if err != nil {
return ErrorResult(fmt.Sprintf("invalid path: %v", err))
}
info, err := os.Stat(resolved)
if err != nil {
return ErrorResult(fmt.Sprintf("file not found: %v", err))
}
if info.IsDir() {
return ErrorResult("path is a directory, expected an image file")
}
if info.Size() > int64(t.maxFileSize) {
return ErrorResult(fmt.Sprintf(
"file too large: %d bytes (max %d bytes)", info.Size(), t.maxFileSize,
))
}
// Detect MIME type — reuse the helper already in send_file.go
mediaType := detectMediaType(resolved)
if !strings.HasPrefix(mediaType, "image/") {
return ErrorResult(fmt.Sprintf(
"file does not appear to be an image (detected type: %s)", mediaType,
))
}
filename := filepath.Base(resolved)
scope := fmt.Sprintf("tool:load_image:%s:%s", channel, chatID)
ref, err := t.mediaStore.Store(resolved, media.MediaMeta{
Filename: filename,
ContentType: mediaType,
Source: "tool:load_image",
CleanupPolicy: media.CleanupPolicyForgetOnly,
}, scope)
if err != nil {
return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err))
}
// Build the tool result text. The media:// ref will be picked up by
// resolveMediaRefs in loop_media.go and converted to a base64 data URL
// before the next LLM call, exactly like channel-received images.
prompt, _ := args["prompt"].(string)
var msg string
if prompt != "" {
msg = fmt.Sprintf("Image loaded: %s\n%s\n[image: %s]", filename, prompt, ref)
} else {
msg = fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref)
}
return &ToolResult{
ForLLM: msg,
ForUser: fmt.Sprintf("Loaded image: %s", filename),
// Media refs inside ForLLM are resolved by resolveMediaRefs in the
// agent loop before the next LLM call. Do NOT use MediaResult here —
// that would send the file to the user channel instead.
Media: []string{ref},
}
}