mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
31afad6e87
* feat: add load_image tool for local file vision
* fix: address load_image PR review feedback
- Exclude load_image from sub-agent tools via Unregister after Clone,
since RunToolLoop does not call resolveMediaRefs
- Add ToolRegistry.Unregister() method
- Fix scope collision: use channel:chatID instead of filename
- Add channel/chatID context resolution matching send_file pattern
- Add comment explaining iteration > 1 guard on resolveMediaRefs
- Remove emoji from ForUser for consistency with send_file
- Add load_image_test.go
* feat: enable load_image for subagents via MediaResolver in RunToolLoop
Instead of removing load_image from sub-agent tools (28f69e71), inject a
MediaResolver into the legacy RunToolLoop fallback path so media:// refs
are resolved to base64 before each LLM call — matching the main agent
loop behavior.
- Add MediaResolver field to ToolLoopConfig and call it on iteration > 1
- Add SubagentManager.SetMediaResolver() and wire it through runTask
- Remove ToolRegistry.Unregister() (no longer needed)
- Restore load_image in sub-agent tool set (revert Clone+Unregister)
- Add TestSubagentManager_SetMediaResolver_StoresResolver
* refactor(load_image): remove prompt parameter from tool schema
* test(tools): add success-path test for LoadImageTool
Add TestLoadImage_SuccessPath that creates a real PNG file with valid
magic bytes, calls Execute with WithToolContext, and verifies:
- result.IsError == false
- ToolResult.Media contains a media:// ref
- ToolResult.ForLLM contains the [image: marker
- media ref is resolvable in the store
Add explanatory comment in loop.go for why Media and ArtifactTags
coexist on non-ResponseHandled tool results (e.g. load_image).
* fix: preallocate slice in tests and add ResponseHandled guard in toolloop
Fix prealloc linter failure in load_image_test.go.
Prevent double-resolving media by checking ResponseHandled in toolloop.go.
* Register TTS tool if provider is available
---------
Co-authored-by: Reusu <admin@yumao.name>
Co-authored-by: 美電球 <hoshina@evaz.org>
164 lines
4.6 KiB
Go
164 lines
4.6 KiB
Go
package tools
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/sipeed/picoclaw/pkg/config"
|
|
"github.com/sipeed/picoclaw/pkg/media"
|
|
)
|
|
|
|
// LoadImageTool loads a local image file into the MediaStore and returns a
|
|
// media:// reference. The agent loop's resolveMediaRefs will then base64-encode
|
|
// it and attach it as an image_url part in the next LLM request, enabling
|
|
// vision on local files — the same pipeline used when a user sends an image
|
|
// through a chat channel.
|
|
//
|
|
// This is intentionally different from SendFileTool:
|
|
// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn
|
|
// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn
|
|
type LoadImageTool struct {
|
|
workspace string
|
|
restrict bool
|
|
maxFileSize int
|
|
mediaStore media.MediaStore
|
|
allowPaths []*regexp.Regexp
|
|
|
|
defaultChannel string
|
|
defaultChatID string
|
|
}
|
|
|
|
func NewLoadImageTool(
|
|
workspace string,
|
|
restrict bool,
|
|
maxFileSize int,
|
|
store media.MediaStore,
|
|
allowPaths ...[]*regexp.Regexp,
|
|
) *LoadImageTool {
|
|
if maxFileSize <= 0 {
|
|
maxFileSize = config.DefaultMaxMediaSize
|
|
}
|
|
var patterns []*regexp.Regexp
|
|
if len(allowPaths) > 0 {
|
|
patterns = allowPaths[0]
|
|
}
|
|
return &LoadImageTool{
|
|
workspace: workspace,
|
|
restrict: restrict,
|
|
maxFileSize: maxFileSize,
|
|
mediaStore: store,
|
|
allowPaths: patterns,
|
|
}
|
|
}
|
|
|
|
func (t *LoadImageTool) Name() string { return "load_image" }
|
|
|
|
func (t *LoadImageTool) Description() string {
|
|
return "Load a local image file so you can analyze its contents with vision. " +
|
|
"Supported formats: JPEG, PNG, GIF, WebP, BMP. " +
|
|
"After calling this tool, describe or analyze the image in your next response."
|
|
}
|
|
|
|
func (t *LoadImageTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"path": map[string]any{
|
|
"type": "string",
|
|
"description": "Path to the local image file. Relative paths are resolved from workspace.",
|
|
},
|
|
},
|
|
"required": []string{"path"},
|
|
}
|
|
}
|
|
|
|
func (t *LoadImageTool) SetContext(channel, chatID string) {
|
|
t.defaultChannel = channel
|
|
t.defaultChatID = chatID
|
|
}
|
|
|
|
func (t *LoadImageTool) SetMediaStore(store media.MediaStore) {
|
|
t.mediaStore = store
|
|
}
|
|
|
|
func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
|
|
path, _ := args["path"].(string)
|
|
if strings.TrimSpace(path) == "" {
|
|
return ErrorResult("path is required")
|
|
}
|
|
|
|
// Prefer context-injected channel/chatID (set by ExecuteWithContext), fall back to SetContext values.
|
|
channel := ToolChannel(ctx)
|
|
if channel == "" {
|
|
channel = t.defaultChannel
|
|
}
|
|
chatID := ToolChatID(ctx)
|
|
if chatID == "" {
|
|
chatID = t.defaultChatID
|
|
}
|
|
if channel == "" || chatID == "" {
|
|
return ErrorResult("no target channel/chat available")
|
|
}
|
|
|
|
if t.mediaStore == nil {
|
|
return ErrorResult("media store not configured")
|
|
}
|
|
|
|
resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("invalid path: %v", err))
|
|
}
|
|
|
|
info, err := os.Stat(resolved)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("file not found: %v", err))
|
|
}
|
|
if info.IsDir() {
|
|
return ErrorResult("path is a directory, expected an image file")
|
|
}
|
|
if info.Size() > int64(t.maxFileSize) {
|
|
return ErrorResult(fmt.Sprintf(
|
|
"file too large: %d bytes (max %d bytes)", info.Size(), t.maxFileSize,
|
|
))
|
|
}
|
|
|
|
// Detect MIME type — reuse the helper already in send_file.go
|
|
mediaType := detectMediaType(resolved)
|
|
if !strings.HasPrefix(mediaType, "image/") {
|
|
return ErrorResult(fmt.Sprintf(
|
|
"file does not appear to be an image (detected type: %s)", mediaType,
|
|
))
|
|
}
|
|
|
|
filename := filepath.Base(resolved)
|
|
scope := fmt.Sprintf("tool:load_image:%s:%s", channel, chatID)
|
|
|
|
ref, err := t.mediaStore.Store(resolved, media.MediaMeta{
|
|
Filename: filename,
|
|
ContentType: mediaType,
|
|
Source: "tool:load_image",
|
|
CleanupPolicy: media.CleanupPolicyForgetOnly,
|
|
}, scope)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err))
|
|
}
|
|
|
|
// Build the tool result text. The media:// ref will be picked up by
|
|
// resolveMediaRefs in loop_media.go and converted to a base64 data URL
|
|
// before the next LLM call, exactly like channel-received images.
|
|
msg := fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref)
|
|
|
|
return &ToolResult{
|
|
ForLLM: msg,
|
|
ForUser: fmt.Sprintf("Loaded image: %s", filename),
|
|
// Media refs inside ForLLM are resolved by resolveMediaRefs in the
|
|
// agent loop before the next LLM call. Do NOT use MediaResult here —
|
|
// that would send the file to the user channel instead.
|
|
Media: []string{ref},
|
|
}
|
|
}
|