mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat: add load_image tool for local file vision
This commit is contained in:
@@ -246,6 +246,17 @@ func registerSharedTools(
|
||||
agent.Tools.Register(sendFileTool)
|
||||
}
|
||||
|
||||
if cfg.Tools.IsToolEnabled("load_image") {
|
||||
loadImageTool := tools.NewLoadImageTool(
|
||||
agent.Workspace,
|
||||
cfg.Agents.Defaults.RestrictToWorkspace,
|
||||
cfg.Agents.Defaults.GetMaxMediaSize(),
|
||||
nil,
|
||||
allowReadPaths,
|
||||
)
|
||||
agent.Tools.Register(loadImageTool)
|
||||
}
|
||||
|
||||
// Skill discovery and installation tools
|
||||
skills_enabled := cfg.Tools.IsToolEnabled("skills")
|
||||
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
|
||||
@@ -1806,6 +1817,10 @@ turnLoop:
|
||||
providerToolDefs = filtered
|
||||
}
|
||||
|
||||
if iteration > 1 {
|
||||
messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize)
|
||||
}
|
||||
|
||||
callMessages := messages
|
||||
if gracefulTerminal {
|
||||
callMessages = append(append([]providers.Message(nil), messages...), ts.interruptHintMessage())
|
||||
@@ -2499,6 +2514,9 @@ turnLoop:
|
||||
Content: contentForLLM,
|
||||
ToolCallID: toolCallID,
|
||||
}
|
||||
if len(toolResult.Media) > 0 && !toolResult.ResponseHandled {
|
||||
toolResultMsg.Media = append(toolResultMsg.Media, toolResult.Media...)
|
||||
}
|
||||
al.emitEvent(
|
||||
EventKindToolExecEnd,
|
||||
ts.eventMeta("runTurn", "turn.tool.end"),
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/config"
|
||||
"github.com/sipeed/picoclaw/pkg/media"
|
||||
)
|
||||
|
||||
// LoadImageTool loads a local image file into the MediaStore and returns a
|
||||
// media:// reference. The agent loop's resolveMediaRefs will then base64-encode
|
||||
// it and attach it as an image_url part in the next LLM request, enabling
|
||||
// vision on local files — the same pipeline used when a user sends an image
|
||||
// through a chat channel.
|
||||
//
|
||||
// This is intentionally different from SendFileTool:
|
||||
// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn
|
||||
// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn
|
||||
type LoadImageTool struct {
|
||||
workspace string
|
||||
restrict bool
|
||||
maxSize int
|
||||
mediaStore media.MediaStore
|
||||
allowPaths []*regexp.Regexp
|
||||
}
|
||||
|
||||
func NewLoadImageTool(
|
||||
workspace string,
|
||||
restrict bool,
|
||||
maxSize int,
|
||||
store media.MediaStore,
|
||||
allowPaths ...[]*regexp.Regexp,
|
||||
) *LoadImageTool {
|
||||
if maxSize <= 0 {
|
||||
maxSize = config.DefaultMaxMediaSize
|
||||
}
|
||||
var patterns []*regexp.Regexp
|
||||
if len(allowPaths) > 0 {
|
||||
patterns = allowPaths[0]
|
||||
}
|
||||
return &LoadImageTool{
|
||||
workspace: workspace,
|
||||
restrict: restrict,
|
||||
maxSize: maxSize,
|
||||
mediaStore: store,
|
||||
allowPaths: patterns,
|
||||
}
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) Name() string { return "load_image" }
|
||||
|
||||
func (t *LoadImageTool) Description() string {
|
||||
return "Load a local image file so you can analyze its contents with vision. " +
|
||||
"Supported formats: JPEG, PNG, GIF, WebP, BMP. " +
|
||||
"After calling this tool, describe or analyze the image in your next response."
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) Parameters() map[string]any {
|
||||
return map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"path": map[string]any{
|
||||
"type": "string",
|
||||
"description": "Path to the local image file. Relative paths are resolved from workspace.",
|
||||
},
|
||||
"prompt": map[string]any{
|
||||
"type": "string",
|
||||
"description": "Optional question or instruction about the image, e.g. 'What objects are in this image?'",
|
||||
},
|
||||
},
|
||||
"required": []string{"path"},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) SetMediaStore(store media.MediaStore) {
|
||||
t.mediaStore = store
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
|
||||
path, _ := args["path"].(string)
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return ErrorResult("path is required")
|
||||
}
|
||||
|
||||
if t.mediaStore == nil {
|
||||
return ErrorResult("media store not configured")
|
||||
}
|
||||
|
||||
resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("invalid path: %v", err))
|
||||
}
|
||||
|
||||
info, err := os.Stat(resolved)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("file not found: %v", err))
|
||||
}
|
||||
if info.IsDir() {
|
||||
return ErrorResult("path is a directory, expected an image file")
|
||||
}
|
||||
if info.Size() > int64(t.maxSize) {
|
||||
return ErrorResult(fmt.Sprintf(
|
||||
"file too large: %d bytes (max %d bytes)", info.Size(), t.maxSize,
|
||||
))
|
||||
}
|
||||
|
||||
// Detect MIME type — reuse the helper already in send_file.go
|
||||
mediaType := detectMediaType(resolved)
|
||||
if !strings.HasPrefix(mediaType, "image/") {
|
||||
return ErrorResult(fmt.Sprintf(
|
||||
"file does not appear to be an image (detected type: %s)", mediaType,
|
||||
))
|
||||
}
|
||||
|
||||
filename := filepath.Base(resolved)
|
||||
scope := fmt.Sprintf("tool:load_image:%s", filename)
|
||||
|
||||
ref, err := t.mediaStore.Store(resolved, media.MediaMeta{
|
||||
Filename: filename,
|
||||
ContentType: mediaType,
|
||||
Source: "tool:load_image",
|
||||
CleanupPolicy: media.CleanupPolicyForgetOnly,
|
||||
}, scope)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err))
|
||||
}
|
||||
|
||||
// Build the tool result text. The media:// ref will be picked up by
|
||||
// resolveMediaRefs in loop_media.go and converted to a base64 data URL
|
||||
// before the next LLM call, exactly like channel-received images.
|
||||
prompt, _ := args["prompt"].(string)
|
||||
var msg string
|
||||
if prompt != "" {
|
||||
msg = fmt.Sprintf("Image loaded: %s\n%s\n[image: %s]", filename, prompt, ref)
|
||||
} else {
|
||||
msg = fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref)
|
||||
}
|
||||
|
||||
return &ToolResult{
|
||||
ForLLM: msg,
|
||||
ForUser: fmt.Sprintf("📷 Loaded image: %s", filename),
|
||||
// Media refs inside ForLLM are resolved by resolveMediaRefs in the
|
||||
// agent loop before the next LLM call. Do NOT use MediaResult here —
|
||||
// that would send the file to the user channel instead.
|
||||
Media: []string{ref},
|
||||
}
|
||||
}
|
||||
@@ -161,11 +161,15 @@ func RunToolLoop(
|
||||
for _, r := range results {
|
||||
contentForLLM := r.result.ContentForLLM()
|
||||
|
||||
messages = append(messages, providers.Message{
|
||||
toolMsg := providers.Message{
|
||||
Role: "tool",
|
||||
Content: contentForLLM,
|
||||
ToolCallID: r.tc.ID,
|
||||
})
|
||||
}
|
||||
if len(r.result.Media) > 0 {
|
||||
toolMsg.Media = append(toolMsg.Media, r.result.Media...)
|
||||
}
|
||||
messages = append(messages, toolMsg)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user