mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
164 lines
4.6 KiB
Go
164 lines
4.6 KiB
Go
package fstools
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/sipeed/picoclaw/pkg/config"
|
|
"github.com/sipeed/picoclaw/pkg/media"
|
|
)
|
|
|
|
// LoadImageTool loads a local image file into the MediaStore and returns a
|
|
// media:// reference. The agent loop's resolveMediaRefs will then base64-encode
|
|
// it and attach it as an image_url part in the next LLM request, enabling
|
|
// vision on local files — the same pipeline used when a user sends an image
|
|
// through a chat channel.
|
|
//
|
|
// This is intentionally different from SendFileTool:
|
|
// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn
|
|
// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn
|
|
type LoadImageTool struct {
|
|
workspace string
|
|
restrict bool
|
|
maxFileSize int
|
|
mediaStore media.MediaStore
|
|
allowPaths []*regexp.Regexp
|
|
|
|
defaultChannel string
|
|
defaultChatID string
|
|
}
|
|
|
|
func NewLoadImageTool(
|
|
workspace string,
|
|
restrict bool,
|
|
maxFileSize int,
|
|
store media.MediaStore,
|
|
allowPaths ...[]*regexp.Regexp,
|
|
) *LoadImageTool {
|
|
if maxFileSize <= 0 {
|
|
maxFileSize = config.DefaultMaxMediaSize
|
|
}
|
|
var patterns []*regexp.Regexp
|
|
if len(allowPaths) > 0 {
|
|
patterns = allowPaths[0]
|
|
}
|
|
return &LoadImageTool{
|
|
workspace: workspace,
|
|
restrict: restrict,
|
|
maxFileSize: maxFileSize,
|
|
mediaStore: store,
|
|
allowPaths: patterns,
|
|
}
|
|
}
|
|
|
|
func (t *LoadImageTool) Name() string { return "load_image" }
|
|
|
|
func (t *LoadImageTool) Description() string {
|
|
return "Load a local image file so you can analyze its contents with vision. " +
|
|
"Supported formats: JPEG, PNG, GIF, WebP, BMP. " +
|
|
"After calling this tool, describe or analyze the image in your next response."
|
|
}
|
|
|
|
func (t *LoadImageTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"path": map[string]any{
|
|
"type": "string",
|
|
"description": "Path to the local image file. Relative paths are resolved from workspace.",
|
|
},
|
|
},
|
|
"required": []string{"path"},
|
|
}
|
|
}
|
|
|
|
func (t *LoadImageTool) SetContext(channel, chatID string) {
|
|
t.defaultChannel = channel
|
|
t.defaultChatID = chatID
|
|
}
|
|
|
|
func (t *LoadImageTool) SetMediaStore(store media.MediaStore) {
|
|
t.mediaStore = store
|
|
}
|
|
|
|
func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
|
|
path, _ := args["path"].(string)
|
|
if strings.TrimSpace(path) == "" {
|
|
return ErrorResult("path is required")
|
|
}
|
|
|
|
// Prefer context-injected channel/chatID (set by ExecuteWithContext), fall back to SetContext values.
|
|
channel := ToolChannel(ctx)
|
|
if channel == "" {
|
|
channel = t.defaultChannel
|
|
}
|
|
chatID := ToolChatID(ctx)
|
|
if chatID == "" {
|
|
chatID = t.defaultChatID
|
|
}
|
|
if channel == "" || chatID == "" {
|
|
return ErrorResult("no target channel/chat available")
|
|
}
|
|
|
|
if t.mediaStore == nil {
|
|
return ErrorResult("media store not configured")
|
|
}
|
|
|
|
resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("invalid path: %v", err))
|
|
}
|
|
|
|
info, err := os.Stat(resolved)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("file not found: %v", err))
|
|
}
|
|
if info.IsDir() {
|
|
return ErrorResult("path is a directory, expected an image file")
|
|
}
|
|
if info.Size() > int64(t.maxFileSize) {
|
|
return ErrorResult(fmt.Sprintf(
|
|
"file too large: %d bytes (max %d bytes)", info.Size(), t.maxFileSize,
|
|
))
|
|
}
|
|
|
|
// Detect MIME type — reuse the helper already in send_file.go
|
|
mediaType := detectMediaType(resolved)
|
|
if !strings.HasPrefix(mediaType, "image/") {
|
|
return ErrorResult(fmt.Sprintf(
|
|
"file does not appear to be an image (detected type: %s)", mediaType,
|
|
))
|
|
}
|
|
|
|
filename := filepath.Base(resolved)
|
|
scope := fmt.Sprintf("tool:load_image:%s:%s", channel, chatID)
|
|
|
|
ref, err := t.mediaStore.Store(resolved, media.MediaMeta{
|
|
Filename: filename,
|
|
ContentType: mediaType,
|
|
Source: "tool:load_image",
|
|
CleanupPolicy: media.CleanupPolicyForgetOnly,
|
|
}, scope)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err))
|
|
}
|
|
|
|
// Build the tool result text. The media:// ref will be picked up by
|
|
// resolveMediaRefs in loop_media.go and converted to a base64 data URL
|
|
// before the next LLM call, exactly like channel-received images.
|
|
msg := fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref)
|
|
|
|
return &ToolResult{
|
|
ForLLM: msg,
|
|
ForUser: fmt.Sprintf("Loaded image: %s", filename),
|
|
// Media refs inside ForLLM are resolved by resolveMediaRefs in the
|
|
// agent loop before the next LLM call. Do NOT use MediaResult here —
|
|
// that would send the file to the user channel instead.
|
|
Media: []string{ref},
|
|
}
|
|
}
|