mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat: add load_image tool for local file vision (#2116)
* feat: add load_image tool for local file vision
* fix: address load_image PR review feedback
- Exclude load_image from sub-agent tools via Unregister after Clone,
since RunToolLoop does not call resolveMediaRefs
- Add ToolRegistry.Unregister() method
- Fix scope collision: use channel:chatID instead of filename
- Add channel/chatID context resolution matching send_file pattern
- Add comment explaining iteration > 1 guard on resolveMediaRefs
- Remove emoji from ForUser for consistency with send_file
- Add load_image_test.go
* feat: enable load_image for subagents via MediaResolver in RunToolLoop
Instead of removing load_image from sub-agent tools (28f69e71), inject a
MediaResolver into the legacy RunToolLoop fallback path so media:// refs
are resolved to base64 before each LLM call — matching the main agent
loop behavior.
- Add MediaResolver field to ToolLoopConfig and call it on iteration > 1
- Add SubagentManager.SetMediaResolver() and wire it through runTask
- Remove ToolRegistry.Unregister() (no longer needed)
- Restore load_image in sub-agent tool set (revert Clone+Unregister)
- Add TestSubagentManager_SetMediaResolver_StoresResolver
* refactor(load_image): remove prompt parameter from tool schema
* test(tools): add success-path test for LoadImageTool
Add TestLoadImage_SuccessPath that creates a real PNG file with valid
magic bytes, calls Execute with WithToolContext, and verifies:
- result.IsError == false
- ToolResult.Media contains a media:// ref
- ToolResult.ForLLM contains the [image: marker
- media ref is resolvable in the store
Add explanatory comment in loop.go for why Media and ArtifactTags
coexist on non-ResponseHandled tool results (e.g. load_image).
* fix: preallocate slice in tests and add ResponseHandled guard in toolloop
Fix prealloc linter failure in load_image_test.go.
Prevent double-resolving media by checking ResponseHandled in toolloop.go.
* Register TTS tool if provider is available
---------
Co-authored-by: Reusu <admin@yumao.name>
Co-authored-by: 美電球 <hoshina@evaz.org>
This commit is contained in:
@@ -281,6 +281,17 @@ func registerSharedTools(
|
||||
agent.Tools.Register(tools.NewSendTTSTool(ttsProvider, nil))
|
||||
}
|
||||
|
||||
if cfg.Tools.IsToolEnabled("load_image") {
|
||||
loadImageTool := tools.NewLoadImageTool(
|
||||
agent.Workspace,
|
||||
cfg.Agents.Defaults.RestrictToWorkspace,
|
||||
cfg.Agents.Defaults.GetMaxMediaSize(),
|
||||
nil,
|
||||
allowReadPaths,
|
||||
)
|
||||
agent.Tools.Register(loadImageTool)
|
||||
}
|
||||
|
||||
// Skill discovery and installation tools
|
||||
skills_enabled := cfg.Tools.IsToolEnabled("skills")
|
||||
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
|
||||
@@ -323,6 +334,14 @@ func registerSharedTools(
|
||||
subagentManager := tools.NewSubagentManager(provider, agent.Model, agent.Workspace)
|
||||
subagentManager.SetLLMOptions(agent.MaxTokens, agent.Temperature)
|
||||
|
||||
// Inject a media resolver so the legacy RunToolLoop fallback path can
|
||||
// resolve media:// refs in the same way the main AgentLoop does.
|
||||
// This keeps subagent vision support working even when the optimized
|
||||
// sub-turn spawner path is unavailable.
|
||||
subagentManager.SetMediaResolver(func(msgs []providers.Message) []providers.Message {
|
||||
return resolveMediaRefs(msgs, al.mediaStore, cfg.Agents.Defaults.GetMaxMediaSize())
|
||||
})
|
||||
|
||||
// Set the spawner that links into AgentLoop's turnState
|
||||
subagentManager.SetSpawner(func(
|
||||
ctx context.Context,
|
||||
@@ -1861,6 +1880,14 @@ turnLoop:
|
||||
providerToolDefs = filtered
|
||||
}
|
||||
|
||||
// Resolve media:// refs produced by tool results (e.g. load_image).
|
||||
// Skipped on iteration 1 because inbound user media is already resolved
|
||||
// before entering the loop; only subsequent iterations can contain new
|
||||
// tool-generated media refs that need base64 encoding.
|
||||
if iteration > 1 {
|
||||
messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize)
|
||||
}
|
||||
|
||||
callMessages := messages
|
||||
if gracefulTerminal {
|
||||
callMessages = append(append([]providers.Message(nil), messages...), ts.interruptHintMessage())
|
||||
@@ -2551,6 +2578,13 @@ turnLoop:
|
||||
}
|
||||
|
||||
if len(toolResult.Media) > 0 && !toolResult.ResponseHandled {
|
||||
// For tools like load_image that produce media refs without sending them
|
||||
// to the user channel (ResponseHandled == false), both Media and ArtifactTags
|
||||
// coexist on the result:
|
||||
// - Media: carries media:// refs that resolveMediaRefs will base64-encode
|
||||
// into image_url parts in the next LLM iteration (enabling vision).
|
||||
// - ArtifactTags: exposes the local file path as a structured [file:…] tag
|
||||
// in the tool result text, so the LLM knows an artifact was produced.
|
||||
toolResult.ArtifactTags = buildArtifactTags(al.mediaStore, toolResult.Media)
|
||||
}
|
||||
|
||||
@@ -2570,6 +2604,9 @@ turnLoop:
|
||||
Content: contentForLLM,
|
||||
ToolCallID: toolCallID,
|
||||
}
|
||||
if len(toolResult.Media) > 0 && !toolResult.ResponseHandled {
|
||||
toolResultMsg.Media = append(toolResultMsg.Media, toolResult.Media...)
|
||||
}
|
||||
al.emitEvent(
|
||||
EventKindToolExecEnd,
|
||||
ts.eventMeta("runTurn", "turn.tool.end"),
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/config"
|
||||
"github.com/sipeed/picoclaw/pkg/media"
|
||||
)
|
||||
|
||||
// LoadImageTool loads a local image file into the MediaStore and returns a
|
||||
// media:// reference. The agent loop's resolveMediaRefs will then base64-encode
|
||||
// it and attach it as an image_url part in the next LLM request, enabling
|
||||
// vision on local files — the same pipeline used when a user sends an image
|
||||
// through a chat channel.
|
||||
//
|
||||
// This is intentionally different from SendFileTool:
|
||||
// - SendFileTool → MediaResult + WithResponseHandled() → sends file to user, ends turn
|
||||
// - LoadImageTool → plain ToolResult with media:// in ForLLM → LLM sees the image next turn
|
||||
type LoadImageTool struct {
|
||||
workspace string
|
||||
restrict bool
|
||||
maxFileSize int
|
||||
mediaStore media.MediaStore
|
||||
allowPaths []*regexp.Regexp
|
||||
|
||||
defaultChannel string
|
||||
defaultChatID string
|
||||
}
|
||||
|
||||
func NewLoadImageTool(
|
||||
workspace string,
|
||||
restrict bool,
|
||||
maxFileSize int,
|
||||
store media.MediaStore,
|
||||
allowPaths ...[]*regexp.Regexp,
|
||||
) *LoadImageTool {
|
||||
if maxFileSize <= 0 {
|
||||
maxFileSize = config.DefaultMaxMediaSize
|
||||
}
|
||||
var patterns []*regexp.Regexp
|
||||
if len(allowPaths) > 0 {
|
||||
patterns = allowPaths[0]
|
||||
}
|
||||
return &LoadImageTool{
|
||||
workspace: workspace,
|
||||
restrict: restrict,
|
||||
maxFileSize: maxFileSize,
|
||||
mediaStore: store,
|
||||
allowPaths: patterns,
|
||||
}
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) Name() string { return "load_image" }
|
||||
|
||||
func (t *LoadImageTool) Description() string {
|
||||
return "Load a local image file so you can analyze its contents with vision. " +
|
||||
"Supported formats: JPEG, PNG, GIF, WebP, BMP. " +
|
||||
"After calling this tool, describe or analyze the image in your next response."
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) Parameters() map[string]any {
|
||||
return map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"path": map[string]any{
|
||||
"type": "string",
|
||||
"description": "Path to the local image file. Relative paths are resolved from workspace.",
|
||||
},
|
||||
},
|
||||
"required": []string{"path"},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) SetContext(channel, chatID string) {
|
||||
t.defaultChannel = channel
|
||||
t.defaultChatID = chatID
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) SetMediaStore(store media.MediaStore) {
|
||||
t.mediaStore = store
|
||||
}
|
||||
|
||||
func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
|
||||
path, _ := args["path"].(string)
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return ErrorResult("path is required")
|
||||
}
|
||||
|
||||
// Prefer context-injected channel/chatID (set by ExecuteWithContext), fall back to SetContext values.
|
||||
channel := ToolChannel(ctx)
|
||||
if channel == "" {
|
||||
channel = t.defaultChannel
|
||||
}
|
||||
chatID := ToolChatID(ctx)
|
||||
if chatID == "" {
|
||||
chatID = t.defaultChatID
|
||||
}
|
||||
if channel == "" || chatID == "" {
|
||||
return ErrorResult("no target channel/chat available")
|
||||
}
|
||||
|
||||
if t.mediaStore == nil {
|
||||
return ErrorResult("media store not configured")
|
||||
}
|
||||
|
||||
resolved, err := validatePathWithAllowPaths(path, t.workspace, t.restrict, t.allowPaths)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("invalid path: %v", err))
|
||||
}
|
||||
|
||||
info, err := os.Stat(resolved)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("file not found: %v", err))
|
||||
}
|
||||
if info.IsDir() {
|
||||
return ErrorResult("path is a directory, expected an image file")
|
||||
}
|
||||
if info.Size() > int64(t.maxFileSize) {
|
||||
return ErrorResult(fmt.Sprintf(
|
||||
"file too large: %d bytes (max %d bytes)", info.Size(), t.maxFileSize,
|
||||
))
|
||||
}
|
||||
|
||||
// Detect MIME type — reuse the helper already in send_file.go
|
||||
mediaType := detectMediaType(resolved)
|
||||
if !strings.HasPrefix(mediaType, "image/") {
|
||||
return ErrorResult(fmt.Sprintf(
|
||||
"file does not appear to be an image (detected type: %s)", mediaType,
|
||||
))
|
||||
}
|
||||
|
||||
filename := filepath.Base(resolved)
|
||||
scope := fmt.Sprintf("tool:load_image:%s:%s", channel, chatID)
|
||||
|
||||
ref, err := t.mediaStore.Store(resolved, media.MediaMeta{
|
||||
Filename: filename,
|
||||
ContentType: mediaType,
|
||||
Source: "tool:load_image",
|
||||
CleanupPolicy: media.CleanupPolicyForgetOnly,
|
||||
}, scope)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err))
|
||||
}
|
||||
|
||||
// Build the tool result text. The media:// ref will be picked up by
|
||||
// resolveMediaRefs in loop_media.go and converted to a base64 data URL
|
||||
// before the next LLM call, exactly like channel-received images.
|
||||
msg := fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref)
|
||||
|
||||
return &ToolResult{
|
||||
ForLLM: msg,
|
||||
ForUser: fmt.Sprintf("Loaded image: %s", filename),
|
||||
// Media refs inside ForLLM are resolved by resolveMediaRefs in the
|
||||
// agent loop before the next LLM call. Do NOT use MediaResult here —
|
||||
// that would send the file to the user channel instead.
|
||||
Media: []string{ref},
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/config"
|
||||
"github.com/sipeed/picoclaw/pkg/media"
|
||||
"github.com/sipeed/picoclaw/pkg/providers"
|
||||
)
|
||||
|
||||
func TestLoadImage_PathRequired(t *testing.T) {
|
||||
tool := NewLoadImageTool("/tmp", false, 0, nil)
|
||||
ctx := WithToolContext(context.Background(), "test", "chat1")
|
||||
result := tool.Execute(ctx, map[string]any{})
|
||||
if !result.IsError {
|
||||
t.Fatal("expected error for missing path")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage_NilMediaStore(t *testing.T) {
|
||||
tool := NewLoadImageTool("/tmp", false, 0, nil)
|
||||
ctx := WithToolContext(context.Background(), "test", "chat1")
|
||||
result := tool.Execute(ctx, map[string]any{"path": "test.png"})
|
||||
if !result.IsError || result.ForLLM != "media store not configured" {
|
||||
t.Fatalf("expected media store error, got: %s", result.ForLLM)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage_NoChannelContext(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
tool := NewLoadImageTool("/tmp", false, 0, store)
|
||||
// No WithToolContext — should fail
|
||||
result := tool.Execute(context.Background(), map[string]any{"path": "test.png"})
|
||||
if !result.IsError || result.ForLLM != "no target channel/chat available" {
|
||||
t.Fatalf("expected channel error, got: %s", result.ForLLM)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage_NonImageFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
txtFile := filepath.Join(dir, "readme.txt")
|
||||
os.WriteFile(txtFile, []byte("hello"), 0o644)
|
||||
|
||||
store := media.NewFileMediaStore()
|
||||
tool := NewLoadImageTool(dir, false, 0, store)
|
||||
ctx := WithToolContext(context.Background(), "test", "chat1")
|
||||
result := tool.Execute(ctx, map[string]any{"path": txtFile})
|
||||
if !result.IsError {
|
||||
t.Fatal("expected error for non-image file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage_DefaultMaxSize(t *testing.T) {
|
||||
tool := NewLoadImageTool("/tmp", false, 0, nil)
|
||||
if tool.maxFileSize != config.DefaultMaxMediaSize {
|
||||
t.Errorf("expected default max size %d, got %d", config.DefaultMaxMediaSize, tool.maxFileSize)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage_FileTooLarge(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
bigFile := filepath.Join(dir, "big.png")
|
||||
// Create a file with PNG header but exceeding max size
|
||||
data := make([]byte, 1024)
|
||||
copy(data, []byte{0x89, 0x50, 0x4E, 0x47}) // PNG magic bytes
|
||||
os.WriteFile(bigFile, data, 0o644)
|
||||
|
||||
store := media.NewFileMediaStore()
|
||||
tool := NewLoadImageTool(dir, false, 512, store) // maxSize = 512
|
||||
ctx := WithToolContext(context.Background(), "test", "chat1")
|
||||
result := tool.Execute(ctx, map[string]any{"path": bigFile})
|
||||
if !result.IsError {
|
||||
t.Fatal("expected error for oversized file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubagentManager_SetMediaResolver_StoresResolver(t *testing.T) {
|
||||
manager := NewSubagentManager(nil, "gpt-test", "/tmp")
|
||||
|
||||
called := false
|
||||
manager.SetMediaResolver(func(msgs []providers.Message) []providers.Message {
|
||||
called = true
|
||||
return msgs
|
||||
})
|
||||
|
||||
manager.mu.RLock()
|
||||
got := manager.mediaResolver
|
||||
manager.mu.RUnlock()
|
||||
|
||||
if got == nil {
|
||||
t.Fatal("expected mediaResolver to be set")
|
||||
}
|
||||
|
||||
if called {
|
||||
t.Fatal("resolver should not be called during SetMediaResolver")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadImage_SuccessPath(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Create a minimal valid PNG file (8-byte signature + minimal IHDR + IEND).
|
||||
// The PNG spec requires the 8-byte magic header: 0x89 P N G \r \n 0x1a \n
|
||||
pngSignature := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}
|
||||
// IHDR chunk: length(13) + "IHDR" + 1x1 px, 8-bit RGB, no interlace + CRC
|
||||
ihdr := []byte{
|
||||
0x00, 0x00, 0x00, 0x0D, // chunk length = 13
|
||||
0x49, 0x48, 0x44, 0x52, // "IHDR"
|
||||
0x00, 0x00, 0x00, 0x01, // width = 1
|
||||
0x00, 0x00, 0x00, 0x01, // height = 1
|
||||
0x08, // bit depth = 8
|
||||
0x02, // color type = RGB
|
||||
0x00, 0x00, 0x00, // compression, filter, interlace
|
||||
0x90, 0x77, 0x53, 0xDE, // CRC (valid for this IHDR)
|
||||
}
|
||||
// IEND chunk
|
||||
iend := []byte{
|
||||
0x00, 0x00, 0x00, 0x00, // chunk length = 0
|
||||
0x49, 0x45, 0x4E, 0x44, // "IEND"
|
||||
0xAE, 0x42, 0x60, 0x82, // CRC
|
||||
}
|
||||
|
||||
pngData := make([]byte, 0, len(pngSignature)+len(ihdr)+len(iend))
|
||||
pngData = append(pngData, pngSignature...)
|
||||
pngData = append(pngData, ihdr...)
|
||||
pngData = append(pngData, iend...)
|
||||
|
||||
imgPath := filepath.Join(dir, "test_image.png")
|
||||
if err := os.WriteFile(imgPath, pngData, 0o644); err != nil {
|
||||
t.Fatalf("failed to create test PNG: %v", err)
|
||||
}
|
||||
|
||||
store := media.NewFileMediaStore()
|
||||
tool := NewLoadImageTool(dir, false, 0, store)
|
||||
ctx := WithToolContext(context.Background(), "test", "chat1")
|
||||
|
||||
result := tool.Execute(ctx, map[string]any{"path": imgPath})
|
||||
|
||||
// 1. Must not be an error
|
||||
if result.IsError {
|
||||
t.Fatalf("expected success, got error: %s", result.ForLLM)
|
||||
}
|
||||
|
||||
// 2. Media must contain exactly one media:// ref
|
||||
if len(result.Media) != 1 {
|
||||
t.Fatalf("expected 1 media ref, got %d", len(result.Media))
|
||||
}
|
||||
if !strings.HasPrefix(result.Media[0], "media://") {
|
||||
t.Errorf("expected media ref to start with 'media://', got: %s", result.Media[0])
|
||||
}
|
||||
|
||||
// 3. ForLLM must contain the [image: marker
|
||||
if !strings.Contains(result.ForLLM, "[image:") {
|
||||
t.Errorf("expected ForLLM to contain '[image:' marker, got: %s", result.ForLLM)
|
||||
}
|
||||
|
||||
// 4. ForLLM should also contain the media:// ref
|
||||
if !strings.Contains(result.ForLLM, result.Media[0]) {
|
||||
t.Errorf("expected ForLLM to contain media ref %q, got: %s", result.Media[0], result.ForLLM)
|
||||
}
|
||||
|
||||
// 5. Verify the ref is resolvable in the store
|
||||
resolved, err := store.Resolve(result.Media[0])
|
||||
if err != nil {
|
||||
t.Fatalf("media ref not resolvable: %v", err)
|
||||
}
|
||||
if resolved != imgPath {
|
||||
t.Errorf("expected resolved path %q, got %q", imgPath, resolved)
|
||||
}
|
||||
}
|
||||
@@ -67,6 +67,12 @@ type SubagentManager struct {
|
||||
hasTemperature bool
|
||||
nextID int
|
||||
spawner SpawnSubTurnFunc
|
||||
|
||||
// mediaResolver resolves media:// refs in tool-loop messages before
|
||||
// each LLM call in the legacy RunToolLoop fallback path.
|
||||
// This lets subagents reuse the same media handling behavior as the
|
||||
// main agent loop without importing pkg/agent and creating a cycle.
|
||||
mediaResolver func([]providers.Message) []providers.Message
|
||||
}
|
||||
|
||||
func NewSubagentManager(
|
||||
@@ -90,6 +96,17 @@ func (sm *SubagentManager) SetSpawner(spawner SpawnSubTurnFunc) {
|
||||
sm.spawner = spawner
|
||||
}
|
||||
|
||||
// SetMediaResolver injects a message preprocessor that resolves media:// refs
|
||||
// into LLM-ready content before each tool-loop iteration.
|
||||
// This is only used by the legacy RunToolLoop fallback path.
|
||||
func (sm *SubagentManager) SetMediaResolver(
|
||||
resolver func([]providers.Message) []providers.Message,
|
||||
) {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
sm.mediaResolver = resolver
|
||||
}
|
||||
|
||||
// SetLLMOptions sets max tokens and temperature for subagent LLM calls.
|
||||
func (sm *SubagentManager) SetLLMOptions(maxTokens int, temperature float64) {
|
||||
sm.mu.Lock()
|
||||
@@ -177,6 +194,7 @@ func (sm *SubagentManager) runTask(
|
||||
temperature := sm.temperature
|
||||
hasMaxTokens := sm.hasMaxTokens
|
||||
hasTemperature := sm.hasTemperature
|
||||
mediaResolver := sm.mediaResolver
|
||||
sm.mu.RUnlock()
|
||||
|
||||
var result *ToolResult
|
||||
@@ -223,6 +241,7 @@ After completing the task, provide a clear summary of what was done.`
|
||||
Tools: tools,
|
||||
MaxIterations: maxIter,
|
||||
LLMOptions: llmOptions,
|
||||
MediaResolver: mediaResolver,
|
||||
}, messages, task.OriginChannel, task.OriginChatID)
|
||||
|
||||
if err == nil {
|
||||
|
||||
+32
-4
@@ -24,6 +24,11 @@ type ToolLoopConfig struct {
|
||||
Tools *ToolRegistry
|
||||
MaxIterations int
|
||||
LLMOptions map[string]any
|
||||
|
||||
// MediaResolver resolves media:// refs in messages before each LLM call.
|
||||
// This is optional and is mainly used by subagent legacy fallback execution
|
||||
// so subagents can reuse the same multimodal media handling as the main loop.
|
||||
MediaResolver func(messages []providers.Message) []providers.Message
|
||||
}
|
||||
|
||||
// ToolLoopResult contains the result of running the tool loop.
|
||||
@@ -63,8 +68,27 @@ func RunToolLoop(
|
||||
if llmOpts == nil {
|
||||
llmOpts = map[string]any{}
|
||||
}
|
||||
// 3. Call LLM
|
||||
response, err := config.Provider.Chat(ctx, messages, providerToolDefs, config.Model, llmOpts)
|
||||
|
||||
// 3. Resolve media:// refs and Call LLM.
|
||||
// Tools like load_image produce media:// refs in their result messages.
|
||||
// Without this step, the LLM would receive raw "media://uuid" strings
|
||||
// instead of base64-encoded image data URLs.
|
||||
//
|
||||
// We build a separate callMessages slice so that:
|
||||
// (a) the resolver output is used for the LLM call only,
|
||||
// (b) the original `messages` slice keeps the unresolved refs for
|
||||
// subsequent iterations — the resolver is idempotent but working
|
||||
// on the original avoids double-encoding issues.
|
||||
//
|
||||
// On iteration 1 the initial user messages typically have no media://
|
||||
// refs (they come from plain text), so this is effectively a no-op;
|
||||
// it becomes relevant from iteration 2 onward when tool results may
|
||||
// contain media refs.
|
||||
callMessages := messages
|
||||
if config.MediaResolver != nil && iteration > 1 {
|
||||
callMessages = config.MediaResolver(messages)
|
||||
}
|
||||
response, err := config.Provider.Chat(ctx, callMessages, providerToolDefs, config.Model, llmOpts)
|
||||
if err != nil {
|
||||
logger.ErrorCF("toolloop", "LLM call failed",
|
||||
map[string]any{
|
||||
@@ -161,11 +185,15 @@ func RunToolLoop(
|
||||
for _, r := range results {
|
||||
contentForLLM := r.result.ContentForLLM()
|
||||
|
||||
messages = append(messages, providers.Message{
|
||||
toolMsg := providers.Message{
|
||||
Role: "tool",
|
||||
Content: contentForLLM,
|
||||
ToolCallID: r.tc.ID,
|
||||
})
|
||||
}
|
||||
if len(r.result.Media) > 0 && !r.result.ResponseHandled {
|
||||
toolMsg.Media = append(toolMsg.Media, r.result.Media...)
|
||||
}
|
||||
messages = append(messages, toolMsg)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user