Files
picoclaw/pkg/tools/toolloop.go
T
reusu 31afad6e87 feat: add load_image tool for local file vision (#2116)
* feat: add load_image tool for local file vision

* fix: address load_image PR review feedback

- Exclude load_image from sub-agent tools via Unregister after Clone,
  since RunToolLoop does not call resolveMediaRefs
- Add ToolRegistry.Unregister() method
- Fix scope collision: use channel:chatID instead of filename
- Add channel/chatID context resolution matching send_file pattern
- Add comment explaining iteration > 1 guard on resolveMediaRefs
- Remove emoji from ForUser for consistency with send_file
- Add load_image_test.go

* feat: enable load_image for subagents via MediaResolver in RunToolLoop

Instead of removing load_image from sub-agent tools (28f69e71), inject a
MediaResolver into the legacy RunToolLoop fallback path so media:// refs
are resolved to base64 before each LLM call — matching the main agent
loop behavior.

- Add MediaResolver field to ToolLoopConfig and call it on iteration > 1
- Add SubagentManager.SetMediaResolver() and wire it through runTask
- Remove ToolRegistry.Unregister() (no longer needed)
- Restore load_image in sub-agent tool set (revert Clone+Unregister)
- Add TestSubagentManager_SetMediaResolver_StoresResolver

* refactor(load_image): remove prompt parameter from tool schema

* test(tools): add success-path test for LoadImageTool

Add TestLoadImage_SuccessPath that creates a real PNG file with valid
magic bytes, calls Execute with WithToolContext, and verifies:
- result.IsError == false
- ToolResult.Media contains a media:// ref
- ToolResult.ForLLM contains the [image: marker
- media ref is resolvable in the store

Add explanatory comment in loop.go for why Media and ArtifactTags
coexist on non-ResponseHandled tool results (e.g. load_image).

* fix: preallocate slice in tests and add ResponseHandled guard in toolloop

Fix prealloc linter failure in load_image_test.go.

Prevent double-resolving media by checking ResponseHandled in toolloop.go.

* Register TTS tool if provider is available

---------

Co-authored-by: Reusu <admin@yumao.name>
Co-authored-by: 美電球 <hoshina@evaz.org>
2026-04-01 21:32:10 +08:00

205 lines
5.8 KiB
Go

// PicoClaw - Ultra-lightweight personal AI agent
// Inspired by and based on nanobot: https://github.com/HKUDS/nanobot
// License: MIT
//
// Copyright (c) 2026 PicoClaw contributors
package tools
import (
"context"
"encoding/json"
"fmt"
"sync"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/providers"
"github.com/sipeed/picoclaw/pkg/utils"
)
// ToolLoopConfig configures the tool execution loop.
type ToolLoopConfig struct {
Provider providers.LLMProvider
Model string
Tools *ToolRegistry
MaxIterations int
LLMOptions map[string]any
// MediaResolver resolves media:// refs in messages before each LLM call.
// This is optional and is mainly used by subagent legacy fallback execution
// so subagents can reuse the same multimodal media handling as the main loop.
MediaResolver func(messages []providers.Message) []providers.Message
}
// ToolLoopResult contains the result of running the tool loop.
type ToolLoopResult struct {
Content string
Iterations int
}
// RunToolLoop executes the LLM + tool call iteration loop.
// This is the core agent logic that can be reused by both main agent and subagents.
func RunToolLoop(
ctx context.Context,
config ToolLoopConfig,
messages []providers.Message,
channel, chatID string,
) (*ToolLoopResult, error) {
iteration := 0
var finalContent string
for iteration < config.MaxIterations {
iteration++
logger.DebugCF("toolloop", "LLM iteration",
map[string]any{
"iteration": iteration,
"max": config.MaxIterations,
})
// 1. Build tool definitions
var providerToolDefs []providers.ToolDefinition
if config.Tools != nil {
providerToolDefs = config.Tools.ToProviderDefs()
}
// 2. Set default LLM options
llmOpts := config.LLMOptions
if llmOpts == nil {
llmOpts = map[string]any{}
}
// 3. Resolve media:// refs and Call LLM.
// Tools like load_image produce media:// refs in their result messages.
// Without this step, the LLM would receive raw "media://uuid" strings
// instead of base64-encoded image data URLs.
//
// We build a separate callMessages slice so that:
// (a) the resolver output is used for the LLM call only,
// (b) the original `messages` slice keeps the unresolved refs for
// subsequent iterations — the resolver is idempotent but working
// on the original avoids double-encoding issues.
//
// On iteration 1 the initial user messages typically have no media://
// refs (they come from plain text), so this is effectively a no-op;
// it becomes relevant from iteration 2 onward when tool results may
// contain media refs.
callMessages := messages
if config.MediaResolver != nil && iteration > 1 {
callMessages = config.MediaResolver(messages)
}
response, err := config.Provider.Chat(ctx, callMessages, providerToolDefs, config.Model, llmOpts)
if err != nil {
logger.ErrorCF("toolloop", "LLM call failed",
map[string]any{
"iteration": iteration,
"error": err.Error(),
})
return nil, fmt.Errorf("LLM call failed: %w", err)
}
// 4. If no tool calls, we're done
if len(response.ToolCalls) == 0 {
finalContent = response.Content
logger.InfoCF("toolloop", "LLM response without tool calls (direct answer)",
map[string]any{
"iteration": iteration,
"content_chars": len(finalContent),
})
break
}
normalizedToolCalls := make([]providers.ToolCall, 0, len(response.ToolCalls))
for _, tc := range response.ToolCalls {
normalizedToolCalls = append(normalizedToolCalls, providers.NormalizeToolCall(tc))
}
// 5. Log tool calls
toolNames := make([]string, 0, len(normalizedToolCalls))
for _, tc := range normalizedToolCalls {
toolNames = append(toolNames, tc.Name)
}
logger.InfoCF("toolloop", "LLM requested tool calls",
map[string]any{
"tools": toolNames,
"count": len(normalizedToolCalls),
"iteration": iteration,
})
// 6. Build assistant message with tool calls
assistantMsg := providers.Message{
Role: "assistant",
Content: response.Content,
}
for _, tc := range normalizedToolCalls {
argumentsJSON, _ := json.Marshal(tc.Arguments)
assistantMsg.ToolCalls = append(assistantMsg.ToolCalls, providers.ToolCall{
ID: tc.ID,
Type: "function",
Name: tc.Name,
Arguments: tc.Arguments,
Function: &providers.FunctionCall{
Name: tc.Name,
Arguments: string(argumentsJSON),
},
})
}
messages = append(messages, assistantMsg)
// 7. Execute tool calls in parallel
type indexedResult struct {
result *ToolResult
tc providers.ToolCall
}
results := make([]indexedResult, len(normalizedToolCalls))
var wg sync.WaitGroup
for i, tc := range normalizedToolCalls {
results[i].tc = tc
wg.Add(1)
go func(idx int, tc providers.ToolCall) {
defer wg.Done()
argsJSON, _ := json.Marshal(tc.Arguments)
argsPreview := utils.Truncate(string(argsJSON), 200)
logger.InfoCF("toolloop", fmt.Sprintf("Tool call: %s(%s)", tc.Name, argsPreview),
map[string]any{
"tool": tc.Name,
"iteration": iteration,
})
var toolResult *ToolResult
if config.Tools != nil {
toolResult = config.Tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, channel, chatID, nil)
} else {
toolResult = ErrorResult("No tools available")
}
results[idx].result = toolResult
}(i, tc)
}
wg.Wait()
// Append results in original order
for _, r := range results {
contentForLLM := r.result.ContentForLLM()
toolMsg := providers.Message{
Role: "tool",
Content: contentForLLM,
ToolCallID: r.tc.ID,
}
if len(r.result.Media) > 0 && !r.result.ResponseHandled {
toolMsg.Media = append(toolMsg.Media, r.result.Media...)
}
messages = append(messages, toolMsg)
}
}
return &ToolLoopResult{
Content: finalContent,
Iterations: iteration,
}, nil
}