mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
31afad6e87
* feat: add load_image tool for local file vision
* fix: address load_image PR review feedback
- Exclude load_image from sub-agent tools via Unregister after Clone,
since RunToolLoop does not call resolveMediaRefs
- Add ToolRegistry.Unregister() method
- Fix scope collision: use channel:chatID instead of filename
- Add channel/chatID context resolution matching send_file pattern
- Add comment explaining iteration > 1 guard on resolveMediaRefs
- Remove emoji from ForUser for consistency with send_file
- Add load_image_test.go
* feat: enable load_image for subagents via MediaResolver in RunToolLoop
Instead of removing load_image from sub-agent tools (28f69e71), inject a
MediaResolver into the legacy RunToolLoop fallback path so media:// refs
are resolved to base64 before each LLM call — matching the main agent
loop behavior.
- Add MediaResolver field to ToolLoopConfig and call it on iteration > 1
- Add SubagentManager.SetMediaResolver() and wire it through runTask
- Remove ToolRegistry.Unregister() (no longer needed)
- Restore load_image in sub-agent tool set (revert Clone+Unregister)
- Add TestSubagentManager_SetMediaResolver_StoresResolver
* refactor(load_image): remove prompt parameter from tool schema
* test(tools): add success-path test for LoadImageTool
Add TestLoadImage_SuccessPath that creates a real PNG file with valid
magic bytes, calls Execute with WithToolContext, and verifies:
- result.IsError == false
- ToolResult.Media contains a media:// ref
- ToolResult.ForLLM contains the [image: marker
- media ref is resolvable in the store
Add explanatory comment in loop.go for why Media and ArtifactTags
coexist on non-ResponseHandled tool results (e.g. load_image).
* fix: preallocate slice in tests and add ResponseHandled guard in toolloop
Fix prealloc linter failure in load_image_test.go.
Prevent double-resolving media by checking ResponseHandled in toolloop.go.
* Register TTS tool if provider is available
---------
Co-authored-by: Reusu <admin@yumao.name>
Co-authored-by: 美電球 <hoshina@evaz.org>
205 lines
5.8 KiB
Go
205 lines
5.8 KiB
Go
// PicoClaw - Ultra-lightweight personal AI agent
|
|
// Inspired by and based on nanobot: https://github.com/HKUDS/nanobot
|
|
// License: MIT
|
|
//
|
|
// Copyright (c) 2026 PicoClaw contributors
|
|
|
|
package tools
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sync"
|
|
|
|
"github.com/sipeed/picoclaw/pkg/logger"
|
|
"github.com/sipeed/picoclaw/pkg/providers"
|
|
"github.com/sipeed/picoclaw/pkg/utils"
|
|
)
|
|
|
|
// ToolLoopConfig configures the tool execution loop.
|
|
type ToolLoopConfig struct {
|
|
Provider providers.LLMProvider
|
|
Model string
|
|
Tools *ToolRegistry
|
|
MaxIterations int
|
|
LLMOptions map[string]any
|
|
|
|
// MediaResolver resolves media:// refs in messages before each LLM call.
|
|
// This is optional and is mainly used by subagent legacy fallback execution
|
|
// so subagents can reuse the same multimodal media handling as the main loop.
|
|
MediaResolver func(messages []providers.Message) []providers.Message
|
|
}
|
|
|
|
// ToolLoopResult contains the result of running the tool loop.
|
|
type ToolLoopResult struct {
|
|
Content string
|
|
Iterations int
|
|
}
|
|
|
|
// RunToolLoop executes the LLM + tool call iteration loop.
|
|
// This is the core agent logic that can be reused by both main agent and subagents.
|
|
func RunToolLoop(
|
|
ctx context.Context,
|
|
config ToolLoopConfig,
|
|
messages []providers.Message,
|
|
channel, chatID string,
|
|
) (*ToolLoopResult, error) {
|
|
iteration := 0
|
|
var finalContent string
|
|
|
|
for iteration < config.MaxIterations {
|
|
iteration++
|
|
|
|
logger.DebugCF("toolloop", "LLM iteration",
|
|
map[string]any{
|
|
"iteration": iteration,
|
|
"max": config.MaxIterations,
|
|
})
|
|
|
|
// 1. Build tool definitions
|
|
var providerToolDefs []providers.ToolDefinition
|
|
if config.Tools != nil {
|
|
providerToolDefs = config.Tools.ToProviderDefs()
|
|
}
|
|
|
|
// 2. Set default LLM options
|
|
llmOpts := config.LLMOptions
|
|
if llmOpts == nil {
|
|
llmOpts = map[string]any{}
|
|
}
|
|
|
|
// 3. Resolve media:// refs and Call LLM.
|
|
// Tools like load_image produce media:// refs in their result messages.
|
|
// Without this step, the LLM would receive raw "media://uuid" strings
|
|
// instead of base64-encoded image data URLs.
|
|
//
|
|
// We build a separate callMessages slice so that:
|
|
// (a) the resolver output is used for the LLM call only,
|
|
// (b) the original `messages` slice keeps the unresolved refs for
|
|
// subsequent iterations — the resolver is idempotent but working
|
|
// on the original avoids double-encoding issues.
|
|
//
|
|
// On iteration 1 the initial user messages typically have no media://
|
|
// refs (they come from plain text), so this is effectively a no-op;
|
|
// it becomes relevant from iteration 2 onward when tool results may
|
|
// contain media refs.
|
|
callMessages := messages
|
|
if config.MediaResolver != nil && iteration > 1 {
|
|
callMessages = config.MediaResolver(messages)
|
|
}
|
|
response, err := config.Provider.Chat(ctx, callMessages, providerToolDefs, config.Model, llmOpts)
|
|
if err != nil {
|
|
logger.ErrorCF("toolloop", "LLM call failed",
|
|
map[string]any{
|
|
"iteration": iteration,
|
|
"error": err.Error(),
|
|
})
|
|
return nil, fmt.Errorf("LLM call failed: %w", err)
|
|
}
|
|
|
|
// 4. If no tool calls, we're done
|
|
if len(response.ToolCalls) == 0 {
|
|
finalContent = response.Content
|
|
logger.InfoCF("toolloop", "LLM response without tool calls (direct answer)",
|
|
map[string]any{
|
|
"iteration": iteration,
|
|
"content_chars": len(finalContent),
|
|
})
|
|
break
|
|
}
|
|
|
|
normalizedToolCalls := make([]providers.ToolCall, 0, len(response.ToolCalls))
|
|
for _, tc := range response.ToolCalls {
|
|
normalizedToolCalls = append(normalizedToolCalls, providers.NormalizeToolCall(tc))
|
|
}
|
|
|
|
// 5. Log tool calls
|
|
toolNames := make([]string, 0, len(normalizedToolCalls))
|
|
for _, tc := range normalizedToolCalls {
|
|
toolNames = append(toolNames, tc.Name)
|
|
}
|
|
logger.InfoCF("toolloop", "LLM requested tool calls",
|
|
map[string]any{
|
|
"tools": toolNames,
|
|
"count": len(normalizedToolCalls),
|
|
"iteration": iteration,
|
|
})
|
|
|
|
// 6. Build assistant message with tool calls
|
|
assistantMsg := providers.Message{
|
|
Role: "assistant",
|
|
Content: response.Content,
|
|
}
|
|
for _, tc := range normalizedToolCalls {
|
|
argumentsJSON, _ := json.Marshal(tc.Arguments)
|
|
assistantMsg.ToolCalls = append(assistantMsg.ToolCalls, providers.ToolCall{
|
|
ID: tc.ID,
|
|
Type: "function",
|
|
Name: tc.Name,
|
|
Arguments: tc.Arguments,
|
|
Function: &providers.FunctionCall{
|
|
Name: tc.Name,
|
|
Arguments: string(argumentsJSON),
|
|
},
|
|
})
|
|
}
|
|
messages = append(messages, assistantMsg)
|
|
|
|
// 7. Execute tool calls in parallel
|
|
type indexedResult struct {
|
|
result *ToolResult
|
|
tc providers.ToolCall
|
|
}
|
|
|
|
results := make([]indexedResult, len(normalizedToolCalls))
|
|
var wg sync.WaitGroup
|
|
|
|
for i, tc := range normalizedToolCalls {
|
|
results[i].tc = tc
|
|
|
|
wg.Add(1)
|
|
go func(idx int, tc providers.ToolCall) {
|
|
defer wg.Done()
|
|
|
|
argsJSON, _ := json.Marshal(tc.Arguments)
|
|
argsPreview := utils.Truncate(string(argsJSON), 200)
|
|
logger.InfoCF("toolloop", fmt.Sprintf("Tool call: %s(%s)", tc.Name, argsPreview),
|
|
map[string]any{
|
|
"tool": tc.Name,
|
|
"iteration": iteration,
|
|
})
|
|
|
|
var toolResult *ToolResult
|
|
if config.Tools != nil {
|
|
toolResult = config.Tools.ExecuteWithContext(ctx, tc.Name, tc.Arguments, channel, chatID, nil)
|
|
} else {
|
|
toolResult = ErrorResult("No tools available")
|
|
}
|
|
results[idx].result = toolResult
|
|
}(i, tc)
|
|
}
|
|
wg.Wait()
|
|
|
|
// Append results in original order
|
|
for _, r := range results {
|
|
contentForLLM := r.result.ContentForLLM()
|
|
|
|
toolMsg := providers.Message{
|
|
Role: "tool",
|
|
Content: contentForLLM,
|
|
ToolCallID: r.tc.ID,
|
|
}
|
|
if len(r.result.Media) > 0 && !r.result.ResponseHandled {
|
|
toolMsg.Media = append(toolMsg.Media, r.result.Media...)
|
|
}
|
|
messages = append(messages, toolMsg)
|
|
}
|
|
}
|
|
|
|
return &ToolLoopResult{
|
|
Content: finalContent,
|
|
Iterations: iteration,
|
|
}, nil
|
|
}
|