mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat: add load_image tool for local file vision (#2116)
* feat: add load_image tool for local file vision
* fix: address load_image PR review feedback
- Exclude load_image from sub-agent tools via Unregister after Clone,
since RunToolLoop does not call resolveMediaRefs
- Add ToolRegistry.Unregister() method
- Fix scope collision: use channel:chatID instead of filename
- Add channel/chatID context resolution matching send_file pattern
- Add comment explaining iteration > 1 guard on resolveMediaRefs
- Remove emoji from ForUser for consistency with send_file
- Add load_image_test.go
* feat: enable load_image for subagents via MediaResolver in RunToolLoop
Instead of removing load_image from sub-agent tools (28f69e71), inject a
MediaResolver into the legacy RunToolLoop fallback path so media:// refs
are resolved to base64 before each LLM call — matching the main agent
loop behavior.
- Add MediaResolver field to ToolLoopConfig and call it on iteration > 1
- Add SubagentManager.SetMediaResolver() and wire it through runTask
- Remove ToolRegistry.Unregister() (no longer needed)
- Restore load_image in sub-agent tool set (revert Clone+Unregister)
- Add TestSubagentManager_SetMediaResolver_StoresResolver
* refactor(load_image): remove prompt parameter from tool schema
* test(tools): add success-path test for LoadImageTool
Add TestLoadImage_SuccessPath that creates a real PNG file with valid
magic bytes, calls Execute with WithToolContext, and verifies:
- result.IsError == false
- ToolResult.Media contains a media:// ref
- ToolResult.ForLLM contains the [image: marker
- media ref is resolvable in the store
Add explanatory comment in loop.go for why Media and ArtifactTags
coexist on non-ResponseHandled tool results (e.g. load_image).
* fix: preallocate slice in tests and add ResponseHandled guard in toolloop
Fix prealloc linter failure in load_image_test.go.
Prevent double-resolving media by checking ResponseHandled in toolloop.go.
* Register TTS tool if provider is available
---------
Co-authored-by: Reusu <admin@yumao.name>
Co-authored-by: 美電球 <hoshina@evaz.org>
This commit is contained in:
@@ -281,6 +281,17 @@ func registerSharedTools(
|
||||
agent.Tools.Register(tools.NewSendTTSTool(ttsProvider, nil))
|
||||
}
|
||||
|
||||
if cfg.Tools.IsToolEnabled("load_image") {
|
||||
loadImageTool := tools.NewLoadImageTool(
|
||||
agent.Workspace,
|
||||
cfg.Agents.Defaults.RestrictToWorkspace,
|
||||
cfg.Agents.Defaults.GetMaxMediaSize(),
|
||||
nil,
|
||||
allowReadPaths,
|
||||
)
|
||||
agent.Tools.Register(loadImageTool)
|
||||
}
|
||||
|
||||
// Skill discovery and installation tools
|
||||
skills_enabled := cfg.Tools.IsToolEnabled("skills")
|
||||
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
|
||||
@@ -323,6 +334,14 @@ func registerSharedTools(
|
||||
subagentManager := tools.NewSubagentManager(provider, agent.Model, agent.Workspace)
|
||||
subagentManager.SetLLMOptions(agent.MaxTokens, agent.Temperature)
|
||||
|
||||
// Inject a media resolver so the legacy RunToolLoop fallback path can
|
||||
// resolve media:// refs in the same way the main AgentLoop does.
|
||||
// This keeps subagent vision support working even when the optimized
|
||||
// sub-turn spawner path is unavailable.
|
||||
subagentManager.SetMediaResolver(func(msgs []providers.Message) []providers.Message {
|
||||
return resolveMediaRefs(msgs, al.mediaStore, cfg.Agents.Defaults.GetMaxMediaSize())
|
||||
})
|
||||
|
||||
// Set the spawner that links into AgentLoop's turnState
|
||||
subagentManager.SetSpawner(func(
|
||||
ctx context.Context,
|
||||
@@ -1861,6 +1880,14 @@ turnLoop:
|
||||
providerToolDefs = filtered
|
||||
}
|
||||
|
||||
// Resolve media:// refs produced by tool results (e.g. load_image).
|
||||
// Skipped on iteration 1 because inbound user media is already resolved
|
||||
// before entering the loop; only subsequent iterations can contain new
|
||||
// tool-generated media refs that need base64 encoding.
|
||||
if iteration > 1 {
|
||||
messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize)
|
||||
}
|
||||
|
||||
callMessages := messages
|
||||
if gracefulTerminal {
|
||||
callMessages = append(append([]providers.Message(nil), messages...), ts.interruptHintMessage())
|
||||
@@ -2551,6 +2578,13 @@ turnLoop:
|
||||
}
|
||||
|
||||
if len(toolResult.Media) > 0 && !toolResult.ResponseHandled {
|
||||
// For tools like load_image that produce media refs without sending them
|
||||
// to the user channel (ResponseHandled == false), both Media and ArtifactTags
|
||||
// coexist on the result:
|
||||
// - Media: carries media:// refs that resolveMediaRefs will base64-encode
|
||||
// into image_url parts in the next LLM iteration (enabling vision).
|
||||
// - ArtifactTags: exposes the local file path as a structured [file:…] tag
|
||||
// in the tool result text, so the LLM knows an artifact was produced.
|
||||
toolResult.ArtifactTags = buildArtifactTags(al.mediaStore, toolResult.Media)
|
||||
}
|
||||
|
||||
@@ -2570,6 +2604,9 @@ turnLoop:
|
||||
Content: contentForLLM,
|
||||
ToolCallID: toolCallID,
|
||||
}
|
||||
if len(toolResult.Media) > 0 && !toolResult.ResponseHandled {
|
||||
toolResultMsg.Media = append(toolResultMsg.Media, toolResult.Media...)
|
||||
}
|
||||
al.emitEvent(
|
||||
EventKindToolExecEnd,
|
||||
ts.eventMeta("runTurn", "turn.tool.end"),
|
||||
|
||||
Reference in New Issue
Block a user