mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
fix(tool): route binary outputs through the media pipeline.
This commit is contained in:
+111
-34
@@ -492,13 +492,13 @@ func (al *AgentLoop) GetConfig() *config.Config {
|
||||
func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
|
||||
al.mediaStore = s
|
||||
|
||||
// Propagate store to send_file tools in all agents.
|
||||
// Propagate store to all registered tools that can emit media.
|
||||
registry := al.GetRegistry()
|
||||
registry.ForEachTool("send_file", func(t tools.Tool) {
|
||||
if sf, ok := t.(*tools.SendFileTool); ok {
|
||||
sf.SetMediaStore(s)
|
||||
for _, agentID := range registry.ListAgentIDs() {
|
||||
if agent, ok := registry.GetAgent(agentID); ok {
|
||||
agent.Tools.SetMediaStore(s)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
|
||||
@@ -926,13 +926,26 @@ func (al *AgentLoop) runAgentLoop(
|
||||
agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage)
|
||||
|
||||
// 3. Run LLM iteration loop
|
||||
finalContent, iteration, err := al.runLLMIteration(ctx, agent, messages, opts)
|
||||
finalContent, iteration, responseHandled, err := al.runLLMIteration(ctx, agent, messages, opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// If last tool had ForUser content and we already sent it, we might not need to send final response
|
||||
// This is controlled by the tool's Silent flag and ForUser content
|
||||
if responseHandled {
|
||||
agent.Sessions.Save(opts.SessionKey)
|
||||
|
||||
if opts.EnableSummary {
|
||||
al.maybeSummarize(agent, opts.SessionKey, opts.Channel, opts.ChatID)
|
||||
}
|
||||
|
||||
logger.InfoCF("agent", "Response already handled by tool output",
|
||||
map[string]any{
|
||||
"agent_id": agent.ID,
|
||||
"session_key": opts.SessionKey,
|
||||
"iterations": iteration,
|
||||
})
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// 4. Handle empty response
|
||||
if finalContent == "" {
|
||||
@@ -1030,14 +1043,57 @@ func (al *AgentLoop) handleReasoning(
|
||||
}
|
||||
}
|
||||
|
||||
const handledToolResponseSummary = "Requested output delivered via tool attachment."
|
||||
|
||||
func (al *AgentLoop) buildOutboundMediaMessage(
|
||||
channel string,
|
||||
chatID string,
|
||||
refs []string,
|
||||
) bus.OutboundMediaMessage {
|
||||
parts := make([]bus.MediaPart, 0, len(refs))
|
||||
for _, ref := range refs {
|
||||
part := bus.MediaPart{Ref: ref}
|
||||
if al.mediaStore != nil {
|
||||
if _, meta, err := al.mediaStore.ResolveWithMeta(ref); err == nil {
|
||||
part.Filename = meta.Filename
|
||||
part.ContentType = meta.ContentType
|
||||
part.Type = inferMediaType(meta.Filename, meta.ContentType)
|
||||
}
|
||||
}
|
||||
parts = append(parts, part)
|
||||
}
|
||||
return bus.OutboundMediaMessage{
|
||||
Channel: channel,
|
||||
ChatID: chatID,
|
||||
Parts: parts,
|
||||
}
|
||||
}
|
||||
|
||||
func (al *AgentLoop) buildArtifactTags(refs []string) []string {
|
||||
if al.mediaStore == nil || len(refs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
tags := make([]string, 0, len(refs))
|
||||
for _, ref := range refs {
|
||||
localPath, meta, err := al.mediaStore.ResolveWithMeta(ref)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
mime := detectMIME(localPath, meta)
|
||||
tags = append(tags, buildPathTag(mime, localPath))
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
// runLLMIteration executes the LLM call loop with tool handling.
|
||||
// Returns (finalContent, iteration, error).
|
||||
// Returns (finalContent, iteration, responseHandled, error).
|
||||
func (al *AgentLoop) runLLMIteration(
|
||||
ctx context.Context,
|
||||
agent *AgentInstance,
|
||||
messages []providers.Message,
|
||||
opts processOptions,
|
||||
) (string, int, error) {
|
||||
) (string, int, bool, error) {
|
||||
iteration := 0
|
||||
var finalContent string
|
||||
|
||||
@@ -1240,7 +1296,7 @@ func (al *AgentLoop) runLLMIteration(
|
||||
"model": activeModel,
|
||||
"error": err.Error(),
|
||||
})
|
||||
return "", iteration, fmt.Errorf("LLM call failed after retries: %w", err)
|
||||
return "", iteration, false, fmt.Errorf("LLM call failed after retries: %w", err)
|
||||
}
|
||||
|
||||
go al.handleReasoning(
|
||||
@@ -1401,10 +1457,7 @@ func (al *AgentLoop) runLLMIteration(
|
||||
}
|
||||
|
||||
// Determine content for the agent loop (ForLLM or error).
|
||||
content := result.ForLLM
|
||||
if content == "" && result.Err != nil {
|
||||
content = result.Err.Error()
|
||||
}
|
||||
content := result.ContentForLLM()
|
||||
if content == "" {
|
||||
return
|
||||
}
|
||||
@@ -1439,8 +1492,14 @@ func (al *AgentLoop) runLLMIteration(
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
allResponsesHandled := len(agentResults) > 0
|
||||
|
||||
// Process results in original order (send to user, save to session)
|
||||
for _, r := range agentResults {
|
||||
if !r.result.ResponseHandled {
|
||||
allResponsesHandled = false
|
||||
}
|
||||
|
||||
// Send ForUser content to user immediately if not Silent
|
||||
if !r.result.Silent && r.result.ForUser != "" && opts.SendResponse {
|
||||
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
|
||||
@@ -1455,32 +1514,33 @@ func (al *AgentLoop) runLLMIteration(
|
||||
})
|
||||
}
|
||||
|
||||
// If tool returned media refs, publish them as outbound media
|
||||
// If tool returned media refs, publish them as outbound media only when the
|
||||
// tool explicitly marked the user-visible delivery as already handled.
|
||||
if len(r.result.Media) > 0 {
|
||||
parts := make([]bus.MediaPart, 0, len(r.result.Media))
|
||||
for _, ref := range r.result.Media {
|
||||
part := bus.MediaPart{Ref: ref}
|
||||
if al.mediaStore != nil {
|
||||
if _, meta, err := al.mediaStore.ResolveWithMeta(ref); err == nil {
|
||||
part.Filename = meta.Filename
|
||||
part.ContentType = meta.ContentType
|
||||
part.Type = inferMediaType(meta.Filename, meta.ContentType)
|
||||
outboundMedia := al.buildOutboundMediaMessage(opts.Channel, opts.ChatID, r.result.Media)
|
||||
if r.result.ResponseHandled {
|
||||
if al.channelManager != nil {
|
||||
if err := al.channelManager.SendMedia(ctx, outboundMedia); err != nil {
|
||||
allResponsesHandled = false
|
||||
logger.WarnCF("agent", "Synchronous media send failed, falling back to bus delivery",
|
||||
map[string]any{
|
||||
"agent_id": agent.ID,
|
||||
"tool": r.tc.Name,
|
||||
"error": err.Error(),
|
||||
})
|
||||
al.bus.PublishOutboundMedia(ctx, outboundMedia)
|
||||
}
|
||||
} else {
|
||||
al.bus.PublishOutboundMedia(ctx, outboundMedia)
|
||||
}
|
||||
parts = append(parts, part)
|
||||
}
|
||||
al.bus.PublishOutboundMedia(ctx, bus.OutboundMediaMessage{
|
||||
Channel: opts.Channel,
|
||||
ChatID: opts.ChatID,
|
||||
Parts: parts,
|
||||
})
|
||||
}
|
||||
|
||||
// Determine content for LLM based on tool result
|
||||
contentForLLM := r.result.ForLLM
|
||||
if contentForLLM == "" && r.result.Err != nil {
|
||||
contentForLLM = r.result.Err.Error()
|
||||
if len(r.result.Media) > 0 && !r.result.ResponseHandled {
|
||||
r.result.ArtifactTags = al.buildArtifactTags(r.result.Media)
|
||||
}
|
||||
contentForLLM := r.result.ContentForLLM()
|
||||
|
||||
toolResultMsg := providers.Message{
|
||||
Role: "tool",
|
||||
@@ -1493,6 +1553,23 @@ func (al *AgentLoop) runLLMIteration(
|
||||
agent.Sessions.AddFullMessage(opts.SessionKey, toolResultMsg)
|
||||
}
|
||||
|
||||
if allResponsesHandled {
|
||||
summaryMsg := providers.Message{
|
||||
Role: "assistant",
|
||||
Content: handledToolResponseSummary,
|
||||
}
|
||||
messages = append(messages, summaryMsg)
|
||||
agent.Sessions.AddFullMessage(opts.SessionKey, summaryMsg)
|
||||
|
||||
logger.InfoCF("agent", "Tool output satisfied delivery; ending turn without follow-up LLM",
|
||||
map[string]any{
|
||||
"agent_id": agent.ID,
|
||||
"iteration": iteration,
|
||||
"tool_count": len(agentResults),
|
||||
})
|
||||
return "", iteration, true, nil
|
||||
}
|
||||
|
||||
// Tick down TTL of discovered tools after processing tool results.
|
||||
// Only reached when tool calls were made (the loop continues);
|
||||
// the break on no-tool-call responses skips this.
|
||||
@@ -1505,7 +1582,7 @@ func (al *AgentLoop) runLLMIteration(
|
||||
})
|
||||
}
|
||||
|
||||
return finalContent, iteration, nil
|
||||
return finalContent, iteration, false, nil
|
||||
}
|
||||
|
||||
// selectCandidates returns the model candidates and resolved model name to use
|
||||
|
||||
@@ -298,6 +298,152 @@ func TestToolRegistry_GetDefinitions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessMessage_MediaToolHandledSkipsFollowUpLLMAndFinalText(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
cfg := &config.Config{
|
||||
Agents: config.AgentsConfig{
|
||||
Defaults: config.AgentDefaults{
|
||||
Workspace: tmpDir,
|
||||
Model: "test-model",
|
||||
MaxTokens: 4096,
|
||||
MaxToolIterations: 10,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
msgBus := bus.NewMessageBus()
|
||||
provider := &handledMediaProvider{}
|
||||
al := NewAgentLoop(cfg, msgBus, provider)
|
||||
|
||||
store := media.NewFileMediaStore()
|
||||
al.SetMediaStore(store)
|
||||
|
||||
imagePath := filepath.Join(tmpDir, "screen.png")
|
||||
if err := os.WriteFile(imagePath, []byte("fake screenshot"), 0o644); err != nil {
|
||||
t.Fatalf("WriteFile(imagePath) error = %v", err)
|
||||
}
|
||||
|
||||
al.RegisterTool(&handledMediaTool{
|
||||
store: store,
|
||||
path: imagePath,
|
||||
})
|
||||
|
||||
response, err := al.processMessage(context.Background(), bus.InboundMessage{
|
||||
Channel: "telegram",
|
||||
ChatID: "chat1",
|
||||
SenderID: "user1",
|
||||
Content: "take a screenshot of the screen and send it to me",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("processMessage() error = %v", err)
|
||||
}
|
||||
if response != "" {
|
||||
t.Fatalf("expected no final response when media tool already handled delivery, got %q", response)
|
||||
}
|
||||
if provider.calls != 1 {
|
||||
t.Fatalf("expected exactly 1 LLM call, got %d", provider.calls)
|
||||
}
|
||||
if len(provider.toolCounts) != 1 {
|
||||
t.Fatalf("expected tool counts for 1 provider call, got %d", len(provider.toolCounts))
|
||||
}
|
||||
if provider.toolCounts[0] == 0 {
|
||||
t.Fatal("expected tools to be available on the first LLM call")
|
||||
}
|
||||
|
||||
select {
|
||||
case mediaMsg := <-msgBus.OutboundMediaChan():
|
||||
if mediaMsg.Channel != "telegram" || mediaMsg.ChatID != "chat1" {
|
||||
t.Fatalf("unexpected outbound media target: %+v", mediaMsg)
|
||||
}
|
||||
if len(mediaMsg.Parts) != 1 {
|
||||
t.Fatalf("expected exactly 1 outbound media part, got %d", len(mediaMsg.Parts))
|
||||
}
|
||||
default:
|
||||
t.Fatal("expected outbound media message to be published")
|
||||
}
|
||||
|
||||
defaultAgent := al.GetRegistry().GetDefaultAgent()
|
||||
if defaultAgent == nil {
|
||||
t.Fatal("expected default agent")
|
||||
}
|
||||
route, _, err := al.resolveMessageRoute(bus.InboundMessage{
|
||||
Channel: "telegram",
|
||||
ChatID: "chat1",
|
||||
SenderID: "user1",
|
||||
Content: "take a screenshot of the screen and send it to me",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveMessageRoute() error = %v", err)
|
||||
}
|
||||
sessionKey := resolveScopeKey(route, "")
|
||||
history := defaultAgent.Sessions.GetHistory(sessionKey)
|
||||
if len(history) == 0 {
|
||||
t.Fatal("expected session history to be saved")
|
||||
}
|
||||
last := history[len(history)-1]
|
||||
if last.Role != "assistant" || last.Content != handledToolResponseSummary {
|
||||
t.Fatalf("expected handled assistant summary in history, got %+v", last)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessMessage_MediaArtifactCanBeForwardedBySendFile(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
cfg := config.DefaultConfig()
|
||||
cfg.Agents.Defaults.Workspace = tmpDir
|
||||
cfg.Agents.Defaults.Model = "test-model"
|
||||
cfg.Agents.Defaults.MaxTokens = 4096
|
||||
cfg.Agents.Defaults.MaxToolIterations = 10
|
||||
|
||||
msgBus := bus.NewMessageBus()
|
||||
provider := &artifactThenSendProvider{}
|
||||
al := NewAgentLoop(cfg, msgBus, provider)
|
||||
|
||||
store := media.NewFileMediaStore()
|
||||
al.SetMediaStore(store)
|
||||
|
||||
mediaDir := media.TempDir()
|
||||
if err := os.MkdirAll(mediaDir, 0o700); err != nil {
|
||||
t.Fatalf("MkdirAll(mediaDir) error = %v", err)
|
||||
}
|
||||
imagePath := filepath.Join(mediaDir, "artifact-screen.png")
|
||||
if err := os.WriteFile(imagePath, []byte("fake screenshot"), 0o644); err != nil {
|
||||
t.Fatalf("WriteFile(imagePath) error = %v", err)
|
||||
}
|
||||
|
||||
al.RegisterTool(&mediaArtifactTool{
|
||||
store: store,
|
||||
path: imagePath,
|
||||
})
|
||||
|
||||
response, err := al.processMessage(context.Background(), bus.InboundMessage{
|
||||
Channel: "telegram",
|
||||
ChatID: "chat1",
|
||||
SenderID: "user1",
|
||||
Content: "take a screenshot of the screen and send it to me",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("processMessage() error = %v", err)
|
||||
}
|
||||
if response != "" {
|
||||
t.Fatalf("expected no final response after send_file handled delivery, got %q", response)
|
||||
}
|
||||
if provider.calls != 2 {
|
||||
t.Fatalf("expected 2 LLM calls (artifact + send_file), got %d", provider.calls)
|
||||
}
|
||||
|
||||
select {
|
||||
case mediaMsg := <-msgBus.OutboundMediaChan():
|
||||
if mediaMsg.Channel != "telegram" || mediaMsg.ChatID != "chat1" {
|
||||
t.Fatalf("unexpected outbound media target: %+v", mediaMsg)
|
||||
}
|
||||
if len(mediaMsg.Parts) != 1 {
|
||||
t.Fatalf("expected exactly 1 outbound media part, got %d", len(mediaMsg.Parts))
|
||||
}
|
||||
default:
|
||||
t.Fatal("expected outbound media from send_file")
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgentLoop_GetStartupInfo verifies startup info contains tools
|
||||
func TestAgentLoop_GetStartupInfo(t *testing.T) {
|
||||
tmpDir, err := os.MkdirTemp("", "agent-test-*")
|
||||
@@ -420,6 +566,98 @@ func (m *countingMockProvider) GetDefaultModel() string {
|
||||
return "counting-mock-model"
|
||||
}
|
||||
|
||||
type handledMediaProvider struct {
|
||||
calls int
|
||||
toolCounts []int
|
||||
}
|
||||
|
||||
func (m *handledMediaProvider) Chat(
|
||||
ctx context.Context,
|
||||
messages []providers.Message,
|
||||
tools []providers.ToolDefinition,
|
||||
model string,
|
||||
opts map[string]any,
|
||||
) (*providers.LLMResponse, error) {
|
||||
m.calls++
|
||||
m.toolCounts = append(m.toolCounts, len(tools))
|
||||
if m.calls == 1 {
|
||||
return &providers.LLMResponse{
|
||||
Content: "Taking the screenshot now.",
|
||||
ToolCalls: []providers.ToolCall{{
|
||||
ID: "call_handled_media",
|
||||
Type: "function",
|
||||
Name: "handled_media_tool",
|
||||
Arguments: map[string]any{},
|
||||
}},
|
||||
}, nil
|
||||
}
|
||||
return &providers.LLMResponse{}, nil
|
||||
}
|
||||
|
||||
func (m *handledMediaProvider) GetDefaultModel() string {
|
||||
return "handled-media-model"
|
||||
}
|
||||
|
||||
type artifactThenSendProvider struct {
|
||||
calls int
|
||||
}
|
||||
|
||||
func (m *artifactThenSendProvider) Chat(
|
||||
ctx context.Context,
|
||||
messages []providers.Message,
|
||||
tools []providers.ToolDefinition,
|
||||
model string,
|
||||
opts map[string]any,
|
||||
) (*providers.LLMResponse, error) {
|
||||
m.calls++
|
||||
if m.calls == 1 {
|
||||
return &providers.LLMResponse{
|
||||
Content: "Taking the screenshot now.",
|
||||
ToolCalls: []providers.ToolCall{{
|
||||
ID: "call_artifact_media",
|
||||
Type: "function",
|
||||
Name: "media_artifact_tool",
|
||||
Arguments: map[string]any{},
|
||||
}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
var artifactPath string
|
||||
for i := len(messages) - 1; i >= 0; i-- {
|
||||
if messages[i].Role != "tool" {
|
||||
continue
|
||||
}
|
||||
start := strings.Index(messages[i].Content, "[file:")
|
||||
if start < 0 {
|
||||
continue
|
||||
}
|
||||
rest := messages[i].Content[start+len("[file:"):]
|
||||
end := strings.Index(rest, "]")
|
||||
if end < 0 {
|
||||
continue
|
||||
}
|
||||
artifactPath = rest[:end]
|
||||
break
|
||||
}
|
||||
if artifactPath == "" {
|
||||
return nil, fmt.Errorf("provider did not receive artifact path in tool result")
|
||||
}
|
||||
|
||||
return &providers.LLMResponse{
|
||||
Content: "",
|
||||
ToolCalls: []providers.ToolCall{{
|
||||
ID: "call_send_file",
|
||||
Type: "function",
|
||||
Name: "send_file",
|
||||
Arguments: map[string]any{"path": artifactPath},
|
||||
}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *artifactThenSendProvider) GetDefaultModel() string {
|
||||
return "artifact-then-send-model"
|
||||
}
|
||||
|
||||
type toolLimitOnlyProvider struct{}
|
||||
|
||||
func (m *toolLimitOnlyProvider) Chat(
|
||||
@@ -465,6 +703,64 @@ func (m *mockCustomTool) Execute(ctx context.Context, args map[string]any) *tool
|
||||
return tools.SilentResult("Custom tool executed")
|
||||
}
|
||||
|
||||
type handledMediaTool struct {
|
||||
store media.MediaStore
|
||||
path string
|
||||
}
|
||||
|
||||
func (m *handledMediaTool) Name() string { return "handled_media_tool" }
|
||||
func (m *handledMediaTool) Description() string {
|
||||
return "Returns a media attachment and fully handles the user response"
|
||||
}
|
||||
|
||||
func (m *handledMediaTool) Parameters() map[string]any {
|
||||
return map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{},
|
||||
}
|
||||
}
|
||||
|
||||
func (m *handledMediaTool) Execute(ctx context.Context, args map[string]any) *tools.ToolResult {
|
||||
ref, err := m.store.Store(m.path, media.MediaMeta{
|
||||
Filename: filepath.Base(m.path),
|
||||
ContentType: "image/png",
|
||||
Source: "test:handled_media_tool",
|
||||
}, "test:handled_media")
|
||||
if err != nil {
|
||||
return tools.ErrorResult(err.Error()).WithError(err)
|
||||
}
|
||||
return tools.MediaResult("Attachment delivered by tool.", []string{ref}).WithResponseHandled()
|
||||
}
|
||||
|
||||
type mediaArtifactTool struct {
|
||||
store media.MediaStore
|
||||
path string
|
||||
}
|
||||
|
||||
func (m *mediaArtifactTool) Name() string { return "media_artifact_tool" }
|
||||
func (m *mediaArtifactTool) Description() string {
|
||||
return "Returns a media artifact that the agent can forward or save later"
|
||||
}
|
||||
|
||||
func (m *mediaArtifactTool) Parameters() map[string]any {
|
||||
return map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{},
|
||||
}
|
||||
}
|
||||
|
||||
func (m *mediaArtifactTool) Execute(ctx context.Context, args map[string]any) *tools.ToolResult {
|
||||
ref, err := m.store.Store(m.path, media.MediaMeta{
|
||||
Filename: filepath.Base(m.path),
|
||||
ContentType: "image/png",
|
||||
Source: "test:media_artifact_tool",
|
||||
}, "test:media_artifact")
|
||||
if err != nil {
|
||||
return tools.ErrorResult(err.Error()).WithError(err)
|
||||
}
|
||||
return tools.MediaResult("Artifact created.", []string{ref})
|
||||
}
|
||||
|
||||
type toolLimitTestTool struct{}
|
||||
|
||||
func (m *toolLimitTestTool) Name() string {
|
||||
|
||||
Reference in New Issue
Block a user