From 1245f2ddf6a2126de087dded1b81f13a9086a5fd Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Tue, 14 Apr 2026 22:15:28 +0200 Subject: [PATCH 1/4] fix(agent): recover after image-input-unsupported failures --- pkg/agent/llm_media.go | 60 ++++++++++++++++++++++++++++++++++++++++++ pkg/agent/loop.go | 41 +++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 pkg/agent/llm_media.go diff --git a/pkg/agent/llm_media.go b/pkg/agent/llm_media.go new file mode 100644 index 000000000..eb1908777 --- /dev/null +++ b/pkg/agent/llm_media.go @@ -0,0 +1,60 @@ +package agent + +import ( + "strings" + + "github.com/sipeed/picoclaw/pkg/providers" +) + +func messagesContainMedia(messages []providers.Message) bool { + for _, msg := range messages { + for _, ref := range msg.Media { + if strings.TrimSpace(ref) != "" { + return true + } + } + } + return false +} + +func stripMessageMedia(messages []providers.Message) []providers.Message { + if !messagesContainMedia(messages) { + return messages + } + stripped := make([]providers.Message, len(messages)) + for i, msg := range messages { + stripped[i] = msg + stripped[i].Media = nil + } + return stripped +} + +func isVisionUnsupportedError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + + // OpenRouter (and OpenAI-compatible) style. + if strings.Contains(msg, "no endpoints found that support image input") { + return true + } + + // Common provider variants. + if strings.Contains(msg, "does not support image input") || + strings.Contains(msg, "does not support image inputs") || + strings.Contains(msg, "does not support images") || + strings.Contains(msg, "image input is not supported") || + strings.Contains(msg, "images are not supported") || + strings.Contains(msg, "does not support vision") || + strings.Contains(msg, "unsupported content type: image_url") { + return true + } + + // Some providers return a generic "invalid" message that still mentions image_url. + if strings.Contains(msg, "image_url") && strings.Contains(msg, "invalid") { + return true + } + + return false +} diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index bc71fa088..11d8c7a85 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -2360,6 +2360,8 @@ turnLoop: var response *providers.LLMResponse var err error maxRetries := 2 + callHasMedia := messagesContainMedia(callMessages) + didStripMedia := false for retry := 0; retry <= maxRetries; retry++ { response, err = callLLM(callMessages, providerToolDefs) if err == nil { @@ -2370,6 +2372,45 @@ turnLoop: return al.abortTurn(ts) } + // If the provider/model doesn't support multimodal inputs, retry once with media stripped + // so the session doesn't get "stuck" after a user sends an image. + if callHasMedia && !didStripMedia && isVisionUnsupportedError(err) { + didStripMedia = true + if !ts.opts.NoHistory { + history := ts.agent.Sessions.GetHistory(ts.sessionKey) + ts.agent.Sessions.SetHistory(ts.sessionKey, stripMessageMedia(history)) + + // Keep persistedMessages aligned so abort restore-point trimming remains correct. + ts.mu.Lock() + for i := range ts.persistedMessages { + ts.persistedMessages[i].Media = nil + } + ts.mu.Unlock() + + ts.refreshRestorePointFromSession(ts.agent) + } + + messages = stripMessageMedia(messages) + callMessages = stripMessageMedia(callMessages) + callHasMedia = false + + al.emitEvent( + EventKindLLMRetry, + ts.eventMeta("runTurn", "turn.llm.retry"), + LLMRetryPayload{ + Attempt: 1, + MaxRetries: 1, + Reason: "vision_unsupported", + Error: err.Error(), + Backoff: 0, + }, + ) + response, err = callLLM(callMessages, providerToolDefs) + if err == nil { + break + } + } + errMsg := strings.ToLower(err.Error()) isTimeoutError := errors.Is(err, context.DeadlineExceeded) || strings.Contains(errMsg, "deadline exceeded") || From d3d639cb7d67556fec9c5d2fdadb01af21b60feb Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Tue, 14 Apr 2026 22:21:33 +0200 Subject: [PATCH 2/4] fix lint --- pkg/agent/loop.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 11d8c7a85..2dd0144fc 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -2377,7 +2377,7 @@ turnLoop: if callHasMedia && !didStripMedia && isVisionUnsupportedError(err) { didStripMedia = true if !ts.opts.NoHistory { - history := ts.agent.Sessions.GetHistory(ts.sessionKey) + history = ts.agent.Sessions.GetHistory(ts.sessionKey) ts.agent.Sessions.SetHistory(ts.sessionKey, stripMessageMedia(history)) // Keep persistedMessages aligned so abort restore-point trimming remains correct. From 7824bc715f2c219403ccd38d77a155af41ad1dc8 Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Tue, 14 Apr 2026 22:31:30 +0200 Subject: [PATCH 3/4] add test --- pkg/agent/loop_test.go | 129 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/pkg/agent/loop_test.go b/pkg/agent/loop_test.go index 9cca84b6b..183d65afb 100644 --- a/pkg/agent/loop_test.go +++ b/pkg/agent/loop_test.go @@ -2565,6 +2565,135 @@ func TestAgentLoop_ContextExhaustionRetry(t *testing.T) { } } +type visionUnsupportedMediaProvider struct { + calls int + mediaSeen []bool +} + +func (p *visionUnsupportedMediaProvider) Chat( + ctx context.Context, + messages []providers.Message, + tools []providers.ToolDefinition, + model string, + opts map[string]any, +) (*providers.LLMResponse, error) { + p.calls++ + + hasMedia := false + for _, msg := range messages { + for _, ref := range msg.Media { + if strings.TrimSpace(ref) != "" { + hasMedia = true + break + } + } + if hasMedia { + break + } + } + p.mediaSeen = append(p.mediaSeen, hasMedia) + + if hasMedia { + return nil, fmt.Errorf("API request failed: Status: 404 Body: {\"error\":{\"message\":\"No endpoints found that support image input\"}}") + } + + return &providers.LLMResponse{ + Content: "ok", + ToolCalls: []providers.ToolCall{}, + }, nil +} + +func (p *visionUnsupportedMediaProvider) GetDefaultModel() string { + return "mock-fail-model" +} + +func TestAgentLoop_VisionUnsupportedErrorStripsSessionMedia(t *testing.T) { + workspace := t.TempDir() + + cfg := &config.Config{ + Agents: config.AgentsConfig{ + Defaults: config.AgentDefaults{ + Workspace: workspace, + ModelName: "test-model", + MaxTokens: 4096, + MaxToolIterations: 3, + }, + }, + } + + msgBus := bus.NewMessageBus() + provider := &visionUnsupportedMediaProvider{} + al := NewAgentLoop(cfg, msgBus, provider) + + sessionKey := "agent:main:telegram:direct:user1" + + timeoutCtx, cancel := context.WithTimeout(context.Background(), responseTimeout) + defer cancel() + + resp, err := al.processMessage(timeoutCtx, testInboundMessage(bus.InboundMessage{ + Context: bus.InboundContext{ + Channel: "telegram", + ChatID: "chat1", + ChatType: "direct", + SenderID: "user1", + MessageID: "m1", + }, + Content: "describe this", + Media: []string{"data:image/png;base64,abc123"}, + SessionKey: sessionKey, + })) + if err != nil { + t.Fatalf("processMessage() error = %v", err) + } + if resp != "ok" { + t.Fatalf("response = %q, want %q", resp, "ok") + } + if provider.calls != 2 { + t.Fatalf("calls = %d, want %d (fail with media, then retry without media)", provider.calls, 2) + } + if !slices.Equal(provider.mediaSeen, []bool{true, false}) { + t.Fatalf("mediaSeen = %v, want %v", provider.mediaSeen, []bool{true, false}) + } + + agent := al.registry.GetDefaultAgent() + if agent == nil { + t.Fatal("expected default agent") + } + history := agent.Sessions.GetHistory(sessionKey) + for i, msg := range history { + if len(msg.Media) > 0 { + t.Fatalf("history[%d].Media = %v, want no media after stripping", i, msg.Media) + } + } + + timeoutCtx2, cancel2 := context.WithTimeout(context.Background(), responseTimeout) + defer cancel2() + + resp2, err := al.processMessage(timeoutCtx2, testInboundMessage(bus.InboundMessage{ + Context: bus.InboundContext{ + Channel: "telegram", + ChatID: "chat1", + ChatType: "direct", + SenderID: "user1", + MessageID: "m2", + }, + Content: "hello again", + SessionKey: sessionKey, + })) + if err != nil { + t.Fatalf("processMessage() second call error = %v", err) + } + if resp2 != "ok" { + t.Fatalf("second response = %q, want %q", resp2, "ok") + } + if provider.calls != 3 { + t.Fatalf("calls after second turn = %d, want %d", provider.calls, 3) + } + if !slices.Equal(provider.mediaSeen, []bool{true, false, false}) { + t.Fatalf("mediaSeen = %v, want %v", provider.mediaSeen, []bool{true, false, false}) + } +} + func TestAgentLoop_EmptyModelResponseUsesAccurateFallback(t *testing.T) { tmpDir, err := os.MkdirTemp("", "agent-test-*") if err != nil { From e60a687387cf96e698f9188475e73f2312f03fa2 Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Tue, 14 Apr 2026 22:35:02 +0200 Subject: [PATCH 4/4] fix lint --- pkg/agent/loop_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/agent/loop_test.go b/pkg/agent/loop_test.go index 183d65afb..e01f74e46 100644 --- a/pkg/agent/loop_test.go +++ b/pkg/agent/loop_test.go @@ -2594,7 +2594,8 @@ func (p *visionUnsupportedMediaProvider) Chat( p.mediaSeen = append(p.mediaSeen, hasMedia) if hasMedia { - return nil, fmt.Errorf("API request failed: Status: 404 Body: {\"error\":{\"message\":\"No endpoints found that support image input\"}}") + return nil, fmt.Errorf("API request failed: " + + "Status: 404 Body: {\"error\":{\"message\":\"No endpoints found that support image input\"}}") } return &providers.LLMResponse{