From 1245f2ddf6a2126de087dded1b81f13a9086a5fd Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Tue, 14 Apr 2026 22:15:28 +0200 Subject: [PATCH] fix(agent): recover after image-input-unsupported failures --- pkg/agent/llm_media.go | 60 ++++++++++++++++++++++++++++++++++++++++++ pkg/agent/loop.go | 41 +++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 pkg/agent/llm_media.go diff --git a/pkg/agent/llm_media.go b/pkg/agent/llm_media.go new file mode 100644 index 000000000..eb1908777 --- /dev/null +++ b/pkg/agent/llm_media.go @@ -0,0 +1,60 @@ +package agent + +import ( + "strings" + + "github.com/sipeed/picoclaw/pkg/providers" +) + +func messagesContainMedia(messages []providers.Message) bool { + for _, msg := range messages { + for _, ref := range msg.Media { + if strings.TrimSpace(ref) != "" { + return true + } + } + } + return false +} + +func stripMessageMedia(messages []providers.Message) []providers.Message { + if !messagesContainMedia(messages) { + return messages + } + stripped := make([]providers.Message, len(messages)) + for i, msg := range messages { + stripped[i] = msg + stripped[i].Media = nil + } + return stripped +} + +func isVisionUnsupportedError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + + // OpenRouter (and OpenAI-compatible) style. + if strings.Contains(msg, "no endpoints found that support image input") { + return true + } + + // Common provider variants. + if strings.Contains(msg, "does not support image input") || + strings.Contains(msg, "does not support image inputs") || + strings.Contains(msg, "does not support images") || + strings.Contains(msg, "image input is not supported") || + strings.Contains(msg, "images are not supported") || + strings.Contains(msg, "does not support vision") || + strings.Contains(msg, "unsupported content type: image_url") { + return true + } + + // Some providers return a generic "invalid" message that still mentions image_url. + if strings.Contains(msg, "image_url") && strings.Contains(msg, "invalid") { + return true + } + + return false +} diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index bc71fa088..11d8c7a85 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -2360,6 +2360,8 @@ turnLoop: var response *providers.LLMResponse var err error maxRetries := 2 + callHasMedia := messagesContainMedia(callMessages) + didStripMedia := false for retry := 0; retry <= maxRetries; retry++ { response, err = callLLM(callMessages, providerToolDefs) if err == nil { @@ -2370,6 +2372,45 @@ turnLoop: return al.abortTurn(ts) } + // If the provider/model doesn't support multimodal inputs, retry once with media stripped + // so the session doesn't get "stuck" after a user sends an image. + if callHasMedia && !didStripMedia && isVisionUnsupportedError(err) { + didStripMedia = true + if !ts.opts.NoHistory { + history := ts.agent.Sessions.GetHistory(ts.sessionKey) + ts.agent.Sessions.SetHistory(ts.sessionKey, stripMessageMedia(history)) + + // Keep persistedMessages aligned so abort restore-point trimming remains correct. + ts.mu.Lock() + for i := range ts.persistedMessages { + ts.persistedMessages[i].Media = nil + } + ts.mu.Unlock() + + ts.refreshRestorePointFromSession(ts.agent) + } + + messages = stripMessageMedia(messages) + callMessages = stripMessageMedia(callMessages) + callHasMedia = false + + al.emitEvent( + EventKindLLMRetry, + ts.eventMeta("runTurn", "turn.llm.retry"), + LLMRetryPayload{ + Attempt: 1, + MaxRetries: 1, + Reason: "vision_unsupported", + Error: err.Error(), + Backoff: 0, + }, + ) + response, err = callLLM(callMessages, providerToolDefs) + if err == nil { + break + } + } + errMsg := strings.ToLower(err.Error()) isTimeoutError := errors.Is(err, context.DeadlineExceeded) || strings.Contains(errMsg, "deadline exceeded") ||