fix(agent): recover after image-input-unsupported failures

This commit is contained in:
afjcjsbx
2026-04-14 22:15:28 +02:00
parent c0fadc5918
commit 1245f2ddf6
2 changed files with 101 additions and 0 deletions
+60
View File
@@ -0,0 +1,60 @@
package agent
import (
"strings"
"github.com/sipeed/picoclaw/pkg/providers"
)
func messagesContainMedia(messages []providers.Message) bool {
for _, msg := range messages {
for _, ref := range msg.Media {
if strings.TrimSpace(ref) != "" {
return true
}
}
}
return false
}
func stripMessageMedia(messages []providers.Message) []providers.Message {
if !messagesContainMedia(messages) {
return messages
}
stripped := make([]providers.Message, len(messages))
for i, msg := range messages {
stripped[i] = msg
stripped[i].Media = nil
}
return stripped
}
func isVisionUnsupportedError(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
// OpenRouter (and OpenAI-compatible) style.
if strings.Contains(msg, "no endpoints found that support image input") {
return true
}
// Common provider variants.
if strings.Contains(msg, "does not support image input") ||
strings.Contains(msg, "does not support image inputs") ||
strings.Contains(msg, "does not support images") ||
strings.Contains(msg, "image input is not supported") ||
strings.Contains(msg, "images are not supported") ||
strings.Contains(msg, "does not support vision") ||
strings.Contains(msg, "unsupported content type: image_url") {
return true
}
// Some providers return a generic "invalid" message that still mentions image_url.
if strings.Contains(msg, "image_url") && strings.Contains(msg, "invalid") {
return true
}
return false
}
+41
View File
@@ -2360,6 +2360,8 @@ turnLoop:
var response *providers.LLMResponse
var err error
maxRetries := 2
callHasMedia := messagesContainMedia(callMessages)
didStripMedia := false
for retry := 0; retry <= maxRetries; retry++ {
response, err = callLLM(callMessages, providerToolDefs)
if err == nil {
@@ -2370,6 +2372,45 @@ turnLoop:
return al.abortTurn(ts)
}
// If the provider/model doesn't support multimodal inputs, retry once with media stripped
// so the session doesn't get "stuck" after a user sends an image.
if callHasMedia && !didStripMedia && isVisionUnsupportedError(err) {
didStripMedia = true
if !ts.opts.NoHistory {
history := ts.agent.Sessions.GetHistory(ts.sessionKey)
ts.agent.Sessions.SetHistory(ts.sessionKey, stripMessageMedia(history))
// Keep persistedMessages aligned so abort restore-point trimming remains correct.
ts.mu.Lock()
for i := range ts.persistedMessages {
ts.persistedMessages[i].Media = nil
}
ts.mu.Unlock()
ts.refreshRestorePointFromSession(ts.agent)
}
messages = stripMessageMedia(messages)
callMessages = stripMessageMedia(callMessages)
callHasMedia = false
al.emitEvent(
EventKindLLMRetry,
ts.eventMeta("runTurn", "turn.llm.retry"),
LLMRetryPayload{
Attempt: 1,
MaxRetries: 1,
Reason: "vision_unsupported",
Error: err.Error(),
Backoff: 0,
},
)
response, err = callLLM(callMessages, providerToolDefs)
if err == nil {
break
}
}
errMsg := strings.ToLower(err.Error())
isTimeoutError := errors.Is(err, context.DeadlineExceeded) ||
strings.Contains(errMsg, "deadline exceeded") ||