mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
fix(agent): recover after image-input-unsupported failures
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/providers"
|
||||
)
|
||||
|
||||
func messagesContainMedia(messages []providers.Message) bool {
|
||||
for _, msg := range messages {
|
||||
for _, ref := range msg.Media {
|
||||
if strings.TrimSpace(ref) != "" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func stripMessageMedia(messages []providers.Message) []providers.Message {
|
||||
if !messagesContainMedia(messages) {
|
||||
return messages
|
||||
}
|
||||
stripped := make([]providers.Message, len(messages))
|
||||
for i, msg := range messages {
|
||||
stripped[i] = msg
|
||||
stripped[i].Media = nil
|
||||
}
|
||||
return stripped
|
||||
}
|
||||
|
||||
func isVisionUnsupportedError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
|
||||
// OpenRouter (and OpenAI-compatible) style.
|
||||
if strings.Contains(msg, "no endpoints found that support image input") {
|
||||
return true
|
||||
}
|
||||
|
||||
// Common provider variants.
|
||||
if strings.Contains(msg, "does not support image input") ||
|
||||
strings.Contains(msg, "does not support image inputs") ||
|
||||
strings.Contains(msg, "does not support images") ||
|
||||
strings.Contains(msg, "image input is not supported") ||
|
||||
strings.Contains(msg, "images are not supported") ||
|
||||
strings.Contains(msg, "does not support vision") ||
|
||||
strings.Contains(msg, "unsupported content type: image_url") {
|
||||
return true
|
||||
}
|
||||
|
||||
// Some providers return a generic "invalid" message that still mentions image_url.
|
||||
if strings.Contains(msg, "image_url") && strings.Contains(msg, "invalid") {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
@@ -2360,6 +2360,8 @@ turnLoop:
|
||||
var response *providers.LLMResponse
|
||||
var err error
|
||||
maxRetries := 2
|
||||
callHasMedia := messagesContainMedia(callMessages)
|
||||
didStripMedia := false
|
||||
for retry := 0; retry <= maxRetries; retry++ {
|
||||
response, err = callLLM(callMessages, providerToolDefs)
|
||||
if err == nil {
|
||||
@@ -2370,6 +2372,45 @@ turnLoop:
|
||||
return al.abortTurn(ts)
|
||||
}
|
||||
|
||||
// If the provider/model doesn't support multimodal inputs, retry once with media stripped
|
||||
// so the session doesn't get "stuck" after a user sends an image.
|
||||
if callHasMedia && !didStripMedia && isVisionUnsupportedError(err) {
|
||||
didStripMedia = true
|
||||
if !ts.opts.NoHistory {
|
||||
history := ts.agent.Sessions.GetHistory(ts.sessionKey)
|
||||
ts.agent.Sessions.SetHistory(ts.sessionKey, stripMessageMedia(history))
|
||||
|
||||
// Keep persistedMessages aligned so abort restore-point trimming remains correct.
|
||||
ts.mu.Lock()
|
||||
for i := range ts.persistedMessages {
|
||||
ts.persistedMessages[i].Media = nil
|
||||
}
|
||||
ts.mu.Unlock()
|
||||
|
||||
ts.refreshRestorePointFromSession(ts.agent)
|
||||
}
|
||||
|
||||
messages = stripMessageMedia(messages)
|
||||
callMessages = stripMessageMedia(callMessages)
|
||||
callHasMedia = false
|
||||
|
||||
al.emitEvent(
|
||||
EventKindLLMRetry,
|
||||
ts.eventMeta("runTurn", "turn.llm.retry"),
|
||||
LLMRetryPayload{
|
||||
Attempt: 1,
|
||||
MaxRetries: 1,
|
||||
Reason: "vision_unsupported",
|
||||
Error: err.Error(),
|
||||
Backoff: 0,
|
||||
},
|
||||
)
|
||||
response, err = callLLM(callMessages, providerToolDefs)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
errMsg := strings.ToLower(err.Error())
|
||||
isTimeoutError := errors.Is(err, context.DeadlineExceeded) ||
|
||||
strings.Contains(errMsg, "deadline exceeded") ||
|
||||
|
||||
Reference in New Issue
Block a user