feat(agent): add network error retry with configurable max retries and backoff

- Add isNetworkError detection for connection reset, broken pipe, read/write tcp, EOF
- Add retry logic with configurable exponential backoff for network errors
- Add config options max_llm_retries and llm_retry_backoff_secs in agents.defaults
- Network errors now retry with backoff (was previously not retried)
- Timeout errors now use configurable backoff instead of hardcoded 5s
- Default: 2 retries with 2s backoff (3 total attempts)
This commit is contained in:
David Siewert
2026-04-25 19:08:46 +06:00
parent 71c877a67f
commit 06fad95719
2 changed files with 49 additions and 2 deletions
+47 -2
View File
@@ -185,7 +185,14 @@ func (p *Pipeline) CallLLM(
// Retry loop
var err error
maxRetries := 2
maxRetries := p.Cfg.Agents.Defaults.MaxLLMRetries
if maxRetries <= 0 {
maxRetries = 2
}
backoffSecs := p.Cfg.Agents.Defaults.LLMRetryBackoffSecs
if backoffSecs <= 0 {
backoffSecs = 2
}
for retry := 0; retry <= maxRetries; retry++ {
exec.response, err = callLLM(exec.callMessages, exec.providerToolDefs)
if err == nil {
@@ -233,6 +240,15 @@ func (p *Pipeline) CallLLM(
strings.Contains(errMsg, "timed out") ||
strings.Contains(errMsg, "timeout exceeded")
isNetworkError := !isTimeoutError && (strings.Contains(errMsg, "connection reset") ||
strings.Contains(errMsg, "connection refused") ||
strings.Contains(errMsg, "broken pipe") ||
strings.Contains(errMsg, "no such host") ||
strings.Contains(errMsg, "network is unreachable") ||
strings.Contains(errMsg, "read tcp") ||
strings.Contains(errMsg, "write tcp") ||
strings.Contains(errMsg, "eof"))
isContextError := !isTimeoutError && (strings.Contains(errMsg, "context_length_exceeded") ||
strings.Contains(errMsg, "context window") ||
strings.Contains(errMsg, "context_window") ||
@@ -245,7 +261,7 @@ func (p *Pipeline) CallLLM(
strings.Contains(errMsg, "request too large"))
if isTimeoutError && retry < maxRetries {
backoff := time.Duration(retry+1) * 5 * time.Second
backoff := time.Duration(retry+1) * time.Duration(backoffSecs) * time.Second
al.emitEvent(
EventKindLLMRetry,
ts.eventMeta("runTurn", "turn.llm.retry"),
@@ -273,6 +289,35 @@ func (p *Pipeline) CallLLM(
continue
}
if isNetworkError && retry < maxRetries {
backoff := time.Duration(retry+1) * time.Duration(backoffSecs) * time.Second
al.emitEvent(
EventKindLLMRetry,
ts.eventMeta("runTurn", "turn.llm.retry"),
LLMRetryPayload{
Attempt: retry + 1,
MaxRetries: maxRetries,
Reason: "network",
Error: err.Error(),
Backoff: backoff,
},
)
logger.WarnCF("agent", "Network error, retrying after backoff", map[string]any{
"error": err.Error(),
"retry": retry,
"backoff": backoff.String(),
})
if sleepErr := sleepWithContext(turnCtx, backoff); sleepErr != nil {
if ts.hardAbortRequested() {
_ = ts.requestHardAbort()
return ControlBreak, nil
}
err = sleepErr
break
}
continue
}
if isContextError && retry < maxRetries && !ts.opts.NoHistory {
al.emitEvent(
EventKindLLMRetry,
+2
View File
@@ -275,6 +275,8 @@ type AgentDefaults struct {
SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker
ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"`
ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"`
MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"`
LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"`
}
const DefaultMaxMediaSize = 20 * 1024 * 1024 // 20 MB