From 06fad9571959df0d908c8a7dc48d70b3c39a3326 Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 19:08:46 +0600 Subject: [PATCH] feat(agent): add network error retry with configurable max retries and backoff - Add isNetworkError detection for connection reset, broken pipe, read/write tcp, EOF - Add retry logic with configurable exponential backoff for network errors - Add config options max_llm_retries and llm_retry_backoff_secs in agents.defaults - Network errors now retry with backoff (was previously not retried) - Timeout errors now use configurable backoff instead of hardcoded 5s - Default: 2 retries with 2s backoff (3 total attempts) --- pkg/agent/pipeline_llm.go | 49 +++++++++++++++++++++++++++++++++++++-- pkg/config/config.go | 2 ++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/pkg/agent/pipeline_llm.go b/pkg/agent/pipeline_llm.go index c426c25c9..95535ed9b 100644 --- a/pkg/agent/pipeline_llm.go +++ b/pkg/agent/pipeline_llm.go @@ -185,7 +185,14 @@ func (p *Pipeline) CallLLM( // Retry loop var err error - maxRetries := 2 + maxRetries := p.Cfg.Agents.Defaults.MaxLLMRetries + if maxRetries <= 0 { + maxRetries = 2 + } + backoffSecs := p.Cfg.Agents.Defaults.LLMRetryBackoffSecs + if backoffSecs <= 0 { + backoffSecs = 2 + } for retry := 0; retry <= maxRetries; retry++ { exec.response, err = callLLM(exec.callMessages, exec.providerToolDefs) if err == nil { @@ -233,6 +240,15 @@ func (p *Pipeline) CallLLM( strings.Contains(errMsg, "timed out") || strings.Contains(errMsg, "timeout exceeded") + isNetworkError := !isTimeoutError && (strings.Contains(errMsg, "connection reset") || + strings.Contains(errMsg, "connection refused") || + strings.Contains(errMsg, "broken pipe") || + strings.Contains(errMsg, "no such host") || + strings.Contains(errMsg, "network is unreachable") || + strings.Contains(errMsg, "read tcp") || + strings.Contains(errMsg, "write tcp") || + strings.Contains(errMsg, "eof")) + isContextError := !isTimeoutError && (strings.Contains(errMsg, "context_length_exceeded") || strings.Contains(errMsg, "context window") || strings.Contains(errMsg, "context_window") || @@ -245,7 +261,7 @@ func (p *Pipeline) CallLLM( strings.Contains(errMsg, "request too large")) if isTimeoutError && retry < maxRetries { - backoff := time.Duration(retry+1) * 5 * time.Second + backoff := time.Duration(retry+1) * time.Duration(backoffSecs) * time.Second al.emitEvent( EventKindLLMRetry, ts.eventMeta("runTurn", "turn.llm.retry"), @@ -273,6 +289,35 @@ func (p *Pipeline) CallLLM( continue } + if isNetworkError && retry < maxRetries { + backoff := time.Duration(retry+1) * time.Duration(backoffSecs) * time.Second + al.emitEvent( + EventKindLLMRetry, + ts.eventMeta("runTurn", "turn.llm.retry"), + LLMRetryPayload{ + Attempt: retry + 1, + MaxRetries: maxRetries, + Reason: "network", + Error: err.Error(), + Backoff: backoff, + }, + ) + logger.WarnCF("agent", "Network error, retrying after backoff", map[string]any{ + "error": err.Error(), + "retry": retry, + "backoff": backoff.String(), + }) + if sleepErr := sleepWithContext(turnCtx, backoff); sleepErr != nil { + if ts.hardAbortRequested() { + _ = ts.requestHardAbort() + return ControlBreak, nil + } + err = sleepErr + break + } + continue + } + if isContextError && retry < maxRetries && !ts.opts.NoHistory { al.emitEvent( EventKindLLMRetry, diff --git a/pkg/config/config.go b/pkg/config/config.go index 5bc96fb12..804f4c67b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -275,6 +275,8 @@ type AgentDefaults struct { SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"` ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"` + MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` + LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` } const DefaultMaxMediaSize = 20 * 1024 * 1024 // 20 MB