From 06fad9571959df0d908c8a7dc48d70b3c39a3326 Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 19:08:46 +0600 Subject: [PATCH 1/8] feat(agent): add network error retry with configurable max retries and backoff - Add isNetworkError detection for connection reset, broken pipe, read/write tcp, EOF - Add retry logic with configurable exponential backoff for network errors - Add config options max_llm_retries and llm_retry_backoff_secs in agents.defaults - Network errors now retry with backoff (was previously not retried) - Timeout errors now use configurable backoff instead of hardcoded 5s - Default: 2 retries with 2s backoff (3 total attempts) --- pkg/agent/pipeline_llm.go | 49 +++++++++++++++++++++++++++++++++++++-- pkg/config/config.go | 2 ++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/pkg/agent/pipeline_llm.go b/pkg/agent/pipeline_llm.go index c426c25c9..95535ed9b 100644 --- a/pkg/agent/pipeline_llm.go +++ b/pkg/agent/pipeline_llm.go @@ -185,7 +185,14 @@ func (p *Pipeline) CallLLM( // Retry loop var err error - maxRetries := 2 + maxRetries := p.Cfg.Agents.Defaults.MaxLLMRetries + if maxRetries <= 0 { + maxRetries = 2 + } + backoffSecs := p.Cfg.Agents.Defaults.LLMRetryBackoffSecs + if backoffSecs <= 0 { + backoffSecs = 2 + } for retry := 0; retry <= maxRetries; retry++ { exec.response, err = callLLM(exec.callMessages, exec.providerToolDefs) if err == nil { @@ -233,6 +240,15 @@ func (p *Pipeline) CallLLM( strings.Contains(errMsg, "timed out") || strings.Contains(errMsg, "timeout exceeded") + isNetworkError := !isTimeoutError && (strings.Contains(errMsg, "connection reset") || + strings.Contains(errMsg, "connection refused") || + strings.Contains(errMsg, "broken pipe") || + strings.Contains(errMsg, "no such host") || + strings.Contains(errMsg, "network is unreachable") || + strings.Contains(errMsg, "read tcp") || + strings.Contains(errMsg, "write tcp") || + strings.Contains(errMsg, "eof")) + isContextError := !isTimeoutError && (strings.Contains(errMsg, "context_length_exceeded") || strings.Contains(errMsg, "context window") || strings.Contains(errMsg, "context_window") || @@ -245,7 +261,7 @@ func (p *Pipeline) CallLLM( strings.Contains(errMsg, "request too large")) if isTimeoutError && retry < maxRetries { - backoff := time.Duration(retry+1) * 5 * time.Second + backoff := time.Duration(retry+1) * time.Duration(backoffSecs) * time.Second al.emitEvent( EventKindLLMRetry, ts.eventMeta("runTurn", "turn.llm.retry"), @@ -273,6 +289,35 @@ func (p *Pipeline) CallLLM( continue } + if isNetworkError && retry < maxRetries { + backoff := time.Duration(retry+1) * time.Duration(backoffSecs) * time.Second + al.emitEvent( + EventKindLLMRetry, + ts.eventMeta("runTurn", "turn.llm.retry"), + LLMRetryPayload{ + Attempt: retry + 1, + MaxRetries: maxRetries, + Reason: "network", + Error: err.Error(), + Backoff: backoff, + }, + ) + logger.WarnCF("agent", "Network error, retrying after backoff", map[string]any{ + "error": err.Error(), + "retry": retry, + "backoff": backoff.String(), + }) + if sleepErr := sleepWithContext(turnCtx, backoff); sleepErr != nil { + if ts.hardAbortRequested() { + _ = ts.requestHardAbort() + return ControlBreak, nil + } + err = sleepErr + break + } + continue + } + if isContextError && retry < maxRetries && !ts.opts.NoHistory { al.emitEvent( EventKindLLMRetry, diff --git a/pkg/config/config.go b/pkg/config/config.go index 5bc96fb12..804f4c67b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -275,6 +275,8 @@ type AgentDefaults struct { SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"` ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"` + MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` + LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` } const DefaultMaxMediaSize = 20 * 1024 * 1024 // 20 MB From 3c4523e7aaeb969d2530f4cb812219c32b65c946 Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 21:19:13 +0600 Subject: [PATCH 2/8] test(agent): add unit tests for network error retry backoff strategy - Test all network error types trigger retry (connection_reset, broken_pipe, read_tcp, eof, connection_refused) - Test custom MaxLLMRetries and LLMRetryBackoffSecs config is respected - Test retry count limit (1 initial + maxRetries retries) - Add countingErrorProvider mock for deterministic call count verification --- pkg/agent/turn_coord_test.go | 167 +++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/pkg/agent/turn_coord_test.go b/pkg/agent/turn_coord_test.go index 7a362a662..9e1eaaf40 100644 --- a/pkg/agent/turn_coord_test.go +++ b/pkg/agent/turn_coord_test.go @@ -106,6 +106,16 @@ func (p *errorProvider) Chat( return nil, errors.New("context_length_exceeded") case "vision": return nil, errors.New("vision_unsupported") + case "connection_reset": + return nil, errors.New("connection reset by peer") + case "broken_pipe": + return nil, errors.New("broken pipe") + case "read_tcp": + return nil, errors.New("read tcp 127.0.0.1:8080: connection reset") + case "eof": + return nil, errors.New("EOF") + case "connection_refused": + return nil, errors.New("connection refused") default: return nil, errors.New("unknown error") } @@ -302,6 +312,163 @@ func TestPipeline_CallLLM_ContextLengthError(t *testing.T) { t.Logf("CallLLM result after context error: err=%v", err) } +func TestPipeline_CallLLM_NetworkErrorRetry(t *testing.T) { + testCases := []struct { + name string + errType string + }{ + {"connection_reset", "connection_reset"}, + {"broken_pipe", "broken_pipe"}, + {"read_tcp", "read_tcp"}, + {"eof", "eof"}, + {"connection_refused", "connection_refused"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + errorPrv := &errorProvider{errType: tc.errType} + al, agent, cleanup := newTurnCoordTestLoop(t, errorPrv) + defer cleanup() + + pipeline := NewPipeline(al) + ts := newTurnState(agent, makeTestProcessOpts("test-session"), turnEventScope{ + turnID: "turn-1", + context: newTurnContext(nil, nil, nil), + }) + + exec, err := pipeline.SetupTurn(context.Background(), ts) + if err != nil { + t.Fatalf("SetupTurn failed: %v", err) + } + + _, err = pipeline.CallLLM(context.Background(), context.Background(), ts, exec, 1) + if err == nil { + t.Error("expected error after network error retries") + } + }) + } +} + +func TestPipeline_CallLLM_RetryConfigRespected(t *testing.T) { + tmpDir := t.TempDir() + + cfg := &config.Config{ + Agents: config.AgentsConfig{ + Defaults: config.AgentDefaults{ + Workspace: tmpDir, + ModelName: "test-model", + MaxTokens: 4096, + MaxToolIterations: 10, + MaxLLMRetries: 3, + LLMRetryBackoffSecs: 1, + }, + }, + } + + msgBus := bus.NewMessageBus() + provider := &errorProvider{errType: "connection_reset"} + al := NewAgentLoop(cfg, msgBus, provider) + defer al.Close() + agent := al.registry.GetDefaultAgent() + if agent == nil { + t.Fatal("expected default agent") + } + + pipeline := NewPipeline(al) + ts := newTurnState(agent, makeTestProcessOpts("test-session"), turnEventScope{ + turnID: "turn-1", + context: newTurnContext(nil, nil, nil), + }) + + exec, err := pipeline.SetupTurn(context.Background(), ts) + if err != nil { + t.Fatalf("SetupTurn failed: %v", err) + } + + start := time.Now() + _, err = pipeline.CallLLM(context.Background(), context.Background(), ts, exec, 1) + elapsed := time.Since(start) + + if err == nil { + t.Error("expected error after retries") + } + + expectedMinTime := 3 * time.Second + if elapsed < expectedMinTime { + t.Errorf("expected at least %v of backoff, got %v", expectedMinTime, elapsed) + } +} + +func TestPipeline_CallLLM_RetryCountLimit(t *testing.T) { + tmpDir := t.TempDir() + + counterPrv := &countingErrorProvider{errType: "connection_reset", targetCalls: 5} + cfg := &config.Config{ + Agents: config.AgentsConfig{ + Defaults: config.AgentDefaults{ + Workspace: tmpDir, + ModelName: "test-model", + MaxTokens: 4096, + MaxToolIterations: 10, + MaxLLMRetries: 2, + LLMRetryBackoffSecs: 0, + }, + }, + } + + msgBus := bus.NewMessageBus() + al := NewAgentLoop(cfg, msgBus, counterPrv) + defer al.Close() + agent := al.registry.GetDefaultAgent() + if agent == nil { + t.Fatal("expected default agent") + } + + pipeline := NewPipeline(al) + ts := newTurnState(agent, makeTestProcessOpts("test-session"), turnEventScope{ + turnID: "turn-1", + context: newTurnContext(nil, nil, nil), + }) + + exec, err := pipeline.SetupTurn(context.Background(), ts) + if err != nil { + t.Fatalf("SetupTurn failed: %v", err) + } + + _, err = pipeline.CallLLM(context.Background(), context.Background(), ts, exec, 1) + if err == nil { + t.Error("expected error after retries") + } + + if counterPrv.callCount != 3 { + t.Errorf("expected exactly 3 calls (1 initial + 2 retries), got %d", counterPrv.callCount) + } +} + +type countingErrorProvider struct { + errType string + targetCalls int + callCount int + mu sync.Mutex +} + +func (p *countingErrorProvider) Chat( + ctx context.Context, + messages []providers.Message, + tools []providers.ToolDefinition, + model string, + opts map[string]any, +) (*providers.LLMResponse, error) { + p.mu.Lock() + p.callCount++ + p.mu.Unlock() + return nil, errors.New("connection reset by peer") +} + +func (p *countingErrorProvider) GetDefaultModel() string { + return "counting-error-model" +} + // ============================================================================= // Pipeline Method Tests: ExecuteTools // ============================================================================= From d2f6a089818dd3207f8007f9af471dd59f5a8eca Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 22:07:16 +0600 Subject: [PATCH 3/8] fix(config): align gci formatting for MaxLLMRetries field --- pkg/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 804f4c67b..10d9da7e0 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -275,7 +275,7 @@ type AgentDefaults struct { SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"` ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"` - MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` + MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` } From 32c8b8ce6ae523f8ee29e50c99fd68f735240fd4 Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 22:09:44 +0600 Subject: [PATCH 4/8] chore(config): add default values for max_llm_retries and llm_retry_backoff_secs --- config/config.example.json | 2 ++ pkg/config/defaults.go | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/config/config.example.json b/config/config.example.json index 858472488..cf96e19a8 100644 --- a/config/config.example.json +++ b/config/config.example.json @@ -11,6 +11,8 @@ "summarize_message_threshold": 20, "summarize_token_percent": 75, "split_on_marker": false, + "max_llm_retries": 2, + "llm_retry_backoff_secs": 2, "tool_feedback": { "enabled": false, "max_args_length": 300 diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go index 3d12c6ba5..0d24bebe3 100644 --- a/pkg/config/defaults.go +++ b/pkg/config/defaults.go @@ -38,10 +38,12 @@ func DefaultConfig() *Config { Enabled: false, MaxArgsLength: 300, }, - SplitOnMarker: false, - }, + SplitOnMarker: false, + MaxLLMRetries: 2, + LLMRetryBackoffSecs: 2, }, - Session: SessionConfig{ + }, + Session: SessionConfig{ Dimensions: []string{"chat"}, }, Channels: defaultChannels(), From 1b2f8aac7998a72c51d2081941fdebf7349a7fc5 Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 22:12:41 +0600 Subject: [PATCH 5/8] fix(config): align indentation for new LLM retry default fields --- pkg/config/defaults.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go index 0d24bebe3..959877e60 100644 --- a/pkg/config/defaults.go +++ b/pkg/config/defaults.go @@ -38,12 +38,12 @@ func DefaultConfig() *Config { Enabled: false, MaxArgsLength: 300, }, - SplitOnMarker: false, - MaxLLMRetries: 2, - LLMRetryBackoffSecs: 2, + SplitOnMarker: false, + MaxLLMRetries: 2, + LLMRetryBackoffSecs: 2, + }, }, - }, - Session: SessionConfig{ + Session: SessionConfig{ Dimensions: []string{"chat"}, }, Channels: defaultChannels(), From 612097b41139a6a78bb3d44aa5cf031d21d786be Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sat, 25 Apr 2026 23:01:45 +0600 Subject: [PATCH 6/8] fix(config): align gci formatting for LLM retry fields --- pkg/config/config.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 10d9da7e0..b8d92b57f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -275,8 +275,8 @@ type AgentDefaults struct { SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"` ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"` - MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` - LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` + MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` + LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` } const DefaultMaxMediaSize = 20 * 1024 * 1024 // 20 MB From f0dc709b17de4e358e36327992346e5ee963427a Mon Sep 17 00:00:00 2001 From: David Siewert Date: Sun, 26 Apr 2026 07:07:19 +0600 Subject: [PATCH 7/8] fix(config): fix golines max-len for MaxLLMRetries field --- pkg/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index b8d92b57f..80a7a79a7 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -275,7 +275,7 @@ type AgentDefaults struct { SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"` ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"` - MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` + MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` } From e656ddf5bbe8fa197c31ec310c019f14932585cb Mon Sep 17 00:00:00 2001 From: David Siewert Date: Mon, 27 Apr 2026 16:47:28 +0600 Subject: [PATCH 8/8] fix: align struct tag spacing in AgentDefaults config --- pkg/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 80a7a79a7..c488dff40 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -275,7 +275,7 @@ type AgentDefaults struct { SplitOnMarker bool `json:"split_on_marker" env:"PICOCLAW_AGENTS_DEFAULTS_SPLIT_ON_MARKER"` // split messages on <|[SPLIT]|> marker ContextManager string `json:"context_manager,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER"` ContextManagerConfig json.RawMessage `json:"context_manager_config,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_CONTEXT_MANAGER_CONFIG"` - MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` + MaxLLMRetries int `json:"max_llm_retries,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_MAX_LLM_RETRIES"` LLMRetryBackoffSecs int `json:"llm_retry_backoff_secs,omitempty" env:"PICOCLAW_AGENTS_DEFAULTS_LLM_RETRY_BACKOFF_SECS"` }