From f93d2b453325f8529805c75f83717dbdd318b4be Mon Sep 17 00:00:00 2001 From: linhaolin1 Date: Thu, 19 Mar 2026 00:10:26 +0800 Subject: [PATCH] fix: Avoid failure of the main agent process due to tool call failures (#1023) * Avoid failure of the main agent process due to tool call failures or abnormal returns * rename recover --- pkg/tools/registry.go | 49 +++++++++-- pkg/tools/registry_test.go | 173 +++++++++++++++++++++++++++++++++++++ pkg/tools/shell.go | 19 +++- pkg/tools/shell_test.go | 63 ++++++++++++++ 4 files changed, 295 insertions(+), 9 deletions(-) diff --git a/pkg/tools/registry.go b/pkg/tools/registry.go index 0635f47d7..60effc292 100644 --- a/pkg/tools/registry.go +++ b/pkg/tools/registry.go @@ -188,15 +188,48 @@ func (r *ToolRegistry) ExecuteWithContext( // The callback is a call parameter, not mutable state on the tool instance. var result *ToolResult start := time.Now() - if asyncExec, ok := tool.(AsyncExecutor); ok && asyncCallback != nil { - logger.DebugCF("tool", "Executing async tool via ExecuteAsync", - map[string]any{ - "tool": name, - }) - result = asyncExec.ExecuteAsync(ctx, args, asyncCallback) - } else { - result = tool.Execute(ctx, args) + + // Use recover to catch any panics during tool execution + // This prevents tool crashes from killing the entire agent + func() { + defer func() { + if re := recover(); re != nil { + errMsg := fmt.Sprintf("Tool '%s' crashed with panic: %v", name, re) + logger.ErrorCF("tool", "Tool execution panic recovered", + map[string]any{ + "tool": name, + "panic": fmt.Sprintf("%v", re), + }) + result = &ToolResult{ + ForLLM: errMsg, + ForUser: errMsg, + IsError: true, + Err: fmt.Errorf("panic: %v", re), + } + } + }() + + if asyncExec, ok := tool.(AsyncExecutor); ok && asyncCallback != nil { + logger.DebugCF("tool", "Executing async tool via ExecuteAsync", + map[string]any{ + "tool": name, + }) + result = asyncExec.ExecuteAsync(ctx, args, asyncCallback) + } else { + result = tool.Execute(ctx, args) + } + }() + + // Handle nil result (should not happen, but defensive) + if result == nil { + result = &ToolResult{ + ForLLM: fmt.Sprintf("Tool '%s' returned nil result unexpectedly", name), + ForUser: fmt.Sprintf("Tool '%s' returned nil result unexpectedly", name), + IsError: true, + Err: fmt.Errorf("nil result from tool"), + } } + duration := time.Since(start) // Log based on result type diff --git a/pkg/tools/registry_test.go b/pkg/tools/registry_test.go index 92d7d5abd..5fe681389 100644 --- a/pkg/tools/registry_test.go +++ b/pkg/tools/registry_test.go @@ -2,6 +2,7 @@ package tools import ( "context" + "errors" "strings" "sync" "testing" @@ -358,3 +359,175 @@ func TestToolRegistry_ConcurrentAccess(t *testing.T) { t.Error("expected tools to be registered after concurrent access") } } + +// --- Panic and abnormal exit tests --- + +// mockPanicTool is a tool that panics during execution +type mockPanicTool struct { + name string + panicValue any +} + +func (m *mockPanicTool) Name() string { return m.name } +func (m *mockPanicTool) Description() string { return "a tool that panics" } +func (m *mockPanicTool) Parameters() map[string]any { return map[string]any{"type": "object"} } +func (m *mockPanicTool) Execute(_ context.Context, _ map[string]any) *ToolResult { + panic(m.panicValue) +} + +// mockNilResultTool is a tool that returns nil +type mockNilResultTool struct { + name string +} + +func (m *mockNilResultTool) Name() string { return m.name } +func (m *mockNilResultTool) Description() string { return "a tool that returns nil" } +func (m *mockNilResultTool) Parameters() map[string]any { return map[string]any{"type": "object"} } +func (m *mockNilResultTool) Execute(_ context.Context, _ map[string]any) *ToolResult { + return nil +} + +func TestToolRegistry_Execute_PanicRecovery(t *testing.T) { + r := NewToolRegistry() + r.Register(&mockPanicTool{ + name: "panic_tool", + panicValue: "something went terribly wrong", + }) + + // Should not panic, should return error result + result := r.Execute(context.Background(), "panic_tool", nil) + + if result == nil { + t.Fatal("expected non-nil result after panic recovery") + } + if !result.IsError { + t.Error("expected IsError=true after panic") + } + if !strings.Contains(result.ForLLM, "panic") { + t.Errorf("expected 'panic' in error message, got %q", result.ForLLM) + } + if !strings.Contains(result.ForLLM, "panic_tool") { + t.Errorf("expected tool name in error message, got %q", result.ForLLM) + } + if !strings.Contains(result.ForLLM, "something went terribly wrong") { + t.Errorf("expected panic value in error message, got %q", result.ForLLM) + } + if result.Err == nil { + t.Error("expected Err to be set") + } +} + +func TestToolRegistry_Execute_PanicRecovery_ErrorType(t *testing.T) { + r := NewToolRegistry() + + // Test with error type panic + r.Register(&mockPanicTool{ + name: "error_panic_tool", + panicValue: errors.New("custom error panic"), + }) + + result := r.Execute(context.Background(), "error_panic_tool", nil) + + if !result.IsError { + t.Error("expected IsError=true") + } + if !strings.Contains(result.ForLLM, "custom error panic") { + t.Errorf("expected error message in ForLLM, got %q", result.ForLLM) + } +} + +func TestToolRegistry_Execute_PanicRecovery_IntType(t *testing.T) { + r := NewToolRegistry() + + // Test with int type panic + r.Register(&mockPanicTool{ + name: "int_panic_tool", + panicValue: 42, + }) + + result := r.Execute(context.Background(), "int_panic_tool", nil) + + if !result.IsError { + t.Error("expected IsError=true") + } + if !strings.Contains(result.ForLLM, "42") { + t.Errorf("expected panic value '42' in ForLLM, got %q", result.ForLLM) + } +} + +func TestToolRegistry_Execute_NilResultHandling(t *testing.T) { + r := NewToolRegistry() + r.Register(&mockNilResultTool{name: "nil_tool"}) + + result := r.Execute(context.Background(), "nil_tool", nil) + + if result == nil { + t.Fatal("expected non-nil result when tool returns nil") + } + if !result.IsError { + t.Error("expected IsError=true for nil result") + } + if !strings.Contains(result.ForLLM, "nil_tool") { + t.Errorf("expected tool name in error message, got %q", result.ForLLM) + } + if !strings.Contains(result.ForLLM, "nil result") { + t.Errorf("expected 'nil result' in error message, got %q", result.ForLLM) + } + if result.Err == nil { + t.Error("expected Err to be set") + } +} + +func TestToolRegistry_ExecuteWithContext_PanicRecovery(t *testing.T) { + r := NewToolRegistry() + r.Register(&mockPanicTool{ + name: "ctx_panic_tool", + panicValue: "context panic test", + }) + + // Should not panic even with context + result := r.ExecuteWithContext( + context.Background(), + "ctx_panic_tool", + map[string]any{"key": "value"}, + "telegram", + "chat-123", + nil, + ) + + if result == nil { + t.Fatal("expected non-nil result") + } + if !result.IsError { + t.Error("expected IsError=true") + } + if !strings.Contains(result.ForLLM, "context panic test") { + t.Errorf("expected panic message, got %q", result.ForLLM) + } +} + +func TestToolRegistry_Execute_PanicDoesNotAffectOtherTools(t *testing.T) { + r := NewToolRegistry() + r.Register(&mockPanicTool{name: "bad_tool", panicValue: "boom"}) + r.Register(&mockRegistryTool{ + name: "good_tool", + desc: "works fine", + params: map[string]any{}, + result: SilentResult("success"), + }) + + // First, trigger the panic + result1 := r.Execute(context.Background(), "bad_tool", nil) + if !result1.IsError { + t.Error("expected error from panic tool") + } + + // Then, verify the good tool still works + result2 := r.Execute(context.Background(), "good_tool", nil) + if result2.IsError { + t.Errorf("expected success from good tool, got error: %s", result2.ForLLM) + } + if result2.ForLLM != "success" { + t.Errorf("expected 'success', got %q", result2.ForLLM) + } +} diff --git a/pkg/tools/shell.go b/pkg/tools/shell.go index 0dc85ae21..78ad2b26d 100644 --- a/pkg/tools/shell.go +++ b/pkg/tools/shell.go @@ -311,13 +311,30 @@ func (t *ExecTool) Execute(ctx context.Context, args map[string]any) *ToolResult if err != nil { if errors.Is(cmdCtx.Err(), context.DeadlineExceeded) { msg := fmt.Sprintf("Command timed out after %v", t.timeout) + if output != "" { + msg += "\n\nPartial output before timeout:\n" + output + } return &ToolResult{ ForLLM: msg, ForUser: msg, IsError: true, + Err: fmt.Errorf("command timeout: %w", err), } } - output += fmt.Sprintf("\nExit code: %v", err) + + // Extract detailed exit information + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + exitCode := exitErr.ExitCode() + output += fmt.Sprintf("\n\n[Command exited with code %d]", exitCode) + + // Add signal information if killed by signal (Unix) + if exitCode == -1 { + output += " (killed by signal)" + } + } else { + output += fmt.Sprintf("\n\n[Command failed: %v]", err) + } } if output == "" { diff --git a/pkg/tools/shell_test.go b/pkg/tools/shell_test.go index c4553020f..f8f83ea74 100644 --- a/pkg/tools/shell_test.go +++ b/pkg/tools/shell_test.go @@ -489,6 +489,69 @@ func TestShellTool_SafePathsInWorkspaceRestriction(t *testing.T) { } } +// TestShellTool_ExitCodeDetails verifies that exit codes are captured with details +func TestShellTool_ExitCodeDetails(t *testing.T) { + tool, err := NewExecTool("", false) + if err != nil { + t.Fatalf("unable to configure exec tool: %s", err) + } + + ctx := context.Background() + args := map[string]any{ + "command": "sh -c 'exit 42'", + } + + result := tool.Execute(ctx, args) + + if !result.IsError { + t.Error("expected error for non-zero exit code") + } + + // Should contain the exit code in the message (new format: "exited with code 42") + if !strings.Contains(result.ForLLM, "42") { + t.Errorf("expected exit code 42 in error message, got: %s", result.ForLLM) + } + + // Verify the new detailed message format + if !strings.Contains(result.ForLLM, "exited with code") { + t.Errorf("expected 'exited with code' in message, got: %s", result.ForLLM) + } + + // Err field is set by the exec system (may or may not be set depending on implementation) + // The important thing is that IsError=true + t.Logf("Exit code result: %s", result.ForLLM) +} + +// TestShellTool_TimeoutWithPartialOutput verifies timeout includes partial output +func TestShellTool_TimeoutWithPartialOutput(t *testing.T) { + tool, err := NewExecTool("", false) + if err != nil { + t.Fatalf("unable to configure exec tool: %s", err) + } + + tool.SetTimeout(1 * time.Second) // Give more time for echo to complete + + ctx := context.Background() + // Use a command that outputs immediately then sleeps + args := map[string]any{ + "command": "echo 'partial output before timeout' && sleep 30", + } + + result := tool.Execute(ctx, args) + + if !result.IsError { + t.Error("expected error for timeout") + } + + // Should mention timeout + if !strings.Contains(result.ForLLM, "timed out") { + t.Errorf("expected 'timed out' in message, got: %s", result.ForLLM) + } + + // Log the result for debugging (partial output depends on shell behavior) + t.Logf("Timeout result: %s", result.ForLLM) +} + // TestShellTool_CustomAllowPatterns verifies that custom allow patterns exempt // commands from deny pattern checks. func TestShellTool_CustomAllowPatterns(t *testing.T) {