From f93d2b453325f8529805c75f83717dbdd318b4be Mon Sep 17 00:00:00 2001
From: linhaolin1 <linhaolin1@gmail.com>
Date: Thu, 19 Mar 2026 00:10:26 +0800
Subject: [PATCH] fix: Avoid failure of the main agent process due to tool call
 failures  (#1023)

* Avoid failure of the main agent process due to tool call failures or abnormal returns

* rename recover
---
 pkg/tools/registry.go      |  49 +++++++++--
 pkg/tools/registry_test.go | 173 +++++++++++++++++++++++++++++++++++++
 pkg/tools/shell.go         |  19 +++-
 pkg/tools/shell_test.go    |  63 ++++++++++++++
 4 files changed, 295 insertions(+), 9 deletions(-)

diff --git a/pkg/tools/registry.go b/pkg/tools/registry.go
index 0635f47d7..60effc292 100644
--- a/pkg/tools/registry.go
+++ b/pkg/tools/registry.go
@@ -188,15 +188,48 @@ func (r *ToolRegistry) ExecuteWithContext(
 	// The callback is a call parameter, not mutable state on the tool instance.
 	var result *ToolResult
 	start := time.Now()
-	if asyncExec, ok := tool.(AsyncExecutor); ok && asyncCallback != nil {
-		logger.DebugCF("tool", "Executing async tool via ExecuteAsync",
-			map[string]any{
-				"tool": name,
-			})
-		result = asyncExec.ExecuteAsync(ctx, args, asyncCallback)
-	} else {
-		result = tool.Execute(ctx, args)
+
+	// Use recover to catch any panics during tool execution
+	// This prevents tool crashes from killing the entire agent
+	func() {
+		defer func() {
+			if re := recover(); re != nil {
+				errMsg := fmt.Sprintf("Tool '%s' crashed with panic: %v", name, re)
+				logger.ErrorCF("tool", "Tool execution panic recovered",
+					map[string]any{
+						"tool":  name,
+						"panic": fmt.Sprintf("%v", re),
+					})
+				result = &ToolResult{
+					ForLLM:  errMsg,
+					ForUser: errMsg,
+					IsError: true,
+					Err:     fmt.Errorf("panic: %v", re),
+				}
+			}
+		}()
+
+		if asyncExec, ok := tool.(AsyncExecutor); ok && asyncCallback != nil {
+			logger.DebugCF("tool", "Executing async tool via ExecuteAsync",
+				map[string]any{
+					"tool": name,
+				})
+			result = asyncExec.ExecuteAsync(ctx, args, asyncCallback)
+		} else {
+			result = tool.Execute(ctx, args)
+		}
+	}()
+
+	// Handle nil result (should not happen, but defensive)
+	if result == nil {
+		result = &ToolResult{
+			ForLLM:  fmt.Sprintf("Tool '%s' returned nil result unexpectedly", name),
+			ForUser: fmt.Sprintf("Tool '%s' returned nil result unexpectedly", name),
+			IsError: true,
+			Err:     fmt.Errorf("nil result from tool"),
+		}
 	}
+
 	duration := time.Since(start)
 
 	// Log based on result type
diff --git a/pkg/tools/registry_test.go b/pkg/tools/registry_test.go
index 92d7d5abd..5fe681389 100644
--- a/pkg/tools/registry_test.go
+++ b/pkg/tools/registry_test.go
@@ -2,6 +2,7 @@ package tools
 
 import (
 	"context"
+	"errors"
 	"strings"
 	"sync"
 	"testing"
@@ -358,3 +359,175 @@ func TestToolRegistry_ConcurrentAccess(t *testing.T) {
 		t.Error("expected tools to be registered after concurrent access")
 	}
 }
+
+// --- Panic and abnormal exit tests ---
+
+// mockPanicTool is a tool that panics during execution
+type mockPanicTool struct {
+	name       string
+	panicValue any
+}
+
+func (m *mockPanicTool) Name() string               { return m.name }
+func (m *mockPanicTool) Description() string        { return "a tool that panics" }
+func (m *mockPanicTool) Parameters() map[string]any { return map[string]any{"type": "object"} }
+func (m *mockPanicTool) Execute(_ context.Context, _ map[string]any) *ToolResult {
+	panic(m.panicValue)
+}
+
+// mockNilResultTool is a tool that returns nil
+type mockNilResultTool struct {
+	name string
+}
+
+func (m *mockNilResultTool) Name() string               { return m.name }
+func (m *mockNilResultTool) Description() string        { return "a tool that returns nil" }
+func (m *mockNilResultTool) Parameters() map[string]any { return map[string]any{"type": "object"} }
+func (m *mockNilResultTool) Execute(_ context.Context, _ map[string]any) *ToolResult {
+	return nil
+}
+
+func TestToolRegistry_Execute_PanicRecovery(t *testing.T) {
+	r := NewToolRegistry()
+	r.Register(&mockPanicTool{
+		name:       "panic_tool",
+		panicValue: "something went terribly wrong",
+	})
+
+	// Should not panic, should return error result
+	result := r.Execute(context.Background(), "panic_tool", nil)
+
+	if result == nil {
+		t.Fatal("expected non-nil result after panic recovery")
+	}
+	if !result.IsError {
+		t.Error("expected IsError=true after panic")
+	}
+	if !strings.Contains(result.ForLLM, "panic") {
+		t.Errorf("expected 'panic' in error message, got %q", result.ForLLM)
+	}
+	if !strings.Contains(result.ForLLM, "panic_tool") {
+		t.Errorf("expected tool name in error message, got %q", result.ForLLM)
+	}
+	if !strings.Contains(result.ForLLM, "something went terribly wrong") {
+		t.Errorf("expected panic value in error message, got %q", result.ForLLM)
+	}
+	if result.Err == nil {
+		t.Error("expected Err to be set")
+	}
+}
+
+func TestToolRegistry_Execute_PanicRecovery_ErrorType(t *testing.T) {
+	r := NewToolRegistry()
+
+	// Test with error type panic
+	r.Register(&mockPanicTool{
+		name:       "error_panic_tool",
+		panicValue: errors.New("custom error panic"),
+	})
+
+	result := r.Execute(context.Background(), "error_panic_tool", nil)
+
+	if !result.IsError {
+		t.Error("expected IsError=true")
+	}
+	if !strings.Contains(result.ForLLM, "custom error panic") {
+		t.Errorf("expected error message in ForLLM, got %q", result.ForLLM)
+	}
+}
+
+func TestToolRegistry_Execute_PanicRecovery_IntType(t *testing.T) {
+	r := NewToolRegistry()
+
+	// Test with int type panic
+	r.Register(&mockPanicTool{
+		name:       "int_panic_tool",
+		panicValue: 42,
+	})
+
+	result := r.Execute(context.Background(), "int_panic_tool", nil)
+
+	if !result.IsError {
+		t.Error("expected IsError=true")
+	}
+	if !strings.Contains(result.ForLLM, "42") {
+		t.Errorf("expected panic value '42' in ForLLM, got %q", result.ForLLM)
+	}
+}
+
+func TestToolRegistry_Execute_NilResultHandling(t *testing.T) {
+	r := NewToolRegistry()
+	r.Register(&mockNilResultTool{name: "nil_tool"})
+
+	result := r.Execute(context.Background(), "nil_tool", nil)
+
+	if result == nil {
+		t.Fatal("expected non-nil result when tool returns nil")
+	}
+	if !result.IsError {
+		t.Error("expected IsError=true for nil result")
+	}
+	if !strings.Contains(result.ForLLM, "nil_tool") {
+		t.Errorf("expected tool name in error message, got %q", result.ForLLM)
+	}
+	if !strings.Contains(result.ForLLM, "nil result") {
+		t.Errorf("expected 'nil result' in error message, got %q", result.ForLLM)
+	}
+	if result.Err == nil {
+		t.Error("expected Err to be set")
+	}
+}
+
+func TestToolRegistry_ExecuteWithContext_PanicRecovery(t *testing.T) {
+	r := NewToolRegistry()
+	r.Register(&mockPanicTool{
+		name:       "ctx_panic_tool",
+		panicValue: "context panic test",
+	})
+
+	// Should not panic even with context
+	result := r.ExecuteWithContext(
+		context.Background(),
+		"ctx_panic_tool",
+		map[string]any{"key": "value"},
+		"telegram",
+		"chat-123",
+		nil,
+	)
+
+	if result == nil {
+		t.Fatal("expected non-nil result")
+	}
+	if !result.IsError {
+		t.Error("expected IsError=true")
+	}
+	if !strings.Contains(result.ForLLM, "context panic test") {
+		t.Errorf("expected panic message, got %q", result.ForLLM)
+	}
+}
+
+func TestToolRegistry_Execute_PanicDoesNotAffectOtherTools(t *testing.T) {
+	r := NewToolRegistry()
+	r.Register(&mockPanicTool{name: "bad_tool", panicValue: "boom"})
+	r.Register(&mockRegistryTool{
+		name:   "good_tool",
+		desc:   "works fine",
+		params: map[string]any{},
+		result: SilentResult("success"),
+	})
+
+	// First, trigger the panic
+	result1 := r.Execute(context.Background(), "bad_tool", nil)
+	if !result1.IsError {
+		t.Error("expected error from panic tool")
+	}
+
+	// Then, verify the good tool still works
+	result2 := r.Execute(context.Background(), "good_tool", nil)
+	if result2.IsError {
+		t.Errorf("expected success from good tool, got error: %s", result2.ForLLM)
+	}
+	if result2.ForLLM != "success" {
+		t.Errorf("expected 'success', got %q", result2.ForLLM)
+	}
+}
diff --git a/pkg/tools/shell.go b/pkg/tools/shell.go
index 0dc85ae21..78ad2b26d 100644
--- a/pkg/tools/shell.go
+++ b/pkg/tools/shell.go
@@ -311,13 +311,30 @@ func (t *ExecTool) Execute(ctx context.Context, args map[string]any) *ToolResult
 	if err != nil {
 		if errors.Is(cmdCtx.Err(), context.DeadlineExceeded) {
 			msg := fmt.Sprintf("Command timed out after %v", t.timeout)
+			if output != "" {
+				msg += "\n\nPartial output before timeout:\n" + output
+			}
 			return &ToolResult{
 				ForLLM:  msg,
 				ForUser: msg,
 				IsError: true,
+				Err:     fmt.Errorf("command timeout: %w", err),
 			}
 		}
-		output += fmt.Sprintf("\nExit code: %v", err)
+
+		// Extract detailed exit information
+		var exitErr *exec.ExitError
+		if errors.As(err, &exitErr) {
+			exitCode := exitErr.ExitCode()
+			output += fmt.Sprintf("\n\n[Command exited with code %d]", exitCode)
+
+			// Add signal information if killed by signal (Unix)
+			if exitCode == -1 {
+				output += " (killed by signal)"
+			}
+		} else {
+			output += fmt.Sprintf("\n\n[Command failed: %v]", err)
+		}
 	}
 
 	if output == "" {
diff --git a/pkg/tools/shell_test.go b/pkg/tools/shell_test.go
index c4553020f..f8f83ea74 100644
--- a/pkg/tools/shell_test.go
+++ b/pkg/tools/shell_test.go
@@ -489,6 +489,69 @@ func TestShellTool_SafePathsInWorkspaceRestriction(t *testing.T) {
 	}
 }
 
+// TestShellTool_ExitCodeDetails verifies that exit codes are captured with details
+func TestShellTool_ExitCodeDetails(t *testing.T) {
+	tool, err := NewExecTool("", false)
+	if err != nil {
+		t.Fatalf("unable to configure exec tool: %s", err)
+	}
+
+	ctx := context.Background()
+	args := map[string]any{
+		"command": "sh -c 'exit 42'",
+	}
+
+	result := tool.Execute(ctx, args)
+
+	if !result.IsError {
+		t.Error("expected error for non-zero exit code")
+	}
+
+	// Should contain the exit code in the message (new format: "exited with code 42")
+	if !strings.Contains(result.ForLLM, "42") {
+		t.Errorf("expected exit code 42 in error message, got: %s", result.ForLLM)
+	}
+
+	// Verify the new detailed message format
+	if !strings.Contains(result.ForLLM, "exited with code") {
+		t.Errorf("expected 'exited with code' in message, got: %s", result.ForLLM)
+	}
+
+	// Err field is set by the exec system (may or may not be set depending on implementation)
+	// The important thing is that IsError=true
+	t.Logf("Exit code result: %s", result.ForLLM)
+}
+
+// TestShellTool_TimeoutWithPartialOutput verifies timeout includes partial output
+func TestShellTool_TimeoutWithPartialOutput(t *testing.T) {
+	tool, err := NewExecTool("", false)
+	if err != nil {
+		t.Fatalf("unable to configure exec tool: %s", err)
+	}
+
+	tool.SetTimeout(1 * time.Second) // Give more time for echo to complete
+
+	ctx := context.Background()
+	// Use a command that outputs immediately then sleeps
+	args := map[string]any{
+		"command": "echo 'partial output before timeout' && sleep 30",
+	}
+
+	result := tool.Execute(ctx, args)
+
+	if !result.IsError {
+		t.Error("expected error for timeout")
+	}
+
+	// Should mention timeout
+	if !strings.Contains(result.ForLLM, "timed out") {
+		t.Errorf("expected 'timed out' in message, got: %s", result.ForLLM)
+	}
+
+	// Log the result for debugging (partial output depends on shell behavior)
+	t.Logf("Timeout result: %s", result.ForLLM)
+}
+
 // TestShellTool_CustomAllowPatterns verifies that custom allow patterns exempt
 // commands from deny pattern checks.
 func TestShellTool_CustomAllowPatterns(t *testing.T) {