fix(agent): resolve subturn deadlocks, panics and context retry state

This commit addresses several critical concurrency and state management bugs within the SubTurn execution and delivery logic.

1. Fix Goroutine Leak & Deadlock in deliverSubTurnResult:
   - Replaced non-blocking select with a safe blocking select that listens to `resultChan` and a new `<-parentTS.Finished()` channel.
   - This ensures results are not arbitrarily dropped when the channel is full (preventing orphaned valid results), while also guaranteeing the child goroutine safely unblocks and exits if the parent finishes execution early.

2. Prevent "Send on Closed Channel" Fatal Panics:
   - Removed `close(pendingResults)` and `drainPendingResults` from `turnState.Finish()`.
   - The pendingResults channel is now naturally garbage collected, completely eliminating the race condition panic when a child attempts delivery at the exact moment the parent finishes.
   - Added a `defer recover()` failsafe inside deliverSubTurnResult to gracefully emit Orphan events in extreme edge cases.

3. Fix Truncation Recovery Prompt Drop:
   - Fixed the runTurn truncation retry logic by introducing an explicit `promptAlreadyAdded` boolean.
   - Ensures that the dynamically generated `recoveryPrompt` is correctly injected into the LLM history sequence on subsequent iterations, adhering to API roles without duplicating arrays.

4. Test Suite Stabilization:
   - Fixed TestDeliverSubTurnResultNoDeadlock to accurately wait for deterministic deliveries instead of racing timeouts.
   - Replaced defunct closed-channel tests with TestFinishedChannelClosedState matching the new Finished() mechanism.
   - Fixed the Finish(true) parameter in TestGrandchildAbort_CascadingCancellation to correctly validate Context cascade behavior.
   - All tests now pass cleanly without hanging or emitting false positives.
This commit is contained in:
Administrator
2026-03-18 13:10:36 +08:00
parent c7ea018a73
commit e20ff43f8b
3 changed files with 94 additions and 190 deletions
+32 -7
View File
@@ -344,7 +344,24 @@ func spawnSubTurn(ctx context.Context, al *AgentLoop, parentTS *turnState, cfg S
// - SubTurnResultDeliveredEvent: successful delivery to channel
// - SubTurnOrphanResultEvent: delivery failed (parent finished or channel full)
func deliverSubTurnResult(parentTS *turnState, childID string, result *tools.ToolResult) {
// Check parent state under lock, but don't hold lock while sending to channel
// Let GC clean up the pendingResults channel; parent Finish will no longer close it.
// We use defer/recover to catch any unlikely channel panics if it were ever closed.
defer func() {
if r := recover(); r != nil {
logger.WarnCF("subturn", "recovered panic sending to pendingResults", map[string]any{
"parent_id": parentTS.turnID,
"child_id": childID,
"recover": r,
})
if result != nil {
MockEventBus.Emit(SubTurnOrphanResultEvent{
ParentID: parentTS.turnID,
ChildID: childID,
Result: result,
})
}
}
}()
parentTS.mu.Lock()
isFinished := parentTS.isFinished
resultChan := parentTS.pendingResults
@@ -363,8 +380,9 @@ func deliverSubTurnResult(parentTS *turnState, childID string, result *tools.Too
}
// Parent Turn is still running → attempt to deliver result
// Note: There's still a small race window between the isFinished check above and the send below,
// but this is acceptable - worst case the result becomes an orphan, which is handled gracefully.
// We use a select statement with parentTS.Finished() to ensure that if the
// parent turn finishes while we are waiting to send the result (e.g. channel
// is full), we don't leak this goroutine by blocking forever.
select {
case resultChan <- result:
// Successfully delivered
@@ -373,9 +391,10 @@ func deliverSubTurnResult(parentTS *turnState, childID string, result *tools.Too
ChildID: childID,
Result: result,
})
default:
// Channel is full - treat as orphan result
logger.WarnCF("subturn", "pendingResults channel full", map[string]any{
case <-parentTS.Finished():
// Parent finished while we were waiting to deliver.
// The result cannot be delivered to the LLM, so it becomes an orphan.
logger.WarnCF("subturn", "parent finished before result could be delivered", map[string]any{
"parent_id": parentTS.turnID,
"child_id": childID,
})
@@ -474,6 +493,7 @@ func runTurn(ctx context.Context, al *AgentLoop, ts *turnState, cfg SubTurnConfi
truncationRetryCount := 0
contextRetryCount := 0
currentPrompt := cfg.SystemPrompt
promptAlreadyAdded := false
for {
// Soft context limit: check and truncate before LLM call
@@ -512,9 +532,13 @@ func runTurn(ctx context.Context, al *AgentLoop, ts *turnState, cfg SubTurnConfi
DefaultResponse: "",
EnableSummary: false,
SendResponse: false,
SkipAddUserMessage: contextRetryCount > 0,
SkipAddUserMessage: promptAlreadyAdded,
})
// Mark the prompt as added so subsequent truncation retries
// won't duplicate it in the history.
promptAlreadyAdded = true
// 1. Handle context length errors
if err != nil && isContextLengthError(err) {
if contextRetryCount >= maxContextRetries {
@@ -562,6 +586,7 @@ func runTurn(ctx context.Context, al *AgentLoop, ts *turnState, cfg SubTurnConfi
// Inject recovery prompt - it will be added by runAgentLoop on next iteration
recoveryPrompt := "Your previous response was truncated due to length. Please provide a shorter, complete response that finishes your thought."
currentPrompt = recoveryPrompt
promptAlreadyAdded = false // We need this new recovery prompt to be added
truncationRetryCount++
continue // Retry with recovery prompt