refactor(bus): fix deadlock and concurrency issues in MessageBus

PublishInbound/PublishOutbound held RLock during blocking channel sends,
deadlocking against Close() which needs a write lock when the buffer is
full. ConsumeInbound/SubscribeOutbound used bare receives instead of
comma-ok, causing zero-value processing or busy loops after close.

Replace sync.RWMutex+bool with atomic.Bool+done channel so Publish
methods use a lock-free 3-way select (send / done / ctx.Done). Add
context.Context parameter to both Publish methods so callers can cancel
or timeout blocked sends. Close() now only sets the atomic flag and
closes the done channel—never closes the data channels—eliminating
send-on-closed-channel panics.

- Remove dead code: RegisterHandler, GetHandler, handlers map,
  MessageHandler type (zero callers across the whole repo)
- Add ErrBusClosed sentinel error
- Update all 10 caller sites to pass context
- Add msgBus.Close() to gateway and agent shutdown flows
- Add pkg/bus/bus_test.go with 11 test cases covering basic round-trip,
  context cancellation, closed-bus behavior, concurrent publish+close,
  full-buffer timeout, and idempotent Close
This commit is contained in:
Hoshina
2026-02-23 00:44:45 +08:00
parent a32d98534c
commit 24e2ed79c0
11 changed files with 284 additions and 55 deletions
+6 -6
View File
@@ -120,7 +120,7 @@ func registerSharedTools(
// Message tool
messageTool := tools.NewMessageTool()
messageTool.SetSendCallback(func(channel, chatID, content string) error {
msgBus.PublishOutbound(bus.OutboundMessage{
msgBus.PublishOutbound(context.TODO(), bus.OutboundMessage{
Channel: channel,
ChatID: chatID,
Content: content,
@@ -202,7 +202,7 @@ func (al *AgentLoop) Run(ctx context.Context) error {
}
if !alreadySent {
al.bus.PublishOutbound(bus.OutboundMessage{
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: msg.Channel,
ChatID: msg.ChatID,
Content: response,
@@ -471,7 +471,7 @@ func (al *AgentLoop) runAgentLoop(ctx context.Context, agent *AgentInstance, opt
// 8. Optional: send response via bus
if opts.SendResponse {
al.bus.PublishOutbound(bus.OutboundMessage{
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
Content: finalContent,
@@ -586,7 +586,7 @@ func (al *AgentLoop) runLLMIteration(
})
if retry == 0 && !constants.IsInternalChannel(opts.Channel) {
al.bus.PublishOutbound(bus.OutboundMessage{
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
Content: "Context window exceeded. Compressing history and retrying...",
@@ -715,7 +715,7 @@ func (al *AgentLoop) runLLMIteration(
// Send ForUser content to user immediately if not Silent
if !toolResult.Silent && toolResult.ForUser != "" && opts.SendResponse {
al.bus.PublishOutbound(bus.OutboundMessage{
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
Content: toolResult.ForUser,
@@ -780,7 +780,7 @@ func (al *AgentLoop) maybeSummarize(agent *AgentInstance, sessionKey, channel, c
go func() {
defer al.summarizing.Delete(summarizeKey)
if !constants.IsInternalChannel(channel) {
al.bus.PublishOutbound(bus.OutboundMessage{
al.bus.PublishOutbound(context.TODO(), bus.OutboundMessage{
Channel: channel,
ChatID: chatID,
Content: "Memory threshold reached. Optimizing conversation history...",