diff --git a/config/config.example.json b/config/config.example.json index 2f643d41b..21b65a479 100644 --- a/config/config.example.json +++ b/config/config.example.json @@ -419,6 +419,9 @@ "enabled": false, "monitor_usb": true }, + "voice": { + "echo_transcription": false + }, "gateway": { "host": "127.0.0.1", "port": 18790 diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 19d13b2bb..e91be71bc 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -438,6 +438,8 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou transcriptions = append(transcriptions, result.Text) } + al.sendTranscriptionFeedback(msg.Channel, msg.ChatID, msg.MessageID, transcriptions) + if len(transcriptions) == 0 { return msg } @@ -462,6 +464,37 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou return msg } +// sendTranscriptionFeedback Asynchronously sends feedback to the user +// with the result of audio transcription if the option is enabled. +func (al *AgentLoop) sendTranscriptionFeedback(channel, chatID string, messageID string, validTexts []string) { + if !al.cfg.Voice.EchoTranscription { + return + } + + go func() { + pubCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + var feedbackMsg string + if len(validTexts) > 0 { + feedbackMsg = "Transcript: " + strings.Join(validTexts, "\n") + } else { + feedbackMsg = "No voice detected in the audio" + } + + err := al.bus.PublishOutbound(pubCtx, bus.OutboundMessage{ + Channel: channel, + ChatID: chatID, + Content: feedbackMsg, + ReplyToMessageID: messageID, + SkipPlaceholder: true, // It serves to avoid consuming the message "Thinking..." + }) + if err != nil { + logger.WarnCF("voice", "Failed to send transcription feedback", map[string]any{"error": err.Error()}) + } + }() +} + // inferMediaType determines the media type ("image", "audio", "video", "file") // from a filename and MIME content type. func inferMediaType(filename, contentType string) string { @@ -764,6 +797,15 @@ func (al *AgentLoop) runAgentLoop( // 2. Save user message to session agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage) + // thinking message only for channels, not for background tasks + if opts.Channel != "" && opts.ChatID != "" && !constants.IsInternalChannel(opts.Channel) && !opts.NoHistory { + al.bus.PublishOutbound(ctx, bus.OutboundMessage{ + Channel: opts.Channel, + ChatID: opts.ChatID, + TriggerPlaceholder: true, + }) + } + // 3. Run LLM iteration loop finalContent, iteration, err := al.runLLMIteration(ctx, agent, messages, opts) if err != nil { diff --git a/pkg/bus/types.go b/pkg/bus/types.go index 7ad8f0417..7b7335327 100644 --- a/pkg/bus/types.go +++ b/pkg/bus/types.go @@ -30,9 +30,12 @@ type InboundMessage struct { } type OutboundMessage struct { - Channel string `json:"channel"` - ChatID string `json:"chat_id"` - Content string `json:"content"` + Channel string `json:"channel"` + ChatID string `json:"chat_id"` + Content string `json:"content"` + ReplyToMessageID string `json:"reply_to_message_id,omitempty"` + SkipPlaceholder bool `json:"skip_placeholder,omitempty"` // Tells Manager not to use Thinking + TriggerPlaceholder bool `json:"trigger_placeholder,omitempty"` } // MediaPart describes a single media attachment to send. diff --git a/pkg/channels/base.go b/pkg/channels/base.go index 063a66523..334dc9254 100644 --- a/pkg/channels/base.go +++ b/pkg/channels/base.go @@ -284,12 +284,6 @@ func (c *BaseChannel) HandleMessage( c.placeholderRecorder.RecordReactionUndo(c.name, chatID, undo) } } - // Placeholder — independent pipeline - if pc, ok := c.owner.(PlaceholderCapable); ok { - if phID, err := pc.SendPlaceholder(ctx, chatID); err == nil && phID != "" { - c.placeholderRecorder.RecordPlaceholder(c.name, chatID, phID) - } - } } if err := c.bus.PublishInbound(ctx, msg); err != nil { diff --git a/pkg/channels/manager.go b/pkg/channels/manager.go index fdd6d0c1f..84def6393 100644 --- a/pkg/channels/manager.go +++ b/pkg/channels/manager.go @@ -133,13 +133,15 @@ func (m *Manager) preSend(ctx context.Context, name string, msg bus.OutboundMess } // 3. Try editing placeholder - if v, loaded := m.placeholders.LoadAndDelete(key); loaded { - if entry, ok := v.(placeholderEntry); ok && entry.id != "" { - if editor, ok := ch.(MessageEditor); ok { - if err := editor.EditMessage(ctx, msg.ChatID, entry.id, msg.Content); err == nil { - return true // edited successfully, skip Send + if !msg.SkipPlaceholder { + if v, loaded := m.placeholders.LoadAndDelete(key); loaded { + if entry, ok := v.(placeholderEntry); ok && entry.id != "" { + if editor, ok := ch.(MessageEditor); ok { + if err := editor.EditMessage(ctx, msg.ChatID, entry.id, msg.Content); err == nil { + return true // edited successfully, skip Send + } + // edit failed → fall through to normal Send } - // edit failed → fall through to normal Send } } } @@ -493,6 +495,15 @@ func (m *Manager) sendWithRetry(ctx context.Context, name string, w *channelWork return } + if msg.TriggerPlaceholder { + if pc, ok := w.ch.(PlaceholderCapable); ok { + if phID, err := pc.SendPlaceholder(ctx, msg.ChatID); err == nil && phID != "" { + m.RecordPlaceholder(name, msg.ChatID, phID) + } + } + return + } + // Pre-send: stop typing and try to edit placeholder if m.preSend(ctx, name, msg, w.ch) { return // placeholder was edited successfully, skip Send diff --git a/pkg/channels/telegram/telegram.go b/pkg/channels/telegram/telegram.go index a2035853c..1d8757293 100644 --- a/pkg/channels/telegram/telegram.go +++ b/pkg/channels/telegram/telegram.go @@ -179,6 +179,14 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err tgMsg := tu.Message(tu.ID(chatID), htmlContent) tgMsg.ParseMode = telego.ModeHTML + if msg.ReplyToMessageID != "" { + if mid, err := strconv.Atoi(msg.ReplyToMessageID); err == nil { + tgMsg.ReplyParameters = &telego.ReplyParameters{ + MessageID: mid, + } + } + } + if _, err = c.bot.SendMessage(ctx, tgMsg); err != nil { logger.ErrorCF("telegram", "HTML parse failed, falling back to plain text", map[string]any{ "error": err.Error(), diff --git a/pkg/config/config.go b/pkg/config/config.go index 72af3e2fb..8d834229c 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -58,6 +58,7 @@ type Config struct { Tools ToolsConfig `json:"tools"` Heartbeat HeartbeatConfig `json:"heartbeat"` Devices DevicesConfig `json:"devices"` + Voice VoiceConfig `json:"voice"` } // MarshalJSON implements custom JSON marshaling for Config @@ -424,6 +425,10 @@ type DevicesConfig struct { MonitorUSB bool `json:"monitor_usb" env:"PICOCLAW_DEVICES_MONITOR_USB"` } +type VoiceConfig struct { + EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"` +} + type ProvidersConfig struct { Anthropic ProviderConfig `json:"anthropic"` OpenAI OpenAIProviderConfig `json:"openai"` diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go index 1902480c5..7b690137a 100644 --- a/pkg/config/defaults.go +++ b/pkg/config/defaults.go @@ -461,5 +461,8 @@ func DefaultConfig() *Config { Enabled: false, MonitorUSB: true, }, + Voice: VoiceConfig{ + EchoTranscription: false, + }, } }