feat(channel): echo voice audio transcription

This commit is contained in:
afjcjsbx
2026-03-07 15:49:33 +01:00
parent 440d665baa
commit 0c117a073f
8 changed files with 84 additions and 15 deletions
+3
View File
@@ -419,6 +419,9 @@
"enabled": false,
"monitor_usb": true
},
"voice": {
"echo_transcription": false
},
"gateway": {
"host": "127.0.0.1",
"port": 18790
+42
View File
@@ -438,6 +438,8 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
transcriptions = append(transcriptions, result.Text)
}
al.sendTranscriptionFeedback(msg.Channel, msg.ChatID, msg.MessageID, transcriptions)
if len(transcriptions) == 0 {
return msg
}
@@ -462,6 +464,37 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
return msg
}
// sendTranscriptionFeedback Asynchronously sends feedback to the user
// with the result of audio transcription if the option is enabled.
func (al *AgentLoop) sendTranscriptionFeedback(channel, chatID string, messageID string, validTexts []string) {
if !al.cfg.Voice.EchoTranscription {
return
}
go func() {
pubCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
var feedbackMsg string
if len(validTexts) > 0 {
feedbackMsg = "Transcript: " + strings.Join(validTexts, "\n")
} else {
feedbackMsg = "No voice detected in the audio"
}
err := al.bus.PublishOutbound(pubCtx, bus.OutboundMessage{
Channel: channel,
ChatID: chatID,
Content: feedbackMsg,
ReplyToMessageID: messageID,
SkipPlaceholder: true, // It serves to avoid consuming the message "Thinking..."
})
if err != nil {
logger.WarnCF("voice", "Failed to send transcription feedback", map[string]any{"error": err.Error()})
}
}()
}
// inferMediaType determines the media type ("image", "audio", "video", "file")
// from a filename and MIME content type.
func inferMediaType(filename, contentType string) string {
@@ -764,6 +797,15 @@ func (al *AgentLoop) runAgentLoop(
// 2. Save user message to session
agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage)
// thinking message only for channels, not for background tasks
if opts.Channel != "" && opts.ChatID != "" && !constants.IsInternalChannel(opts.Channel) && !opts.NoHistory {
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: opts.Channel,
ChatID: opts.ChatID,
TriggerPlaceholder: true,
})
}
// 3. Run LLM iteration loop
finalContent, iteration, err := al.runLLMIteration(ctx, agent, messages, opts)
if err != nil {
+6 -3
View File
@@ -30,9 +30,12 @@ type InboundMessage struct {
}
type OutboundMessage struct {
Channel string `json:"channel"`
ChatID string `json:"chat_id"`
Content string `json:"content"`
Channel string `json:"channel"`
ChatID string `json:"chat_id"`
Content string `json:"content"`
ReplyToMessageID string `json:"reply_to_message_id,omitempty"`
SkipPlaceholder bool `json:"skip_placeholder,omitempty"` // Tells Manager not to use Thinking
TriggerPlaceholder bool `json:"trigger_placeholder,omitempty"`
}
// MediaPart describes a single media attachment to send.
-6
View File
@@ -284,12 +284,6 @@ func (c *BaseChannel) HandleMessage(
c.placeholderRecorder.RecordReactionUndo(c.name, chatID, undo)
}
}
// Placeholder — independent pipeline
if pc, ok := c.owner.(PlaceholderCapable); ok {
if phID, err := pc.SendPlaceholder(ctx, chatID); err == nil && phID != "" {
c.placeholderRecorder.RecordPlaceholder(c.name, chatID, phID)
}
}
}
if err := c.bus.PublishInbound(ctx, msg); err != nil {
+17 -6
View File
@@ -133,13 +133,15 @@ func (m *Manager) preSend(ctx context.Context, name string, msg bus.OutboundMess
}
// 3. Try editing placeholder
if v, loaded := m.placeholders.LoadAndDelete(key); loaded {
if entry, ok := v.(placeholderEntry); ok && entry.id != "" {
if editor, ok := ch.(MessageEditor); ok {
if err := editor.EditMessage(ctx, msg.ChatID, entry.id, msg.Content); err == nil {
return true // edited successfully, skip Send
if !msg.SkipPlaceholder {
if v, loaded := m.placeholders.LoadAndDelete(key); loaded {
if entry, ok := v.(placeholderEntry); ok && entry.id != "" {
if editor, ok := ch.(MessageEditor); ok {
if err := editor.EditMessage(ctx, msg.ChatID, entry.id, msg.Content); err == nil {
return true // edited successfully, skip Send
}
// edit failed → fall through to normal Send
}
// edit failed → fall through to normal Send
}
}
}
@@ -493,6 +495,15 @@ func (m *Manager) sendWithRetry(ctx context.Context, name string, w *channelWork
return
}
if msg.TriggerPlaceholder {
if pc, ok := w.ch.(PlaceholderCapable); ok {
if phID, err := pc.SendPlaceholder(ctx, msg.ChatID); err == nil && phID != "" {
m.RecordPlaceholder(name, msg.ChatID, phID)
}
}
return
}
// Pre-send: stop typing and try to edit placeholder
if m.preSend(ctx, name, msg, w.ch) {
return // placeholder was edited successfully, skip Send
+8
View File
@@ -179,6 +179,14 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err
tgMsg := tu.Message(tu.ID(chatID), htmlContent)
tgMsg.ParseMode = telego.ModeHTML
if msg.ReplyToMessageID != "" {
if mid, err := strconv.Atoi(msg.ReplyToMessageID); err == nil {
tgMsg.ReplyParameters = &telego.ReplyParameters{
MessageID: mid,
}
}
}
if _, err = c.bot.SendMessage(ctx, tgMsg); err != nil {
logger.ErrorCF("telegram", "HTML parse failed, falling back to plain text", map[string]any{
"error": err.Error(),
+5
View File
@@ -58,6 +58,7 @@ type Config struct {
Tools ToolsConfig `json:"tools"`
Heartbeat HeartbeatConfig `json:"heartbeat"`
Devices DevicesConfig `json:"devices"`
Voice VoiceConfig `json:"voice"`
}
// MarshalJSON implements custom JSON marshaling for Config
@@ -424,6 +425,10 @@ type DevicesConfig struct {
MonitorUSB bool `json:"monitor_usb" env:"PICOCLAW_DEVICES_MONITOR_USB"`
}
type VoiceConfig struct {
EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
}
type ProvidersConfig struct {
Anthropic ProviderConfig `json:"anthropic"`
OpenAI OpenAIProviderConfig `json:"openai"`
+3
View File
@@ -461,5 +461,8 @@ func DefaultConfig() *Config {
Enabled: false,
MonitorUSB: true,
},
Voice: VoiceConfig{
EchoTranscription: false,
},
}
}