mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat(channel): echo voice audio transcription
This commit is contained in:
@@ -419,6 +419,9 @@
|
||||
"enabled": false,
|
||||
"monitor_usb": true
|
||||
},
|
||||
"voice": {
|
||||
"echo_transcription": false
|
||||
},
|
||||
"gateway": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 18790
|
||||
|
||||
@@ -438,6 +438,8 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
|
||||
transcriptions = append(transcriptions, result.Text)
|
||||
}
|
||||
|
||||
al.sendTranscriptionFeedback(msg.Channel, msg.ChatID, msg.MessageID, transcriptions)
|
||||
|
||||
if len(transcriptions) == 0 {
|
||||
return msg
|
||||
}
|
||||
@@ -462,6 +464,37 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
|
||||
return msg
|
||||
}
|
||||
|
||||
// sendTranscriptionFeedback Asynchronously sends feedback to the user
|
||||
// with the result of audio transcription if the option is enabled.
|
||||
func (al *AgentLoop) sendTranscriptionFeedback(channel, chatID string, messageID string, validTexts []string) {
|
||||
if !al.cfg.Voice.EchoTranscription {
|
||||
return
|
||||
}
|
||||
|
||||
go func() {
|
||||
pubCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var feedbackMsg string
|
||||
if len(validTexts) > 0 {
|
||||
feedbackMsg = "Transcript: " + strings.Join(validTexts, "\n")
|
||||
} else {
|
||||
feedbackMsg = "No voice detected in the audio"
|
||||
}
|
||||
|
||||
err := al.bus.PublishOutbound(pubCtx, bus.OutboundMessage{
|
||||
Channel: channel,
|
||||
ChatID: chatID,
|
||||
Content: feedbackMsg,
|
||||
ReplyToMessageID: messageID,
|
||||
SkipPlaceholder: true, // It serves to avoid consuming the message "Thinking..."
|
||||
})
|
||||
if err != nil {
|
||||
logger.WarnCF("voice", "Failed to send transcription feedback", map[string]any{"error": err.Error()})
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// inferMediaType determines the media type ("image", "audio", "video", "file")
|
||||
// from a filename and MIME content type.
|
||||
func inferMediaType(filename, contentType string) string {
|
||||
@@ -764,6 +797,15 @@ func (al *AgentLoop) runAgentLoop(
|
||||
// 2. Save user message to session
|
||||
agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage)
|
||||
|
||||
// thinking message only for channels, not for background tasks
|
||||
if opts.Channel != "" && opts.ChatID != "" && !constants.IsInternalChannel(opts.Channel) && !opts.NoHistory {
|
||||
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
|
||||
Channel: opts.Channel,
|
||||
ChatID: opts.ChatID,
|
||||
TriggerPlaceholder: true,
|
||||
})
|
||||
}
|
||||
|
||||
// 3. Run LLM iteration loop
|
||||
finalContent, iteration, err := al.runLLMIteration(ctx, agent, messages, opts)
|
||||
if err != nil {
|
||||
|
||||
+6
-3
@@ -30,9 +30,12 @@ type InboundMessage struct {
|
||||
}
|
||||
|
||||
type OutboundMessage struct {
|
||||
Channel string `json:"channel"`
|
||||
ChatID string `json:"chat_id"`
|
||||
Content string `json:"content"`
|
||||
Channel string `json:"channel"`
|
||||
ChatID string `json:"chat_id"`
|
||||
Content string `json:"content"`
|
||||
ReplyToMessageID string `json:"reply_to_message_id,omitempty"`
|
||||
SkipPlaceholder bool `json:"skip_placeholder,omitempty"` // Tells Manager not to use Thinking
|
||||
TriggerPlaceholder bool `json:"trigger_placeholder,omitempty"`
|
||||
}
|
||||
|
||||
// MediaPart describes a single media attachment to send.
|
||||
|
||||
@@ -284,12 +284,6 @@ func (c *BaseChannel) HandleMessage(
|
||||
c.placeholderRecorder.RecordReactionUndo(c.name, chatID, undo)
|
||||
}
|
||||
}
|
||||
// Placeholder — independent pipeline
|
||||
if pc, ok := c.owner.(PlaceholderCapable); ok {
|
||||
if phID, err := pc.SendPlaceholder(ctx, chatID); err == nil && phID != "" {
|
||||
c.placeholderRecorder.RecordPlaceholder(c.name, chatID, phID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := c.bus.PublishInbound(ctx, msg); err != nil {
|
||||
|
||||
+17
-6
@@ -133,13 +133,15 @@ func (m *Manager) preSend(ctx context.Context, name string, msg bus.OutboundMess
|
||||
}
|
||||
|
||||
// 3. Try editing placeholder
|
||||
if v, loaded := m.placeholders.LoadAndDelete(key); loaded {
|
||||
if entry, ok := v.(placeholderEntry); ok && entry.id != "" {
|
||||
if editor, ok := ch.(MessageEditor); ok {
|
||||
if err := editor.EditMessage(ctx, msg.ChatID, entry.id, msg.Content); err == nil {
|
||||
return true // edited successfully, skip Send
|
||||
if !msg.SkipPlaceholder {
|
||||
if v, loaded := m.placeholders.LoadAndDelete(key); loaded {
|
||||
if entry, ok := v.(placeholderEntry); ok && entry.id != "" {
|
||||
if editor, ok := ch.(MessageEditor); ok {
|
||||
if err := editor.EditMessage(ctx, msg.ChatID, entry.id, msg.Content); err == nil {
|
||||
return true // edited successfully, skip Send
|
||||
}
|
||||
// edit failed → fall through to normal Send
|
||||
}
|
||||
// edit failed → fall through to normal Send
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -493,6 +495,15 @@ func (m *Manager) sendWithRetry(ctx context.Context, name string, w *channelWork
|
||||
return
|
||||
}
|
||||
|
||||
if msg.TriggerPlaceholder {
|
||||
if pc, ok := w.ch.(PlaceholderCapable); ok {
|
||||
if phID, err := pc.SendPlaceholder(ctx, msg.ChatID); err == nil && phID != "" {
|
||||
m.RecordPlaceholder(name, msg.ChatID, phID)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Pre-send: stop typing and try to edit placeholder
|
||||
if m.preSend(ctx, name, msg, w.ch) {
|
||||
return // placeholder was edited successfully, skip Send
|
||||
|
||||
@@ -179,6 +179,14 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err
|
||||
tgMsg := tu.Message(tu.ID(chatID), htmlContent)
|
||||
tgMsg.ParseMode = telego.ModeHTML
|
||||
|
||||
if msg.ReplyToMessageID != "" {
|
||||
if mid, err := strconv.Atoi(msg.ReplyToMessageID); err == nil {
|
||||
tgMsg.ReplyParameters = &telego.ReplyParameters{
|
||||
MessageID: mid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if _, err = c.bot.SendMessage(ctx, tgMsg); err != nil {
|
||||
logger.ErrorCF("telegram", "HTML parse failed, falling back to plain text", map[string]any{
|
||||
"error": err.Error(),
|
||||
|
||||
@@ -58,6 +58,7 @@ type Config struct {
|
||||
Tools ToolsConfig `json:"tools"`
|
||||
Heartbeat HeartbeatConfig `json:"heartbeat"`
|
||||
Devices DevicesConfig `json:"devices"`
|
||||
Voice VoiceConfig `json:"voice"`
|
||||
}
|
||||
|
||||
// MarshalJSON implements custom JSON marshaling for Config
|
||||
@@ -424,6 +425,10 @@ type DevicesConfig struct {
|
||||
MonitorUSB bool `json:"monitor_usb" env:"PICOCLAW_DEVICES_MONITOR_USB"`
|
||||
}
|
||||
|
||||
type VoiceConfig struct {
|
||||
EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
|
||||
}
|
||||
|
||||
type ProvidersConfig struct {
|
||||
Anthropic ProviderConfig `json:"anthropic"`
|
||||
OpenAI OpenAIProviderConfig `json:"openai"`
|
||||
|
||||
@@ -461,5 +461,8 @@ func DefaultConfig() *Config {
|
||||
Enabled: false,
|
||||
MonitorUSB: true,
|
||||
},
|
||||
Voice: VoiceConfig{
|
||||
EchoTranscription: false,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user