Merge pull request #947 from dim/fix/transcription

Fix voice transcription
This commit is contained in:
Mauro
2026-03-04 18:37:24 +01:00
committed by GitHub
10 changed files with 261 additions and 10 deletions
+63
View File
@@ -12,6 +12,7 @@ import (
"errors"
"fmt"
"path/filepath"
"regexp"
"strings"
"sync"
"sync/atomic"
@@ -31,6 +32,7 @@ import (
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type AgentLoop struct {
@@ -43,6 +45,7 @@ type AgentLoop struct {
fallback *providers.FallbackChain
channelManager *channels.Manager
mediaStore media.MediaStore
transcriber voice.Transcriber
}
// processOptions configures how a message is processed
@@ -339,6 +342,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
al.mediaStore = s
}
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
al.transcriber = t
}
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
// replaces audio annotations in msg.Content with the transcribed text.
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
if al.transcriber == nil || al.mediaStore == nil || len(msg.Media) == 0 {
return msg
}
// Transcribe each audio media ref in order.
var transcriptions []string
for _, ref := range msg.Media {
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
if err != nil {
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
continue
}
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
continue
}
result, err := al.transcriber.Transcribe(ctx, path)
if err != nil {
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
transcriptions = append(transcriptions, "")
continue
}
transcriptions = append(transcriptions, result.Text)
}
if len(transcriptions) == 0 {
return msg
}
// Replace audio annotations sequentially with transcriptions.
idx := 0
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
if idx >= len(transcriptions) {
return match
}
text := transcriptions[idx]
idx++
return "[voice: " + text + "]"
})
// Append any remaining transcriptions not matched by an annotation.
for ; idx < len(transcriptions); idx++ {
newContent += "\n[voice: " + transcriptions[idx] + "]"
}
msg.Content = newContent
return msg
}
// inferMediaType determines the media type ("image", "audio", "video", "file")
// from a filename and MIME content type.
func inferMediaType(filename, contentType string) string {
@@ -450,6 +511,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
},
)
msg = al.transcribeAudioInMessage(ctx, msg)
// Route system messages to processSystemMessage
if msg.Channel == "system" {
return al.processSystemMessage(ctx, msg)