Refactor/asr tts (#1939)

* refactor: update ASR and TTS implementations

* fix lint

* Integrating asr/tts models w/ new security config

* update documents

* add arbitrary whisper transcriptor support

* update documents

* fix lint

* add mimo tts
This commit is contained in:
Hua Audio
2026-04-01 06:21:21 +02:00
committed by GitHub
parent ff90a65814
commit 0f395ce110
48 changed files with 3527 additions and 358 deletions
+53 -17
View File
@@ -18,6 +18,8 @@ import (
"sync/atomic"
"time"
"github.com/sipeed/picoclaw/pkg/audio/asr"
"github.com/sipeed/picoclaw/pkg/audio/tts"
"github.com/sipeed/picoclaw/pkg/bus"
"github.com/sipeed/picoclaw/pkg/channels"
"github.com/sipeed/picoclaw/pkg/commands"
@@ -31,7 +33,6 @@ import (
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type AgentLoop struct {
@@ -51,7 +52,7 @@ type AgentLoop struct {
fallback *providers.FallbackChain
channelManager *channels.Manager
mediaStore media.MediaStore
transcriber voice.Transcriber
transcriber asr.Transcriber
cmdRegistry *commands.Registry
mcp mcpRuntime
hookRuntime hookRuntime
@@ -159,6 +160,13 @@ func registerSharedTools(
provider providers.LLMProvider,
) {
allowReadPaths := buildAllowReadPatterns(cfg)
var ttsProvider tts.TTSProvider
if cfg.Tools.IsToolEnabled("send_tts") {
ttsProvider = tts.DetectTTS(cfg)
if ttsProvider == nil {
logger.WarnCF("voice-tts", "send_tts enabled but no TTS provider configured", nil)
}
}
for _, agentID := range registry.ListAgentIDs() {
agent, ok := registry.GetAgent(agentID)
@@ -269,6 +277,10 @@ func registerSharedTools(
agent.Tools.Register(sendFileTool)
}
if ttsProvider != nil {
agent.Tools.Register(tools.NewSendTTSTool(ttsProvider, nil))
}
// Skill discovery and installation tools
skills_enabled := cfg.Tools.IsToolEnabled("skills")
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
@@ -1059,10 +1071,15 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
agent.Tools.SetMediaStore(s)
}
}
registry.ForEachTool("send_tts", func(t tools.Tool) {
if st, ok := t.(*tools.SendTTSTool); ok {
st.SetMediaStore(s)
}
})
}
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
func (al *AgentLoop) SetTranscriber(t asr.Transcriber) {
al.transcriber = t
}
@@ -1083,19 +1100,23 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
// Transcribe each audio media ref in order.
var transcriptions []string
var keptMedia []string
for _, ref := range msg.Media {
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
if err != nil {
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
keptMedia = append(keptMedia, ref)
continue
}
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
keptMedia = append(keptMedia, ref)
continue
}
result, err := al.transcriber.Transcribe(ctx, path)
if err != nil {
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
transcriptions = append(transcriptions, "")
keptMedia = append(keptMedia, ref)
continue
}
transcriptions = append(transcriptions, result.Text)
@@ -1115,15 +1136,21 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
}
text := transcriptions[idx]
idx++
if text == "" {
return match
}
return "[voice: " + text + "]"
})
// Append any remaining transcriptions not matched by an annotation.
for ; idx < len(transcriptions); idx++ {
newContent += "\n[voice: " + transcriptions[idx] + "]"
if transcriptions[idx] != "" {
newContent += "\n[voice: " + transcriptions[idx] + "]"
}
}
msg.Content = newContent
msg.Media = keptMedia
return msg, true
}
@@ -2464,6 +2491,28 @@ turnLoop:
if toolResult == nil {
toolResult = tools.ErrorResult("hook returned nil tool result")
}
// Send ForUser if not silent and has content.
// For ResponseHandled tools, send regardless of SendResponse setting,
// since they've already handled the response (e.g., send_tts, send_file).
shouldSendForUser := !toolResult.Silent && toolResult.ForUser != "" &&
(ts.opts.SendResponse || toolResult.ResponseHandled)
if shouldSendForUser {
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: ts.channel,
ChatID: ts.chatID,
Content: toolResult.ForUser,
Metadata: map[string]string{
"is_tool_call": "true",
},
})
logger.DebugCF("agent", "Sent tool result to user",
map[string]any{
"tool": toolName,
"content_len": len(toolResult.ForUser),
})
}
if len(toolResult.Media) > 0 && toolResult.ResponseHandled {
parts := make([]bus.MediaPart, 0, len(toolResult.Media))
for _, ref := range toolResult.Media {
@@ -2509,19 +2558,6 @@ turnLoop:
allResponsesHandled = false
}
if !toolResult.Silent && toolResult.ForUser != "" && ts.opts.SendResponse {
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
Channel: ts.channel,
ChatID: ts.chatID,
Content: toolResult.ForUser,
})
logger.DebugCF("agent", "Sent tool result to user",
map[string]any{
"tool": toolName,
"content_len": len(toolResult.ForUser),
})
}
contentForLLM := toolResult.ContentForLLM()
// Filter sensitive data (API keys, tokens, secrets) before sending to LLM