mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
Refactor/asr tts (#1939)
* refactor: update ASR and TTS implementations * fix lint * Integrating asr/tts models w/ new security config * update documents * add arbitrary whisper transcriptor support * update documents * fix lint * add mimo tts
This commit is contained in:
+53
-17
@@ -18,6 +18,8 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/audio/asr"
|
||||
"github.com/sipeed/picoclaw/pkg/audio/tts"
|
||||
"github.com/sipeed/picoclaw/pkg/bus"
|
||||
"github.com/sipeed/picoclaw/pkg/channels"
|
||||
"github.com/sipeed/picoclaw/pkg/commands"
|
||||
@@ -31,7 +33,6 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/state"
|
||||
"github.com/sipeed/picoclaw/pkg/tools"
|
||||
"github.com/sipeed/picoclaw/pkg/utils"
|
||||
"github.com/sipeed/picoclaw/pkg/voice"
|
||||
)
|
||||
|
||||
type AgentLoop struct {
|
||||
@@ -51,7 +52,7 @@ type AgentLoop struct {
|
||||
fallback *providers.FallbackChain
|
||||
channelManager *channels.Manager
|
||||
mediaStore media.MediaStore
|
||||
transcriber voice.Transcriber
|
||||
transcriber asr.Transcriber
|
||||
cmdRegistry *commands.Registry
|
||||
mcp mcpRuntime
|
||||
hookRuntime hookRuntime
|
||||
@@ -159,6 +160,13 @@ func registerSharedTools(
|
||||
provider providers.LLMProvider,
|
||||
) {
|
||||
allowReadPaths := buildAllowReadPatterns(cfg)
|
||||
var ttsProvider tts.TTSProvider
|
||||
if cfg.Tools.IsToolEnabled("send_tts") {
|
||||
ttsProvider = tts.DetectTTS(cfg)
|
||||
if ttsProvider == nil {
|
||||
logger.WarnCF("voice-tts", "send_tts enabled but no TTS provider configured", nil)
|
||||
}
|
||||
}
|
||||
|
||||
for _, agentID := range registry.ListAgentIDs() {
|
||||
agent, ok := registry.GetAgent(agentID)
|
||||
@@ -269,6 +277,10 @@ func registerSharedTools(
|
||||
agent.Tools.Register(sendFileTool)
|
||||
}
|
||||
|
||||
if ttsProvider != nil {
|
||||
agent.Tools.Register(tools.NewSendTTSTool(ttsProvider, nil))
|
||||
}
|
||||
|
||||
// Skill discovery and installation tools
|
||||
skills_enabled := cfg.Tools.IsToolEnabled("skills")
|
||||
find_skills_enable := cfg.Tools.IsToolEnabled("find_skills")
|
||||
@@ -1059,10 +1071,15 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
|
||||
agent.Tools.SetMediaStore(s)
|
||||
}
|
||||
}
|
||||
registry.ForEachTool("send_tts", func(t tools.Tool) {
|
||||
if st, ok := t.(*tools.SendTTSTool); ok {
|
||||
st.SetMediaStore(s)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
|
||||
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
|
||||
func (al *AgentLoop) SetTranscriber(t asr.Transcriber) {
|
||||
al.transcriber = t
|
||||
}
|
||||
|
||||
@@ -1083,19 +1100,23 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
|
||||
|
||||
// Transcribe each audio media ref in order.
|
||||
var transcriptions []string
|
||||
var keptMedia []string
|
||||
for _, ref := range msg.Media {
|
||||
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
|
||||
if err != nil {
|
||||
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
|
||||
keptMedia = append(keptMedia, ref)
|
||||
continue
|
||||
}
|
||||
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
|
||||
keptMedia = append(keptMedia, ref)
|
||||
continue
|
||||
}
|
||||
result, err := al.transcriber.Transcribe(ctx, path)
|
||||
if err != nil {
|
||||
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
|
||||
transcriptions = append(transcriptions, "")
|
||||
keptMedia = append(keptMedia, ref)
|
||||
continue
|
||||
}
|
||||
transcriptions = append(transcriptions, result.Text)
|
||||
@@ -1115,15 +1136,21 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
|
||||
}
|
||||
text := transcriptions[idx]
|
||||
idx++
|
||||
if text == "" {
|
||||
return match
|
||||
}
|
||||
return "[voice: " + text + "]"
|
||||
})
|
||||
|
||||
// Append any remaining transcriptions not matched by an annotation.
|
||||
for ; idx < len(transcriptions); idx++ {
|
||||
newContent += "\n[voice: " + transcriptions[idx] + "]"
|
||||
if transcriptions[idx] != "" {
|
||||
newContent += "\n[voice: " + transcriptions[idx] + "]"
|
||||
}
|
||||
}
|
||||
|
||||
msg.Content = newContent
|
||||
msg.Media = keptMedia
|
||||
return msg, true
|
||||
}
|
||||
|
||||
@@ -2464,6 +2491,28 @@ turnLoop:
|
||||
if toolResult == nil {
|
||||
toolResult = tools.ErrorResult("hook returned nil tool result")
|
||||
}
|
||||
|
||||
// Send ForUser if not silent and has content.
|
||||
// For ResponseHandled tools, send regardless of SendResponse setting,
|
||||
// since they've already handled the response (e.g., send_tts, send_file).
|
||||
shouldSendForUser := !toolResult.Silent && toolResult.ForUser != "" &&
|
||||
(ts.opts.SendResponse || toolResult.ResponseHandled)
|
||||
if shouldSendForUser {
|
||||
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
|
||||
Channel: ts.channel,
|
||||
ChatID: ts.chatID,
|
||||
Content: toolResult.ForUser,
|
||||
Metadata: map[string]string{
|
||||
"is_tool_call": "true",
|
||||
},
|
||||
})
|
||||
logger.DebugCF("agent", "Sent tool result to user",
|
||||
map[string]any{
|
||||
"tool": toolName,
|
||||
"content_len": len(toolResult.ForUser),
|
||||
})
|
||||
}
|
||||
|
||||
if len(toolResult.Media) > 0 && toolResult.ResponseHandled {
|
||||
parts := make([]bus.MediaPart, 0, len(toolResult.Media))
|
||||
for _, ref := range toolResult.Media {
|
||||
@@ -2509,19 +2558,6 @@ turnLoop:
|
||||
allResponsesHandled = false
|
||||
}
|
||||
|
||||
if !toolResult.Silent && toolResult.ForUser != "" && ts.opts.SendResponse {
|
||||
al.bus.PublishOutbound(ctx, bus.OutboundMessage{
|
||||
Channel: ts.channel,
|
||||
ChatID: ts.chatID,
|
||||
Content: toolResult.ForUser,
|
||||
})
|
||||
logger.DebugCF("agent", "Sent tool result to user",
|
||||
map[string]any{
|
||||
"tool": toolName,
|
||||
"content_len": len(toolResult.ForUser),
|
||||
})
|
||||
}
|
||||
|
||||
contentForLLM := toolResult.ContentForLLM()
|
||||
|
||||
// Filter sensitive data (API keys, tokens, secrets) before sending to LLM
|
||||
|
||||
Reference in New Issue
Block a user