From b1386ad71fbe43142b96deca018d73294df06fcc Mon Sep 17 00:00:00 2001 From: Dimitrij Denissenko Date: Sun, 1 Mar 2026 08:31:04 +0000 Subject: [PATCH] Fix voice transcription --- README.fr.md | 2 +- README.ja.md | 2 +- README.md | 2 +- README.pt-br.md | 2 +- README.vi.md | 2 +- README.zh.md | 2 +- cmd/picoclaw/internal/gateway/helpers.go | 18 +++++ pkg/agent/loop.go | 64 ++++++++++++++++ pkg/voice/transcriber.go | 5 ++ pkg/voice/transcriber_test.go | 97 ++++++++++++++++++++++++ 10 files changed, 190 insertions(+), 6 deletions(-) create mode 100644 pkg/voice/transcriber_test.go diff --git a/README.fr.md b/README.fr.md index c452b71ac..87eaca0e8 100644 --- a/README.fr.md +++ b/README.fr.md @@ -772,7 +772,7 @@ Le sous-agent a accès aux outils (message, web_search, etc.) et peut communique ### Fournisseurs > [!NOTE] -> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages vocaux Telegram seront automatiquement transcrits. +> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages audio de n'importe quel canal seront automatiquement transcrits au niveau de l'agent. | Fournisseur | Utilisation | Obtenir une Clé API | | ------------------------ | ---------------------------------------- | ------------------------------------------------------ | diff --git a/README.ja.md b/README.ja.md index 6d5d09451..bb8d33fae 100644 --- a/README.ja.md +++ b/README.ja.md @@ -728,7 +728,7 @@ HEARTBEAT_OK 応答 ユーザーが直接結果を受け取る ### プロバイダー > [!NOTE] -> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、Telegram の音声メッセージが自動的に文字起こしされます。 +> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、あらゆるチャンネルからの音声メッセージがエージェントレベルで自動的に文字起こしされます。 | プロバイダー | 用途 | API キー取得先 | | --- | --- | --- | diff --git a/README.md b/README.md index b040d0605..5b39204c7 100644 --- a/README.md +++ b/README.md @@ -818,7 +818,7 @@ The subagent has access to tools (message, web_search, etc.) and can communicate ### Providers > [!NOTE] -> Groq provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed. +> Groq provides free voice transcription via Whisper. If configured, audio messages from any channel will be automatically transcribed at the agent level. | Provider | Purpose | Get API Key | | -------------------------- | --------------------------------------- | -------------------------------------------------------------------- | diff --git a/README.pt-br.md b/README.pt-br.md index 61663e363..6752124d0 100644 --- a/README.pt-br.md +++ b/README.pt-br.md @@ -766,7 +766,7 @@ O subagente tem acesso às ferramentas (message, web_search, etc.) e pode se com ### Provedores > [!NOTE] -> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de voz do Telegram serão automaticamente transcritas. +> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de áudio de qualquer canal serão automaticamente transcritas no nível do agente. | Provedor | Finalidade | Obter API Key | | --- | --- | --- | diff --git a/README.vi.md b/README.vi.md index f8ece7eda..161a96dd7 100644 --- a/README.vi.md +++ b/README.vi.md @@ -740,7 +740,7 @@ Subagent có quyền truy cập các công cụ (message, web_search, v.v.) và ### Nhà cung cấp (Providers) > [!NOTE] -> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn thoại trên Telegram sẽ được tự động chuyển thành văn bản. +> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn âm thanh từ bất kỳ kênh nào sẽ được tự động chuyển thành văn bản ở cấp độ agent. | Nhà cung cấp | Mục đích | Lấy API Key | | --- | --- | --- | diff --git a/README.zh.md b/README.zh.md index 7c9351cb4..f39526250 100644 --- a/README.zh.md +++ b/README.zh.md @@ -418,7 +418,7 @@ Agent 读取 HEARTBEAT.md ### 提供商 (Providers) > [!NOTE] -> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,Telegram 语音消息将被自动转录为文字。 +> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,任意渠道的音频消息都将在 Agent 层面自动转录为文字。 | 提供商 | 用途 | 获取 API Key | | -------------------- | ---------------------------- | -------------------------------------------------------------------- | diff --git a/cmd/picoclaw/internal/gateway/helpers.go b/cmd/picoclaw/internal/gateway/helpers.go index 747f7d44e..c4a6f59fe 100644 --- a/cmd/picoclaw/internal/gateway/helpers.go +++ b/cmd/picoclaw/internal/gateway/helpers.go @@ -7,6 +7,7 @@ import ( "os" "os/signal" "path/filepath" + "strings" "time" "github.com/sipeed/picoclaw/cmd/picoclaw/internal" @@ -36,6 +37,7 @@ import ( "github.com/sipeed/picoclaw/pkg/providers" "github.com/sipeed/picoclaw/pkg/state" "github.com/sipeed/picoclaw/pkg/tools" + "github.com/sipeed/picoclaw/pkg/voice" ) func gatewayCmd(debug bool) error { @@ -134,6 +136,22 @@ func gatewayCmd(debug bool) error { agentLoop.SetChannelManager(channelManager) agentLoop.SetMediaStore(mediaStore) + // Wire up voice transcription if Groq API key is available + groqAPIKey := cfg.Providers.Groq.APIKey + if groqAPIKey == "" { + for _, mc := range cfg.ModelList { + if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" { + groqAPIKey = mc.APIKey + break + } + } + } + if groqAPIKey != "" { + transcriber := voice.NewGroqTranscriber(groqAPIKey) + agentLoop.SetTranscriber(transcriber) + logger.InfoC("voice", "Groq voice transcription enabled (agent-level)") + } + enabledChannels := channelManager.GetEnabledChannels() if len(enabledChannels) > 0 { fmt.Printf("✓ Channels enabled: %s\n", enabledChannels) diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index a72f95bb1..0a2633d90 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -18,6 +18,8 @@ import ( "time" "unicode/utf8" + "regexp" + "github.com/sipeed/picoclaw/pkg/bus" "github.com/sipeed/picoclaw/pkg/channels" "github.com/sipeed/picoclaw/pkg/config" @@ -30,6 +32,7 @@ import ( "github.com/sipeed/picoclaw/pkg/state" "github.com/sipeed/picoclaw/pkg/tools" "github.com/sipeed/picoclaw/pkg/utils" + "github.com/sipeed/picoclaw/pkg/voice" ) type AgentLoop struct { @@ -42,6 +45,7 @@ type AgentLoop struct { fallback *providers.FallbackChain channelManager *channels.Manager mediaStore media.MediaStore + transcriber voice.Transcriber } // processOptions configures how a message is processed @@ -262,6 +266,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) { al.mediaStore = s } +// SetTranscriber injects a voice transcriber for agent-level audio transcription. +func (al *AgentLoop) SetTranscriber(t voice.Transcriber) { + al.transcriber = t +} + +var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`) + +// transcribeAudioInMessage resolves audio media refs, transcribes them, and +// replaces audio annotations in msg.Content with the transcribed text. +func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage { + if al.transcriber == nil || !al.transcriber.IsAvailable() || al.mediaStore == nil || len(msg.Media) == 0 { + return msg + } + + // Transcribe each audio media ref in order. + var transcriptions []string + for _, ref := range msg.Media { + path, meta, err := al.mediaStore.ResolveWithMeta(ref) + if err != nil { + logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err}) + continue + } + if !utils.IsAudioFile(meta.Filename, meta.ContentType) { + continue + } + result, err := al.transcriber.Transcribe(ctx, path) + if err != nil { + logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err}) + transcriptions = append(transcriptions, "") + continue + } + transcriptions = append(transcriptions, result.Text) + } + + if len(transcriptions) == 0 { + return msg + } + + // Replace audio annotations sequentially with transcriptions. + idx := 0 + newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string { + if idx >= len(transcriptions) { + return match + } + text := transcriptions[idx] + idx++ + return "[voice: " + text + "]" + }) + + // Append any remaining transcriptions not matched by an annotation. + for ; idx < len(transcriptions); idx++ { + newContent += "\n[voice: " + transcriptions[idx] + "]" + } + + msg.Content = newContent + return msg +} + // inferMediaType determines the media type ("image", "audio", "video", "file") // from a filename and MIME content type. func inferMediaType(filename, contentType string) string { @@ -364,6 +426,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) "session_key": msg.SessionKey, }) + msg = al.transcribeAudioInMessage(ctx, msg) + // Route system messages to processSystemMessage if msg.Channel == "system" { return al.processSystemMessage(ctx, msg) diff --git a/pkg/voice/transcriber.go b/pkg/voice/transcriber.go index f973e77fe..bf48d0fda 100644 --- a/pkg/voice/transcriber.go +++ b/pkg/voice/transcriber.go @@ -16,6 +16,11 @@ import ( "github.com/sipeed/picoclaw/pkg/utils" ) +type Transcriber interface { + Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) + IsAvailable() bool +} + type GroqTranscriber struct { apiKey string apiBase string diff --git a/pkg/voice/transcriber_test.go b/pkg/voice/transcriber_test.go new file mode 100644 index 000000000..c4755dd54 --- /dev/null +++ b/pkg/voice/transcriber_test.go @@ -0,0 +1,97 @@ +package voice + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" +) + +// Ensure GroqTranscriber satisfies the Transcriber interface at compile time. +var _ Transcriber = (*GroqTranscriber)(nil) + +func TestIsAvailable(t *testing.T) { + tests := []struct { + name string + apiKey string + want bool + }{ + {"with key", "sk-test-key", true}, + {"empty key", "", false}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tr := NewGroqTranscriber(tc.apiKey) + if got := tr.IsAvailable(); got != tc.want { + t.Errorf("IsAvailable() = %v, want %v", got, tc.want) + } + }) + } +} + +func TestTranscribe(t *testing.T) { + // Write a minimal fake audio file so the transcriber can open and send it. + tmpDir := t.TempDir() + audioPath := filepath.Join(tmpDir, "clip.ogg") + if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil { + t.Fatalf("failed to write fake audio file: %v", err) + } + + t.Run("success", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/audio/transcriptions" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.Header.Get("Authorization") != "Bearer sk-test" { + t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization")) + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(TranscriptionResponse{ + Text: "hello world", + Language: "en", + Duration: 1.5, + }) + })) + defer srv.Close() + + tr := NewGroqTranscriber("sk-test") + tr.apiBase = srv.URL + + resp, err := tr.Transcribe(context.Background(), audioPath) + if err != nil { + t.Fatalf("Transcribe() error: %v", err) + } + if resp.Text != "hello world" { + t.Errorf("Text = %q, want %q", resp.Text, "hello world") + } + if resp.Language != "en" { + t.Errorf("Language = %q, want %q", resp.Language, "en") + } + }) + + t.Run("api error", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized) + })) + defer srv.Close() + + tr := NewGroqTranscriber("sk-bad") + tr.apiBase = srv.URL + + _, err := tr.Transcribe(context.Background(), audioPath) + if err == nil { + t.Fatal("expected error for non-200 response, got nil") + } + }) + + t.Run("missing file", func(t *testing.T) { + tr := NewGroqTranscriber("sk-test") + _, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg")) + if err == nil { + t.Fatal("expected error for missing file, got nil") + } + }) +}