Fix voice transcription

This commit is contained in:
Dimitrij Denissenko
2026-03-01 08:31:04 +00:00
parent f7136b6a5d
commit b1386ad71f
10 changed files with 190 additions and 6 deletions
+1 -1
View File
@@ -772,7 +772,7 @@ Le sous-agent a accès aux outils (message, web_search, etc.) et peut communique
### Fournisseurs
> [!NOTE]
> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages vocaux Telegram seront automatiquement transcrits.
> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages audio de n'importe quel canal seront automatiquement transcrits au niveau de l'agent.
| Fournisseur | Utilisation | Obtenir une Clé API |
| ------------------------ | ---------------------------------------- | ------------------------------------------------------ |
+1 -1
View File
@@ -728,7 +728,7 @@ HEARTBEAT_OK 応答 ユーザーが直接結果を受け取る
### プロバイダー
> [!NOTE]
> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、Telegram の音声メッセージが自動的に文字起こしされます。
> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、あらゆるチャンネルからの音声メッセージがエージェントレベルで自動的に文字起こしされます。
| プロバイダー | 用途 | API キー取得先 |
| --- | --- | --- |
+1 -1
View File
@@ -818,7 +818,7 @@ The subagent has access to tools (message, web_search, etc.) and can communicate
### Providers
> [!NOTE]
> Groq provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed.
> Groq provides free voice transcription via Whisper. If configured, audio messages from any channel will be automatically transcribed at the agent level.
| Provider | Purpose | Get API Key |
| -------------------------- | --------------------------------------- | -------------------------------------------------------------------- |
+1 -1
View File
@@ -766,7 +766,7 @@ O subagente tem acesso às ferramentas (message, web_search, etc.) e pode se com
### Provedores
> [!NOTE]
> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de voz do Telegram serão automaticamente transcritas.
> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de áudio de qualquer canal serão automaticamente transcritas no nível do agente.
| Provedor | Finalidade | Obter API Key |
| --- | --- | --- |
+1 -1
View File
@@ -740,7 +740,7 @@ Subagent có quyền truy cập các công cụ (message, web_search, v.v.) và
### Nhà cung cấp (Providers)
> [!NOTE]
> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn thoại trên Telegram sẽ được tự động chuyển thành văn bản.
> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn âm thanh từ bất kỳ kênh nào sẽ được tự động chuyển thành văn bản ở cấp độ agent.
| Nhà cung cấp | Mục đích | Lấy API Key |
| --- | --- | --- |
+1 -1
View File
@@ -418,7 +418,7 @@ Agent 读取 HEARTBEAT.md
### 提供商 (Providers)
> [!NOTE]
> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,Telegram 语音消息将被自动转录为文字。
> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,任意渠道的音频消息都将在 Agent 层面自动转录为文字。
| 提供商 | 用途 | 获取 API Key |
| -------------------- | ---------------------------- | -------------------------------------------------------------------- |
+18
View File
@@ -7,6 +7,7 @@ import (
"os"
"os/signal"
"path/filepath"
"strings"
"time"
"github.com/sipeed/picoclaw/cmd/picoclaw/internal"
@@ -36,6 +37,7 @@ import (
"github.com/sipeed/picoclaw/pkg/providers"
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/voice"
)
func gatewayCmd(debug bool) error {
@@ -134,6 +136,22 @@ func gatewayCmd(debug bool) error {
agentLoop.SetChannelManager(channelManager)
agentLoop.SetMediaStore(mediaStore)
// Wire up voice transcription if Groq API key is available
groqAPIKey := cfg.Providers.Groq.APIKey
if groqAPIKey == "" {
for _, mc := range cfg.ModelList {
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" {
groqAPIKey = mc.APIKey
break
}
}
}
if groqAPIKey != "" {
transcriber := voice.NewGroqTranscriber(groqAPIKey)
agentLoop.SetTranscriber(transcriber)
logger.InfoC("voice", "Groq voice transcription enabled (agent-level)")
}
enabledChannels := channelManager.GetEnabledChannels()
if len(enabledChannels) > 0 {
fmt.Printf("✓ Channels enabled: %s\n", enabledChannels)
+64
View File
@@ -18,6 +18,8 @@ import (
"time"
"unicode/utf8"
"regexp"
"github.com/sipeed/picoclaw/pkg/bus"
"github.com/sipeed/picoclaw/pkg/channels"
"github.com/sipeed/picoclaw/pkg/config"
@@ -30,6 +32,7 @@ import (
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type AgentLoop struct {
@@ -42,6 +45,7 @@ type AgentLoop struct {
fallback *providers.FallbackChain
channelManager *channels.Manager
mediaStore media.MediaStore
transcriber voice.Transcriber
}
// processOptions configures how a message is processed
@@ -262,6 +266,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
al.mediaStore = s
}
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
al.transcriber = t
}
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
// replaces audio annotations in msg.Content with the transcribed text.
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
if al.transcriber == nil || !al.transcriber.IsAvailable() || al.mediaStore == nil || len(msg.Media) == 0 {
return msg
}
// Transcribe each audio media ref in order.
var transcriptions []string
for _, ref := range msg.Media {
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
if err != nil {
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
continue
}
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
continue
}
result, err := al.transcriber.Transcribe(ctx, path)
if err != nil {
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
transcriptions = append(transcriptions, "")
continue
}
transcriptions = append(transcriptions, result.Text)
}
if len(transcriptions) == 0 {
return msg
}
// Replace audio annotations sequentially with transcriptions.
idx := 0
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
if idx >= len(transcriptions) {
return match
}
text := transcriptions[idx]
idx++
return "[voice: " + text + "]"
})
// Append any remaining transcriptions not matched by an annotation.
for ; idx < len(transcriptions); idx++ {
newContent += "\n[voice: " + transcriptions[idx] + "]"
}
msg.Content = newContent
return msg
}
// inferMediaType determines the media type ("image", "audio", "video", "file")
// from a filename and MIME content type.
func inferMediaType(filename, contentType string) string {
@@ -364,6 +426,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
"session_key": msg.SessionKey,
})
msg = al.transcribeAudioInMessage(ctx, msg)
// Route system messages to processSystemMessage
if msg.Channel == "system" {
return al.processSystemMessage(ctx, msg)
+5
View File
@@ -16,6 +16,11 @@ import (
"github.com/sipeed/picoclaw/pkg/utils"
)
type Transcriber interface {
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
IsAvailable() bool
}
type GroqTranscriber struct {
apiKey string
apiBase string
+97
View File
@@ -0,0 +1,97 @@
package voice
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
)
// Ensure GroqTranscriber satisfies the Transcriber interface at compile time.
var _ Transcriber = (*GroqTranscriber)(nil)
func TestIsAvailable(t *testing.T) {
tests := []struct {
name string
apiKey string
want bool
}{
{"with key", "sk-test-key", true},
{"empty key", "", false},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
tr := NewGroqTranscriber(tc.apiKey)
if got := tr.IsAvailable(); got != tc.want {
t.Errorf("IsAvailable() = %v, want %v", got, tc.want)
}
})
}
}
func TestTranscribe(t *testing.T) {
// Write a minimal fake audio file so the transcriber can open and send it.
tmpDir := t.TempDir()
audioPath := filepath.Join(tmpDir, "clip.ogg")
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
t.Fatalf("failed to write fake audio file: %v", err)
}
t.Run("success", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/audio/transcriptions" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.Header.Get("Authorization") != "Bearer sk-test" {
t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization"))
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
Text: "hello world",
Language: "en",
Duration: 1.5,
})
}))
defer srv.Close()
tr := NewGroqTranscriber("sk-test")
tr.apiBase = srv.URL
resp, err := tr.Transcribe(context.Background(), audioPath)
if err != nil {
t.Fatalf("Transcribe() error: %v", err)
}
if resp.Text != "hello world" {
t.Errorf("Text = %q, want %q", resp.Text, "hello world")
}
if resp.Language != "en" {
t.Errorf("Language = %q, want %q", resp.Language, "en")
}
})
t.Run("api error", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
}))
defer srv.Close()
tr := NewGroqTranscriber("sk-bad")
tr.apiBase = srv.URL
_, err := tr.Transcribe(context.Background(), audioPath)
if err == nil {
t.Fatal("expected error for non-200 response, got nil")
}
})
t.Run("missing file", func(t *testing.T) {
tr := NewGroqTranscriber("sk-test")
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
if err == nil {
t.Fatal("expected error for missing file, got nil")
}
})
}