Merge pull request #947 from dim/fix/transcription

Fix voice transcription
This commit is contained in:
Mauro
2026-03-04 18:37:24 +01:00
committed by GitHub
10 changed files with 261 additions and 10 deletions
+1 -1
View File
@@ -827,7 +827,7 @@ Le sous-agent a accès aux outils (message, web_search, etc.) et peut communique
### Fournisseurs
> [!NOTE]
> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages vocaux Telegram seront automatiquement transcrits.
> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages audio de n'importe quel canal seront automatiquement transcrits au niveau de l'agent.
| Fournisseur | Utilisation | Obtenir une Clé API |
| ------------------------ | ---------------------------------------- | ------------------------------------------------------ |
+1 -1
View File
@@ -785,7 +785,7 @@ HEARTBEAT_OK 応答 ユーザーが直接結果を受け取る
### プロバイダー
> [!NOTE]
> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、Telegram の音声メッセージが自動的に文字起こしされます。
> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、あらゆるチャンネルからの音声メッセージがエージェントレベルで自動的に文字起こしされます。
| プロバイダー | 用途 | API キー取得先 |
| --- | --- | --- |
+1 -1
View File
@@ -911,7 +911,7 @@ The subagent has access to tools (message, web_search, etc.) and can communicate
### Providers
> [!NOTE]
> Groq provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed.
> Groq provides free voice transcription via Whisper. If configured, audio messages from any channel will be automatically transcribed at the agent level.
| Provider | Purpose | Get API Key |
| -------------------------- | --------------------------------------- | -------------------------------------------------------------------- |
+1 -1
View File
@@ -823,7 +823,7 @@ O subagente tem acesso às ferramentas (message, web_search, etc.) e pode se com
### Provedores
> [!NOTE]
> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de voz do Telegram serão automaticamente transcritas.
> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de áudio de qualquer canal serão automaticamente transcritas no nível do agente.
| Provedor | Finalidade | Obter API Key |
| --- | --- | --- |
+1 -1
View File
@@ -795,7 +795,7 @@ Subagent có quyền truy cập các công cụ (message, web_search, v.v.) và
### Nhà cung cấp (Providers)
> [!NOTE]
> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn thoại trên Telegram sẽ được tự động chuyển thành văn bản.
> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn âm thanh từ bất kỳ kênh nào sẽ được tự động chuyển thành văn bản ở cấp độ agent.
| Nhà cung cấp | Mục đích | Lấy API Key |
| --- | --- | --- |
+1 -1
View File
@@ -459,7 +459,7 @@ Agent 读取 HEARTBEAT.md
### 提供商 (Providers)
> [!NOTE]
> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,Telegram 语音消息将被自动转录为文字。
> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,任意渠道的音频消息都将在 Agent 层面自动转录为文字。
| 提供商 | 用途 | 获取 API Key |
| -------------------- | ---------------------------- | -------------------------------------------------------------------- |
+7
View File
@@ -36,6 +36,7 @@ import (
"github.com/sipeed/picoclaw/pkg/providers"
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/voice"
)
func gatewayCmd(debug bool) error {
@@ -134,6 +135,12 @@ func gatewayCmd(debug bool) error {
agentLoop.SetChannelManager(channelManager)
agentLoop.SetMediaStore(mediaStore)
// Wire up voice transcription if a supported provider is configured.
if transcriber := voice.DetectTranscriber(cfg); transcriber != nil {
agentLoop.SetTranscriber(transcriber)
logger.InfoCF("voice", "Transcription enabled (agent-level)", map[string]any{"provider": transcriber.Name()})
}
enabledChannels := channelManager.GetEnabledChannels()
if len(enabledChannels) > 0 {
fmt.Printf("✓ Channels enabled: %s\n", enabledChannels)
+63
View File
@@ -12,6 +12,7 @@ import (
"errors"
"fmt"
"path/filepath"
"regexp"
"strings"
"sync"
"sync/atomic"
@@ -31,6 +32,7 @@ import (
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type AgentLoop struct {
@@ -43,6 +45,7 @@ type AgentLoop struct {
fallback *providers.FallbackChain
channelManager *channels.Manager
mediaStore media.MediaStore
transcriber voice.Transcriber
}
// processOptions configures how a message is processed
@@ -339,6 +342,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
al.mediaStore = s
}
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
al.transcriber = t
}
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
// replaces audio annotations in msg.Content with the transcribed text.
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
if al.transcriber == nil || al.mediaStore == nil || len(msg.Media) == 0 {
return msg
}
// Transcribe each audio media ref in order.
var transcriptions []string
for _, ref := range msg.Media {
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
if err != nil {
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
continue
}
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
continue
}
result, err := al.transcriber.Transcribe(ctx, path)
if err != nil {
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
transcriptions = append(transcriptions, "")
continue
}
transcriptions = append(transcriptions, result.Text)
}
if len(transcriptions) == 0 {
return msg
}
// Replace audio annotations sequentially with transcriptions.
idx := 0
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
if idx >= len(transcriptions) {
return match
}
text := transcriptions[idx]
idx++
return "[voice: " + text + "]"
})
// Append any remaining transcriptions not matched by an annotation.
for ; idx < len(transcriptions); idx++ {
newContent += "\n[voice: " + transcriptions[idx] + "]"
}
msg.Content = newContent
return msg
}
// inferMediaType determines the media type ("image", "audio", "video", "file")
// from a filename and MIME content type.
func inferMediaType(filename, contentType string) string {
@@ -450,6 +511,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
},
)
msg = al.transcribeAudioInMessage(ctx, msg)
// Route system messages to processSystemMessage
if msg.Channel == "system" {
return al.processSystemMessage(ctx, msg)
+25 -4
View File
@@ -10,12 +10,19 @@ import (
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/sipeed/picoclaw/pkg/config"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/utils"
)
type Transcriber interface {
Name() string
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
}
type GroqTranscriber struct {
apiKey string
apiBase string
@@ -152,8 +159,22 @@ func (t *GroqTranscriber) Transcribe(ctx context.Context, audioFilePath string)
return &result, nil
}
func (t *GroqTranscriber) IsAvailable() bool {
available := t.apiKey != ""
logger.DebugCF("voice", "Checking transcriber availability", map[string]any{"available": available})
return available
func (t *GroqTranscriber) Name() string {
return "groq"
}
// DetectTranscriber inspects cfg and returns the appropriate Transcriber, or
// nil if no supported transcription provider is configured.
func DetectTranscriber(cfg *config.Config) Transcriber {
// Direct Groq provider config takes priority.
if key := cfg.Providers.Groq.APIKey; key != "" {
return NewGroqTranscriber(key)
}
// Fall back to any model-list entry that uses the groq/ protocol.
for _, mc := range cfg.ModelList {
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" {
return NewGroqTranscriber(mc.APIKey)
}
}
return nil
}
+160
View File
@@ -0,0 +1,160 @@
package voice
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"github.com/sipeed/picoclaw/pkg/config"
)
// Ensure GroqTranscriber satisfies the Transcriber interface at compile time.
var _ Transcriber = (*GroqTranscriber)(nil)
func TestGroqTranscriberName(t *testing.T) {
tr := NewGroqTranscriber("sk-test")
if got := tr.Name(); got != "groq" {
t.Errorf("Name() = %q, want %q", got, "groq")
}
}
func TestDetectTranscriber(t *testing.T) {
tests := []struct {
name string
cfg *config.Config
wantNil bool
wantName string
}{
{
name: "no config",
cfg: &config.Config{},
wantNil: true,
},
{
name: "groq provider key",
cfg: &config.Config{
Providers: config.ProvidersConfig{
Groq: config.ProviderConfig{APIKey: "sk-groq-direct"},
},
},
wantName: "groq",
},
{
name: "groq via model list",
cfg: &config.Config{
ModelList: []config.ModelConfig{
{Model: "openai/gpt-4o", APIKey: "sk-openai"},
{Model: "groq/llama-3.3-70b", APIKey: "sk-groq-model"},
},
},
wantName: "groq",
},
{
name: "groq model list entry without key is skipped",
cfg: &config.Config{
ModelList: []config.ModelConfig{
{Model: "groq/llama-3.3-70b", APIKey: ""},
},
},
wantNil: true,
},
{
name: "provider key takes priority over model list",
cfg: &config.Config{
Providers: config.ProvidersConfig{
Groq: config.ProviderConfig{APIKey: "sk-groq-direct"},
},
ModelList: []config.ModelConfig{
{Model: "groq/llama-3.3-70b", APIKey: "sk-groq-model"},
},
},
wantName: "groq",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
tr := DetectTranscriber(tc.cfg)
if tc.wantNil {
if tr != nil {
t.Errorf("DetectTranscriber() = %v, want nil", tr)
}
return
}
if tr == nil {
t.Fatal("DetectTranscriber() = nil, want non-nil")
}
if got := tr.Name(); got != tc.wantName {
t.Errorf("Name() = %q, want %q", got, tc.wantName)
}
})
}
}
func TestTranscribe(t *testing.T) {
// Write a minimal fake audio file so the transcriber can open and send it.
tmpDir := t.TempDir()
audioPath := filepath.Join(tmpDir, "clip.ogg")
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
t.Fatalf("failed to write fake audio file: %v", err)
}
t.Run("success", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/audio/transcriptions" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.Header.Get("Authorization") != "Bearer sk-test" {
t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization"))
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
Text: "hello world",
Language: "en",
Duration: 1.5,
})
}))
defer srv.Close()
tr := NewGroqTranscriber("sk-test")
tr.apiBase = srv.URL
resp, err := tr.Transcribe(context.Background(), audioPath)
if err != nil {
t.Fatalf("Transcribe() error: %v", err)
}
if resp.Text != "hello world" {
t.Errorf("Text = %q, want %q", resp.Text, "hello world")
}
if resp.Language != "en" {
t.Errorf("Language = %q, want %q", resp.Language, "en")
}
})
t.Run("api error", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
}))
defer srv.Close()
tr := NewGroqTranscriber("sk-bad")
tr.apiBase = srv.URL
_, err := tr.Transcribe(context.Background(), audioPath)
if err == nil {
t.Fatal("expected error for non-200 response, got nil")
}
})
t.Run("missing file", func(t *testing.T) {
tr := NewGroqTranscriber("sk-test")
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
if err == nil {
t.Fatal("expected error for missing file, got nil")
}
})
}