diff --git a/README.fr.md b/README.fr.md index e537fc13a..320aa9e22 100644 --- a/README.fr.md +++ b/README.fr.md @@ -827,7 +827,7 @@ Le sous-agent a accès aux outils (message, web_search, etc.) et peut communique ### Fournisseurs > [!NOTE] -> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages vocaux Telegram seront automatiquement transcrits. +> Groq fournit la transcription vocale gratuite via Whisper. Si configuré, les messages audio de n'importe quel canal seront automatiquement transcrits au niveau de l'agent. | Fournisseur | Utilisation | Obtenir une Clé API | | ------------------------ | ---------------------------------------- | ------------------------------------------------------ | diff --git a/README.ja.md b/README.ja.md index 20ad5033b..ea6bc7e72 100644 --- a/README.ja.md +++ b/README.ja.md @@ -785,7 +785,7 @@ HEARTBEAT_OK 応答 ユーザーが直接結果を受け取る ### プロバイダー > [!NOTE] -> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、Telegram の音声メッセージが自動的に文字起こしされます。 +> Groq は Whisper による無料の音声文字起こしを提供しています。設定すると、あらゆるチャンネルからの音声メッセージがエージェントレベルで自動的に文字起こしされます。 | プロバイダー | 用途 | API キー取得先 | | --- | --- | --- | diff --git a/README.md b/README.md index 759ebbb82..c1ef72141 100644 --- a/README.md +++ b/README.md @@ -911,7 +911,7 @@ The subagent has access to tools (message, web_search, etc.) and can communicate ### Providers > [!NOTE] -> Groq provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed. +> Groq provides free voice transcription via Whisper. If configured, audio messages from any channel will be automatically transcribed at the agent level. | Provider | Purpose | Get API Key | | -------------------------- | --------------------------------------- | -------------------------------------------------------------------- | diff --git a/README.pt-br.md b/README.pt-br.md index bfe655770..67ce9e0d3 100644 --- a/README.pt-br.md +++ b/README.pt-br.md @@ -823,7 +823,7 @@ O subagente tem acesso às ferramentas (message, web_search, etc.) e pode se com ### Provedores > [!NOTE] -> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de voz do Telegram serão automaticamente transcritas. +> O Groq fornece transcrição de voz gratuita via Whisper. Se configurado, mensagens de áudio de qualquer canal serão automaticamente transcritas no nível do agente. | Provedor | Finalidade | Obter API Key | | --- | --- | --- | diff --git a/README.vi.md b/README.vi.md index b30659614..5755896ed 100644 --- a/README.vi.md +++ b/README.vi.md @@ -795,7 +795,7 @@ Subagent có quyền truy cập các công cụ (message, web_search, v.v.) và ### Nhà cung cấp (Providers) > [!NOTE] -> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn thoại trên Telegram sẽ được tự động chuyển thành văn bản. +> Groq cung cấp dịch vụ chuyển giọng nói thành văn bản miễn phí qua Whisper. Nếu đã cấu hình Groq, tin nhắn âm thanh từ bất kỳ kênh nào sẽ được tự động chuyển thành văn bản ở cấp độ agent. | Nhà cung cấp | Mục đích | Lấy API Key | | --- | --- | --- | diff --git a/README.zh.md b/README.zh.md index db96ba555..bd90173f9 100644 --- a/README.zh.md +++ b/README.zh.md @@ -459,7 +459,7 @@ Agent 读取 HEARTBEAT.md ### 提供商 (Providers) > [!NOTE] -> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,Telegram 语音消息将被自动转录为文字。 +> Groq 通过 Whisper 提供免费的语音转录。如果配置了 Groq,任意渠道的音频消息都将在 Agent 层面自动转录为文字。 | 提供商 | 用途 | 获取 API Key | | -------------------- | ---------------------------- | -------------------------------------------------------------------- | diff --git a/cmd/picoclaw/internal/gateway/helpers.go b/cmd/picoclaw/internal/gateway/helpers.go index 747f7d44e..5225340c7 100644 --- a/cmd/picoclaw/internal/gateway/helpers.go +++ b/cmd/picoclaw/internal/gateway/helpers.go @@ -36,6 +36,7 @@ import ( "github.com/sipeed/picoclaw/pkg/providers" "github.com/sipeed/picoclaw/pkg/state" "github.com/sipeed/picoclaw/pkg/tools" + "github.com/sipeed/picoclaw/pkg/voice" ) func gatewayCmd(debug bool) error { @@ -134,6 +135,12 @@ func gatewayCmd(debug bool) error { agentLoop.SetChannelManager(channelManager) agentLoop.SetMediaStore(mediaStore) + // Wire up voice transcription if a supported provider is configured. + if transcriber := voice.DetectTranscriber(cfg); transcriber != nil { + agentLoop.SetTranscriber(transcriber) + logger.InfoCF("voice", "Transcription enabled (agent-level)", map[string]any{"provider": transcriber.Name()}) + } + enabledChannels := channelManager.GetEnabledChannels() if len(enabledChannels) > 0 { fmt.Printf("✓ Channels enabled: %s\n", enabledChannels) diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index db9efa2cf..7ce2a37a6 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -12,6 +12,7 @@ import ( "errors" "fmt" "path/filepath" + "regexp" "strings" "sync" "sync/atomic" @@ -31,6 +32,7 @@ import ( "github.com/sipeed/picoclaw/pkg/state" "github.com/sipeed/picoclaw/pkg/tools" "github.com/sipeed/picoclaw/pkg/utils" + "github.com/sipeed/picoclaw/pkg/voice" ) type AgentLoop struct { @@ -43,6 +45,7 @@ type AgentLoop struct { fallback *providers.FallbackChain channelManager *channels.Manager mediaStore media.MediaStore + transcriber voice.Transcriber } // processOptions configures how a message is processed @@ -339,6 +342,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) { al.mediaStore = s } +// SetTranscriber injects a voice transcriber for agent-level audio transcription. +func (al *AgentLoop) SetTranscriber(t voice.Transcriber) { + al.transcriber = t +} + +var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`) + +// transcribeAudioInMessage resolves audio media refs, transcribes them, and +// replaces audio annotations in msg.Content with the transcribed text. +func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage { + if al.transcriber == nil || al.mediaStore == nil || len(msg.Media) == 0 { + return msg + } + + // Transcribe each audio media ref in order. + var transcriptions []string + for _, ref := range msg.Media { + path, meta, err := al.mediaStore.ResolveWithMeta(ref) + if err != nil { + logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err}) + continue + } + if !utils.IsAudioFile(meta.Filename, meta.ContentType) { + continue + } + result, err := al.transcriber.Transcribe(ctx, path) + if err != nil { + logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err}) + transcriptions = append(transcriptions, "") + continue + } + transcriptions = append(transcriptions, result.Text) + } + + if len(transcriptions) == 0 { + return msg + } + + // Replace audio annotations sequentially with transcriptions. + idx := 0 + newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string { + if idx >= len(transcriptions) { + return match + } + text := transcriptions[idx] + idx++ + return "[voice: " + text + "]" + }) + + // Append any remaining transcriptions not matched by an annotation. + for ; idx < len(transcriptions); idx++ { + newContent += "\n[voice: " + transcriptions[idx] + "]" + } + + msg.Content = newContent + return msg +} + // inferMediaType determines the media type ("image", "audio", "video", "file") // from a filename and MIME content type. func inferMediaType(filename, contentType string) string { @@ -450,6 +511,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) }, ) + msg = al.transcribeAudioInMessage(ctx, msg) + // Route system messages to processSystemMessage if msg.Channel == "system" { return al.processSystemMessage(ctx, msg) diff --git a/pkg/voice/transcriber.go b/pkg/voice/transcriber.go index f973e77fe..e949d7a22 100644 --- a/pkg/voice/transcriber.go +++ b/pkg/voice/transcriber.go @@ -10,12 +10,19 @@ import ( "net/http" "os" "path/filepath" + "strings" "time" + "github.com/sipeed/picoclaw/pkg/config" "github.com/sipeed/picoclaw/pkg/logger" "github.com/sipeed/picoclaw/pkg/utils" ) +type Transcriber interface { + Name() string + Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) +} + type GroqTranscriber struct { apiKey string apiBase string @@ -152,8 +159,22 @@ func (t *GroqTranscriber) Transcribe(ctx context.Context, audioFilePath string) return &result, nil } -func (t *GroqTranscriber) IsAvailable() bool { - available := t.apiKey != "" - logger.DebugCF("voice", "Checking transcriber availability", map[string]any{"available": available}) - return available +func (t *GroqTranscriber) Name() string { + return "groq" +} + +// DetectTranscriber inspects cfg and returns the appropriate Transcriber, or +// nil if no supported transcription provider is configured. +func DetectTranscriber(cfg *config.Config) Transcriber { + // Direct Groq provider config takes priority. + if key := cfg.Providers.Groq.APIKey; key != "" { + return NewGroqTranscriber(key) + } + // Fall back to any model-list entry that uses the groq/ protocol. + for _, mc := range cfg.ModelList { + if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" { + return NewGroqTranscriber(mc.APIKey) + } + } + return nil } diff --git a/pkg/voice/transcriber_test.go b/pkg/voice/transcriber_test.go new file mode 100644 index 000000000..9b6add333 --- /dev/null +++ b/pkg/voice/transcriber_test.go @@ -0,0 +1,160 @@ +package voice + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "github.com/sipeed/picoclaw/pkg/config" +) + +// Ensure GroqTranscriber satisfies the Transcriber interface at compile time. +var _ Transcriber = (*GroqTranscriber)(nil) + +func TestGroqTranscriberName(t *testing.T) { + tr := NewGroqTranscriber("sk-test") + if got := tr.Name(); got != "groq" { + t.Errorf("Name() = %q, want %q", got, "groq") + } +} + +func TestDetectTranscriber(t *testing.T) { + tests := []struct { + name string + cfg *config.Config + wantNil bool + wantName string + }{ + { + name: "no config", + cfg: &config.Config{}, + wantNil: true, + }, + { + name: "groq provider key", + cfg: &config.Config{ + Providers: config.ProvidersConfig{ + Groq: config.ProviderConfig{APIKey: "sk-groq-direct"}, + }, + }, + wantName: "groq", + }, + { + name: "groq via model list", + cfg: &config.Config{ + ModelList: []config.ModelConfig{ + {Model: "openai/gpt-4o", APIKey: "sk-openai"}, + {Model: "groq/llama-3.3-70b", APIKey: "sk-groq-model"}, + }, + }, + wantName: "groq", + }, + { + name: "groq model list entry without key is skipped", + cfg: &config.Config{ + ModelList: []config.ModelConfig{ + {Model: "groq/llama-3.3-70b", APIKey: ""}, + }, + }, + wantNil: true, + }, + { + name: "provider key takes priority over model list", + cfg: &config.Config{ + Providers: config.ProvidersConfig{ + Groq: config.ProviderConfig{APIKey: "sk-groq-direct"}, + }, + ModelList: []config.ModelConfig{ + {Model: "groq/llama-3.3-70b", APIKey: "sk-groq-model"}, + }, + }, + wantName: "groq", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tr := DetectTranscriber(tc.cfg) + if tc.wantNil { + if tr != nil { + t.Errorf("DetectTranscriber() = %v, want nil", tr) + } + return + } + if tr == nil { + t.Fatal("DetectTranscriber() = nil, want non-nil") + } + if got := tr.Name(); got != tc.wantName { + t.Errorf("Name() = %q, want %q", got, tc.wantName) + } + }) + } +} + +func TestTranscribe(t *testing.T) { + // Write a minimal fake audio file so the transcriber can open and send it. + tmpDir := t.TempDir() + audioPath := filepath.Join(tmpDir, "clip.ogg") + if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil { + t.Fatalf("failed to write fake audio file: %v", err) + } + + t.Run("success", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/audio/transcriptions" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.Header.Get("Authorization") != "Bearer sk-test" { + t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization")) + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(TranscriptionResponse{ + Text: "hello world", + Language: "en", + Duration: 1.5, + }) + })) + defer srv.Close() + + tr := NewGroqTranscriber("sk-test") + tr.apiBase = srv.URL + + resp, err := tr.Transcribe(context.Background(), audioPath) + if err != nil { + t.Fatalf("Transcribe() error: %v", err) + } + if resp.Text != "hello world" { + t.Errorf("Text = %q, want %q", resp.Text, "hello world") + } + if resp.Language != "en" { + t.Errorf("Language = %q, want %q", resp.Language, "en") + } + }) + + t.Run("api error", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized) + })) + defer srv.Close() + + tr := NewGroqTranscriber("sk-bad") + tr.apiBase = srv.URL + + _, err := tr.Transcribe(context.Background(), audioPath) + if err == nil { + t.Fatal("expected error for non-200 response, got nil") + } + }) + + t.Run("missing file", func(t *testing.T) { + tr := NewGroqTranscriber("sk-test") + _, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg")) + if err == nil { + t.Fatal("expected error for missing file, got nil") + } + }) +}