Merge pull request #947 from dim/fix/transcription

Fix voice transcription
This commit is contained in:
Mauro
2026-03-04 18:37:24 +01:00
committed by GitHub
10 changed files with 261 additions and 10 deletions
+63
View File
@@ -12,6 +12,7 @@ import (
"errors"
"fmt"
"path/filepath"
"regexp"
"strings"
"sync"
"sync/atomic"
@@ -31,6 +32,7 @@ import (
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type AgentLoop struct {
@@ -43,6 +45,7 @@ type AgentLoop struct {
fallback *providers.FallbackChain
channelManager *channels.Manager
mediaStore media.MediaStore
transcriber voice.Transcriber
}
// processOptions configures how a message is processed
@@ -339,6 +342,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
al.mediaStore = s
}
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
al.transcriber = t
}
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
// replaces audio annotations in msg.Content with the transcribed text.
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
if al.transcriber == nil || al.mediaStore == nil || len(msg.Media) == 0 {
return msg
}
// Transcribe each audio media ref in order.
var transcriptions []string
for _, ref := range msg.Media {
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
if err != nil {
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
continue
}
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
continue
}
result, err := al.transcriber.Transcribe(ctx, path)
if err != nil {
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
transcriptions = append(transcriptions, "")
continue
}
transcriptions = append(transcriptions, result.Text)
}
if len(transcriptions) == 0 {
return msg
}
// Replace audio annotations sequentially with transcriptions.
idx := 0
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
if idx >= len(transcriptions) {
return match
}
text := transcriptions[idx]
idx++
return "[voice: " + text + "]"
})
// Append any remaining transcriptions not matched by an annotation.
for ; idx < len(transcriptions); idx++ {
newContent += "\n[voice: " + transcriptions[idx] + "]"
}
msg.Content = newContent
return msg
}
// inferMediaType determines the media type ("image", "audio", "video", "file")
// from a filename and MIME content type.
func inferMediaType(filename, contentType string) string {
@@ -450,6 +511,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
},
)
msg = al.transcribeAudioInMessage(ctx, msg)
// Route system messages to processSystemMessage
if msg.Channel == "system" {
return al.processSystemMessage(ctx, msg)
+25 -4
View File
@@ -10,12 +10,19 @@ import (
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/sipeed/picoclaw/pkg/config"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/utils"
)
type Transcriber interface {
Name() string
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
}
type GroqTranscriber struct {
apiKey string
apiBase string
@@ -152,8 +159,22 @@ func (t *GroqTranscriber) Transcribe(ctx context.Context, audioFilePath string)
return &result, nil
}
func (t *GroqTranscriber) IsAvailable() bool {
available := t.apiKey != ""
logger.DebugCF("voice", "Checking transcriber availability", map[string]any{"available": available})
return available
func (t *GroqTranscriber) Name() string {
return "groq"
}
// DetectTranscriber inspects cfg and returns the appropriate Transcriber, or
// nil if no supported transcription provider is configured.
func DetectTranscriber(cfg *config.Config) Transcriber {
// Direct Groq provider config takes priority.
if key := cfg.Providers.Groq.APIKey; key != "" {
return NewGroqTranscriber(key)
}
// Fall back to any model-list entry that uses the groq/ protocol.
for _, mc := range cfg.ModelList {
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" {
return NewGroqTranscriber(mc.APIKey)
}
}
return nil
}
+160
View File
@@ -0,0 +1,160 @@
package voice
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"github.com/sipeed/picoclaw/pkg/config"
)
// Ensure GroqTranscriber satisfies the Transcriber interface at compile time.
var _ Transcriber = (*GroqTranscriber)(nil)
func TestGroqTranscriberName(t *testing.T) {
tr := NewGroqTranscriber("sk-test")
if got := tr.Name(); got != "groq" {
t.Errorf("Name() = %q, want %q", got, "groq")
}
}
func TestDetectTranscriber(t *testing.T) {
tests := []struct {
name string
cfg *config.Config
wantNil bool
wantName string
}{
{
name: "no config",
cfg: &config.Config{},
wantNil: true,
},
{
name: "groq provider key",
cfg: &config.Config{
Providers: config.ProvidersConfig{
Groq: config.ProviderConfig{APIKey: "sk-groq-direct"},
},
},
wantName: "groq",
},
{
name: "groq via model list",
cfg: &config.Config{
ModelList: []config.ModelConfig{
{Model: "openai/gpt-4o", APIKey: "sk-openai"},
{Model: "groq/llama-3.3-70b", APIKey: "sk-groq-model"},
},
},
wantName: "groq",
},
{
name: "groq model list entry without key is skipped",
cfg: &config.Config{
ModelList: []config.ModelConfig{
{Model: "groq/llama-3.3-70b", APIKey: ""},
},
},
wantNil: true,
},
{
name: "provider key takes priority over model list",
cfg: &config.Config{
Providers: config.ProvidersConfig{
Groq: config.ProviderConfig{APIKey: "sk-groq-direct"},
},
ModelList: []config.ModelConfig{
{Model: "groq/llama-3.3-70b", APIKey: "sk-groq-model"},
},
},
wantName: "groq",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
tr := DetectTranscriber(tc.cfg)
if tc.wantNil {
if tr != nil {
t.Errorf("DetectTranscriber() = %v, want nil", tr)
}
return
}
if tr == nil {
t.Fatal("DetectTranscriber() = nil, want non-nil")
}
if got := tr.Name(); got != tc.wantName {
t.Errorf("Name() = %q, want %q", got, tc.wantName)
}
})
}
}
func TestTranscribe(t *testing.T) {
// Write a minimal fake audio file so the transcriber can open and send it.
tmpDir := t.TempDir()
audioPath := filepath.Join(tmpDir, "clip.ogg")
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
t.Fatalf("failed to write fake audio file: %v", err)
}
t.Run("success", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/audio/transcriptions" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.Header.Get("Authorization") != "Bearer sk-test" {
t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization"))
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
Text: "hello world",
Language: "en",
Duration: 1.5,
})
}))
defer srv.Close()
tr := NewGroqTranscriber("sk-test")
tr.apiBase = srv.URL
resp, err := tr.Transcribe(context.Background(), audioPath)
if err != nil {
t.Fatalf("Transcribe() error: %v", err)
}
if resp.Text != "hello world" {
t.Errorf("Text = %q, want %q", resp.Text, "hello world")
}
if resp.Language != "en" {
t.Errorf("Language = %q, want %q", resp.Language, "en")
}
})
t.Run("api error", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
}))
defer srv.Close()
tr := NewGroqTranscriber("sk-bad")
tr.apiBase = srv.URL
_, err := tr.Transcribe(context.Background(), audioPath)
if err == nil {
t.Fatal("expected error for non-200 response, got nil")
}
})
t.Run("missing file", func(t *testing.T) {
tr := NewGroqTranscriber("sk-test")
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
if err == nil {
t.Fatal("expected error for missing file, got nil")
}
})
}