mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
Fix voice transcription
This commit is contained in:
@@ -18,6 +18,8 @@ import (
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"regexp"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/bus"
|
||||
"github.com/sipeed/picoclaw/pkg/channels"
|
||||
"github.com/sipeed/picoclaw/pkg/config"
|
||||
@@ -30,6 +32,7 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/state"
|
||||
"github.com/sipeed/picoclaw/pkg/tools"
|
||||
"github.com/sipeed/picoclaw/pkg/utils"
|
||||
"github.com/sipeed/picoclaw/pkg/voice"
|
||||
)
|
||||
|
||||
type AgentLoop struct {
|
||||
@@ -42,6 +45,7 @@ type AgentLoop struct {
|
||||
fallback *providers.FallbackChain
|
||||
channelManager *channels.Manager
|
||||
mediaStore media.MediaStore
|
||||
transcriber voice.Transcriber
|
||||
}
|
||||
|
||||
// processOptions configures how a message is processed
|
||||
@@ -262,6 +266,64 @@ func (al *AgentLoop) SetMediaStore(s media.MediaStore) {
|
||||
al.mediaStore = s
|
||||
}
|
||||
|
||||
// SetTranscriber injects a voice transcriber for agent-level audio transcription.
|
||||
func (al *AgentLoop) SetTranscriber(t voice.Transcriber) {
|
||||
al.transcriber = t
|
||||
}
|
||||
|
||||
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
|
||||
|
||||
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
|
||||
// replaces audio annotations in msg.Content with the transcribed text.
|
||||
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
|
||||
if al.transcriber == nil || !al.transcriber.IsAvailable() || al.mediaStore == nil || len(msg.Media) == 0 {
|
||||
return msg
|
||||
}
|
||||
|
||||
// Transcribe each audio media ref in order.
|
||||
var transcriptions []string
|
||||
for _, ref := range msg.Media {
|
||||
path, meta, err := al.mediaStore.ResolveWithMeta(ref)
|
||||
if err != nil {
|
||||
logger.WarnCF("voice", "Failed to resolve media ref", map[string]any{"ref": ref, "error": err})
|
||||
continue
|
||||
}
|
||||
if !utils.IsAudioFile(meta.Filename, meta.ContentType) {
|
||||
continue
|
||||
}
|
||||
result, err := al.transcriber.Transcribe(ctx, path)
|
||||
if err != nil {
|
||||
logger.WarnCF("voice", "Transcription failed", map[string]any{"ref": ref, "error": err})
|
||||
transcriptions = append(transcriptions, "")
|
||||
continue
|
||||
}
|
||||
transcriptions = append(transcriptions, result.Text)
|
||||
}
|
||||
|
||||
if len(transcriptions) == 0 {
|
||||
return msg
|
||||
}
|
||||
|
||||
// Replace audio annotations sequentially with transcriptions.
|
||||
idx := 0
|
||||
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
|
||||
if idx >= len(transcriptions) {
|
||||
return match
|
||||
}
|
||||
text := transcriptions[idx]
|
||||
idx++
|
||||
return "[voice: " + text + "]"
|
||||
})
|
||||
|
||||
// Append any remaining transcriptions not matched by an annotation.
|
||||
for ; idx < len(transcriptions); idx++ {
|
||||
newContent += "\n[voice: " + transcriptions[idx] + "]"
|
||||
}
|
||||
|
||||
msg.Content = newContent
|
||||
return msg
|
||||
}
|
||||
|
||||
// inferMediaType determines the media type ("image", "audio", "video", "file")
|
||||
// from a filename and MIME content type.
|
||||
func inferMediaType(filename, contentType string) string {
|
||||
@@ -364,6 +426,8 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
|
||||
"session_key": msg.SessionKey,
|
||||
})
|
||||
|
||||
msg = al.transcribeAudioInMessage(ctx, msg)
|
||||
|
||||
// Route system messages to processSystemMessage
|
||||
if msg.Channel == "system" {
|
||||
return al.processSystemMessage(ctx, msg)
|
||||
|
||||
@@ -16,6 +16,11 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/utils"
|
||||
)
|
||||
|
||||
type Transcriber interface {
|
||||
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
|
||||
IsAvailable() bool
|
||||
}
|
||||
|
||||
type GroqTranscriber struct {
|
||||
apiKey string
|
||||
apiBase string
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
package voice
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Ensure GroqTranscriber satisfies the Transcriber interface at compile time.
|
||||
var _ Transcriber = (*GroqTranscriber)(nil)
|
||||
|
||||
func TestIsAvailable(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
apiKey string
|
||||
want bool
|
||||
}{
|
||||
{"with key", "sk-test-key", true},
|
||||
{"empty key", "", false},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tr := NewGroqTranscriber(tc.apiKey)
|
||||
if got := tr.IsAvailable(); got != tc.want {
|
||||
t.Errorf("IsAvailable() = %v, want %v", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranscribe(t *testing.T) {
|
||||
// Write a minimal fake audio file so the transcriber can open and send it.
|
||||
tmpDir := t.TempDir()
|
||||
audioPath := filepath.Join(tmpDir, "clip.ogg")
|
||||
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
|
||||
t.Fatalf("failed to write fake audio file: %v", err)
|
||||
}
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/audio/transcriptions" {
|
||||
t.Errorf("unexpected path: %s", r.URL.Path)
|
||||
}
|
||||
if r.Header.Get("Authorization") != "Bearer sk-test" {
|
||||
t.Errorf("unexpected Authorization header: %s", r.Header.Get("Authorization"))
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
|
||||
Text: "hello world",
|
||||
Language: "en",
|
||||
Duration: 1.5,
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewGroqTranscriber("sk-test")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
resp, err := tr.Transcribe(context.Background(), audioPath)
|
||||
if err != nil {
|
||||
t.Fatalf("Transcribe() error: %v", err)
|
||||
}
|
||||
if resp.Text != "hello world" {
|
||||
t.Errorf("Text = %q, want %q", resp.Text, "hello world")
|
||||
}
|
||||
if resp.Language != "en" {
|
||||
t.Errorf("Language = %q, want %q", resp.Language, "en")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("api error", func(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewGroqTranscriber("sk-bad")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
_, err := tr.Transcribe(context.Background(), audioPath)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-200 response, got nil")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("missing file", func(t *testing.T) {
|
||||
tr := NewGroqTranscriber("sk-test")
|
||||
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing file, got nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user