diff --git a/pkg/agent/loop_test.go b/pkg/agent/loop_test.go index a37873711..976d25c4b 100644 --- a/pkg/agent/loop_test.go +++ b/pkg/agent/loop_test.go @@ -1717,7 +1717,7 @@ func TestProcessMessage_PublishesReasoningContentToReasoningChannel(t *testing.T Agents: config.AgentsConfig{ Defaults: config.AgentDefaults{ Workspace: tmpDir, - Model: "test-model", + ModelName: "test-model", MaxTokens: 4096, MaxToolIterations: 10, }, diff --git a/pkg/channels/telegram/telegram.go b/pkg/channels/telegram/telegram.go index f62d6d008..d0011d21b 100644 --- a/pkg/channels/telegram/telegram.go +++ b/pkg/channels/telegram/telegram.go @@ -481,13 +481,26 @@ func (c *TelegramChannel) SendMedia(ctx context.Context, msg bus.OutboundMediaMe _, err = c.bot.SendDocument(ctx, docParams) } case "audio": - params := &telego.SendAudioParams{ - ChatID: tu.ID(chatID), - MessageThreadID: threadID, - Audio: telego.InputFile{File: file}, - Caption: part.Caption, + // Send OGG files with "voice" in the filename as Telegram voice + // bubbles (SendVoice) instead of audio attachments (SendAudio). + fn := strings.ToLower(part.Filename) + if strings.Contains(fn, "voice") && (strings.HasSuffix(fn, ".ogg") || strings.HasSuffix(fn, ".oga")) { + vparams := &telego.SendVoiceParams{ + ChatID: tu.ID(chatID), + MessageThreadID: threadID, + Voice: telego.InputFile{File: file}, + Caption: part.Caption, + } + _, err = c.bot.SendVoice(ctx, vparams) + } else { + params := &telego.SendAudioParams{ + ChatID: tu.ID(chatID), + MessageThreadID: threadID, + Audio: telego.InputFile{File: file}, + Caption: part.Caption, + } + _, err = c.bot.SendAudio(ctx, params) } - _, err = c.bot.SendAudio(ctx, params) case "video": params := &telego.SendVideoParams{ ChatID: tu.ID(chatID), diff --git a/pkg/config/config.go b/pkg/config/config.go index 68cfdcb54..9f61e2188 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -928,8 +928,9 @@ type DevicesConfig struct { } type VoiceConfig struct { - ModelName string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"` - EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"` + ModelName string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"` + EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"` + ElevenLabsAPIKey string `json:"elevenlabs_api_key,omitempty" env:"PICOCLAW_VOICE_ELEVENLABS_API_KEY"` } // ModelConfig represents a model-centric provider configuration. diff --git a/pkg/voice/elevenlabs_transcriber.go b/pkg/voice/elevenlabs_transcriber.go new file mode 100644 index 000000000..93db10f8d --- /dev/null +++ b/pkg/voice/elevenlabs_transcriber.go @@ -0,0 +1,141 @@ +package voice + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "mime/multipart" + "net/http" + "os" + "path/filepath" + "time" + + "github.com/sipeed/picoclaw/pkg/logger" + "github.com/sipeed/picoclaw/pkg/utils" +) + +// ElevenLabsTranscriber uses the ElevenLabs Scribe API for speech-to-text. +type ElevenLabsTranscriber struct { + apiKey string + apiBase string + httpClient *http.Client +} + +func NewElevenLabsTranscriber(apiKey string) *ElevenLabsTranscriber { + logger.DebugCF("voice", "Creating ElevenLabs transcriber", map[string]any{"has_api_key": apiKey != ""}) + + return &ElevenLabsTranscriber{ + apiKey: apiKey, + apiBase: "https://api.elevenlabs.io", + httpClient: &http.Client{ + Timeout: 120 * time.Second, + }, + } +} + +func (t *ElevenLabsTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) { + logger.InfoCF("voice", "Starting ElevenLabs transcription", map[string]any{"audio_file": audioFilePath}) + + audioFile, err := os.Open(audioFilePath) + if err != nil { + logger.ErrorCF("voice", "Failed to open audio file", map[string]any{"path": audioFilePath, "error": err}) + return nil, fmt.Errorf("failed to open audio file: %w", err) + } + defer audioFile.Close() + + fileInfo, err := audioFile.Stat() + if err != nil { + logger.ErrorCF("voice", "Failed to get file info", map[string]any{"path": audioFilePath, "error": err}) + return nil, fmt.Errorf("failed to get file info: %w", err) + } + + logger.DebugCF("voice", "Audio file details", map[string]any{ + "size_bytes": fileInfo.Size(), + "file_name": filepath.Base(audioFilePath), + }) + + var requestBody bytes.Buffer + writer := multipart.NewWriter(&requestBody) + + part, err := writer.CreateFormFile("file", filepath.Base(audioFilePath)) + if err != nil { + logger.ErrorCF("voice", "Failed to create form file", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to create form file: %w", err) + } + + if _, err = io.Copy(part, audioFile); err != nil { + logger.ErrorCF("voice", "Failed to copy file content", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to copy file content: %w", err) + } + + if err = writer.WriteField("model_id", "scribe_v1"); err != nil { + return nil, fmt.Errorf("failed to write model_id field: %w", err) + } + + if err = writer.Close(); err != nil { + logger.ErrorCF("voice", "Failed to close multipart writer", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to close multipart writer: %w", err) + } + + url := t.apiBase + "/v1/speech-to-text" + req, err := http.NewRequestWithContext(ctx, "POST", url, &requestBody) + if err != nil { + logger.ErrorCF("voice", "Failed to create request", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", writer.FormDataContentType()) + req.Header.Set("Xi-Api-Key", t.apiKey) + + logger.DebugCF("voice", "Sending transcription request to ElevenLabs API", map[string]any{ + "url": url, + "request_size_bytes": requestBody.Len(), + "file_size_bytes": fileInfo.Size(), + }) + + resp, err := t.httpClient.Do(req) + if err != nil { + logger.ErrorCF("voice", "Failed to send request", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + logger.ErrorCF("voice", "Failed to read response", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + logger.ErrorCF("voice", "ElevenLabs API error", map[string]any{ + "status_code": resp.StatusCode, + "response": string(body), + }) + return nil, fmt.Errorf("ElevenLabs API error (status %d): %s", resp.StatusCode, string(body)) + } + + logger.DebugCF("voice", "Received response from ElevenLabs API", map[string]any{ + "status_code": resp.StatusCode, + "response_size_bytes": len(body), + }) + + var result TranscriptionResponse + if err := json.Unmarshal(body, &result); err != nil { + logger.ErrorCF("voice", "Failed to unmarshal response", map[string]any{"error": err}) + return nil, fmt.Errorf("failed to unmarshal response: %w", err) + } + + logger.InfoCF("voice", "ElevenLabs transcription completed successfully", map[string]any{ + "text_length": len(result.Text), + "language": result.Language, + "transcription_preview": utils.Truncate(result.Text, 50), + }) + + return &result, nil +} + +func (t *ElevenLabsTranscriber) Name() string { + return "elevenlabs" +} diff --git a/pkg/voice/elevenlabs_transcriber_test.go b/pkg/voice/elevenlabs_transcriber_test.go new file mode 100644 index 000000000..78be8958a --- /dev/null +++ b/pkg/voice/elevenlabs_transcriber_test.go @@ -0,0 +1,83 @@ +package voice + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" +) + +// Ensure ElevenLabsTranscriber satisfies the Transcriber interface at compile time. +var _ Transcriber = (*ElevenLabsTranscriber)(nil) + +func TestElevenLabsTranscriberName(t *testing.T) { + tr := NewElevenLabsTranscriber("sk_test") + if got := tr.Name(); got != "elevenlabs" { + t.Errorf("Name() = %q, want %q", got, "elevenlabs") + } +} + +func TestElevenLabsTranscribe(t *testing.T) { + tmpDir := t.TempDir() + audioPath := filepath.Join(tmpDir, "clip.ogg") + if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil { + t.Fatalf("failed to write fake audio file: %v", err) + } + + t.Run("success", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/speech-to-text" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.Header.Get("Xi-Api-Key") != "sk_test" { + t.Errorf("unexpected xi-api-key header: %s", r.Header.Get("Xi-Api-Key")) + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(TranscriptionResponse{ + Text: "hello from elevenlabs", + Language: "en", + }) + })) + defer srv.Close() + + tr := NewElevenLabsTranscriber("sk_test") + tr.apiBase = srv.URL + + resp, err := tr.Transcribe(context.Background(), audioPath) + if err != nil { + t.Fatalf("Transcribe() error: %v", err) + } + if resp.Text != "hello from elevenlabs" { + t.Errorf("Text = %q, want %q", resp.Text, "hello from elevenlabs") + } + if resp.Language != "en" { + t.Errorf("Language = %q, want %q", resp.Language, "en") + } + }) + + t.Run("api error", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized) + })) + defer srv.Close() + + tr := NewElevenLabsTranscriber("sk_bad") + tr.apiBase = srv.URL + + _, err := tr.Transcribe(context.Background(), audioPath) + if err == nil { + t.Fatal("expected error for non-200 response, got nil") + } + }) + + t.Run("missing file", func(t *testing.T) { + tr := NewElevenLabsTranscriber("sk_test") + _, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg")) + if err == nil { + t.Fatal("expected error for missing file, got nil") + } + }) +} diff --git a/pkg/voice/transcriber.go b/pkg/voice/transcriber.go index a50fba8f8..f56fdeedd 100644 --- a/pkg/voice/transcriber.go +++ b/pkg/voice/transcriber.go @@ -54,6 +54,10 @@ func DetectTranscriber(cfg *config.Config) Transcriber { } } + // ElevenLabs voice config (supports Scribe STT). + if key := strings.TrimSpace(cfg.Voice.ElevenLabsAPIKey); key != "" { + return NewElevenLabsTranscriber(key) + } // Fall back to any model-list entry that uses the groq/ protocol. for _, mc := range cfg.ModelList { if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey() != "" { diff --git a/pkg/voice/transcriber_test.go b/pkg/voice/transcriber_test.go index 20ba5388b..70a7fca8f 100644 --- a/pkg/voice/transcriber_test.go +++ b/pkg/voice/transcriber_test.go @@ -145,6 +145,48 @@ func TestDetectTranscriber(t *testing.T) { }), wantNil: true, }, + { + name: "elevenlabs voice config key", + cfg: &config.Config{ + Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"}, + }, + wantName: "elevenlabs", + }, + { + name: "elevenlabs takes priority over groq model list", + cfg: (&config.Config{ + Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"}, + ModelList: []*config.ModelConfig{ + {ModelName: "groq", Model: "groq/llama-3.3-70b"}, + }, + }).WithSecurity(&config.SecurityConfig{ + ModelList: map[string]config.ModelSecurityEntry{ + "groq": { + APIKeys: []string{"sk-groq-direct"}, + }, + }, + }), + wantName: "elevenlabs", + }, + { + name: "voice model name takes priority over elevenlabs", + cfg: (&config.Config{ + Voice: config.VoiceConfig{ + ModelName: "voice-gemini", + ElevenLabsAPIKey: "sk_elevenlabs_test", + }, + ModelList: []*config.ModelConfig{ + {ModelName: "voice-gemini", Model: "gemini/gemini-2.5-flash"}, + }, + }).WithSecurity(&config.SecurityConfig{ + ModelList: map[string]config.ModelSecurityEntry{ + "voice-gemini": { + APIKeys: []string{"sk-gemini-model"}, + }, + }, + }), + wantName: "audio-model", + }, } for _, tc := range tests {