feat: add ElevenLabs Scribe STT transcriber and Telegram SendVoice support (#1905)

* feat: add ElevenLabs Scribe STT transcriber and Telegram SendVoice support Add ElevenLabsTranscriber as an alternative speech-to-text provider using the ElevenLabs Scribe API (scribe_v1). This enables voice message transcription for users who already have an ElevenLabs API key, without requiring a separate Groq account. Changes: - Add ElevenLabsTranscriber implementing the Transcriber interface - Update DetectTranscriber to check providers.elevenlabs.api_key first, falling back to Groq for backward compatibility - Add ElevenLabs to ProvidersConfig - Add "voice" media type for OGG files with "voice" in filename - Add SendVoice support in Telegram channel for voice bubble messages - Add comprehensive tests for ElevenLabs transcriber Configuration: "providers": { "elevenlabs": { "api_key": "sk_your_key_here" } } Closes #1503 (partial) * fix: move voice-bubble detection into Telegram channel to avoid regression in other channels Address review feedback: keep inferMediaType returning "audio" for all OGG files. Voice-bubble detection (SendVoice vs SendAudio) is now done inside the Telegram channel based on filename, so other channels that map "audio" explicitly are unaffected. * fix: align VoiceConfig struct tags to pass golines formatter Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix(agent): use ModelName in loop test added by upstream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-12 18:08:54 +00:00 · 2026-03-23 22:11:10 +01:00
parent f06173a5e0
commit dd9adf8a04
7 changed files with 293 additions and 9 deletions
@@ -1717,7 +1717,7 @@ func TestProcessMessage_PublishesReasoningContentToReasoningChannel(t *testing.T
 		Agents: config.AgentsConfig{
 			Defaults: config.AgentDefaults{
 				Workspace:         tmpDir,
-				Model:             "test-model",
+				ModelName:         "test-model",
 				MaxTokens:         4096,
 				MaxToolIterations: 10,
 			},
@@ -481,13 +481,26 @@ func (c *TelegramChannel) SendMedia(ctx context.Context, msg bus.OutboundMediaMe
 				_, err = c.bot.SendDocument(ctx, docParams)
 			}
 		case "audio":
-			params := &telego.SendAudioParams{
-				ChatID:          tu.ID(chatID),
-				MessageThreadID: threadID,
-				Audio:           telego.InputFile{File: file},
-				Caption:         part.Caption,
+			// Send OGG files with "voice" in the filename as Telegram voice
+			// bubbles (SendVoice) instead of audio attachments (SendAudio).
+			fn := strings.ToLower(part.Filename)
+			if strings.Contains(fn, "voice") && (strings.HasSuffix(fn, ".ogg") || strings.HasSuffix(fn, ".oga")) {
+				vparams := &telego.SendVoiceParams{
+					ChatID:          tu.ID(chatID),
+					MessageThreadID: threadID,
+					Voice:           telego.InputFile{File: file},
+					Caption:         part.Caption,
+				}
+				_, err = c.bot.SendVoice(ctx, vparams)
+			} else {
+				params := &telego.SendAudioParams{
+					ChatID:          tu.ID(chatID),
+					MessageThreadID: threadID,
+					Audio:           telego.InputFile{File: file},
+					Caption:         part.Caption,
+				}
+				_, err = c.bot.SendAudio(ctx, params)
 			}
-			_, err = c.bot.SendAudio(ctx, params)
 		case "video":
 			params := &telego.SendVideoParams{
 				ChatID:          tu.ID(chatID),
@@ -928,8 +928,9 @@ type DevicesConfig struct {
 }

 type VoiceConfig struct {
-	ModelName         string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"`
-	EchoTranscription bool   `json:"echo_transcription"   env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
+	ModelName         string `json:"model_name,omitempty"         env:"PICOCLAW_VOICE_MODEL_NAME"`
+	EchoTranscription bool   `json:"echo_transcription"           env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
+	ElevenLabsAPIKey  string `json:"elevenlabs_api_key,omitempty" env:"PICOCLAW_VOICE_ELEVENLABS_API_KEY"`
 }

 // ModelConfig represents a model-centric provider configuration.
@@ -0,0 +1,141 @@
+package voice
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+	"path/filepath"
+	"time"
+
+	"github.com/sipeed/picoclaw/pkg/logger"
+	"github.com/sipeed/picoclaw/pkg/utils"
+)
+
+// ElevenLabsTranscriber uses the ElevenLabs Scribe API for speech-to-text.
+type ElevenLabsTranscriber struct {
+	apiKey     string
+	apiBase    string
+	httpClient *http.Client
+}
+
+func NewElevenLabsTranscriber(apiKey string) *ElevenLabsTranscriber {
+	logger.DebugCF("voice", "Creating ElevenLabs transcriber", map[string]any{"has_api_key": apiKey != ""})
+
+	return &ElevenLabsTranscriber{
+		apiKey:  apiKey,
+		apiBase: "https://api.elevenlabs.io",
+		httpClient: &http.Client{
+			Timeout: 120 * time.Second,
+		},
+	}
+}
+
+func (t *ElevenLabsTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) {
+	logger.InfoCF("voice", "Starting ElevenLabs transcription", map[string]any{"audio_file": audioFilePath})
+
+	audioFile, err := os.Open(audioFilePath)
+	if err != nil {
+		logger.ErrorCF("voice", "Failed to open audio file", map[string]any{"path": audioFilePath, "error": err})
+		return nil, fmt.Errorf("failed to open audio file: %w", err)
+	}
+	defer audioFile.Close()
+
+	fileInfo, err := audioFile.Stat()
+	if err != nil {
+		logger.ErrorCF("voice", "Failed to get file info", map[string]any{"path": audioFilePath, "error": err})
+		return nil, fmt.Errorf("failed to get file info: %w", err)
+	}
+
+	logger.DebugCF("voice", "Audio file details", map[string]any{
+		"size_bytes": fileInfo.Size(),
+		"file_name":  filepath.Base(audioFilePath),
+	})
+
+	var requestBody bytes.Buffer
+	writer := multipart.NewWriter(&requestBody)
+
+	part, err := writer.CreateFormFile("file", filepath.Base(audioFilePath))
+	if err != nil {
+		logger.ErrorCF("voice", "Failed to create form file", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to create form file: %w", err)
+	}
+
+	if _, err = io.Copy(part, audioFile); err != nil {
+		logger.ErrorCF("voice", "Failed to copy file content", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to copy file content: %w", err)
+	}
+
+	if err = writer.WriteField("model_id", "scribe_v1"); err != nil {
+		return nil, fmt.Errorf("failed to write model_id field: %w", err)
+	}
+
+	if err = writer.Close(); err != nil {
+		logger.ErrorCF("voice", "Failed to close multipart writer", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to close multipart writer: %w", err)
+	}
+
+	url := t.apiBase + "/v1/speech-to-text"
+	req, err := http.NewRequestWithContext(ctx, "POST", url, &requestBody)
+	if err != nil {
+		logger.ErrorCF("voice", "Failed to create request", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	req.Header.Set("Xi-Api-Key", t.apiKey)
+
+	logger.DebugCF("voice", "Sending transcription request to ElevenLabs API", map[string]any{
+		"url":                url,
+		"request_size_bytes": requestBody.Len(),
+		"file_size_bytes":    fileInfo.Size(),
+	})
+
+	resp, err := t.httpClient.Do(req)
+	if err != nil {
+		logger.ErrorCF("voice", "Failed to send request", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to send request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		logger.ErrorCF("voice", "Failed to read response", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to read response: %w", err)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		logger.ErrorCF("voice", "ElevenLabs API error", map[string]any{
+			"status_code": resp.StatusCode,
+			"response":    string(body),
+		})
+		return nil, fmt.Errorf("ElevenLabs API error (status %d): %s", resp.StatusCode, string(body))
+	}
+
+	logger.DebugCF("voice", "Received response from ElevenLabs API", map[string]any{
+		"status_code":         resp.StatusCode,
+		"response_size_bytes": len(body),
+	})
+
+	var result TranscriptionResponse
+	if err := json.Unmarshal(body, &result); err != nil {
+		logger.ErrorCF("voice", "Failed to unmarshal response", map[string]any{"error": err})
+		return nil, fmt.Errorf("failed to unmarshal response: %w", err)
+	}
+
+	logger.InfoCF("voice", "ElevenLabs transcription completed successfully", map[string]any{
+		"text_length":           len(result.Text),
+		"language":              result.Language,
+		"transcription_preview": utils.Truncate(result.Text, 50),
+	})
+
+	return &result, nil
+}
+
+func (t *ElevenLabsTranscriber) Name() string {
+	return "elevenlabs"
+}
@@ -0,0 +1,83 @@
+package voice
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// Ensure ElevenLabsTranscriber satisfies the Transcriber interface at compile time.
+var _ Transcriber = (*ElevenLabsTranscriber)(nil)
+
+func TestElevenLabsTranscriberName(t *testing.T) {
+	tr := NewElevenLabsTranscriber("sk_test")
+	if got := tr.Name(); got != "elevenlabs" {
+		t.Errorf("Name() = %q, want %q", got, "elevenlabs")
+	}
+}
+
+func TestElevenLabsTranscribe(t *testing.T) {
+	tmpDir := t.TempDir()
+	audioPath := filepath.Join(tmpDir, "clip.ogg")
+	if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
+		t.Fatalf("failed to write fake audio file: %v", err)
+	}
+
+	t.Run("success", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path != "/v1/speech-to-text" {
+				t.Errorf("unexpected path: %s", r.URL.Path)
+			}
+			if r.Header.Get("Xi-Api-Key") != "sk_test" {
+				t.Errorf("unexpected xi-api-key header: %s", r.Header.Get("Xi-Api-Key"))
+			}
+			w.Header().Set("Content-Type", "application/json")
+			_ = json.NewEncoder(w).Encode(TranscriptionResponse{
+				Text:     "hello from elevenlabs",
+				Language: "en",
+			})
+		}))
+		defer srv.Close()
+
+		tr := NewElevenLabsTranscriber("sk_test")
+		tr.apiBase = srv.URL
+
+		resp, err := tr.Transcribe(context.Background(), audioPath)
+		if err != nil {
+			t.Fatalf("Transcribe() error: %v", err)
+		}
+		if resp.Text != "hello from elevenlabs" {
+			t.Errorf("Text = %q, want %q", resp.Text, "hello from elevenlabs")
+		}
+		if resp.Language != "en" {
+			t.Errorf("Language = %q, want %q", resp.Language, "en")
+		}
+	})
+
+	t.Run("api error", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
+		}))
+		defer srv.Close()
+
+		tr := NewElevenLabsTranscriber("sk_bad")
+		tr.apiBase = srv.URL
+
+		_, err := tr.Transcribe(context.Background(), audioPath)
+		if err == nil {
+			t.Fatal("expected error for non-200 response, got nil")
+		}
+	})
+
+	t.Run("missing file", func(t *testing.T) {
+		tr := NewElevenLabsTranscriber("sk_test")
+		_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
+		if err == nil {
+			t.Fatal("expected error for missing file, got nil")
+		}
+	})
+}
@@ -54,6 +54,10 @@ func DetectTranscriber(cfg *config.Config) Transcriber {
 		}
 	}

+	// ElevenLabs voice config (supports Scribe STT).
+	if key := strings.TrimSpace(cfg.Voice.ElevenLabsAPIKey); key != "" {
+		return NewElevenLabsTranscriber(key)
+	}
 	// Fall back to any model-list entry that uses the groq/ protocol.
 	for _, mc := range cfg.ModelList {
 		if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey() != "" {
@@ -145,6 +145,48 @@ func TestDetectTranscriber(t *testing.T) {
 			}),
 			wantNil: true,
 		},
+		{
+			name: "elevenlabs voice config key",
+			cfg: &config.Config{
+				Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"},
+			},
+			wantName: "elevenlabs",
+		},
+		{
+			name: "elevenlabs takes priority over groq model list",
+			cfg: (&config.Config{
+				Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"},
+				ModelList: []*config.ModelConfig{
+					{ModelName: "groq", Model: "groq/llama-3.3-70b"},
+				},
+			}).WithSecurity(&config.SecurityConfig{
+				ModelList: map[string]config.ModelSecurityEntry{
+					"groq": {
+						APIKeys: []string{"sk-groq-direct"},
+					},
+				},
+			}),
+			wantName: "elevenlabs",
+		},
+		{
+			name: "voice model name takes priority over elevenlabs",
+			cfg: (&config.Config{
+				Voice: config.VoiceConfig{
+					ModelName:        "voice-gemini",
+					ElevenLabsAPIKey: "sk_elevenlabs_test",
+				},
+				ModelList: []*config.ModelConfig{
+					{ModelName: "voice-gemini", Model: "gemini/gemini-2.5-flash"},
+				},
+			}).WithSecurity(&config.SecurityConfig{
+				ModelList: map[string]config.ModelSecurityEntry{
+					"voice-gemini": {
+						APIKeys: []string{"sk-gemini-model"},
+					},
+				},
+			}),
+			wantName: "audio-model",
+		},
 	}

 	for _, tc := range tests {