feat: add ElevenLabs Scribe STT transcriber and Telegram SendVoice support (#1905)

* feat: add ElevenLabs Scribe STT transcriber and Telegram SendVoice support

Add ElevenLabsTranscriber as an alternative speech-to-text provider using
the ElevenLabs Scribe API (scribe_v1). This enables voice message
transcription for users who already have an ElevenLabs API key, without
requiring a separate Groq account.

Changes:
- Add ElevenLabsTranscriber implementing the Transcriber interface
- Update DetectTranscriber to check providers.elevenlabs.api_key first,
  falling back to Groq for backward compatibility
- Add ElevenLabs to ProvidersConfig
- Add "voice" media type for OGG files with "voice" in filename
- Add SendVoice support in Telegram channel for voice bubble messages
- Add comprehensive tests for ElevenLabs transcriber

Configuration:
  "providers": {
    "elevenlabs": {
      "api_key": "sk_your_key_here"
    }
  }

Closes #1503 (partial)

* fix: move voice-bubble detection into Telegram channel to avoid regression in other channels

Address review feedback: keep inferMediaType returning "audio" for all
OGG files. Voice-bubble detection (SendVoice vs SendAudio) is now done
inside the Telegram channel based on filename, so other channels that
map "audio" explicitly are unaffected.

* fix: align VoiceConfig struct tags to pass golines formatter

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix(agent): use ModelName in loop test added by upstream

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Orkun Manap
2026-03-23 22:11:10 +01:00
committed by GitHub
parent f06173a5e0
commit dd9adf8a04
7 changed files with 293 additions and 9 deletions
+1 -1
View File
@@ -1717,7 +1717,7 @@ func TestProcessMessage_PublishesReasoningContentToReasoningChannel(t *testing.T
Agents: config.AgentsConfig{
Defaults: config.AgentDefaults{
Workspace: tmpDir,
Model: "test-model",
ModelName: "test-model",
MaxTokens: 4096,
MaxToolIterations: 10,
},
+19 -6
View File
@@ -481,13 +481,26 @@ func (c *TelegramChannel) SendMedia(ctx context.Context, msg bus.OutboundMediaMe
_, err = c.bot.SendDocument(ctx, docParams)
}
case "audio":
params := &telego.SendAudioParams{
ChatID: tu.ID(chatID),
MessageThreadID: threadID,
Audio: telego.InputFile{File: file},
Caption: part.Caption,
// Send OGG files with "voice" in the filename as Telegram voice
// bubbles (SendVoice) instead of audio attachments (SendAudio).
fn := strings.ToLower(part.Filename)
if strings.Contains(fn, "voice") && (strings.HasSuffix(fn, ".ogg") || strings.HasSuffix(fn, ".oga")) {
vparams := &telego.SendVoiceParams{
ChatID: tu.ID(chatID),
MessageThreadID: threadID,
Voice: telego.InputFile{File: file},
Caption: part.Caption,
}
_, err = c.bot.SendVoice(ctx, vparams)
} else {
params := &telego.SendAudioParams{
ChatID: tu.ID(chatID),
MessageThreadID: threadID,
Audio: telego.InputFile{File: file},
Caption: part.Caption,
}
_, err = c.bot.SendAudio(ctx, params)
}
_, err = c.bot.SendAudio(ctx, params)
case "video":
params := &telego.SendVideoParams{
ChatID: tu.ID(chatID),
+3 -2
View File
@@ -928,8 +928,9 @@ type DevicesConfig struct {
}
type VoiceConfig struct {
ModelName string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"`
EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
ModelName string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"`
EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
ElevenLabsAPIKey string `json:"elevenlabs_api_key,omitempty" env:"PICOCLAW_VOICE_ELEVENLABS_API_KEY"`
}
// ModelConfig represents a model-centric provider configuration.
+141
View File
@@ -0,0 +1,141 @@
package voice
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
"path/filepath"
"time"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/utils"
)
// ElevenLabsTranscriber uses the ElevenLabs Scribe API for speech-to-text.
type ElevenLabsTranscriber struct {
apiKey string
apiBase string
httpClient *http.Client
}
func NewElevenLabsTranscriber(apiKey string) *ElevenLabsTranscriber {
logger.DebugCF("voice", "Creating ElevenLabs transcriber", map[string]any{"has_api_key": apiKey != ""})
return &ElevenLabsTranscriber{
apiKey: apiKey,
apiBase: "https://api.elevenlabs.io",
httpClient: &http.Client{
Timeout: 120 * time.Second,
},
}
}
func (t *ElevenLabsTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) {
logger.InfoCF("voice", "Starting ElevenLabs transcription", map[string]any{"audio_file": audioFilePath})
audioFile, err := os.Open(audioFilePath)
if err != nil {
logger.ErrorCF("voice", "Failed to open audio file", map[string]any{"path": audioFilePath, "error": err})
return nil, fmt.Errorf("failed to open audio file: %w", err)
}
defer audioFile.Close()
fileInfo, err := audioFile.Stat()
if err != nil {
logger.ErrorCF("voice", "Failed to get file info", map[string]any{"path": audioFilePath, "error": err})
return nil, fmt.Errorf("failed to get file info: %w", err)
}
logger.DebugCF("voice", "Audio file details", map[string]any{
"size_bytes": fileInfo.Size(),
"file_name": filepath.Base(audioFilePath),
})
var requestBody bytes.Buffer
writer := multipart.NewWriter(&requestBody)
part, err := writer.CreateFormFile("file", filepath.Base(audioFilePath))
if err != nil {
logger.ErrorCF("voice", "Failed to create form file", map[string]any{"error": err})
return nil, fmt.Errorf("failed to create form file: %w", err)
}
if _, err = io.Copy(part, audioFile); err != nil {
logger.ErrorCF("voice", "Failed to copy file content", map[string]any{"error": err})
return nil, fmt.Errorf("failed to copy file content: %w", err)
}
if err = writer.WriteField("model_id", "scribe_v1"); err != nil {
return nil, fmt.Errorf("failed to write model_id field: %w", err)
}
if err = writer.Close(); err != nil {
logger.ErrorCF("voice", "Failed to close multipart writer", map[string]any{"error": err})
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
}
url := t.apiBase + "/v1/speech-to-text"
req, err := http.NewRequestWithContext(ctx, "POST", url, &requestBody)
if err != nil {
logger.ErrorCF("voice", "Failed to create request", map[string]any{"error": err})
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", writer.FormDataContentType())
req.Header.Set("Xi-Api-Key", t.apiKey)
logger.DebugCF("voice", "Sending transcription request to ElevenLabs API", map[string]any{
"url": url,
"request_size_bytes": requestBody.Len(),
"file_size_bytes": fileInfo.Size(),
})
resp, err := t.httpClient.Do(req)
if err != nil {
logger.ErrorCF("voice", "Failed to send request", map[string]any{"error": err})
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
logger.ErrorCF("voice", "Failed to read response", map[string]any{"error": err})
return nil, fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
logger.ErrorCF("voice", "ElevenLabs API error", map[string]any{
"status_code": resp.StatusCode,
"response": string(body),
})
return nil, fmt.Errorf("ElevenLabs API error (status %d): %s", resp.StatusCode, string(body))
}
logger.DebugCF("voice", "Received response from ElevenLabs API", map[string]any{
"status_code": resp.StatusCode,
"response_size_bytes": len(body),
})
var result TranscriptionResponse
if err := json.Unmarshal(body, &result); err != nil {
logger.ErrorCF("voice", "Failed to unmarshal response", map[string]any{"error": err})
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
}
logger.InfoCF("voice", "ElevenLabs transcription completed successfully", map[string]any{
"text_length": len(result.Text),
"language": result.Language,
"transcription_preview": utils.Truncate(result.Text, 50),
})
return &result, nil
}
func (t *ElevenLabsTranscriber) Name() string {
return "elevenlabs"
}
+83
View File
@@ -0,0 +1,83 @@
package voice
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
)
// Ensure ElevenLabsTranscriber satisfies the Transcriber interface at compile time.
var _ Transcriber = (*ElevenLabsTranscriber)(nil)
func TestElevenLabsTranscriberName(t *testing.T) {
tr := NewElevenLabsTranscriber("sk_test")
if got := tr.Name(); got != "elevenlabs" {
t.Errorf("Name() = %q, want %q", got, "elevenlabs")
}
}
func TestElevenLabsTranscribe(t *testing.T) {
tmpDir := t.TempDir()
audioPath := filepath.Join(tmpDir, "clip.ogg")
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
t.Fatalf("failed to write fake audio file: %v", err)
}
t.Run("success", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/v1/speech-to-text" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.Header.Get("Xi-Api-Key") != "sk_test" {
t.Errorf("unexpected xi-api-key header: %s", r.Header.Get("Xi-Api-Key"))
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
Text: "hello from elevenlabs",
Language: "en",
})
}))
defer srv.Close()
tr := NewElevenLabsTranscriber("sk_test")
tr.apiBase = srv.URL
resp, err := tr.Transcribe(context.Background(), audioPath)
if err != nil {
t.Fatalf("Transcribe() error: %v", err)
}
if resp.Text != "hello from elevenlabs" {
t.Errorf("Text = %q, want %q", resp.Text, "hello from elevenlabs")
}
if resp.Language != "en" {
t.Errorf("Language = %q, want %q", resp.Language, "en")
}
})
t.Run("api error", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
}))
defer srv.Close()
tr := NewElevenLabsTranscriber("sk_bad")
tr.apiBase = srv.URL
_, err := tr.Transcribe(context.Background(), audioPath)
if err == nil {
t.Fatal("expected error for non-200 response, got nil")
}
})
t.Run("missing file", func(t *testing.T) {
tr := NewElevenLabsTranscriber("sk_test")
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
if err == nil {
t.Fatal("expected error for missing file, got nil")
}
})
}
+4
View File
@@ -54,6 +54,10 @@ func DetectTranscriber(cfg *config.Config) Transcriber {
}
}
// ElevenLabs voice config (supports Scribe STT).
if key := strings.TrimSpace(cfg.Voice.ElevenLabsAPIKey); key != "" {
return NewElevenLabsTranscriber(key)
}
// Fall back to any model-list entry that uses the groq/ protocol.
for _, mc := range cfg.ModelList {
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey() != "" {
+42
View File
@@ -145,6 +145,48 @@ func TestDetectTranscriber(t *testing.T) {
}),
wantNil: true,
},
{
name: "elevenlabs voice config key",
cfg: &config.Config{
Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"},
},
wantName: "elevenlabs",
},
{
name: "elevenlabs takes priority over groq model list",
cfg: (&config.Config{
Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"},
ModelList: []*config.ModelConfig{
{ModelName: "groq", Model: "groq/llama-3.3-70b"},
},
}).WithSecurity(&config.SecurityConfig{
ModelList: map[string]config.ModelSecurityEntry{
"groq": {
APIKeys: []string{"sk-groq-direct"},
},
},
}),
wantName: "elevenlabs",
},
{
name: "voice model name takes priority over elevenlabs",
cfg: (&config.Config{
Voice: config.VoiceConfig{
ModelName: "voice-gemini",
ElevenLabsAPIKey: "sk_elevenlabs_test",
},
ModelList: []*config.ModelConfig{
{ModelName: "voice-gemini", Model: "gemini/gemini-2.5-flash"},
},
}).WithSecurity(&config.SecurityConfig{
ModelList: map[string]config.ModelSecurityEntry{
"voice-gemini": {
APIKeys: []string{"sk-gemini-model"},
},
},
}),
wantName: "audio-model",
},
}
for _, tc := range tests {