mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat: add ElevenLabs Scribe STT transcriber and Telegram SendVoice support (#1905)
* feat: add ElevenLabs Scribe STT transcriber and Telegram SendVoice support
Add ElevenLabsTranscriber as an alternative speech-to-text provider using
the ElevenLabs Scribe API (scribe_v1). This enables voice message
transcription for users who already have an ElevenLabs API key, without
requiring a separate Groq account.
Changes:
- Add ElevenLabsTranscriber implementing the Transcriber interface
- Update DetectTranscriber to check providers.elevenlabs.api_key first,
falling back to Groq for backward compatibility
- Add ElevenLabs to ProvidersConfig
- Add "voice" media type for OGG files with "voice" in filename
- Add SendVoice support in Telegram channel for voice bubble messages
- Add comprehensive tests for ElevenLabs transcriber
Configuration:
"providers": {
"elevenlabs": {
"api_key": "sk_your_key_here"
}
}
Closes #1503 (partial)
* fix: move voice-bubble detection into Telegram channel to avoid regression in other channels
Address review feedback: keep inferMediaType returning "audio" for all
OGG files. Voice-bubble detection (SendVoice vs SendAudio) is now done
inside the Telegram channel based on filename, so other channels that
map "audio" explicitly are unaffected.
* fix: align VoiceConfig struct tags to pass golines formatter
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
* fix(agent): use ModelName in loop test added by upstream
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---------
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1717,7 +1717,7 @@ func TestProcessMessage_PublishesReasoningContentToReasoningChannel(t *testing.T
|
||||
Agents: config.AgentsConfig{
|
||||
Defaults: config.AgentDefaults{
|
||||
Workspace: tmpDir,
|
||||
Model: "test-model",
|
||||
ModelName: "test-model",
|
||||
MaxTokens: 4096,
|
||||
MaxToolIterations: 10,
|
||||
},
|
||||
|
||||
@@ -481,13 +481,26 @@ func (c *TelegramChannel) SendMedia(ctx context.Context, msg bus.OutboundMediaMe
|
||||
_, err = c.bot.SendDocument(ctx, docParams)
|
||||
}
|
||||
case "audio":
|
||||
params := &telego.SendAudioParams{
|
||||
ChatID: tu.ID(chatID),
|
||||
MessageThreadID: threadID,
|
||||
Audio: telego.InputFile{File: file},
|
||||
Caption: part.Caption,
|
||||
// Send OGG files with "voice" in the filename as Telegram voice
|
||||
// bubbles (SendVoice) instead of audio attachments (SendAudio).
|
||||
fn := strings.ToLower(part.Filename)
|
||||
if strings.Contains(fn, "voice") && (strings.HasSuffix(fn, ".ogg") || strings.HasSuffix(fn, ".oga")) {
|
||||
vparams := &telego.SendVoiceParams{
|
||||
ChatID: tu.ID(chatID),
|
||||
MessageThreadID: threadID,
|
||||
Voice: telego.InputFile{File: file},
|
||||
Caption: part.Caption,
|
||||
}
|
||||
_, err = c.bot.SendVoice(ctx, vparams)
|
||||
} else {
|
||||
params := &telego.SendAudioParams{
|
||||
ChatID: tu.ID(chatID),
|
||||
MessageThreadID: threadID,
|
||||
Audio: telego.InputFile{File: file},
|
||||
Caption: part.Caption,
|
||||
}
|
||||
_, err = c.bot.SendAudio(ctx, params)
|
||||
}
|
||||
_, err = c.bot.SendAudio(ctx, params)
|
||||
case "video":
|
||||
params := &telego.SendVideoParams{
|
||||
ChatID: tu.ID(chatID),
|
||||
|
||||
@@ -928,8 +928,9 @@ type DevicesConfig struct {
|
||||
}
|
||||
|
||||
type VoiceConfig struct {
|
||||
ModelName string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"`
|
||||
EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
|
||||
ModelName string `json:"model_name,omitempty" env:"PICOCLAW_VOICE_MODEL_NAME"`
|
||||
EchoTranscription bool `json:"echo_transcription" env:"PICOCLAW_VOICE_ECHO_TRANSCRIPTION"`
|
||||
ElevenLabsAPIKey string `json:"elevenlabs_api_key,omitempty" env:"PICOCLAW_VOICE_ELEVENLABS_API_KEY"`
|
||||
}
|
||||
|
||||
// ModelConfig represents a model-centric provider configuration.
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
package voice
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/logger"
|
||||
"github.com/sipeed/picoclaw/pkg/utils"
|
||||
)
|
||||
|
||||
// ElevenLabsTranscriber uses the ElevenLabs Scribe API for speech-to-text.
|
||||
type ElevenLabsTranscriber struct {
|
||||
apiKey string
|
||||
apiBase string
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
func NewElevenLabsTranscriber(apiKey string) *ElevenLabsTranscriber {
|
||||
logger.DebugCF("voice", "Creating ElevenLabs transcriber", map[string]any{"has_api_key": apiKey != ""})
|
||||
|
||||
return &ElevenLabsTranscriber{
|
||||
apiKey: apiKey,
|
||||
apiBase: "https://api.elevenlabs.io",
|
||||
httpClient: &http.Client{
|
||||
Timeout: 120 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *ElevenLabsTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) {
|
||||
logger.InfoCF("voice", "Starting ElevenLabs transcription", map[string]any{"audio_file": audioFilePath})
|
||||
|
||||
audioFile, err := os.Open(audioFilePath)
|
||||
if err != nil {
|
||||
logger.ErrorCF("voice", "Failed to open audio file", map[string]any{"path": audioFilePath, "error": err})
|
||||
return nil, fmt.Errorf("failed to open audio file: %w", err)
|
||||
}
|
||||
defer audioFile.Close()
|
||||
|
||||
fileInfo, err := audioFile.Stat()
|
||||
if err != nil {
|
||||
logger.ErrorCF("voice", "Failed to get file info", map[string]any{"path": audioFilePath, "error": err})
|
||||
return nil, fmt.Errorf("failed to get file info: %w", err)
|
||||
}
|
||||
|
||||
logger.DebugCF("voice", "Audio file details", map[string]any{
|
||||
"size_bytes": fileInfo.Size(),
|
||||
"file_name": filepath.Base(audioFilePath),
|
||||
})
|
||||
|
||||
var requestBody bytes.Buffer
|
||||
writer := multipart.NewWriter(&requestBody)
|
||||
|
||||
part, err := writer.CreateFormFile("file", filepath.Base(audioFilePath))
|
||||
if err != nil {
|
||||
logger.ErrorCF("voice", "Failed to create form file", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to create form file: %w", err)
|
||||
}
|
||||
|
||||
if _, err = io.Copy(part, audioFile); err != nil {
|
||||
logger.ErrorCF("voice", "Failed to copy file content", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to copy file content: %w", err)
|
||||
}
|
||||
|
||||
if err = writer.WriteField("model_id", "scribe_v1"); err != nil {
|
||||
return nil, fmt.Errorf("failed to write model_id field: %w", err)
|
||||
}
|
||||
|
||||
if err = writer.Close(); err != nil {
|
||||
logger.ErrorCF("voice", "Failed to close multipart writer", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
|
||||
}
|
||||
|
||||
url := t.apiBase + "/v1/speech-to-text"
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, &requestBody)
|
||||
if err != nil {
|
||||
logger.ErrorCF("voice", "Failed to create request", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
req.Header.Set("Xi-Api-Key", t.apiKey)
|
||||
|
||||
logger.DebugCF("voice", "Sending transcription request to ElevenLabs API", map[string]any{
|
||||
"url": url,
|
||||
"request_size_bytes": requestBody.Len(),
|
||||
"file_size_bytes": fileInfo.Size(),
|
||||
})
|
||||
|
||||
resp, err := t.httpClient.Do(req)
|
||||
if err != nil {
|
||||
logger.ErrorCF("voice", "Failed to send request", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
logger.ErrorCF("voice", "Failed to read response", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to read response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
logger.ErrorCF("voice", "ElevenLabs API error", map[string]any{
|
||||
"status_code": resp.StatusCode,
|
||||
"response": string(body),
|
||||
})
|
||||
return nil, fmt.Errorf("ElevenLabs API error (status %d): %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
logger.DebugCF("voice", "Received response from ElevenLabs API", map[string]any{
|
||||
"status_code": resp.StatusCode,
|
||||
"response_size_bytes": len(body),
|
||||
})
|
||||
|
||||
var result TranscriptionResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
logger.ErrorCF("voice", "Failed to unmarshal response", map[string]any{"error": err})
|
||||
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
|
||||
}
|
||||
|
||||
logger.InfoCF("voice", "ElevenLabs transcription completed successfully", map[string]any{
|
||||
"text_length": len(result.Text),
|
||||
"language": result.Language,
|
||||
"transcription_preview": utils.Truncate(result.Text, 50),
|
||||
})
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (t *ElevenLabsTranscriber) Name() string {
|
||||
return "elevenlabs"
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
package voice
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Ensure ElevenLabsTranscriber satisfies the Transcriber interface at compile time.
|
||||
var _ Transcriber = (*ElevenLabsTranscriber)(nil)
|
||||
|
||||
func TestElevenLabsTranscriberName(t *testing.T) {
|
||||
tr := NewElevenLabsTranscriber("sk_test")
|
||||
if got := tr.Name(); got != "elevenlabs" {
|
||||
t.Errorf("Name() = %q, want %q", got, "elevenlabs")
|
||||
}
|
||||
}
|
||||
|
||||
func TestElevenLabsTranscribe(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
audioPath := filepath.Join(tmpDir, "clip.ogg")
|
||||
if err := os.WriteFile(audioPath, []byte("fake-audio-data"), 0o644); err != nil {
|
||||
t.Fatalf("failed to write fake audio file: %v", err)
|
||||
}
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/v1/speech-to-text" {
|
||||
t.Errorf("unexpected path: %s", r.URL.Path)
|
||||
}
|
||||
if r.Header.Get("Xi-Api-Key") != "sk_test" {
|
||||
t.Errorf("unexpected xi-api-key header: %s", r.Header.Get("Xi-Api-Key"))
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
|
||||
Text: "hello from elevenlabs",
|
||||
Language: "en",
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewElevenLabsTranscriber("sk_test")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
resp, err := tr.Transcribe(context.Background(), audioPath)
|
||||
if err != nil {
|
||||
t.Fatalf("Transcribe() error: %v", err)
|
||||
}
|
||||
if resp.Text != "hello from elevenlabs" {
|
||||
t.Errorf("Text = %q, want %q", resp.Text, "hello from elevenlabs")
|
||||
}
|
||||
if resp.Language != "en" {
|
||||
t.Errorf("Language = %q, want %q", resp.Language, "en")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("api error", func(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, `{"error":"invalid_api_key"}`, http.StatusUnauthorized)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewElevenLabsTranscriber("sk_bad")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
_, err := tr.Transcribe(context.Background(), audioPath)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-200 response, got nil")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("missing file", func(t *testing.T) {
|
||||
tr := NewElevenLabsTranscriber("sk_test")
|
||||
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing file, got nil")
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -54,6 +54,10 @@ func DetectTranscriber(cfg *config.Config) Transcriber {
|
||||
}
|
||||
}
|
||||
|
||||
// ElevenLabs voice config (supports Scribe STT).
|
||||
if key := strings.TrimSpace(cfg.Voice.ElevenLabsAPIKey); key != "" {
|
||||
return NewElevenLabsTranscriber(key)
|
||||
}
|
||||
// Fall back to any model-list entry that uses the groq/ protocol.
|
||||
for _, mc := range cfg.ModelList {
|
||||
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey() != "" {
|
||||
|
||||
@@ -145,6 +145,48 @@ func TestDetectTranscriber(t *testing.T) {
|
||||
}),
|
||||
wantNil: true,
|
||||
},
|
||||
{
|
||||
name: "elevenlabs voice config key",
|
||||
cfg: &config.Config{
|
||||
Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"},
|
||||
},
|
||||
wantName: "elevenlabs",
|
||||
},
|
||||
{
|
||||
name: "elevenlabs takes priority over groq model list",
|
||||
cfg: (&config.Config{
|
||||
Voice: config.VoiceConfig{ElevenLabsAPIKey: "sk_elevenlabs_test"},
|
||||
ModelList: []*config.ModelConfig{
|
||||
{ModelName: "groq", Model: "groq/llama-3.3-70b"},
|
||||
},
|
||||
}).WithSecurity(&config.SecurityConfig{
|
||||
ModelList: map[string]config.ModelSecurityEntry{
|
||||
"groq": {
|
||||
APIKeys: []string{"sk-groq-direct"},
|
||||
},
|
||||
},
|
||||
}),
|
||||
wantName: "elevenlabs",
|
||||
},
|
||||
{
|
||||
name: "voice model name takes priority over elevenlabs",
|
||||
cfg: (&config.Config{
|
||||
Voice: config.VoiceConfig{
|
||||
ModelName: "voice-gemini",
|
||||
ElevenLabsAPIKey: "sk_elevenlabs_test",
|
||||
},
|
||||
ModelList: []*config.ModelConfig{
|
||||
{ModelName: "voice-gemini", Model: "gemini/gemini-2.5-flash"},
|
||||
},
|
||||
}).WithSecurity(&config.SecurityConfig{
|
||||
ModelList: map[string]config.ModelSecurityEntry{
|
||||
"voice-gemini": {
|
||||
APIKeys: []string{"sk-gemini-model"},
|
||||
},
|
||||
},
|
||||
}),
|
||||
wantName: "audio-model",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
|
||||
Reference in New Issue
Block a user