mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat(provider,web,asr): enhance model management with explicit provider metadata (#2701)
* feat(provider,web): enhance model management with provider options * fix(asr): enhance compatibility for ElevenLabs transcription model * fix(provider,web): align provider availability predicates and add flow gating * fix(web,asr): preserve legacy elevenlabs transcription configs * fix(provider,web,asr): normalize elevenlabs configs and gate default chat models * fix: tighten provider catalog and elevenlabs compatibility
This commit is contained in:
@@ -82,7 +82,8 @@ Notes:
|
||||
"model_list": [
|
||||
{
|
||||
"model_name": "elevenlabs-asr",
|
||||
"model": "elevenlabs/scribe_v1"
|
||||
"provider": "elevenlabs",
|
||||
"model": "scribe_v1"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -130,7 +131,7 @@ PicoClaw currently supports three main ASR routes:
|
||||
|
||||
| Route | Example models | Behavior |
|
||||
| --- | --- | --- |
|
||||
| ElevenLabs ASR | `elevenlabs/scribe_v1` | Uses the ElevenLabs transcription API. |
|
||||
| ElevenLabs ASR | `provider: elevenlabs`, `model: scribe_v1` | Uses the ElevenLabs transcription API. |
|
||||
| Whisper endpoint models | `openai/whisper-1`, `groq/whisper-large-v3` | Uses an OpenAI-compatible `/audio/transcriptions` endpoint. |
|
||||
| Audio-capable chat models **(Under construction)** | `openai/gpt-4o-audio-preview`, `gemini/gemini-2.5-flash` | Sends audio to a multimodal chat model and asks it to transcribe. |
|
||||
|
||||
@@ -142,7 +143,7 @@ If you are unsure which one to pick, choose Groq Whisper or ElevenLabs first.
|
||||
|
||||
1. **Preferred path**: resolve `voice.model_name` against `model_list`.
|
||||
2. If that resolved model is:
|
||||
- `elevenlabs/...`, PicoClaw uses the ElevenLabs transcriber.
|
||||
- an `elevenlabs` provider model, PicoClaw uses the ElevenLabs transcriber.
|
||||
- an OpenAI-compatible Whisper model, PicoClaw uses the Whisper transcriber.
|
||||
- an audio-capable chat model, PicoClaw uses `AudioModelTranscriber`.
|
||||
3. **Fallback path**: if `voice.model_name` is not set, PicoClaw performs a compatibility scan through `model_list` for legacy auto-detected ASR entries.
|
||||
|
||||
@@ -82,7 +82,8 @@ model_list:
|
||||
"model_list": [
|
||||
{
|
||||
"model_name": "elevenlabs-asr",
|
||||
"model": "elevenlabs/scribe_v1"
|
||||
"provider": "elevenlabs",
|
||||
"model": "scribe_v1"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -130,7 +131,7 @@ PicoClaw 目前主要支持三种 ASR 路径:
|
||||
|
||||
| 路径 | 示例模型 | 行为说明 |
|
||||
| --- | --- | --- |
|
||||
| ElevenLabs ASR | `elevenlabs/scribe_v1` | 使用 ElevenLabs 的语音转录接口。 |
|
||||
| ElevenLabs ASR | `provider: elevenlabs`,`model: scribe_v1` | 使用 ElevenLabs 的语音转录接口。 |
|
||||
| Whisper 接口模型 | `openai/whisper-1`、`groq/whisper-large-v3` | 使用 OpenAI 兼容的 `/audio/transcriptions` 接口。 |
|
||||
| 支持音频的聊天模型 **(重构中)** | `openai/gpt-4o-audio-preview`、`gemini/gemini-2.5-flash` | 把音频发给多模态聊天模型,并要求它返回转录结果。 |
|
||||
|
||||
@@ -142,7 +143,7 @@ PicoClaw 目前主要支持三种 ASR 路径:
|
||||
|
||||
1. **首选路径**:根据 `voice.model_name` 在 `model_list` 中找到对应模型。
|
||||
2. 如果找到的模型属于以下类型:
|
||||
- `elevenlabs/...`,则使用 ElevenLabs transcriber。
|
||||
- `provider=elevenlabs` 的模型,则使用 ElevenLabs transcriber。
|
||||
- OpenAI 兼容的 Whisper 模型,则使用 Whisper transcriber。
|
||||
- 支持音频输入的聊天模型,则使用 `AudioModelTranscriber`。
|
||||
3. **回退路径**:如果没有设置 `voice.model_name`,PicoClaw 会为了兼容旧配置,扫描 `model_list` 中可自动识别的 ASR 条目。
|
||||
|
||||
+21
-6
@@ -8,6 +8,12 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/providers"
|
||||
)
|
||||
|
||||
const elevenLabsSupportedModelID = "scribe_v1"
|
||||
|
||||
func ElevenLabsSupportedModelID() string {
|
||||
return elevenLabsSupportedModelID
|
||||
}
|
||||
|
||||
type Transcriber interface {
|
||||
Name() string
|
||||
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
|
||||
@@ -72,14 +78,23 @@ func whisperModelID(modelCfg *config.ModelConfig) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func isElevenLabsTranscriptionModel(modelCfg *config.ModelConfig) bool {
|
||||
if modelCfg == nil || modelCfg.APIKey() == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
protocol, _ := providers.ExtractProtocol(modelCfg)
|
||||
return protocol == "elevenlabs"
|
||||
}
|
||||
|
||||
func transcriberFromModelConfig(modelCfg *config.ModelConfig) Transcriber {
|
||||
if modelCfg == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
protocol, _ := providers.ExtractProtocol(modelCfg)
|
||||
if protocol == "elevenlabs" && modelCfg.APIKey() != "" {
|
||||
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase)
|
||||
if isElevenLabsTranscriptionModel(modelCfg) {
|
||||
_, modelID := providers.ExtractProtocol(modelCfg)
|
||||
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase, modelID)
|
||||
}
|
||||
if modelID := whisperModelID(modelCfg); modelID != "" {
|
||||
return NewWhisperTranscriber(modelCfg)
|
||||
@@ -95,9 +110,9 @@ func fallbackTranscriberFromModelConfig(modelCfg *config.ModelConfig) Transcribe
|
||||
return nil
|
||||
}
|
||||
|
||||
protocol, _ := providers.ExtractProtocol(modelCfg)
|
||||
if protocol == "elevenlabs" && modelCfg.APIKey() != "" {
|
||||
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase)
|
||||
if isElevenLabsTranscriptionModel(modelCfg) {
|
||||
_, modelID := providers.ExtractProtocol(modelCfg)
|
||||
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase, modelID)
|
||||
}
|
||||
if modelID := whisperModelID(modelCfg); modelID != "" {
|
||||
return NewWhisperTranscriber(modelCfg)
|
||||
|
||||
@@ -46,6 +46,21 @@ func TestDetectTranscriber(t *testing.T) {
|
||||
},
|
||||
wantName: "elevenlabs",
|
||||
},
|
||||
{
|
||||
name: "explicit elevenlabs provider selects elevenlabs transcriber",
|
||||
cfg: &config.Config{
|
||||
Voice: config.VoiceConfig{ModelName: "my-asr-model"},
|
||||
ModelList: []*config.ModelConfig{
|
||||
{
|
||||
ModelName: "my-asr-model",
|
||||
Provider: "elevenlabs",
|
||||
Model: "scribe_v1",
|
||||
APIKeys: config.SimpleSecureStrings("sk_elevenlabs_test"),
|
||||
},
|
||||
},
|
||||
},
|
||||
wantName: "elevenlabs",
|
||||
},
|
||||
{
|
||||
name: "voice model name alias selects whisper transcriber for groq",
|
||||
cfg: &config.Config{
|
||||
|
||||
@@ -20,19 +20,24 @@ import (
|
||||
type ElevenLabsTranscriber struct {
|
||||
apiKey string
|
||||
apiBase string
|
||||
modelID string
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
func NewElevenLabsTranscriber(apiKey, apiBase string) *ElevenLabsTranscriber {
|
||||
func NewElevenLabsTranscriber(apiKey, apiBase, modelID string) *ElevenLabsTranscriber {
|
||||
logger.DebugCF("voice", "Creating ElevenLabs transcriber", map[string]any{"has_api_key": apiKey != ""})
|
||||
|
||||
if apiBase == "" {
|
||||
apiBase = "https://api.elevenlabs.io"
|
||||
}
|
||||
if modelID == "" || modelID != ElevenLabsSupportedModelID() {
|
||||
modelID = ElevenLabsSupportedModelID()
|
||||
}
|
||||
|
||||
return &ElevenLabsTranscriber{
|
||||
apiKey: apiKey,
|
||||
apiBase: apiBase,
|
||||
modelID: modelID,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 120 * time.Second,
|
||||
},
|
||||
@@ -74,7 +79,7 @@ func (t *ElevenLabsTranscriber) Transcribe(ctx context.Context, audioFilePath st
|
||||
return nil, fmt.Errorf("failed to copy file content: %w", err)
|
||||
}
|
||||
|
||||
if err = writer.WriteField("model_id", "scribe_v1"); err != nil {
|
||||
if err = writer.WriteField("model_id", t.modelID); err != nil {
|
||||
return nil, fmt.Errorf("failed to write model_id field: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -3,10 +3,14 @@ package asr
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"mime"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -14,7 +18,7 @@ import (
|
||||
var _ Transcriber = (*ElevenLabsTranscriber)(nil)
|
||||
|
||||
func TestElevenLabsTranscriberName(t *testing.T) {
|
||||
tr := NewElevenLabsTranscriber("sk_test", "")
|
||||
tr := NewElevenLabsTranscriber("sk_test", "", "scribe_v1")
|
||||
if got := tr.Name(); got != "elevenlabs" {
|
||||
t.Errorf("Name() = %q, want %q", got, "elevenlabs")
|
||||
}
|
||||
@@ -35,6 +39,35 @@ func TestElevenLabsTranscribe(t *testing.T) {
|
||||
if r.Header.Get("Xi-Api-Key") != "sk_test" {
|
||||
t.Errorf("unexpected xi-api-key header: %s", r.Header.Get("Xi-Api-Key"))
|
||||
}
|
||||
mediaType, params, err := mime.ParseMediaType(r.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
t.Fatalf("ParseMediaType() error = %v", err)
|
||||
}
|
||||
if mediaType != "multipart/form-data" {
|
||||
t.Fatalf("content-type = %q, want multipart/form-data", mediaType)
|
||||
}
|
||||
reader := multipart.NewReader(r.Body, params["boundary"])
|
||||
var gotModelID string
|
||||
for {
|
||||
part, err := reader.NextPart()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("NextPart() error = %v", err)
|
||||
}
|
||||
if part.FormName() != "model_id" {
|
||||
continue
|
||||
}
|
||||
body, err := io.ReadAll(part)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadAll(part) error = %v", err)
|
||||
}
|
||||
gotModelID = strings.TrimSpace(string(body))
|
||||
}
|
||||
if gotModelID != "scribe_v1" {
|
||||
t.Fatalf("model_id = %q, want %q", gotModelID, "scribe_v1")
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
|
||||
Text: "hello from elevenlabs",
|
||||
@@ -43,7 +76,7 @@ func TestElevenLabsTranscribe(t *testing.T) {
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewElevenLabsTranscriber("sk_test", "")
|
||||
tr := NewElevenLabsTranscriber("sk_test", "", "scribe_v1")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
resp, err := tr.Transcribe(context.Background(), audioPath)
|
||||
@@ -64,7 +97,7 @@ func TestElevenLabsTranscribe(t *testing.T) {
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewElevenLabsTranscriber("sk_bad", "")
|
||||
tr := NewElevenLabsTranscriber("sk_bad", "", "scribe_v1")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
_, err := tr.Transcribe(context.Background(), audioPath)
|
||||
@@ -74,10 +107,54 @@ func TestElevenLabsTranscribe(t *testing.T) {
|
||||
})
|
||||
|
||||
t.Run("missing file", func(t *testing.T) {
|
||||
tr := NewElevenLabsTranscriber("sk_test", "")
|
||||
tr := NewElevenLabsTranscriber("sk_test", "", "scribe_v1")
|
||||
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing file, got nil")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("unsupported model falls back to scribe_v1", func(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mediaType, params, err := mime.ParseMediaType(r.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
t.Fatalf("ParseMediaType() error = %v", err)
|
||||
}
|
||||
if mediaType != "multipart/form-data" {
|
||||
t.Fatalf("content-type = %q, want multipart/form-data", mediaType)
|
||||
}
|
||||
reader := multipart.NewReader(r.Body, params["boundary"])
|
||||
var gotModelID string
|
||||
for {
|
||||
part, err := reader.NextPart()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("NextPart() error = %v", err)
|
||||
}
|
||||
if part.FormName() != "model_id" {
|
||||
continue
|
||||
}
|
||||
body, err := io.ReadAll(part)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadAll(part) error = %v", err)
|
||||
}
|
||||
gotModelID = strings.TrimSpace(string(body))
|
||||
}
|
||||
if gotModelID != "scribe_v1" {
|
||||
t.Fatalf("model_id = %q, want runtime fallback to %q", gotModelID, "scribe_v1")
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(TranscriptionResponse{Text: "ok"})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
tr := NewElevenLabsTranscriber("sk_test", "", "unsupported-model")
|
||||
tr.apiBase = srv.URL
|
||||
|
||||
if _, err := tr.Transcribe(context.Background(), audioPath); err != nil {
|
||||
t.Fatalf("Transcribe() error: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user