feat(provider,web,asr): enhance model management with explicit provider metadata (#2701)

* feat(provider,web): enhance model management with provider options

* fix(asr): enhance compatibility for ElevenLabs transcription model

* fix(provider,web): align provider availability predicates and add flow gating

* fix(web,asr): preserve legacy elevenlabs transcription configs

* fix(provider,web,asr): normalize elevenlabs configs and gate default chat models

* fix: tighten provider catalog and elevenlabs compatibility
This commit is contained in:
LC
2026-05-06 16:06:49 +08:00
committed by GitHub
parent 4d3070e849
commit 81a050555d
26 changed files with 2341 additions and 193 deletions
+4 -3
View File
@@ -82,7 +82,8 @@ Notes:
"model_list": [
{
"model_name": "elevenlabs-asr",
"model": "elevenlabs/scribe_v1"
"provider": "elevenlabs",
"model": "scribe_v1"
}
]
}
@@ -130,7 +131,7 @@ PicoClaw currently supports three main ASR routes:
| Route | Example models | Behavior |
| --- | --- | --- |
| ElevenLabs ASR | `elevenlabs/scribe_v1` | Uses the ElevenLabs transcription API. |
| ElevenLabs ASR | `provider: elevenlabs`, `model: scribe_v1` | Uses the ElevenLabs transcription API. |
| Whisper endpoint models | `openai/whisper-1`, `groq/whisper-large-v3` | Uses an OpenAI-compatible `/audio/transcriptions` endpoint. |
| Audio-capable chat models **(Under construction)** | `openai/gpt-4o-audio-preview`, `gemini/gemini-2.5-flash` | Sends audio to a multimodal chat model and asks it to transcribe. |
@@ -142,7 +143,7 @@ If you are unsure which one to pick, choose Groq Whisper or ElevenLabs first.
1. **Preferred path**: resolve `voice.model_name` against `model_list`.
2. If that resolved model is:
- `elevenlabs/...`, PicoClaw uses the ElevenLabs transcriber.
- an `elevenlabs` provider model, PicoClaw uses the ElevenLabs transcriber.
- an OpenAI-compatible Whisper model, PicoClaw uses the Whisper transcriber.
- an audio-capable chat model, PicoClaw uses `AudioModelTranscriber`.
3. **Fallback path**: if `voice.model_name` is not set, PicoClaw performs a compatibility scan through `model_list` for legacy auto-detected ASR entries.
+4 -3
View File
@@ -82,7 +82,8 @@ model_list:
"model_list": [
{
"model_name": "elevenlabs-asr",
"model": "elevenlabs/scribe_v1"
"provider": "elevenlabs",
"model": "scribe_v1"
}
]
}
@@ -130,7 +131,7 @@ PicoClaw 目前主要支持三种 ASR 路径:
| 路径 | 示例模型 | 行为说明 |
| --- | --- | --- |
| ElevenLabs ASR | `elevenlabs/scribe_v1` | 使用 ElevenLabs 的语音转录接口。 |
| ElevenLabs ASR | `provider: elevenlabs``model: scribe_v1` | 使用 ElevenLabs 的语音转录接口。 |
| Whisper 接口模型 | `openai/whisper-1``groq/whisper-large-v3` | 使用 OpenAI 兼容的 `/audio/transcriptions` 接口。 |
| 支持音频的聊天模型 **(重构中)** | `openai/gpt-4o-audio-preview``gemini/gemini-2.5-flash` | 把音频发给多模态聊天模型,并要求它返回转录结果。 |
@@ -142,7 +143,7 @@ PicoClaw 目前主要支持三种 ASR 路径:
1. **首选路径**:根据 `voice.model_name``model_list` 中找到对应模型。
2. 如果找到的模型属于以下类型:
- `elevenlabs/...`,则使用 ElevenLabs transcriber。
- `provider=elevenlabs` 的模型,则使用 ElevenLabs transcriber。
- OpenAI 兼容的 Whisper 模型,则使用 Whisper transcriber。
- 支持音频输入的聊天模型,则使用 `AudioModelTranscriber`
3. **回退路径**:如果没有设置 `voice.model_name`,PicoClaw 会为了兼容旧配置,扫描 `model_list` 中可自动识别的 ASR 条目。
+21 -6
View File
@@ -8,6 +8,12 @@ import (
"github.com/sipeed/picoclaw/pkg/providers"
)
const elevenLabsSupportedModelID = "scribe_v1"
func ElevenLabsSupportedModelID() string {
return elevenLabsSupportedModelID
}
type Transcriber interface {
Name() string
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
@@ -72,14 +78,23 @@ func whisperModelID(modelCfg *config.ModelConfig) string {
return ""
}
func isElevenLabsTranscriptionModel(modelCfg *config.ModelConfig) bool {
if modelCfg == nil || modelCfg.APIKey() == "" {
return false
}
protocol, _ := providers.ExtractProtocol(modelCfg)
return protocol == "elevenlabs"
}
func transcriberFromModelConfig(modelCfg *config.ModelConfig) Transcriber {
if modelCfg == nil {
return nil
}
protocol, _ := providers.ExtractProtocol(modelCfg)
if protocol == "elevenlabs" && modelCfg.APIKey() != "" {
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase)
if isElevenLabsTranscriptionModel(modelCfg) {
_, modelID := providers.ExtractProtocol(modelCfg)
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase, modelID)
}
if modelID := whisperModelID(modelCfg); modelID != "" {
return NewWhisperTranscriber(modelCfg)
@@ -95,9 +110,9 @@ func fallbackTranscriberFromModelConfig(modelCfg *config.ModelConfig) Transcribe
return nil
}
protocol, _ := providers.ExtractProtocol(modelCfg)
if protocol == "elevenlabs" && modelCfg.APIKey() != "" {
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase)
if isElevenLabsTranscriptionModel(modelCfg) {
_, modelID := providers.ExtractProtocol(modelCfg)
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase, modelID)
}
if modelID := whisperModelID(modelCfg); modelID != "" {
return NewWhisperTranscriber(modelCfg)
+15
View File
@@ -46,6 +46,21 @@ func TestDetectTranscriber(t *testing.T) {
},
wantName: "elevenlabs",
},
{
name: "explicit elevenlabs provider selects elevenlabs transcriber",
cfg: &config.Config{
Voice: config.VoiceConfig{ModelName: "my-asr-model"},
ModelList: []*config.ModelConfig{
{
ModelName: "my-asr-model",
Provider: "elevenlabs",
Model: "scribe_v1",
APIKeys: config.SimpleSecureStrings("sk_elevenlabs_test"),
},
},
},
wantName: "elevenlabs",
},
{
name: "voice model name alias selects whisper transcriber for groq",
cfg: &config.Config{
+7 -2
View File
@@ -20,19 +20,24 @@ import (
type ElevenLabsTranscriber struct {
apiKey string
apiBase string
modelID string
httpClient *http.Client
}
func NewElevenLabsTranscriber(apiKey, apiBase string) *ElevenLabsTranscriber {
func NewElevenLabsTranscriber(apiKey, apiBase, modelID string) *ElevenLabsTranscriber {
logger.DebugCF("voice", "Creating ElevenLabs transcriber", map[string]any{"has_api_key": apiKey != ""})
if apiBase == "" {
apiBase = "https://api.elevenlabs.io"
}
if modelID == "" || modelID != ElevenLabsSupportedModelID() {
modelID = ElevenLabsSupportedModelID()
}
return &ElevenLabsTranscriber{
apiKey: apiKey,
apiBase: apiBase,
modelID: modelID,
httpClient: &http.Client{
Timeout: 120 * time.Second,
},
@@ -74,7 +79,7 @@ func (t *ElevenLabsTranscriber) Transcribe(ctx context.Context, audioFilePath st
return nil, fmt.Errorf("failed to copy file content: %w", err)
}
if err = writer.WriteField("model_id", "scribe_v1"); err != nil {
if err = writer.WriteField("model_id", t.modelID); err != nil {
return nil, fmt.Errorf("failed to write model_id field: %w", err)
}
+81 -4
View File
@@ -3,10 +3,14 @@ package asr
import (
"context"
"encoding/json"
"io"
"mime"
"mime/multipart"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
)
@@ -14,7 +18,7 @@ import (
var _ Transcriber = (*ElevenLabsTranscriber)(nil)
func TestElevenLabsTranscriberName(t *testing.T) {
tr := NewElevenLabsTranscriber("sk_test", "")
tr := NewElevenLabsTranscriber("sk_test", "", "scribe_v1")
if got := tr.Name(); got != "elevenlabs" {
t.Errorf("Name() = %q, want %q", got, "elevenlabs")
}
@@ -35,6 +39,35 @@ func TestElevenLabsTranscribe(t *testing.T) {
if r.Header.Get("Xi-Api-Key") != "sk_test" {
t.Errorf("unexpected xi-api-key header: %s", r.Header.Get("Xi-Api-Key"))
}
mediaType, params, err := mime.ParseMediaType(r.Header.Get("Content-Type"))
if err != nil {
t.Fatalf("ParseMediaType() error = %v", err)
}
if mediaType != "multipart/form-data" {
t.Fatalf("content-type = %q, want multipart/form-data", mediaType)
}
reader := multipart.NewReader(r.Body, params["boundary"])
var gotModelID string
for {
part, err := reader.NextPart()
if err == io.EOF {
break
}
if err != nil {
t.Fatalf("NextPart() error = %v", err)
}
if part.FormName() != "model_id" {
continue
}
body, err := io.ReadAll(part)
if err != nil {
t.Fatalf("ReadAll(part) error = %v", err)
}
gotModelID = strings.TrimSpace(string(body))
}
if gotModelID != "scribe_v1" {
t.Fatalf("model_id = %q, want %q", gotModelID, "scribe_v1")
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(TranscriptionResponse{
Text: "hello from elevenlabs",
@@ -43,7 +76,7 @@ func TestElevenLabsTranscribe(t *testing.T) {
}))
defer srv.Close()
tr := NewElevenLabsTranscriber("sk_test", "")
tr := NewElevenLabsTranscriber("sk_test", "", "scribe_v1")
tr.apiBase = srv.URL
resp, err := tr.Transcribe(context.Background(), audioPath)
@@ -64,7 +97,7 @@ func TestElevenLabsTranscribe(t *testing.T) {
}))
defer srv.Close()
tr := NewElevenLabsTranscriber("sk_bad", "")
tr := NewElevenLabsTranscriber("sk_bad", "", "scribe_v1")
tr.apiBase = srv.URL
_, err := tr.Transcribe(context.Background(), audioPath)
@@ -74,10 +107,54 @@ func TestElevenLabsTranscribe(t *testing.T) {
})
t.Run("missing file", func(t *testing.T) {
tr := NewElevenLabsTranscriber("sk_test", "")
tr := NewElevenLabsTranscriber("sk_test", "", "scribe_v1")
_, err := tr.Transcribe(context.Background(), filepath.Join(tmpDir, "nonexistent.ogg"))
if err == nil {
t.Fatal("expected error for missing file, got nil")
}
})
t.Run("unsupported model falls back to scribe_v1", func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
mediaType, params, err := mime.ParseMediaType(r.Header.Get("Content-Type"))
if err != nil {
t.Fatalf("ParseMediaType() error = %v", err)
}
if mediaType != "multipart/form-data" {
t.Fatalf("content-type = %q, want multipart/form-data", mediaType)
}
reader := multipart.NewReader(r.Body, params["boundary"])
var gotModelID string
for {
part, err := reader.NextPart()
if err == io.EOF {
break
}
if err != nil {
t.Fatalf("NextPart() error = %v", err)
}
if part.FormName() != "model_id" {
continue
}
body, err := io.ReadAll(part)
if err != nil {
t.Fatalf("ReadAll(part) error = %v", err)
}
gotModelID = strings.TrimSpace(string(body))
}
if gotModelID != "scribe_v1" {
t.Fatalf("model_id = %q, want runtime fallback to %q", gotModelID, "scribe_v1")
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(TranscriptionResponse{Text: "ok"})
}))
defer srv.Close()
tr := NewElevenLabsTranscriber("sk_test", "", "unsupported-model")
tr.apiBase = srv.URL
if _, err := tr.Transcribe(context.Background(), audioPath); err != nil {
t.Fatalf("Transcribe() error: %v", err)
}
})
}