Files
picoclaw/pkg/audio/asr/asr.go
T
LC 548dc15acd refactor(models): unify provider metadata around backend catalog (#2896)
* feat(models): unify provider metadata around backend catalog

- Move shared provider metadata and alias normalization into backend-owned provider catalog
- Expose display, fetch, auth, and default model metadata through /api/models provider_options
- Replace frontend static provider registry with catalog-driven selection, validation, grouping, and fallback rendering
- Treat provider default api_base as placeholder and effective fetch/test base while keep submitted api_base separate from derived defaults
- Add model page retry handling, touched locale updates, and provider metadata assertions in backend tests

* fix(models): canonicalize backend provider aliases and common models

* fix(models): restore deepseek common model recommendations
2026-05-20 11:50:34 +08:00

147 lines
4.0 KiB
Go

package asr
import (
"context"
"strings"
"github.com/sipeed/picoclaw/pkg/config"
"github.com/sipeed/picoclaw/pkg/providers"
)
const elevenLabsSupportedModelID = "scribe_v1"
func ElevenLabsSupportedModelID() string {
return elevenLabsSupportedModelID
}
type Transcriber interface {
Name() string
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
}
type TranscriptionResponse struct {
Text string `json:"text"`
Language string `json:"language,omitempty"`
Duration float64 `json:"duration,omitempty"`
}
func supportsAudioTranscription(modelCfg *config.ModelConfig) bool {
protocol, _ := providers.ExtractProtocol(modelCfg)
switch protocol {
case "openai", "azure",
"litellm", "openrouter", "groq", "zhipu", "gemini", "nvidia",
"ollama", "moonshot", "shengsuanyun", "deepseek", "cerebras",
"vivgrid", "volcengine", "vllm", "qwen-portal", "qwen-intl", "qwen-us",
"mistral", "avian", "minimax", "longcat", "modelscope", "novita",
"alibaba-coding", "zai":
// These protocols all go through the OpenAI-compatible or Azure provider path in
// providers.CreateProviderFromConfig, so they are the only ones that can supply
// the audio media payload shape expected by NewAudioModelTranscriber.
// TODO: Further restrict this by modelID, since not every model under these
// protocols supports audio transcription.
return true
default:
return false
}
}
func supportsWhisperTranscription(modelCfg *config.ModelConfig) bool {
protocol, _ := providers.ExtractProtocol(modelCfg)
switch protocol {
case "openai", "litellm", "openrouter", "groq", "zhipu", "gemini", "nvidia",
"ollama", "moonshot", "shengsuanyun", "deepseek", "cerebras",
"vivgrid", "volcengine", "vllm", "qwen-portal", "qwen-intl", "qwen-us",
"mistral", "avian", "minimax", "longcat", "modelscope", "novita",
"alibaba-coding", "zai", "mimo":
return true
default:
return false
}
}
func whisperModelID(modelCfg *config.ModelConfig) string {
if modelCfg == nil || modelCfg.APIKey() == "" {
return ""
}
if !supportsWhisperTranscription(modelCfg) {
return ""
}
_, modelID := providers.ExtractProtocol(modelCfg)
if strings.Contains(strings.ToLower(modelID), "whisper") {
return modelID
}
return ""
}
func isElevenLabsTranscriptionModel(modelCfg *config.ModelConfig) bool {
if modelCfg == nil || modelCfg.APIKey() == "" {
return false
}
protocol, _ := providers.ExtractProtocol(modelCfg)
return protocol == "elevenlabs"
}
func transcriberFromModelConfig(modelCfg *config.ModelConfig) Transcriber {
if modelCfg == nil {
return nil
}
if isElevenLabsTranscriptionModel(modelCfg) {
_, modelID := providers.ExtractProtocol(modelCfg)
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase, modelID)
}
if modelID := whisperModelID(modelCfg); modelID != "" {
return NewWhisperTranscriber(modelCfg)
}
if supportsAudioTranscription(modelCfg) {
return NewAudioModelTranscriber(modelCfg)
}
return nil
}
func fallbackTranscriberFromModelConfig(modelCfg *config.ModelConfig) Transcriber {
if modelCfg == nil {
return nil
}
if isElevenLabsTranscriptionModel(modelCfg) {
_, modelID := providers.ExtractProtocol(modelCfg)
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase, modelID)
}
if modelID := whisperModelID(modelCfg); modelID != "" {
return NewWhisperTranscriber(modelCfg)
}
return nil
}
// DetectTranscriber inspects cfg and returns the appropriate Transcriber, or
// nil if no supported transcription provider is configured.
func DetectTranscriber(cfg *config.Config) Transcriber {
if cfg == nil {
return nil
}
if modelName := strings.TrimSpace(cfg.Voice.ModelName); modelName != "" {
modelCfg, err := cfg.GetModelConfig(modelName)
if err == nil {
if tr := transcriberFromModelConfig(modelCfg); tr != nil {
return tr
}
}
}
// Fall back to compatibility scanning for legacy auto-detected ASR providers.
for _, mc := range cfg.ModelList {
if tr := fallbackTranscriberFromModelConfig(mc); tr != nil {
return tr
}
}
return nil
}