mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
77b0c43392
* refactor: support explicit model list providers * fix(web): preserve explicit model providers * fix(web): preserve legacy provider prefixes on model updates fix(models): normalize explicit provider-prefixed ids fix(api): preserve legacy model updates across providers fix(agent): preserve config identity for explicit provider refs * fix ci
132 lines
3.9 KiB
Go
132 lines
3.9 KiB
Go
package asr
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
|
|
"github.com/sipeed/picoclaw/pkg/config"
|
|
"github.com/sipeed/picoclaw/pkg/providers"
|
|
)
|
|
|
|
type Transcriber interface {
|
|
Name() string
|
|
Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error)
|
|
}
|
|
|
|
type TranscriptionResponse struct {
|
|
Text string `json:"text"`
|
|
Language string `json:"language,omitempty"`
|
|
Duration float64 `json:"duration,omitempty"`
|
|
}
|
|
|
|
func supportsAudioTranscription(modelCfg *config.ModelConfig) bool {
|
|
protocol, _ := providers.ExtractProtocol(modelCfg)
|
|
|
|
switch protocol {
|
|
case "openai", "azure", "azure-openai",
|
|
"litellm", "openrouter", "groq", "zhipu", "gemini", "nvidia",
|
|
"ollama", "moonshot", "shengsuanyun", "deepseek", "cerebras",
|
|
"vivgrid", "volcengine", "vllm", "qwen", "qwen-portal", "qwen-intl", "qwen-international", "dashscope-intl",
|
|
"qwen-us", "dashscope-us", "mistral", "avian", "minimax", "longcat", "modelscope", "novita",
|
|
"coding-plan", "alibaba-coding", "qwen-coding", "zai":
|
|
// These protocols all go through the OpenAI-compatible or Azure provider path in
|
|
// providers.CreateProviderFromConfig, so they are the only ones that can supply
|
|
// the audio media payload shape expected by NewAudioModelTranscriber.
|
|
|
|
// TODO: Further restrict this by modelID, since not every model under these
|
|
// protocols supports audio transcription.
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func supportsWhisperTranscription(modelCfg *config.ModelConfig) bool {
|
|
protocol, _ := providers.ExtractProtocol(modelCfg)
|
|
|
|
switch protocol {
|
|
case "openai", "litellm", "openrouter", "groq", "zhipu", "gemini", "nvidia",
|
|
"ollama", "moonshot", "shengsuanyun", "deepseek", "cerebras",
|
|
"vivgrid", "volcengine", "vllm", "qwen", "qwen-portal", "qwen-intl", "qwen-international", "dashscope-intl",
|
|
"qwen-us", "dashscope-us", "mistral", "avian", "minimax", "longcat", "modelscope", "novita",
|
|
"coding-plan", "alibaba-coding", "qwen-coding", "zai", "mimo":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func whisperModelID(modelCfg *config.ModelConfig) string {
|
|
if modelCfg == nil || modelCfg.APIKey() == "" {
|
|
return ""
|
|
}
|
|
|
|
if !supportsWhisperTranscription(modelCfg) {
|
|
return ""
|
|
}
|
|
|
|
_, modelID := providers.ExtractProtocol(modelCfg)
|
|
if strings.Contains(strings.ToLower(modelID), "whisper") {
|
|
return modelID
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func transcriberFromModelConfig(modelCfg *config.ModelConfig) Transcriber {
|
|
if modelCfg == nil {
|
|
return nil
|
|
}
|
|
|
|
protocol, _ := providers.ExtractProtocol(modelCfg)
|
|
if protocol == "elevenlabs" && modelCfg.APIKey() != "" {
|
|
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase)
|
|
}
|
|
if modelID := whisperModelID(modelCfg); modelID != "" {
|
|
return NewWhisperTranscriber(modelCfg)
|
|
}
|
|
if supportsAudioTranscription(modelCfg) {
|
|
return NewAudioModelTranscriber(modelCfg)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func fallbackTranscriberFromModelConfig(modelCfg *config.ModelConfig) Transcriber {
|
|
if modelCfg == nil {
|
|
return nil
|
|
}
|
|
|
|
protocol, _ := providers.ExtractProtocol(modelCfg)
|
|
if protocol == "elevenlabs" && modelCfg.APIKey() != "" {
|
|
return NewElevenLabsTranscriber(modelCfg.APIKey(), modelCfg.APIBase)
|
|
}
|
|
if modelID := whisperModelID(modelCfg); modelID != "" {
|
|
return NewWhisperTranscriber(modelCfg)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// DetectTranscriber inspects cfg and returns the appropriate Transcriber, or
|
|
// nil if no supported transcription provider is configured.
|
|
func DetectTranscriber(cfg *config.Config) Transcriber {
|
|
if cfg == nil {
|
|
return nil
|
|
}
|
|
|
|
if modelName := strings.TrimSpace(cfg.Voice.ModelName); modelName != "" {
|
|
modelCfg, err := cfg.GetModelConfig(modelName)
|
|
if err == nil {
|
|
if tr := transcriberFromModelConfig(modelCfg); tr != nil {
|
|
return tr
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to compatibility scanning for legacy auto-detected ASR providers.
|
|
for _, mc := range cfg.ModelList {
|
|
if tr := fallbackTranscriberFromModelConfig(mc); tr != nil {
|
|
return tr
|
|
}
|
|
}
|
|
return nil
|
|
}
|