Files
picoclaw/pkg/voice/audio_model_transcriber.go
T
RussellLuo 8ad4b9b497 feat(voice): add audio-model transcription support
- Add `AudioModelTranscriber` for model-based audio transcription via LLM providers
- Support selecting a transcription model with `voice.model_name` in config
- Keep Groq transcription as a fallback and move it into dedicated files with focused tests
- Serialize `data:audio/...` media as input_audio for OpenAI-compatible providers
- Improve transcription logging by rendering error fields as strings
- Add coverage for transcriber detection, audio-model behavior, provider audio serialization, and Groq transcription

Fixes #1890.
2026-03-22 20:07:22 +08:00

116 lines
3.0 KiB
Go

package voice
import (
"context"
"encoding/base64"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/sipeed/picoclaw/pkg/config"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/providers"
"github.com/sipeed/picoclaw/pkg/utils"
)
type AudioModelTranscriber struct {
provider providers.LLMProvider
modelID string
prompt string
}
const (
defaultTranscriptionPrompt = "Transcribe this audio."
)
func audioFormat(path string) (string, error) {
switch strings.ToLower(filepath.Ext(strings.TrimPrefix(path, "file://"))) {
case ".wav":
return "wav", nil
case ".mp3":
return "mp3", nil
case ".aiff", ".aif":
return "aiff", nil
case ".aac":
return "aac", nil
case ".ogg":
return "ogg", nil
case ".flac":
return "flac", nil
default:
return "", fmt.Errorf("unsupported audio format for %q", path)
}
}
func NewAudioModelTranscriber(modelCfg *config.ModelConfig) *AudioModelTranscriber {
if modelCfg == nil {
return nil
}
logger.DebugCF("voice", "Creating audio model transcriber", map[string]any{
"has_api_key": modelCfg.APIKey != "",
"api_base": modelCfg.APIBase,
"model": modelCfg.Model,
})
provider, modelID, err := providers.CreateProviderFromConfig(modelCfg)
if err != nil {
logger.ErrorCF("voice", "Failed to create audio model provider", map[string]any{"error": err})
return nil
}
return &AudioModelTranscriber{
provider: provider,
modelID: modelID,
prompt: defaultTranscriptionPrompt,
}
}
func (t *AudioModelTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) {
logger.InfoCF("voice", "Starting audio model transcription", map[string]any{
"audio_file": audioFilePath,
"model": t.modelID,
})
audioBytes, err := os.ReadFile(audioFilePath)
if err != nil {
logger.ErrorCF("voice", "Failed to read audio file", map[string]any{"path": audioFilePath, "error": err})
return nil, fmt.Errorf("failed to read audio file: %w", err)
}
format, err := audioFormat(audioFilePath)
if err != nil {
logger.ErrorCF("voice", "Failed to detect audio format", map[string]any{"path": audioFilePath, "error": err})
return nil, err
}
resp, err := t.provider.Chat(ctx, []providers.Message{
{
Role: "user",
Content: t.prompt,
Media: []string{
fmt.Sprintf("data:audio/%s;base64,%s", format, base64.StdEncoding.EncodeToString(audioBytes)),
},
},
}, nil, t.modelID, map[string]any{
"temperature": 0,
})
if err != nil {
logger.ErrorCF("voice", "Audio model transcription request failed", map[string]any{"error": err})
return nil, fmt.Errorf("transcription request failed: %w", err)
}
text := strings.TrimSpace(resp.Content)
logger.InfoCF("voice", "Audio model transcription completed successfully", map[string]any{
"text_length": len(text),
"transcription_preview": utils.Truncate(text, 50),
})
return &TranscriptionResponse{Text: text}, nil
}
func (t *AudioModelTranscriber) Name() string {
return "audio-model"
}