mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
96 lines
2.5 KiB
Go
96 lines
2.5 KiB
Go
package voice
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/sipeed/picoclaw/pkg/config"
|
|
"github.com/sipeed/picoclaw/pkg/logger"
|
|
"github.com/sipeed/picoclaw/pkg/providers"
|
|
"github.com/sipeed/picoclaw/pkg/utils"
|
|
)
|
|
|
|
type AudioModelTranscriber struct {
|
|
provider providers.LLMProvider
|
|
modelID string
|
|
prompt string
|
|
}
|
|
|
|
const (
|
|
defaultTranscriptionPrompt = "Transcribe this audio."
|
|
)
|
|
|
|
func NewAudioModelTranscriber(modelCfg *config.ModelConfig) *AudioModelTranscriber {
|
|
if modelCfg == nil {
|
|
return nil
|
|
}
|
|
|
|
logger.DebugCF("voice", "Creating audio model transcriber", map[string]any{
|
|
"has_api_key": modelCfg.APIKey() != "",
|
|
"api_base": modelCfg.APIBase,
|
|
"model": modelCfg.Model,
|
|
})
|
|
|
|
provider, modelID, err := providers.CreateProviderFromConfig(modelCfg)
|
|
if err != nil {
|
|
logger.ErrorCF("voice", "Failed to create audio model provider", map[string]any{"error": err})
|
|
return nil
|
|
}
|
|
|
|
return &AudioModelTranscriber{
|
|
provider: provider,
|
|
modelID: modelID,
|
|
prompt: defaultTranscriptionPrompt,
|
|
}
|
|
}
|
|
|
|
func (t *AudioModelTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) {
|
|
logger.InfoCF("voice", "Starting audio model transcription", map[string]any{
|
|
"audio_file": audioFilePath,
|
|
"model": t.modelID,
|
|
})
|
|
|
|
audioBytes, err := os.ReadFile(audioFilePath)
|
|
if err != nil {
|
|
logger.ErrorCF("voice", "Failed to read audio file", map[string]any{"path": audioFilePath, "error": err})
|
|
return nil, fmt.Errorf("failed to read audio file: %w", err)
|
|
}
|
|
|
|
format, err := utils.AudioFormat(audioFilePath)
|
|
if err != nil {
|
|
logger.ErrorCF("voice", "Failed to detect audio format", map[string]any{"path": audioFilePath, "error": err})
|
|
return nil, err
|
|
}
|
|
|
|
resp, err := t.provider.Chat(ctx, []providers.Message{
|
|
{
|
|
Role: "user",
|
|
Content: t.prompt,
|
|
Media: []string{
|
|
fmt.Sprintf("data:audio/%s;base64,%s", format, base64.StdEncoding.EncodeToString(audioBytes)),
|
|
},
|
|
},
|
|
}, nil, t.modelID, map[string]any{
|
|
"temperature": 0,
|
|
})
|
|
if err != nil {
|
|
logger.ErrorCF("voice", "Audio model transcription request failed", map[string]any{"error": err})
|
|
return nil, fmt.Errorf("transcription request failed: %w", err)
|
|
}
|
|
|
|
text := strings.TrimSpace(resp.Content)
|
|
logger.InfoCF("voice", "Audio model transcription completed successfully", map[string]any{
|
|
"text_length": len(text),
|
|
"transcription_preview": utils.Truncate(text, 50),
|
|
})
|
|
|
|
return &TranscriptionResponse{Text: text}, nil
|
|
}
|
|
|
|
func (t *AudioModelTranscriber) Name() string {
|
|
return "audio-model"
|
|
}
|