picoclaw/pkg/voice/audio_model_transcriber.go

package voice

import (
	"context"
	"encoding/base64"
	"fmt"
	"os"
	"path/filepath"
	"strings"

	"github.com/sipeed/picoclaw/pkg/config"
	"github.com/sipeed/picoclaw/pkg/logger"
	"github.com/sipeed/picoclaw/pkg/providers"
	"github.com/sipeed/picoclaw/pkg/utils"
)

type AudioModelTranscriber struct {
	provider providers.LLMProvider
	modelID  string
	prompt   string
}

const (
	defaultTranscriptionPrompt = "Transcribe this audio."
)

func audioFormat(path string) (string, error) {
	switch strings.ToLower(filepath.Ext(strings.TrimPrefix(path, "file://"))) {
	case ".wav":
		return "wav", nil
	case ".mp3":
		return "mp3", nil
	case ".aiff", ".aif":
		return "aiff", nil
	case ".aac":
		return "aac", nil
	case ".ogg":
		return "ogg", nil
	case ".flac":
		return "flac", nil
	default:
		return "", fmt.Errorf("unsupported audio format for %q", path)
	}
}

func NewAudioModelTranscriber(modelCfg *config.ModelConfig) *AudioModelTranscriber {
	if modelCfg == nil {
		return nil
	}

	logger.DebugCF("voice", "Creating audio model transcriber", map[string]any{
		"has_api_key": modelCfg.APIKey != "",
		"api_base":    modelCfg.APIBase,
		"model":       modelCfg.Model,
	})

	provider, modelID, err := providers.CreateProviderFromConfig(modelCfg)
	if err != nil {
		logger.ErrorCF("voice", "Failed to create audio model provider", map[string]any{"error": err})
		return nil
	}

	return &AudioModelTranscriber{
		provider: provider,
		modelID:  modelID,
		prompt:   defaultTranscriptionPrompt,
	}
}

func (t *AudioModelTranscriber) Transcribe(ctx context.Context, audioFilePath string) (*TranscriptionResponse, error) {
	logger.InfoCF("voice", "Starting audio model transcription", map[string]any{
		"audio_file": audioFilePath,
		"model":      t.modelID,
	})

	audioBytes, err := os.ReadFile(audioFilePath)
	if err != nil {
		logger.ErrorCF("voice", "Failed to read audio file", map[string]any{"path": audioFilePath, "error": err})
		return nil, fmt.Errorf("failed to read audio file: %w", err)
	}

	format, err := audioFormat(audioFilePath)
	if err != nil {
		logger.ErrorCF("voice", "Failed to detect audio format", map[string]any{"path": audioFilePath, "error": err})
		return nil, err
	}

	resp, err := t.provider.Chat(ctx, []providers.Message{
		{
			Role:    "user",
			Content: t.prompt,
			Media: []string{
				fmt.Sprintf("data:audio/%s;base64,%s", format, base64.StdEncoding.EncodeToString(audioBytes)),
			},
		},
	}, nil, t.modelID, map[string]any{
		"temperature": 0,
	})
	if err != nil {
		logger.ErrorCF("voice", "Audio model transcription request failed", map[string]any{"error": err})
		return nil, fmt.Errorf("transcription request failed: %w", err)
	}

	text := strings.TrimSpace(resp.Content)
	logger.InfoCF("voice", "Audio model transcription completed successfully", map[string]any{
		"text_length":           len(text),
		"transcription_preview": utils.Truncate(text, 50),
	})

	return &TranscriptionResponse{Text: text}, nil
}

func (t *AudioModelTranscriber) Name() string {
	return "audio-model"
}