refactor(channels): remove channel-side voice transcription (Phase 12)

Remove SetTranscriber and inline transcription logic from 4 channels
(Telegram, Discord, Slack, OneBot) and the gateway wiring. Voice/audio
files are still downloaded and stored in MediaStore with simple text
annotations ([voice], [audio: filename], [file: name]). The pkg/voice
package is preserved for future Agent-level transcription middleware.
This commit is contained in:
Hoshina
2026-02-23 03:47:12 +08:00
parent e10b1e1fd4
commit e00745489d
5 changed files with 17 additions and 157 deletions
+3 -41
View File
@@ -6,7 +6,6 @@ import (
"os"
"os/signal"
"path/filepath"
"strings"
"time"
"github.com/sipeed/picoclaw/cmd/picoclaw/internal"
@@ -14,14 +13,14 @@ import (
"github.com/sipeed/picoclaw/pkg/bus"
"github.com/sipeed/picoclaw/pkg/channels"
_ "github.com/sipeed/picoclaw/pkg/channels/dingtalk"
dch "github.com/sipeed/picoclaw/pkg/channels/discord"
_ "github.com/sipeed/picoclaw/pkg/channels/discord"
_ "github.com/sipeed/picoclaw/pkg/channels/feishu"
_ "github.com/sipeed/picoclaw/pkg/channels/line"
_ "github.com/sipeed/picoclaw/pkg/channels/maixcam"
_ "github.com/sipeed/picoclaw/pkg/channels/onebot"
_ "github.com/sipeed/picoclaw/pkg/channels/qq"
slackch "github.com/sipeed/picoclaw/pkg/channels/slack"
tgramch "github.com/sipeed/picoclaw/pkg/channels/telegram"
_ "github.com/sipeed/picoclaw/pkg/channels/slack"
_ "github.com/sipeed/picoclaw/pkg/channels/telegram"
_ "github.com/sipeed/picoclaw/pkg/channels/wecom"
_ "github.com/sipeed/picoclaw/pkg/channels/whatsapp"
"github.com/sipeed/picoclaw/pkg/config"
@@ -34,7 +33,6 @@ import (
"github.com/sipeed/picoclaw/pkg/providers"
"github.com/sipeed/picoclaw/pkg/state"
"github.com/sipeed/picoclaw/pkg/tools"
"github.com/sipeed/picoclaw/pkg/voice"
)
func gatewayCmd(debug bool) error {
@@ -127,42 +125,6 @@ func gatewayCmd(debug bool) error {
agentLoop.SetChannelManager(channelManager)
agentLoop.SetMediaStore(mediaStore)
var transcriber *voice.GroqTranscriber
groqAPIKey := cfg.Providers.Groq.APIKey
if groqAPIKey == "" {
for _, mc := range cfg.ModelList {
if strings.HasPrefix(mc.Model, "groq/") && mc.APIKey != "" {
groqAPIKey = mc.APIKey
break
}
}
}
if groqAPIKey != "" {
transcriber = voice.NewGroqTranscriber(groqAPIKey)
logger.InfoC("voice", "Groq voice transcription enabled")
}
if transcriber != nil {
if telegramChannel, ok := channelManager.GetChannel("telegram"); ok {
if tc, ok := telegramChannel.(*tgramch.TelegramChannel); ok {
tc.SetTranscriber(transcriber)
logger.InfoC("voice", "Groq transcription attached to Telegram channel")
}
}
if discordChannel, ok := channelManager.GetChannel("discord"); ok {
if dc, ok := discordChannel.(*dch.DiscordChannel); ok {
dc.SetTranscriber(transcriber)
logger.InfoC("voice", "Groq transcription attached to Discord channel")
}
}
if slackChannel, ok := channelManager.GetChannel("slack"); ok {
if sc, ok := slackChannel.(*slackch.SlackChannel); ok {
sc.SetTranscriber(transcriber)
logger.InfoC("voice", "Groq transcription attached to Slack channel")
}
}
}
enabledChannels := channelManager.GetEnabledChannels()
if len(enabledChannels) > 0 {
fmt.Printf("✓ Channels enabled: %s\n", enabledChannels)
+10 -39
View File
@@ -16,24 +16,21 @@ import (
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/media"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
const (
transcriptionTimeout = 30 * time.Second
sendTimeout = 10 * time.Second
sendTimeout = 10 * time.Second
)
type DiscordChannel struct {
*channels.BaseChannel
session *discordgo.Session
config config.DiscordConfig
transcriber *voice.GroqTranscriber
ctx context.Context
cancel context.CancelFunc
typingMu sync.Mutex
typingStop map[string]chan struct{} // chatID → stop signal
botUserID string // stored for mention checking
session *discordgo.Session
config config.DiscordConfig
ctx context.Context
cancel context.CancelFunc
typingMu sync.Mutex
typingStop map[string]chan struct{} // chatID → stop signal
botUserID string // stored for mention checking
}
func NewDiscordChannel(cfg config.DiscordConfig, bus *bus.MessageBus) (*DiscordChannel, error) {
@@ -48,16 +45,11 @@ func NewDiscordChannel(cfg config.DiscordConfig, bus *bus.MessageBus) (*DiscordC
BaseChannel: base,
session: session,
config: cfg,
transcriber: nil,
ctx: context.Background(),
typingStop: make(map[string]chan struct{}),
}, nil
}
func (c *DiscordChannel) SetTranscriber(transcriber *voice.GroqTranscriber) {
c.transcriber = transcriber
}
func (c *DiscordChannel) Start(ctx context.Context) error {
logger.InfoC("discord", "Starting Discord bot")
@@ -265,7 +257,7 @@ func (c *DiscordChannel) handleMessage(s *discordgo.Session, m *discordgo.Messag
return
}
// Check allowlist first to avoid downloading attachments and transcribing for rejected users
// Check allowlist first to avoid downloading attachments for rejected users
if !c.IsAllowed(m.Author.ID) {
logger.DebugCF("discord", "Message rejected by allowlist", map[string]any{
"user_id": m.Author.ID,
@@ -323,29 +315,8 @@ func (c *DiscordChannel) handleMessage(s *discordgo.Session, m *discordgo.Messag
if isAudio {
localPath := c.downloadAttachment(attachment.URL, attachment.Filename)
if localPath != "" {
transcribedText := ""
if c.transcriber != nil && c.transcriber.IsAvailable() {
ctx, cancel := context.WithTimeout(c.ctx, transcriptionTimeout)
result, err := c.transcriber.Transcribe(ctx, localPath)
cancel() // Release context resources immediately to avoid leaks in for loop
if err != nil {
logger.ErrorCF("discord", "Voice transcription failed", map[string]any{
"error": err.Error(),
})
transcribedText = fmt.Sprintf("[audio: %s (transcription failed)]", attachment.Filename)
} else {
transcribedText = fmt.Sprintf("[audio transcription: %s]", result.Text)
logger.DebugCF("discord", "Audio transcribed successfully", map[string]any{
"text": result.Text,
})
}
} else {
transcribedText = fmt.Sprintf("[audio: %s]", attachment.Filename)
}
mediaPaths = append(mediaPaths, storeMedia(localPath, attachment.Filename))
content = appendContent(content, transcribedText)
content = appendContent(content, fmt.Sprintf("[audio: %s]", attachment.Filename))
} else {
logger.WarnCF("discord", "Failed to download audio attachment", map[string]any{
"url": attachment.URL,
+2 -25
View File
@@ -18,7 +18,6 @@ import (
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/media"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type OneBotChannel struct {
@@ -36,7 +35,6 @@ type OneBotChannel struct {
selfID int64
pending map[string]chan json.RawMessage
pendingMu sync.Mutex
transcriber *voice.GroqTranscriber
lastMessageID sync.Map
pendingEmojiMsg sync.Map
}
@@ -112,10 +110,6 @@ func NewOneBotChannel(cfg config.OneBotConfig, messageBus *bus.MessageBus) (*One
}, nil
}
func (c *OneBotChannel) SetTranscriber(transcriber *voice.GroqTranscriber) {
c.transcriber = transcriber
}
func (c *OneBotChannel) setMsgEmojiLike(messageID string, emojiID int, set bool) {
go func() {
_, err := c.sendAPIRequest("set_msg_emoji_like", map[string]any{
@@ -794,25 +788,8 @@ func (c *OneBotChannel) parseMessageSegments(
LoggerPrefix: "onebot",
})
if localPath != "" {
if c.transcriber != nil && c.transcriber.IsAvailable() {
tctx, tcancel := context.WithTimeout(c.ctx, 30*time.Second)
result, err := c.transcriber.Transcribe(tctx, localPath)
tcancel()
if err != nil {
logger.WarnCF("onebot", "Voice transcription failed", map[string]any{
"error": err.Error(),
})
textParts = append(textParts, "[voice (transcription failed)]")
mediaRefs = append(mediaRefs, storeFile(localPath, "voice.amr"))
} else {
textParts = append(textParts, fmt.Sprintf("[voice transcription: %s]", result.Text))
// Still store the file so it can be released later
storeFile(localPath, "voice.amr")
}
} else {
textParts = append(textParts, "[voice]")
mediaRefs = append(mediaRefs, storeFile(localPath, "voice.amr"))
}
textParts = append(textParts, "[voice]")
mediaRefs = append(mediaRefs, storeFile(localPath, "voice.amr"))
}
}
}
+1 -22
View File
@@ -5,7 +5,6 @@ import (
"fmt"
"strings"
"sync"
"time"
"github.com/slack-go/slack"
"github.com/slack-go/slack/slackevents"
@@ -17,7 +16,6 @@ import (
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/media"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type SlackChannel struct {
@@ -27,7 +25,6 @@ type SlackChannel struct {
socketClient *socketmode.Client
botUserID string
teamID string
transcriber *voice.GroqTranscriber
ctx context.Context
cancel context.CancelFunc
pendingAcks sync.Map
@@ -60,10 +57,6 @@ func NewSlackChannel(cfg config.SlackConfig, messageBus *bus.MessageBus) (*Slack
}, nil
}
func (c *SlackChannel) SetTranscriber(transcriber *voice.GroqTranscriber) {
c.transcriber = transcriber
}
func (c *SlackChannel) Start(ctx context.Context) error {
logger.InfoC("slack", "Starting Slack channel (Socket Mode)")
@@ -311,21 +304,7 @@ func (c *SlackChannel) handleMessageEvent(ev *slackevents.MessageEvent) {
continue
}
mediaPaths = append(mediaPaths, storeMedia(localPath, file.Name))
if utils.IsAudioFile(file.Name, file.Mimetype) && c.transcriber != nil && c.transcriber.IsAvailable() {
ctx, cancel := context.WithTimeout(c.ctx, 30*time.Second)
defer cancel()
result, err := c.transcriber.Transcribe(ctx, localPath)
if err != nil {
logger.ErrorCF("slack", "Voice transcription failed", map[string]any{"error": err.Error()})
content += fmt.Sprintf("\n[audio: %s (transcription failed)]", file.Name)
} else {
content += fmt.Sprintf("\n[voice transcription: %s]", result.Text)
}
} else {
content += fmt.Sprintf("\n[file: %s]", file.Name)
}
content += fmt.Sprintf("\n[file: %s]", file.Name)
}
}
+1 -30
View File
@@ -22,7 +22,6 @@ import (
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/media"
"github.com/sipeed/picoclaw/pkg/utils"
"github.com/sipeed/picoclaw/pkg/voice"
)
type TelegramChannel struct {
@@ -32,7 +31,6 @@ type TelegramChannel struct {
commands TelegramCommander
config *config.Config
chatIDs map[string]int64
transcriber *voice.GroqTranscriber
ctx context.Context
cancel context.CancelFunc
placeholders sync.Map // chatID -> messageID
@@ -91,16 +89,11 @@ func NewTelegramChannel(cfg *config.Config, bus *bus.MessageBus) (*TelegramChann
bot: bot,
config: cfg,
chatIDs: make(map[string]int64),
transcriber: nil,
placeholders: sync.Map{},
stopThinking: sync.Map{},
}, nil
}
func (c *TelegramChannel) SetTranscriber(transcriber *voice.GroqTranscriber) {
c.transcriber = transcriber
}
func (c *TelegramChannel) Start(ctx context.Context) error {
logger.InfoC("telegram", "Starting Telegram bot (polling mode)...")
@@ -391,32 +384,10 @@ func (c *TelegramChannel) handleMessage(ctx context.Context, message *telego.Mes
if voicePath != "" {
mediaPaths = append(mediaPaths, storeMedia(voicePath, "voice.ogg"))
transcribedText := ""
if c.transcriber != nil && c.transcriber.IsAvailable() {
transcriberCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
result, err := c.transcriber.Transcribe(transcriberCtx, voicePath)
if err != nil {
logger.ErrorCF("telegram", "Voice transcription failed", map[string]any{
"error": err.Error(),
"path": voicePath,
})
transcribedText = "[voice (transcription failed)]"
} else {
transcribedText = fmt.Sprintf("[voice transcription: %s]", result.Text)
logger.InfoCF("telegram", "Voice transcribed successfully", map[string]any{
"text": result.Text,
})
}
} else {
transcribedText = "[voice]"
}
if content != "" {
content += "\n"
}
content += transcribedText
content += "[voice]"
}
}