Files
picoclaw/pkg/agent/agent_media.go
T
Guoguo cb1e1a3595 fix(feishu): fix image download with API fallback and post image support (#2708)
* fix(feishu): fix image download with API fallback and post image support

- Add Image.Get API fallback when MessageResource.Get fails (different
  permission scope: im:resource vs im:message:readonly)
- Extract and download images from post (rich text) messages
- Extract images from interactive card messages
- Deduplicate post image keys across locales
- Add comprehensive tests for new helpers

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* feat(media): add image path tags alongside base64 for LLM file access

Images are still base64-encoded into msg.Media for multimodal LLMs,
but now also get [image:path] tags injected into message content so
the LLM knows the local file path for save/forward operations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* refactor(media): only auto-inject images for tool results, not user messages

Channel-received images (role=user) now get path tags only, letting
the LLM decide whether to view via load_image or just operate on
the file. Tool result images (role=tool, e.g. load_image) are
base64-encoded into a synthetic user message appended after the tool
message, since many LLM APIs don't support image_url in tool messages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix(media): preserve tool-message ordering for multi-tool-call scenarios

Move synthetic user message (carrying base64 tool images) to after the
entire contiguous tool-message block instead of immediately after each
tool message. This preserves the assistant→tool→tool ordering required
by OpenAI-compatible APIs.

Also fix load_image to use generic [image: photo] placeholder so
injectPathTags can properly replace it with the actual path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix(test): update load_image test for [image: photo] placeholder

The test was checking ForLLM for the media:// ref, but load_image now
emits the generic [image: photo] placeholder instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix(media): match all channel image placeholders in injectPathTags

Different channels emit different placeholder formats — Telegram/Feishu
use [image: photo], WeCom/WeChat/Line use bare [image], QQ/Discord use
[image: <filename>]. The previous string-match code only handled
[image: photo], so for the other channels the path tag was appended as
a duplicate, producing content like "[image] [image:/path]".

Switch to per-type regex that matches all generic placeholder shapes
while leaving path tags ([image:/path]) untouched. Also fixes the same
issue for [audio], [video], [file] tags. Added test coverage for the
various placeholder shapes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix(media): skip path tag append for JSON content (Feishu cards/posts)

When content is structured JSON (interactive cards, post messages),
injectPathTags now skips the fallback append — only placeholder
replacement is attempted. This prevents corrupting JSON payloads
like {"schema":"2.0",...} with appended [image:/path] tags.

Adds looksLikeJSON() helper and three test cases covering JSON
objects, arrays, and an end-to-end resolveMediaRefs scenario with
Feishu card content.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(media): prepend path tags for JSON content, narrow looksLikeJSON

Two fixes from code review:

1. looksLikeJSON now only checks for '{' prefix (not '['), avoiding
   false positives on regular text like "[update] see attached".

2. For JSON content (Feishu cards/posts), path tags are prepended
   before the JSON instead of being silently dropped. This ensures
   the LLM can discover attached images via the path tag while the
   JSON payload stays valid for downstream parsing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-30 11:08:00 +08:00

289 lines
8.2 KiB
Go

// PicoClaw - Ultra-lightweight personal AI agent
// Inspired by and based on nanobot: https://github.com/HKUDS/nanobot
// License: MIT
//
// Copyright (c) 2026 PicoClaw contributors
package agent
import (
"bytes"
"encoding/base64"
"io"
"os"
"regexp"
"strings"
"github.com/h2non/filetype"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/media"
"github.com/sipeed/picoclaw/pkg/providers"
)
// genericPlaceholderRegex matches generic media placeholders emitted by various
// channels: [image], [image: photo], [image: filename.jpg] — but NOT path tags
// like [image:/path/to/file] (path tags have no space after the colon).
var (
imagePlaceholderRegex = regexp.MustCompile(`\[image(:\s+[^\]]*)?\]`)
audioPlaceholderRegex = regexp.MustCompile(`\[audio(:\s+[^\]]*)?\]`)
videoPlaceholderRegex = regexp.MustCompile(`\[video(:\s+[^\]]*)?\]`)
filePlaceholderRegex = regexp.MustCompile(`\[file(:\s+[^\]]*)?\]`)
)
// resolveMediaRefs resolves media:// refs in messages.
// For user messages: images get path tags only ([image:/path]) so the LLM
// can decide whether to view them via load_image or operate on the file.
// For tool messages: images are base64-encoded and appended as a synthetic
// user message only after the contiguous tool-message block ends, so we don't
// break the tool-results-must-immediately-follow-assistant constraint that
// LLM APIs enforce.
// Non-image files always get path tags regardless of role.
// Returns a new slice; original messages are not mutated.
func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxSize int) []providers.Message {
if store == nil {
return messages
}
result := make([]providers.Message, 0, len(messages))
var pendingToolImages []string
for idx, m := range messages {
// When leaving a tool-message block, flush any accumulated images
// as a synthetic user message.
if m.Role != "tool" && len(pendingToolImages) > 0 {
result = append(result, providers.Message{
Role: "user",
Content: "[Loaded image from tool result above]",
Media: pendingToolImages,
})
pendingToolImages = nil
}
if len(m.Media) == 0 {
result = append(result, m)
if idx == len(messages)-1 && len(pendingToolImages) > 0 {
result = append(result, providers.Message{
Role: "user",
Content: "[Loaded image from tool result above]",
Media: pendingToolImages,
})
pendingToolImages = nil
}
continue
}
msg := m
resolved := make([]string, 0, len(m.Media))
var pathTags []string
for _, ref := range m.Media {
if !strings.HasPrefix(ref, "media://") {
resolved = append(resolved, ref)
continue
}
localPath, meta, err := store.ResolveWithMeta(ref)
if err != nil {
logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{
"ref": ref,
"error": err.Error(),
})
continue
}
info, err := os.Stat(localPath)
if err != nil {
logger.WarnCF("agent", "Failed to stat media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
continue
}
mime := detectMIME(localPath, meta)
pathTags = append(pathTags, buildPathTag(mime, localPath))
if m.Role == "tool" && strings.HasPrefix(mime, "image/") {
dataURL := encodeImageToDataURL(localPath, mime, info, maxSize)
if dataURL != "" {
pendingToolImages = append(pendingToolImages, dataURL)
}
}
}
msg.Media = resolved
if len(pathTags) > 0 {
msg.Content = injectPathTags(msg.Content, pathTags)
}
result = append(result, msg)
// If this is the last message and we have pending images, flush them.
if idx == len(messages)-1 && len(pendingToolImages) > 0 {
result = append(result, providers.Message{
Role: "user",
Content: "[Loaded image from tool result above]",
Media: pendingToolImages,
})
pendingToolImages = nil
}
}
return result
}
// encodeImageToDataURL base64-encodes an image file into a data URL.
// Returns empty string if the file exceeds maxSize or encoding fails.
func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string {
if info.Size() > int64(maxSize) {
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
"path": localPath,
"size": info.Size(),
"max_size": maxSize,
})
return ""
}
f, err := os.Open(localPath)
if err != nil {
logger.WarnCF("agent", "Failed to open media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
return ""
}
defer f.Close()
prefix := "data:" + mime + ";base64,"
encodedLen := base64.StdEncoding.EncodedLen(int(info.Size()))
var buf bytes.Buffer
buf.Grow(len(prefix) + encodedLen)
buf.WriteString(prefix)
encoder := base64.NewEncoder(base64.StdEncoding, &buf)
if _, err := io.Copy(encoder, f); err != nil {
logger.WarnCF("agent", "Failed to encode media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
return ""
}
encoder.Close()
return buf.String()
}
func buildArtifactTags(store media.MediaStore, refs []string) []string {
if store == nil || len(refs) == 0 {
return nil
}
tags := make([]string, 0, len(refs))
for _, ref := range refs {
localPath, meta, err := store.ResolveWithMeta(ref)
if err != nil {
continue
}
mime := detectMIME(localPath, meta)
tags = append(tags, buildPathTag(mime, localPath))
}
return tags
}
func buildProviderAttachments(store media.MediaStore, refs []string) []providers.Attachment {
if store == nil || len(refs) == 0 {
return nil
}
attachments := make([]providers.Attachment, 0, len(refs))
for _, ref := range refs {
attachment := providers.Attachment{Ref: ref}
if _, meta, err := store.ResolveWithMeta(ref); err == nil {
attachment.Filename = meta.Filename
attachment.ContentType = meta.ContentType
attachment.Type = inferMediaType(meta.Filename, meta.ContentType)
}
attachments = append(attachments, attachment)
}
return attachments
}
// detectMIME determines the MIME type from metadata or magic-bytes detection.
// Returns empty string if detection fails.
func detectMIME(localPath string, meta media.MediaMeta) string {
if meta.ContentType != "" {
return meta.ContentType
}
kind, err := filetype.MatchFile(localPath)
if err != nil || kind == filetype.Unknown {
return ""
}
return kind.MIME.Value
}
// buildPathTag creates a structured tag exposing the local file path.
// Tag type is derived from MIME: [image:/path], [audio:/path], [video:/path], or [file:/path].
func buildPathTag(mime, localPath string) string {
switch {
case strings.HasPrefix(mime, "image/"):
return "[image:" + localPath + "]"
case strings.HasPrefix(mime, "audio/"):
return "[audio:" + localPath + "]"
case strings.HasPrefix(mime, "video/"):
return "[video:" + localPath + "]"
default:
return "[file:" + localPath + "]"
}
}
// injectPathTags replaces generic media tags in content with path-bearing versions,
// or appends if no matching generic tag is found. Channels emit a few different
// placeholder formats — [image], [image: photo], [image: filename.jpg] — so we
// match all of them via regex while leaving path tags ([image:/path]) untouched.
//
// When content is structured data (e.g., JSON from Feishu interactive cards or
// post messages), tags are only injected via placeholder replacement — never
// appended — to avoid corrupting the payload.
func injectPathTags(content string, tags []string) string {
isStructured := looksLikeJSON(content)
for _, tag := range tags {
var pattern *regexp.Regexp
switch {
case strings.HasPrefix(tag, "[image:"):
pattern = imagePlaceholderRegex
case strings.HasPrefix(tag, "[audio:"):
pattern = audioPlaceholderRegex
case strings.HasPrefix(tag, "[video:"):
pattern = videoPlaceholderRegex
case strings.HasPrefix(tag, "[file:"):
pattern = filePlaceholderRegex
}
if pattern != nil {
if loc := pattern.FindStringIndex(content); loc != nil {
content = content[:loc[0]] + tag + content[loc[1]:]
continue
}
}
if isStructured {
content = tag + "\n" + content
continue
}
if content == "" {
content = tag
} else {
content += " " + tag
}
}
return content
}
func looksLikeJSON(s string) bool {
s = strings.TrimSpace(s)
return len(s) > 1 && s[0] == '{'
}