From cb1e1a3595248faefd4275c528ec4d67d125c80c Mon Sep 17 00:00:00 2001 From: Guoguo <16666742+imguoguo@users.noreply.github.com> Date: Thu, 30 Apr 2026 11:08:00 +0800 Subject: [PATCH] fix(feishu): fix image download with API fallback and post image support (#2708) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(feishu): fix image download with API fallback and post image support - Add Image.Get API fallback when MessageResource.Get fails (different permission scope: im:resource vs im:message:readonly) - Extract and download images from post (rich text) messages - Extract images from interactive card messages - Deduplicate post image keys across locales - Add comprehensive tests for new helpers Co-Authored-By: Claude Opus 4.6 * feat(media): add image path tags alongside base64 for LLM file access Images are still base64-encoded into msg.Media for multimodal LLMs, but now also get [image:path] tags injected into message content so the LLM knows the local file path for save/forward operations. Co-Authored-By: Claude Opus 4.6 * refactor(media): only auto-inject images for tool results, not user messages Channel-received images (role=user) now get path tags only, letting the LLM decide whether to view via load_image or just operate on the file. Tool result images (role=tool, e.g. load_image) are base64-encoded into a synthetic user message appended after the tool message, since many LLM APIs don't support image_url in tool messages. Co-Authored-By: Claude Opus 4.6 * fix(media): preserve tool-message ordering for multi-tool-call scenarios Move synthetic user message (carrying base64 tool images) to after the entire contiguous tool-message block instead of immediately after each tool message. This preserves the assistant→tool→tool ordering required by OpenAI-compatible APIs. Also fix load_image to use generic [image: photo] placeholder so injectPathTags can properly replace it with the actual path. Co-Authored-By: Claude Opus 4.6 * fix(test): update load_image test for [image: photo] placeholder The test was checking ForLLM for the media:// ref, but load_image now emits the generic [image: photo] placeholder instead. Co-Authored-By: Claude Opus 4.6 * fix(media): match all channel image placeholders in injectPathTags Different channels emit different placeholder formats — Telegram/Feishu use [image: photo], WeCom/WeChat/Line use bare [image], QQ/Discord use [image: ]. The previous string-match code only handled [image: photo], so for the other channels the path tag was appended as a duplicate, producing content like "[image] [image:/path]". Switch to per-type regex that matches all generic placeholder shapes while leaving path tags ([image:/path]) untouched. Also fixes the same issue for [audio], [video], [file] tags. Added test coverage for the various placeholder shapes. Co-Authored-By: Claude Opus 4.6 * fix(media): skip path tag append for JSON content (Feishu cards/posts) When content is structured JSON (interactive cards, post messages), injectPathTags now skips the fallback append — only placeholder replacement is attempted. This prevents corrupting JSON payloads like {"schema":"2.0",...} with appended [image:/path] tags. Adds looksLikeJSON() helper and three test cases covering JSON objects, arrays, and an end-to-end resolveMediaRefs scenario with Feishu card content. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(media): prepend path tags for JSON content, narrow looksLikeJSON Two fixes from code review: 1. looksLikeJSON now only checks for '{' prefix (not '['), avoiding false positives on regular text like "[update] see attached". 2. For JSON content (Feishu cards/posts), path tags are prepended before the JSON instead of being silently dropped. This ensures the LLM can discover attached images via the path tag while the JSON payload stays valid for downstream parsing. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.6 --- pkg/agent/agent_media.go | 197 +++++++++++++------- pkg/agent/agent_test.go | 248 +++++++++++++++++++++++--- pkg/agent/steering_test.go | 6 +- pkg/channels/feishu/common.go | 56 ++++++ pkg/channels/feishu/common_test.go | 94 ++++++++++ pkg/channels/feishu/feishu_64.go | 124 ++++++++++--- pkg/channels/feishu/feishu_64_test.go | 7 + pkg/tools/fs/load_image.go | 8 +- pkg/tools/fs/load_image_test.go | 7 +- 9 files changed, 625 insertions(+), 122 deletions(-) diff --git a/pkg/agent/agent_media.go b/pkg/agent/agent_media.go index a773d2ebb..c02c7392c 100644 --- a/pkg/agent/agent_media.go +++ b/pkg/agent/agent_media.go @@ -11,6 +11,7 @@ import ( "encoding/base64" "io" "os" + "regexp" "strings" "github.com/h2non/filetype" @@ -20,24 +21,59 @@ import ( "github.com/sipeed/picoclaw/pkg/providers" ) +// genericPlaceholderRegex matches generic media placeholders emitted by various +// channels: [image], [image: photo], [image: filename.jpg] — but NOT path tags +// like [image:/path/to/file] (path tags have no space after the colon). +var ( + imagePlaceholderRegex = regexp.MustCompile(`\[image(:\s+[^\]]*)?\]`) + audioPlaceholderRegex = regexp.MustCompile(`\[audio(:\s+[^\]]*)?\]`) + videoPlaceholderRegex = regexp.MustCompile(`\[video(:\s+[^\]]*)?\]`) + filePlaceholderRegex = regexp.MustCompile(`\[file(:\s+[^\]]*)?\]`) +) + // resolveMediaRefs resolves media:// refs in messages. -// Images are base64-encoded into the Media array for multimodal LLMs. -// Non-image files (documents, audio, video) have their local path injected -// into Content so the agent can access them via file tools like read_file. +// For user messages: images get path tags only ([image:/path]) so the LLM +// can decide whether to view them via load_image or operate on the file. +// For tool messages: images are base64-encoded and appended as a synthetic +// user message only after the contiguous tool-message block ends, so we don't +// break the tool-results-must-immediately-follow-assistant constraint that +// LLM APIs enforce. +// Non-image files always get path tags regardless of role. // Returns a new slice; original messages are not mutated. func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxSize int) []providers.Message { if store == nil { return messages } - result := make([]providers.Message, len(messages)) - copy(result, messages) + result := make([]providers.Message, 0, len(messages)) + var pendingToolImages []string + + for idx, m := range messages { + // When leaving a tool-message block, flush any accumulated images + // as a synthetic user message. + if m.Role != "tool" && len(pendingToolImages) > 0 { + result = append(result, providers.Message{ + Role: "user", + Content: "[Loaded image from tool result above]", + Media: pendingToolImages, + }) + pendingToolImages = nil + } - for i, m := range result { if len(m.Media) == 0 { + result = append(result, m) + if idx == len(messages)-1 && len(pendingToolImages) > 0 { + result = append(result, providers.Message{ + Role: "user", + Content: "[Loaded image from tool result above]", + Media: pendingToolImages, + }) + pendingToolImages = nil + } continue } + msg := m resolved := make([]string, 0, len(m.Media)) var pathTags []string @@ -66,27 +102,77 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS } mime := detectMIME(localPath, meta) + pathTags = append(pathTags, buildPathTag(mime, localPath)) - if strings.HasPrefix(mime, "image/") { + if m.Role == "tool" && strings.HasPrefix(mime, "image/") { dataURL := encodeImageToDataURL(localPath, mime, info, maxSize) if dataURL != "" { - resolved = append(resolved, dataURL) + pendingToolImages = append(pendingToolImages, dataURL) } - continue } - - pathTags = append(pathTags, buildPathTag(mime, localPath)) } - result[i].Media = resolved + msg.Media = resolved if len(pathTags) > 0 { - result[i].Content = injectPathTags(result[i].Content, pathTags) + msg.Content = injectPathTags(msg.Content, pathTags) + } + result = append(result, msg) + + // If this is the last message and we have pending images, flush them. + if idx == len(messages)-1 && len(pendingToolImages) > 0 { + result = append(result, providers.Message{ + Role: "user", + Content: "[Loaded image from tool result above]", + Media: pendingToolImages, + }) + pendingToolImages = nil } } return result } +// encodeImageToDataURL base64-encodes an image file into a data URL. +// Returns empty string if the file exceeds maxSize or encoding fails. +func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string { + if info.Size() > int64(maxSize) { + logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ + "path": localPath, + "size": info.Size(), + "max_size": maxSize, + }) + return "" + } + + f, err := os.Open(localPath) + if err != nil { + logger.WarnCF("agent", "Failed to open media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + return "" + } + defer f.Close() + + prefix := "data:" + mime + ";base64," + encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) + var buf bytes.Buffer + buf.Grow(len(prefix) + encodedLen) + buf.WriteString(prefix) + + encoder := base64.NewEncoder(base64.StdEncoding, &buf) + if _, err := io.Copy(encoder, f); err != nil { + logger.WarnCF("agent", "Failed to encode media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + return "" + } + encoder.Close() + + return buf.String() +} + func buildArtifactTags(store media.MediaStore, refs []string) []string { if store == nil || len(refs) == 0 { return nil @@ -137,51 +223,12 @@ func detectMIME(localPath string, meta media.MediaMeta) string { return kind.MIME.Value } -// encodeImageToDataURL base64-encodes an image file into a data URL. -// Returns empty string if the file exceeds maxSize or encoding fails. -func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string { - if info.Size() > int64(maxSize) { - logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ - "path": localPath, - "size": info.Size(), - "max_size": maxSize, - }) - return "" - } - - f, err := os.Open(localPath) - if err != nil { - logger.WarnCF("agent", "Failed to open media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - return "" - } - defer f.Close() - - prefix := "data:" + mime + ";base64," - encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) - var buf bytes.Buffer - buf.Grow(len(prefix) + encodedLen) - buf.WriteString(prefix) - - encoder := base64.NewEncoder(base64.StdEncoding, &buf) - if _, err := io.Copy(encoder, f); err != nil { - logger.WarnCF("agent", "Failed to encode media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - return "" - } - encoder.Close() - - return buf.String() -} - // buildPathTag creates a structured tag exposing the local file path. -// Tag type is derived from MIME: [audio:/path], [video:/path], or [file:/path]. +// Tag type is derived from MIME: [image:/path], [audio:/path], [video:/path], or [file:/path]. func buildPathTag(mime, localPath string) string { switch { + case strings.HasPrefix(mime, "image/"): + return "[image:" + localPath + "]" case strings.HasPrefix(mime, "audio/"): return "[audio:" + localPath + "]" case strings.HasPrefix(mime, "video/"): @@ -192,22 +239,41 @@ func buildPathTag(mime, localPath string) string { } // injectPathTags replaces generic media tags in content with path-bearing versions, -// or appends if no matching generic tag is found. +// or appends if no matching generic tag is found. Channels emit a few different +// placeholder formats — [image], [image: photo], [image: filename.jpg] — so we +// match all of them via regex while leaving path tags ([image:/path]) untouched. +// +// When content is structured data (e.g., JSON from Feishu interactive cards or +// post messages), tags are only injected via placeholder replacement — never +// appended — to avoid corrupting the payload. func injectPathTags(content string, tags []string) string { + isStructured := looksLikeJSON(content) for _, tag := range tags { - var generic string + var pattern *regexp.Regexp switch { + case strings.HasPrefix(tag, "[image:"): + pattern = imagePlaceholderRegex case strings.HasPrefix(tag, "[audio:"): - generic = "[audio]" + pattern = audioPlaceholderRegex case strings.HasPrefix(tag, "[video:"): - generic = "[video]" + pattern = videoPlaceholderRegex case strings.HasPrefix(tag, "[file:"): - generic = "[file]" + pattern = filePlaceholderRegex } - if generic != "" && strings.Contains(content, generic) { - content = strings.Replace(content, generic, tag, 1) - } else if content == "" { + if pattern != nil { + if loc := pattern.FindStringIndex(content); loc != nil { + content = content[:loc[0]] + tag + content[loc[1]:] + continue + } + } + + if isStructured { + content = tag + "\n" + content + continue + } + + if content == "" { content = tag } else { content += " " + tag @@ -215,3 +281,8 @@ func injectPathTags(content string, tags []string) string { } return content } + +func looksLikeJSON(s string) bool { + s = strings.TrimSpace(s) + return len(s) > 1 && s[0] == '{' +} diff --git a/pkg/agent/agent_test.go b/pkg/agent/agent_test.go index 4047ab74d..f326e2acb 100644 --- a/pkg/agent/agent_test.go +++ b/pkg/agent/agent_test.go @@ -1781,17 +1781,22 @@ func (m *artifactThenSendProvider) Chat( if messages[i].Role != "tool" { continue } - start := strings.Index(messages[i].Content, "[file:") - if start < 0 { - continue + for _, prefix := range []string{"[image:", "[file:", "[audio:", "[video:"} { + start := strings.Index(messages[i].Content, prefix) + if start < 0 { + continue + } + rest := messages[i].Content[start+len(prefix):] + end := strings.Index(rest, "]") + if end < 0 { + continue + } + artifactPath = rest[:end] + break } - rest := messages[i].Content[start+len("[file:"):] - end := strings.Index(rest, "]") - if end < 0 { - continue + if artifactPath != "" { + break } - artifactPath = rest[:end] - break } if artifactPath == "" { return nil, fmt.Errorf("provider did not receive artifact path in tool result") @@ -4656,7 +4661,7 @@ func TestRun_PicoToolFeedbackSuppressesDuplicateInterimAssistantContent(t *testi } } -func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) { +func TestResolveMediaRefs_ImageInjectsPathTag(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -4684,15 +4689,110 @@ func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) { } result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 resolved media, got %d", len(result[0].Media)) + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (images use path tags), got %d", len(result[0].Media)) } - if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { - t.Fatalf("expected data:image/png;base64, prefix, got %q", result[0].Media[0][:40]) + localPath, _, _ := store.ResolveWithMeta(ref) + expectedContent := "describe this [image:" + localPath + "]" + if result[0].Content != expectedContent { + t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) } } -func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { +func TestResolveMediaRefs_ToolRoleImageAppendedAsUserMessage(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + pngPath := filepath.Join(dir, "tool-result.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature + 0x00, 0x00, 0x00, 0x0D, // IHDR length + 0x49, 0x48, 0x44, 0x52, // "IHDR" + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, // 1x1 RGB + 0x00, 0x00, 0x00, // no interlace + 0x90, 0x77, 0x53, 0xDE, // CRC + } + if err := os.WriteFile(pngPath, pngHeader, 0o644); err != nil { + t.Fatal(err) + } + ref, _ := store.Store(pngPath, media.MediaMeta{}, "test") + + messages := []providers.Message{ + {Role: "tool", Content: "Image loaded", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + // Tool message should have path tag but no base64 + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media in tool message, got %d", len(result[0].Media)) + } + localPath, _, _ := store.ResolveWithMeta(ref) + if !strings.Contains(result[0].Content, "[image:"+localPath+"]") { + t.Fatalf("expected image path tag in tool content, got %q", result[0].Content) + } + + // A synthetic user message with base64 should follow + if len(result) != 2 { + t.Fatalf("expected 2 messages (tool + synthetic user), got %d", len(result)) + } + if result[1].Role != "user" { + t.Fatalf("expected synthetic message role=user, got %q", result[1].Role) + } + if len(result[1].Media) != 1 { + t.Fatalf("expected 1 base64 media in synthetic user message, got %d", len(result[1].Media)) + } + if !strings.HasPrefix(result[1].Media[0], "data:image/png;base64,") { + t.Fatalf("expected data:image/png;base64, prefix, got %q", result[1].Media[0][:40]) + } +} + +func TestResolveMediaRefs_MultiToolCallPreservesOrdering(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + // Create image for tool #1 + pngPath := filepath.Join(dir, "loaded.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature + 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // IHDR + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, + 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, + } + os.WriteFile(pngPath, pngHeader, 0o644) + imgRef, _ := store.Store(pngPath, media.MediaMeta{}, "test") + + // Simulate: assistant called load_image + read_file, two tool results follow + messages := []providers.Message{ + {Role: "assistant", Content: "Let me load the image and read the file."}, + {Role: "tool", Content: "Image loaded [image: photo]", Media: []string{imgRef}}, + {Role: "tool", Content: "file contents here"}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + // assistant, tool#1, tool#2 must remain contiguous — no user in between + if result[0].Role != "assistant" { + t.Fatalf("result[0] expected assistant, got %q", result[0].Role) + } + if result[1].Role != "tool" { + t.Fatalf("result[1] expected tool, got %q", result[1].Role) + } + if result[2].Role != "tool" { + t.Fatalf("result[2] expected tool, got %q", result[2].Role) + } + + // Synthetic user message should come AFTER the tool block + if len(result) != 4 { + t.Fatalf("expected 4 messages (assistant + 2 tool + synthetic user), got %d", len(result)) + } + if result[3].Role != "user" { + t.Fatalf("result[3] expected user, got %q", result[3].Role) + } + if len(result[3].Media) != 1 || !strings.HasPrefix(result[3].Media[0], "data:image/png;base64,") { + t.Fatal("expected synthetic user message to contain base64 image") + } +} + +func TestResolveMediaRefs_OversizedImageSkipsBase64KeepsPathTag(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -4714,6 +4814,11 @@ func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { if len(result[0].Media) != 0 { t.Fatalf("expected 0 media (oversized), got %d", len(result[0].Media)) } + localPath, _, _ := store.ResolveWithMeta(ref) + expected := "hi [image:" + localPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } } func TestResolveMediaRefs_UnknownTypeInjectsPath(t *testing.T) { @@ -4791,11 +4896,13 @@ func TestResolveMediaRefs_UsesMetaContentType(t *testing.T) { } result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 media, got %d", len(result[0].Media)) + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (images use path tags), got %d", len(result[0].Media)) } - if !strings.HasPrefix(result[0].Media[0], "data:image/jpeg;base64,") { - t.Fatalf("expected jpeg prefix, got %q", result[0].Media[0][:30]) + localPath, _, _ := store.ResolveWithMeta(ref) + expectedContent := "hi [image:" + localPath + "]" + if result[0].Content != expectedContent { + t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) } } @@ -4885,6 +4992,98 @@ func TestResolveMediaRefs_NoGenericTagAppendsPath(t *testing.T) { } } +func TestInjectPathTags_HandlesVariousChannelPlaceholders(t *testing.T) { + cases := []struct { + name string + content string + tag string + want string + }{ + // Telegram / Feishu format + {"image_photo", "[image: photo]", "[image:/tmp/p.png]", "[image:/tmp/p.png]"}, + // WeCom / WeChat / Line format + {"bare_image", "[image]", "[image:/tmp/p.png]", "[image:/tmp/p.png]"}, + // QQ / Discord format with filename + {"image_filename", "[image: pic.jpg]", "[image:/tmp/p.png]", "[image:/tmp/p.png]"}, + {"audio_with_filename", "[audio: voice.m4a]", "[audio:/tmp/a.m4a]", "[audio:/tmp/a.m4a]"}, + {"bare_audio", "[audio]", "[audio:/tmp/a.m4a]", "[audio:/tmp/a.m4a]"}, + {"bare_video", "[video]", "[video:/tmp/v.mp4]", "[video:/tmp/v.mp4]"}, + {"bare_file", "[file]", "[file:/tmp/f.pdf]", "[file:/tmp/f.pdf]"}, + // Mixed surrounding text + { + "with_text", + "hello [image] world", + "[image:/tmp/p.png]", + "hello [image:/tmp/p.png] world", + }, + // No placeholder — append + {"no_placeholder", "hello world", "[image:/tmp/p.png]", "hello world [image:/tmp/p.png]"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := injectPathTags(tc.content, []string{tc.tag}) + if got != tc.want { + t.Errorf("expected %q, got %q", tc.want, got) + } + }) + } +} + +func TestInjectPathTags_DoesNotReplacePathTag(t *testing.T) { + // If content already contains a path tag, we must not touch it. + content := "see [image:/already/placed.png] thanks" + got := injectPathTags(content, []string{"[image:/new/path.png]"}) + want := "see [image:/already/placed.png] thanks [image:/new/path.png]" + if got != want { + t.Fatalf("expected %q, got %q", want, got) + } +} + +func TestInjectPathTags_PrependsForJSONContent(t *testing.T) { + jsonContent := `{"schema":"2.0","body":{"elements":[{"tag":"img","img_key":"img_123"}]}}` + got := injectPathTags(jsonContent, []string{"[image:/tmp/photo.png]"}) + want := "[image:/tmp/photo.png]\n" + jsonContent + if got != want { + t.Fatalf("expected tag prepended to JSON, got %q", got) + } +} + +func TestInjectPathTags_BracketTextNotTreatedAsJSON(t *testing.T) { + content := "[update] see attached report" + got := injectPathTags(content, []string{"[file:/tmp/report.pdf]"}) + want := "[update] see attached report [file:/tmp/report.pdf]" + if got != want { + t.Fatalf("expected tag appended to bracket text, got %q", got) + } +} + +func TestResolveMediaRefs_JSONContentPrependsPathTag(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + pngPath := filepath.Join(dir, "card_img.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, + 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, + 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, + } + os.WriteFile(pngPath, pngHeader, 0o644) + ref, _ := store.Store(pngPath, media.MediaMeta{ContentType: "image/png"}, "test") + + jsonContent := `{"schema":"2.0","body":{"elements":[{"tag":"img","img_key":"img_123"}]}}` + messages := []providers.Message{ + {Role: "user", Content: jsonContent, Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + want := "[image:" + pngPath + "]\n" + jsonContent + if result[0].Content != want { + t.Fatalf("expected path tag prepended to JSON content, got %q", result[0].Content) + } +} + func TestResolveMediaRefs_EmptyContentGetsPathTag(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -4928,13 +5127,12 @@ func TestResolveMediaRefs_MixedImageAndFile(t *testing.T) { } result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 media (image only), got %d", len(result[0].Media)) + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (all types use path tags), got %d", len(result[0].Media)) } - if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { - t.Fatal("expected image to be base64 encoded") - } - expectedContent := "check these [file:" + pdfPath + "]" + imgLocalPath, _, _ := store.ResolveWithMeta(imgRef) + pdfLocalPath, _, _ := store.ResolveWithMeta(fileRef) + expectedContent := "check these [file:" + pdfLocalPath + "] [image:" + imgLocalPath + "]" if result[0].Content != expectedContent { t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) } diff --git a/pkg/agent/steering_test.go b/pkg/agent/steering_test.go index bba988672..1ff699976 100644 --- a/pkg/agent/steering_test.go +++ b/pkg/agent/steering_test.go @@ -1051,16 +1051,16 @@ func TestAgentLoop_Continue_PreservesSteeringMedia(t *testing.T) { foundResolvedMedia := false for _, msg := range msgs { - if msg.Role != "user" || msg.Content != "describe this image" || len(msg.Media) != 1 { + if msg.Role != "user" || !strings.Contains(msg.Content, "describe this image") { continue } - if strings.HasPrefix(msg.Media[0], "data:image/png;base64,") { + if strings.Contains(msg.Content, "[image:") { foundResolvedMedia = true break } } if !foundResolvedMedia { - t.Fatal("expected continue path to inject steering media into the provider request") + t.Fatal("expected continue path to inject image path tag into the provider request") } defaultAgent := al.registry.GetDefaultAgent() diff --git a/pkg/channels/feishu/common.go b/pkg/channels/feishu/common.go index 81238460a..95579df09 100644 --- a/pkg/channels/feishu/common.go +++ b/pkg/channels/feishu/common.go @@ -64,6 +64,62 @@ func extractJSONStringField(content, field string) string { // Format: {"image_key": "img_xxx"} func extractImageKey(content string) string { return extractJSONStringField(content, "image_key") } +// extractPostImageKeys extracts all image_key values from a Feishu post (rich text) +// message. Post messages have nested arrays of elements where images appear as +// {"tag":"img","image_key":"img_xxx"}. +func extractPostImageKeys(rawContent string) []string { + if rawContent == "" { + return nil + } + + var post map[string]json.RawMessage + if err := json.Unmarshal([]byte(rawContent), &post); err != nil { + return nil + } + + var keys []string + seen := make(map[string]struct{}) + + collectFromRows := func(contentRaw json.RawMessage) { + var rows [][]map[string]any + if err := json.Unmarshal(contentRaw, &rows); err != nil { + return + } + for _, row := range rows { + for _, elem := range row { + if tag, _ := elem["tag"].(string); tag == "img" { + if ik, _ := elem["image_key"].(string); ik != "" { + if _, dup := seen[ik]; !dup { + seen[ik] = struct{}{} + keys = append(keys, ik) + } + } + } + } + } + } + + // Flat format: {"title":"...", "content":[[...]]} + if contentRaw, ok := post["content"]; ok { + collectFromRows(contentRaw) + } + + // Localized format: {"zh_cn": {"title":"...", "content":[[...]]}, ...} + for _, raw := range post { + var locale map[string]json.RawMessage + if err := json.Unmarshal(raw, &locale); err != nil { + continue + } + contentRaw, ok := locale["content"] + if !ok { + continue + } + collectFromRows(contentRaw) + } + + return keys +} + // extractFileKey extracts the file_key from a Feishu file/audio message content JSON. // Format: {"file_key": "file_xxx", "file_name": "...", ...} func extractFileKey(content string) string { return extractJSONStringField(content, "file_key") } diff --git a/pkg/channels/feishu/common_test.go b/pkg/channels/feishu/common_test.go index ff4af0148..dcf7861a2 100644 --- a/pkg/channels/feishu/common_test.go +++ b/pkg/channels/feishu/common_test.go @@ -291,6 +291,100 @@ func TestStripMentionPlaceholders(t *testing.T) { } } +func TestExtractPostImageKeys(t *testing.T) { + tests := []struct { + name string + content string + want []string + }{ + { + name: "empty content", + content: "", + want: nil, + }, + { + name: "invalid JSON", + content: "not json", + want: nil, + }, + { + name: "post with no images", + content: `{"zh_cn":{"title":"Title","content":[[{"tag":"text","text":"hello"}]]}}`, + want: nil, + }, + { + name: "post with one image", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_v3_001"}]]}}`, + want: []string{"img_v3_001"}, + }, + { + name: "post with multiple images", + content: `{"zh_cn":{"title":"","content":[[{"tag":"text","text":"see"},{"tag":"img","image_key":"img_001"}],[{"tag":"img","image_key":"img_002"}]]}}`, + want: []string{"img_001", "img_002"}, + }, + { + name: "post with text and image mixed in row", + content: `{"zh_cn":{"title":"","content":[[{"tag":"text","text":"hi"},{"tag":"img","image_key":"img_mix"}]]}}`, + want: []string{"img_mix"}, + }, + { + name: "en_us locale", + content: `{"en_us":{"title":"","content":[[{"tag":"img","image_key":"img_en"}]]}}`, + want: []string{"img_en"}, + }, + { + name: "multiple locales with distinct images", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_zh"}]]},"en_us":{"title":"","content":[[{"tag":"img","image_key":"img_en"}]]}}`, + want: []string{"img_zh", "img_en"}, + }, + { + name: "duplicate image_key across locales is deduplicated", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_same"}]]},"en_us":{"title":"","content":[[{"tag":"img","image_key":"img_same"}]]}}`, + want: []string{"img_same"}, + }, + { + name: "image with empty image_key", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":""}]]}}`, + want: nil, + }, + { + name: "flat format without locale wrapper", + content: `{"title":"","content":[[{"tag":"img","image_key":"img_v3_flat","width":1826,"height":338}],[{"tag":"text","text":" check this image","style":[]}]]}`, + want: []string{"img_v3_flat"}, + }, + { + name: "flat format multiple images", + content: `{"title":"","content":[[{"tag":"img","image_key":"img_flat_1"}],[{"tag":"img","image_key":"img_flat_2"},{"tag":"text","text":"desc"}]]}`, + want: []string{"img_flat_1", "img_flat_2"}, + }, + { + name: "flat format no images", + content: `{"title":"Test","content":[[{"tag":"text","text":"just text"}]]}`, + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := extractPostImageKeys(tt.content) + if len(got) != len(tt.want) { + t.Errorf("extractPostImageKeys() = %v, want %v", got, tt.want) + return + } + // Use set comparison to avoid map iteration order dependency + gotSet := make(map[string]bool, len(got)) + for _, v := range got { + gotSet[v] = true + } + for _, v := range tt.want { + if !gotSet[v] { + t.Errorf("extractPostImageKeys() missing expected key %q; got %v", v, got) + } + } + }) + } +} + func TestExtractCardImageKeys(t *testing.T) { tests := []struct { name string diff --git a/pkg/channels/feishu/feishu_64.go b/pkg/channels/feishu/feishu_64.go index 8f3ae39d9..d09c021c7 100644 --- a/pkg/channels/feishu/feishu_64.go +++ b/pkg/channels/feishu/feishu_64.go @@ -803,6 +803,14 @@ func (c *FeishuChannel) downloadInboundMedia( refs = append(refs, ref) } + case larkim.MsgTypePost: + for _, imageKey := range extractPostImageKeys(rawContent) { + ref := c.downloadResource(ctx, messageID, imageKey, "image", ".jpg", store, scope) + if ref != "" { + refs = append(refs, ref) + } + } + case larkim.MsgTypeInteractive: // Extract and download images embedded in interactive cards feishuKeys, _ := extractCardImageKeys(rawContent) @@ -842,12 +850,41 @@ func (c *FeishuChannel) downloadInboundMedia( // downloadResource downloads a message resource (image/file) from Feishu, // writes it to the project media directory, and stores the reference in MediaStore. // fallbackExt (e.g. ".jpg") is appended when the resolved filename has no extension. +// +// For image resources, if the primary MessageResource.Get API fails (which +// requires im:message or im:message:readonly scope), a fallback to the +// Image.Get API (which requires im:resource scope) is attempted. This ensures +// image downloads succeed regardless of which permission the user has granted. func (c *FeishuChannel) downloadResource( ctx context.Context, messageID, fileKey, resourceType, fallbackExt string, store media.MediaStore, scope string, ) string { + file, filename := c.fetchResourceData(ctx, messageID, fileKey, resourceType) + if file == nil { + return "" + } + if closer, ok := file.(io.Closer); ok { + defer closer.Close() + } + + if filename == "" { + filename = fileKey + } + if filepath.Ext(filename) == "" && fallbackExt != "" { + filename += fallbackExt + } + + return c.storeResourceFile(ctx, messageID, fileKey, filename, file, store, scope) +} + +// fetchResourceData tries to download a resource from Feishu, first via +// MessageResource.Get, then falling back to Image.Get for image resources. +func (c *FeishuChannel) fetchResourceData( + ctx context.Context, + messageID, fileKey, resourceType string, +) (io.Reader, string) { req := larkim.NewGetMessageResourceReqBuilder(). MessageId(messageID). FileKey(fileKey). @@ -855,41 +892,80 @@ func (c *FeishuChannel) downloadResource( Build() resp, err := c.client.Im.V1.MessageResource.Get(ctx, req) + if err == nil && resp.Success() && resp.File != nil { + return resp.File, resp.FileName + } + if err != nil { - logger.ErrorCF("feishu", "Failed to download resource", map[string]any{ + logger.WarnCF("feishu", "MessageResource.Get failed", map[string]any{ "message_id": messageID, "file_key": fileKey, "error": err.Error(), }) - return "" + } else if !resp.Success() { + c.invalidateTokenOnAuthError(resp.Code) + logger.WarnCF("feishu", "MessageResource.Get api error", map[string]any{ + "message_id": messageID, + "file_key": fileKey, + "code": resp.Code, + "msg": resp.Msg, + }) + } else { + logger.WarnCF("feishu", "MessageResource.Get returned empty file body", map[string]any{ + "message_id": messageID, + "file_key": fileKey, + }) + } + + if resourceType != "image" { + return nil, "" + } + + return c.fetchImageDirect(ctx, fileKey) +} + +// fetchImageDirect downloads an image using the Image.Get API +// (/open-apis/im/v1/images/:image_key), which requires the im:resource scope. +func (c *FeishuChannel) fetchImageDirect(ctx context.Context, imageKey string) (io.Reader, string) { + req := larkim.NewGetImageReqBuilder(). + ImageKey(imageKey). + Build() + + resp, err := c.client.Im.V1.Image.Get(ctx, req) + if err != nil { + logger.ErrorCF("feishu", "Image.Get fallback failed", map[string]any{ + "image_key": imageKey, + "error": err.Error(), + }) + return nil, "" } if !resp.Success() { c.invalidateTokenOnAuthError(resp.Code) - logger.ErrorCF("feishu", "Resource download api error", map[string]any{ - "code": resp.Code, - "msg": resp.Msg, + logger.ErrorCF("feishu", "Image.Get fallback api error", map[string]any{ + "image_key": imageKey, + "code": resp.Code, + "msg": resp.Msg, }) - return "" + return nil, "" } - if resp.File == nil { - return "" - } - // Safely close the underlying reader if it implements io.Closer (e.g. HTTP response body). - if closer, ok := resp.File.(io.Closer); ok { - defer closer.Close() + return nil, "" } - filename := resp.FileName - if filename == "" { - filename = fileKey - } - // If filename still has no extension, append the fallback (like Telegram's ext parameter). - if filepath.Ext(filename) == "" && fallbackExt != "" { - filename += fallbackExt - } + logger.DebugCF("feishu", "Image downloaded via Image.Get fallback", map[string]any{ + "image_key": imageKey, + }) + return resp.File, resp.FileName +} - // Write to the shared picoclaw_media directory using a unique name to avoid collisions. +// storeResourceFile writes downloaded resource data to disk and registers it in the MediaStore. +func (c *FeishuChannel) storeResourceFile( + ctx context.Context, + messageID, fileKey, filename string, + file io.Reader, + store media.MediaStore, + scope string, +) string { mediaDir := media.TempDir() if mkdirErr := os.MkdirAll(mediaDir, 0o700); mkdirErr != nil { logger.ErrorCF("feishu", "Failed to create media directory", map[string]any{ @@ -908,7 +984,7 @@ func (c *FeishuChannel) downloadResource( return "" } - if _, copyErr := io.Copy(out, resp.File); copyErr != nil { + if _, copyErr := io.Copy(out, file); copyErr != nil { out.Close() os.Remove(localPath) logger.ErrorCF("feishu", "Failed to write resource to file", map[string]any{ @@ -943,8 +1019,8 @@ func appendMediaTags(content, messageType string, mediaRefs []string) string { return content } - // Don't append tags to JSON content (interactive cards) - would produce invalid JSON - if messageType == larkim.MsgTypeInteractive { + // Don't append tags to JSON content - would produce invalid JSON + if messageType == larkim.MsgTypeInteractive || messageType == larkim.MsgTypePost { return content } diff --git a/pkg/channels/feishu/feishu_64_test.go b/pkg/channels/feishu/feishu_64_test.go index 48fdf0f74..d256325ad 100644 --- a/pkg/channels/feishu/feishu_64_test.go +++ b/pkg/channels/feishu/feishu_64_test.go @@ -180,6 +180,13 @@ func TestAppendMediaTags(t *testing.T) { mediaRefs: []string{"ref1"}, want: `{"schema":"2.0","body":{"elements":[{"tag":"img","img_key":"img_123"}]}}`, }, + { + name: "post message with images returns content unchanged", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_001"}]]}}`, + messageType: "post", + mediaRefs: []string{"ref1"}, + want: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_001"}]]}}`, + }, } for _, tt := range tests { diff --git a/pkg/tools/fs/load_image.go b/pkg/tools/fs/load_image.go index 6f612faea..0a67fa120 100644 --- a/pkg/tools/fs/load_image.go +++ b/pkg/tools/fs/load_image.go @@ -147,10 +147,10 @@ func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolR return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err)) } - // Build the tool result text. The media:// ref will be picked up by - // resolveMediaRefs in loop_media.go and converted to a base64 data URL - // before the next LLM call, exactly like channel-received images. - msg := fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref) + // Build the tool result text. The media:// ref in Media will be picked + // up by resolveMediaRefs in agent_media.go and base64-encoded for tool + // result messages (role="tool"), so the LLM can see the image content. + msg := fmt.Sprintf("Image loaded: %s\n[image: photo]", filename) return &ToolResult{ ForLLM: msg, diff --git a/pkg/tools/fs/load_image_test.go b/pkg/tools/fs/load_image_test.go index 72f163d81..d33db73be 100644 --- a/pkg/tools/fs/load_image_test.go +++ b/pkg/tools/fs/load_image_test.go @@ -135,9 +135,10 @@ func TestLoadImage_SuccessPath(t *testing.T) { t.Errorf("expected ForLLM to contain '[image:' marker, got: %s", result.ForLLM) } - // 4. ForLLM should also contain the media:// ref - if !strings.Contains(result.ForLLM, result.Media[0]) { - t.Errorf("expected ForLLM to contain media ref %q, got: %s", result.Media[0], result.ForLLM) + // 4. ForLLM should contain the generic [image: photo] placeholder + // (resolveMediaRefs will replace it with the actual path later) + if !strings.Contains(result.ForLLM, "[image: photo]") { + t.Errorf("expected ForLLM to contain '[image: photo]' placeholder, got: %s", result.ForLLM) } // 5. Verify the ref is resolvable in the store