diff --git a/pkg/agent/agent_media.go b/pkg/agent/agent_media.go index a773d2ebb..c02c7392c 100644 --- a/pkg/agent/agent_media.go +++ b/pkg/agent/agent_media.go @@ -11,6 +11,7 @@ import ( "encoding/base64" "io" "os" + "regexp" "strings" "github.com/h2non/filetype" @@ -20,24 +21,59 @@ import ( "github.com/sipeed/picoclaw/pkg/providers" ) +// genericPlaceholderRegex matches generic media placeholders emitted by various +// channels: [image], [image: photo], [image: filename.jpg] — but NOT path tags +// like [image:/path/to/file] (path tags have no space after the colon). +var ( + imagePlaceholderRegex = regexp.MustCompile(`\[image(:\s+[^\]]*)?\]`) + audioPlaceholderRegex = regexp.MustCompile(`\[audio(:\s+[^\]]*)?\]`) + videoPlaceholderRegex = regexp.MustCompile(`\[video(:\s+[^\]]*)?\]`) + filePlaceholderRegex = regexp.MustCompile(`\[file(:\s+[^\]]*)?\]`) +) + // resolveMediaRefs resolves media:// refs in messages. -// Images are base64-encoded into the Media array for multimodal LLMs. -// Non-image files (documents, audio, video) have their local path injected -// into Content so the agent can access them via file tools like read_file. +// For user messages: images get path tags only ([image:/path]) so the LLM +// can decide whether to view them via load_image or operate on the file. +// For tool messages: images are base64-encoded and appended as a synthetic +// user message only after the contiguous tool-message block ends, so we don't +// break the tool-results-must-immediately-follow-assistant constraint that +// LLM APIs enforce. +// Non-image files always get path tags regardless of role. // Returns a new slice; original messages are not mutated. func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxSize int) []providers.Message { if store == nil { return messages } - result := make([]providers.Message, len(messages)) - copy(result, messages) + result := make([]providers.Message, 0, len(messages)) + var pendingToolImages []string + + for idx, m := range messages { + // When leaving a tool-message block, flush any accumulated images + // as a synthetic user message. + if m.Role != "tool" && len(pendingToolImages) > 0 { + result = append(result, providers.Message{ + Role: "user", + Content: "[Loaded image from tool result above]", + Media: pendingToolImages, + }) + pendingToolImages = nil + } - for i, m := range result { if len(m.Media) == 0 { + result = append(result, m) + if idx == len(messages)-1 && len(pendingToolImages) > 0 { + result = append(result, providers.Message{ + Role: "user", + Content: "[Loaded image from tool result above]", + Media: pendingToolImages, + }) + pendingToolImages = nil + } continue } + msg := m resolved := make([]string, 0, len(m.Media)) var pathTags []string @@ -66,27 +102,77 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS } mime := detectMIME(localPath, meta) + pathTags = append(pathTags, buildPathTag(mime, localPath)) - if strings.HasPrefix(mime, "image/") { + if m.Role == "tool" && strings.HasPrefix(mime, "image/") { dataURL := encodeImageToDataURL(localPath, mime, info, maxSize) if dataURL != "" { - resolved = append(resolved, dataURL) + pendingToolImages = append(pendingToolImages, dataURL) } - continue } - - pathTags = append(pathTags, buildPathTag(mime, localPath)) } - result[i].Media = resolved + msg.Media = resolved if len(pathTags) > 0 { - result[i].Content = injectPathTags(result[i].Content, pathTags) + msg.Content = injectPathTags(msg.Content, pathTags) + } + result = append(result, msg) + + // If this is the last message and we have pending images, flush them. + if idx == len(messages)-1 && len(pendingToolImages) > 0 { + result = append(result, providers.Message{ + Role: "user", + Content: "[Loaded image from tool result above]", + Media: pendingToolImages, + }) + pendingToolImages = nil } } return result } +// encodeImageToDataURL base64-encodes an image file into a data URL. +// Returns empty string if the file exceeds maxSize or encoding fails. +func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string { + if info.Size() > int64(maxSize) { + logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ + "path": localPath, + "size": info.Size(), + "max_size": maxSize, + }) + return "" + } + + f, err := os.Open(localPath) + if err != nil { + logger.WarnCF("agent", "Failed to open media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + return "" + } + defer f.Close() + + prefix := "data:" + mime + ";base64," + encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) + var buf bytes.Buffer + buf.Grow(len(prefix) + encodedLen) + buf.WriteString(prefix) + + encoder := base64.NewEncoder(base64.StdEncoding, &buf) + if _, err := io.Copy(encoder, f); err != nil { + logger.WarnCF("agent", "Failed to encode media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + return "" + } + encoder.Close() + + return buf.String() +} + func buildArtifactTags(store media.MediaStore, refs []string) []string { if store == nil || len(refs) == 0 { return nil @@ -137,51 +223,12 @@ func detectMIME(localPath string, meta media.MediaMeta) string { return kind.MIME.Value } -// encodeImageToDataURL base64-encodes an image file into a data URL. -// Returns empty string if the file exceeds maxSize or encoding fails. -func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string { - if info.Size() > int64(maxSize) { - logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ - "path": localPath, - "size": info.Size(), - "max_size": maxSize, - }) - return "" - } - - f, err := os.Open(localPath) - if err != nil { - logger.WarnCF("agent", "Failed to open media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - return "" - } - defer f.Close() - - prefix := "data:" + mime + ";base64," - encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) - var buf bytes.Buffer - buf.Grow(len(prefix) + encodedLen) - buf.WriteString(prefix) - - encoder := base64.NewEncoder(base64.StdEncoding, &buf) - if _, err := io.Copy(encoder, f); err != nil { - logger.WarnCF("agent", "Failed to encode media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - return "" - } - encoder.Close() - - return buf.String() -} - // buildPathTag creates a structured tag exposing the local file path. -// Tag type is derived from MIME: [audio:/path], [video:/path], or [file:/path]. +// Tag type is derived from MIME: [image:/path], [audio:/path], [video:/path], or [file:/path]. func buildPathTag(mime, localPath string) string { switch { + case strings.HasPrefix(mime, "image/"): + return "[image:" + localPath + "]" case strings.HasPrefix(mime, "audio/"): return "[audio:" + localPath + "]" case strings.HasPrefix(mime, "video/"): @@ -192,22 +239,41 @@ func buildPathTag(mime, localPath string) string { } // injectPathTags replaces generic media tags in content with path-bearing versions, -// or appends if no matching generic tag is found. +// or appends if no matching generic tag is found. Channels emit a few different +// placeholder formats — [image], [image: photo], [image: filename.jpg] — so we +// match all of them via regex while leaving path tags ([image:/path]) untouched. +// +// When content is structured data (e.g., JSON from Feishu interactive cards or +// post messages), tags are only injected via placeholder replacement — never +// appended — to avoid corrupting the payload. func injectPathTags(content string, tags []string) string { + isStructured := looksLikeJSON(content) for _, tag := range tags { - var generic string + var pattern *regexp.Regexp switch { + case strings.HasPrefix(tag, "[image:"): + pattern = imagePlaceholderRegex case strings.HasPrefix(tag, "[audio:"): - generic = "[audio]" + pattern = audioPlaceholderRegex case strings.HasPrefix(tag, "[video:"): - generic = "[video]" + pattern = videoPlaceholderRegex case strings.HasPrefix(tag, "[file:"): - generic = "[file]" + pattern = filePlaceholderRegex } - if generic != "" && strings.Contains(content, generic) { - content = strings.Replace(content, generic, tag, 1) - } else if content == "" { + if pattern != nil { + if loc := pattern.FindStringIndex(content); loc != nil { + content = content[:loc[0]] + tag + content[loc[1]:] + continue + } + } + + if isStructured { + content = tag + "\n" + content + continue + } + + if content == "" { content = tag } else { content += " " + tag @@ -215,3 +281,8 @@ func injectPathTags(content string, tags []string) string { } return content } + +func looksLikeJSON(s string) bool { + s = strings.TrimSpace(s) + return len(s) > 1 && s[0] == '{' +} diff --git a/pkg/agent/agent_test.go b/pkg/agent/agent_test.go index 4047ab74d..f326e2acb 100644 --- a/pkg/agent/agent_test.go +++ b/pkg/agent/agent_test.go @@ -1781,17 +1781,22 @@ func (m *artifactThenSendProvider) Chat( if messages[i].Role != "tool" { continue } - start := strings.Index(messages[i].Content, "[file:") - if start < 0 { - continue + for _, prefix := range []string{"[image:", "[file:", "[audio:", "[video:"} { + start := strings.Index(messages[i].Content, prefix) + if start < 0 { + continue + } + rest := messages[i].Content[start+len(prefix):] + end := strings.Index(rest, "]") + if end < 0 { + continue + } + artifactPath = rest[:end] + break } - rest := messages[i].Content[start+len("[file:"):] - end := strings.Index(rest, "]") - if end < 0 { - continue + if artifactPath != "" { + break } - artifactPath = rest[:end] - break } if artifactPath == "" { return nil, fmt.Errorf("provider did not receive artifact path in tool result") @@ -4656,7 +4661,7 @@ func TestRun_PicoToolFeedbackSuppressesDuplicateInterimAssistantContent(t *testi } } -func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) { +func TestResolveMediaRefs_ImageInjectsPathTag(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -4684,15 +4689,110 @@ func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) { } result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 resolved media, got %d", len(result[0].Media)) + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (images use path tags), got %d", len(result[0].Media)) } - if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { - t.Fatalf("expected data:image/png;base64, prefix, got %q", result[0].Media[0][:40]) + localPath, _, _ := store.ResolveWithMeta(ref) + expectedContent := "describe this [image:" + localPath + "]" + if result[0].Content != expectedContent { + t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) } } -func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { +func TestResolveMediaRefs_ToolRoleImageAppendedAsUserMessage(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + pngPath := filepath.Join(dir, "tool-result.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature + 0x00, 0x00, 0x00, 0x0D, // IHDR length + 0x49, 0x48, 0x44, 0x52, // "IHDR" + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, // 1x1 RGB + 0x00, 0x00, 0x00, // no interlace + 0x90, 0x77, 0x53, 0xDE, // CRC + } + if err := os.WriteFile(pngPath, pngHeader, 0o644); err != nil { + t.Fatal(err) + } + ref, _ := store.Store(pngPath, media.MediaMeta{}, "test") + + messages := []providers.Message{ + {Role: "tool", Content: "Image loaded", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + // Tool message should have path tag but no base64 + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media in tool message, got %d", len(result[0].Media)) + } + localPath, _, _ := store.ResolveWithMeta(ref) + if !strings.Contains(result[0].Content, "[image:"+localPath+"]") { + t.Fatalf("expected image path tag in tool content, got %q", result[0].Content) + } + + // A synthetic user message with base64 should follow + if len(result) != 2 { + t.Fatalf("expected 2 messages (tool + synthetic user), got %d", len(result)) + } + if result[1].Role != "user" { + t.Fatalf("expected synthetic message role=user, got %q", result[1].Role) + } + if len(result[1].Media) != 1 { + t.Fatalf("expected 1 base64 media in synthetic user message, got %d", len(result[1].Media)) + } + if !strings.HasPrefix(result[1].Media[0], "data:image/png;base64,") { + t.Fatalf("expected data:image/png;base64, prefix, got %q", result[1].Media[0][:40]) + } +} + +func TestResolveMediaRefs_MultiToolCallPreservesOrdering(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + // Create image for tool #1 + pngPath := filepath.Join(dir, "loaded.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature + 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // IHDR + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, + 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, + } + os.WriteFile(pngPath, pngHeader, 0o644) + imgRef, _ := store.Store(pngPath, media.MediaMeta{}, "test") + + // Simulate: assistant called load_image + read_file, two tool results follow + messages := []providers.Message{ + {Role: "assistant", Content: "Let me load the image and read the file."}, + {Role: "tool", Content: "Image loaded [image: photo]", Media: []string{imgRef}}, + {Role: "tool", Content: "file contents here"}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + // assistant, tool#1, tool#2 must remain contiguous — no user in between + if result[0].Role != "assistant" { + t.Fatalf("result[0] expected assistant, got %q", result[0].Role) + } + if result[1].Role != "tool" { + t.Fatalf("result[1] expected tool, got %q", result[1].Role) + } + if result[2].Role != "tool" { + t.Fatalf("result[2] expected tool, got %q", result[2].Role) + } + + // Synthetic user message should come AFTER the tool block + if len(result) != 4 { + t.Fatalf("expected 4 messages (assistant + 2 tool + synthetic user), got %d", len(result)) + } + if result[3].Role != "user" { + t.Fatalf("result[3] expected user, got %q", result[3].Role) + } + if len(result[3].Media) != 1 || !strings.HasPrefix(result[3].Media[0], "data:image/png;base64,") { + t.Fatal("expected synthetic user message to contain base64 image") + } +} + +func TestResolveMediaRefs_OversizedImageSkipsBase64KeepsPathTag(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -4714,6 +4814,11 @@ func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { if len(result[0].Media) != 0 { t.Fatalf("expected 0 media (oversized), got %d", len(result[0].Media)) } + localPath, _, _ := store.ResolveWithMeta(ref) + expected := "hi [image:" + localPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } } func TestResolveMediaRefs_UnknownTypeInjectsPath(t *testing.T) { @@ -4791,11 +4896,13 @@ func TestResolveMediaRefs_UsesMetaContentType(t *testing.T) { } result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 media, got %d", len(result[0].Media)) + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (images use path tags), got %d", len(result[0].Media)) } - if !strings.HasPrefix(result[0].Media[0], "data:image/jpeg;base64,") { - t.Fatalf("expected jpeg prefix, got %q", result[0].Media[0][:30]) + localPath, _, _ := store.ResolveWithMeta(ref) + expectedContent := "hi [image:" + localPath + "]" + if result[0].Content != expectedContent { + t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) } } @@ -4885,6 +4992,98 @@ func TestResolveMediaRefs_NoGenericTagAppendsPath(t *testing.T) { } } +func TestInjectPathTags_HandlesVariousChannelPlaceholders(t *testing.T) { + cases := []struct { + name string + content string + tag string + want string + }{ + // Telegram / Feishu format + {"image_photo", "[image: photo]", "[image:/tmp/p.png]", "[image:/tmp/p.png]"}, + // WeCom / WeChat / Line format + {"bare_image", "[image]", "[image:/tmp/p.png]", "[image:/tmp/p.png]"}, + // QQ / Discord format with filename + {"image_filename", "[image: pic.jpg]", "[image:/tmp/p.png]", "[image:/tmp/p.png]"}, + {"audio_with_filename", "[audio: voice.m4a]", "[audio:/tmp/a.m4a]", "[audio:/tmp/a.m4a]"}, + {"bare_audio", "[audio]", "[audio:/tmp/a.m4a]", "[audio:/tmp/a.m4a]"}, + {"bare_video", "[video]", "[video:/tmp/v.mp4]", "[video:/tmp/v.mp4]"}, + {"bare_file", "[file]", "[file:/tmp/f.pdf]", "[file:/tmp/f.pdf]"}, + // Mixed surrounding text + { + "with_text", + "hello [image] world", + "[image:/tmp/p.png]", + "hello [image:/tmp/p.png] world", + }, + // No placeholder — append + {"no_placeholder", "hello world", "[image:/tmp/p.png]", "hello world [image:/tmp/p.png]"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := injectPathTags(tc.content, []string{tc.tag}) + if got != tc.want { + t.Errorf("expected %q, got %q", tc.want, got) + } + }) + } +} + +func TestInjectPathTags_DoesNotReplacePathTag(t *testing.T) { + // If content already contains a path tag, we must not touch it. + content := "see [image:/already/placed.png] thanks" + got := injectPathTags(content, []string{"[image:/new/path.png]"}) + want := "see [image:/already/placed.png] thanks [image:/new/path.png]" + if got != want { + t.Fatalf("expected %q, got %q", want, got) + } +} + +func TestInjectPathTags_PrependsForJSONContent(t *testing.T) { + jsonContent := `{"schema":"2.0","body":{"elements":[{"tag":"img","img_key":"img_123"}]}}` + got := injectPathTags(jsonContent, []string{"[image:/tmp/photo.png]"}) + want := "[image:/tmp/photo.png]\n" + jsonContent + if got != want { + t.Fatalf("expected tag prepended to JSON, got %q", got) + } +} + +func TestInjectPathTags_BracketTextNotTreatedAsJSON(t *testing.T) { + content := "[update] see attached report" + got := injectPathTags(content, []string{"[file:/tmp/report.pdf]"}) + want := "[update] see attached report [file:/tmp/report.pdf]" + if got != want { + t.Fatalf("expected tag appended to bracket text, got %q", got) + } +} + +func TestResolveMediaRefs_JSONContentPrependsPathTag(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + pngPath := filepath.Join(dir, "card_img.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, + 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, + 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, + } + os.WriteFile(pngPath, pngHeader, 0o644) + ref, _ := store.Store(pngPath, media.MediaMeta{ContentType: "image/png"}, "test") + + jsonContent := `{"schema":"2.0","body":{"elements":[{"tag":"img","img_key":"img_123"}]}}` + messages := []providers.Message{ + {Role: "user", Content: jsonContent, Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + want := "[image:" + pngPath + "]\n" + jsonContent + if result[0].Content != want { + t.Fatalf("expected path tag prepended to JSON content, got %q", result[0].Content) + } +} + func TestResolveMediaRefs_EmptyContentGetsPathTag(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -4928,13 +5127,12 @@ func TestResolveMediaRefs_MixedImageAndFile(t *testing.T) { } result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) - if len(result[0].Media) != 1 { - t.Fatalf("expected 1 media (image only), got %d", len(result[0].Media)) + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (all types use path tags), got %d", len(result[0].Media)) } - if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { - t.Fatal("expected image to be base64 encoded") - } - expectedContent := "check these [file:" + pdfPath + "]" + imgLocalPath, _, _ := store.ResolveWithMeta(imgRef) + pdfLocalPath, _, _ := store.ResolveWithMeta(fileRef) + expectedContent := "check these [file:" + pdfLocalPath + "] [image:" + imgLocalPath + "]" if result[0].Content != expectedContent { t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) } diff --git a/pkg/agent/steering_test.go b/pkg/agent/steering_test.go index bba988672..1ff699976 100644 --- a/pkg/agent/steering_test.go +++ b/pkg/agent/steering_test.go @@ -1051,16 +1051,16 @@ func TestAgentLoop_Continue_PreservesSteeringMedia(t *testing.T) { foundResolvedMedia := false for _, msg := range msgs { - if msg.Role != "user" || msg.Content != "describe this image" || len(msg.Media) != 1 { + if msg.Role != "user" || !strings.Contains(msg.Content, "describe this image") { continue } - if strings.HasPrefix(msg.Media[0], "data:image/png;base64,") { + if strings.Contains(msg.Content, "[image:") { foundResolvedMedia = true break } } if !foundResolvedMedia { - t.Fatal("expected continue path to inject steering media into the provider request") + t.Fatal("expected continue path to inject image path tag into the provider request") } defaultAgent := al.registry.GetDefaultAgent() diff --git a/pkg/channels/feishu/common.go b/pkg/channels/feishu/common.go index 81238460a..95579df09 100644 --- a/pkg/channels/feishu/common.go +++ b/pkg/channels/feishu/common.go @@ -64,6 +64,62 @@ func extractJSONStringField(content, field string) string { // Format: {"image_key": "img_xxx"} func extractImageKey(content string) string { return extractJSONStringField(content, "image_key") } +// extractPostImageKeys extracts all image_key values from a Feishu post (rich text) +// message. Post messages have nested arrays of elements where images appear as +// {"tag":"img","image_key":"img_xxx"}. +func extractPostImageKeys(rawContent string) []string { + if rawContent == "" { + return nil + } + + var post map[string]json.RawMessage + if err := json.Unmarshal([]byte(rawContent), &post); err != nil { + return nil + } + + var keys []string + seen := make(map[string]struct{}) + + collectFromRows := func(contentRaw json.RawMessage) { + var rows [][]map[string]any + if err := json.Unmarshal(contentRaw, &rows); err != nil { + return + } + for _, row := range rows { + for _, elem := range row { + if tag, _ := elem["tag"].(string); tag == "img" { + if ik, _ := elem["image_key"].(string); ik != "" { + if _, dup := seen[ik]; !dup { + seen[ik] = struct{}{} + keys = append(keys, ik) + } + } + } + } + } + } + + // Flat format: {"title":"...", "content":[[...]]} + if contentRaw, ok := post["content"]; ok { + collectFromRows(contentRaw) + } + + // Localized format: {"zh_cn": {"title":"...", "content":[[...]]}, ...} + for _, raw := range post { + var locale map[string]json.RawMessage + if err := json.Unmarshal(raw, &locale); err != nil { + continue + } + contentRaw, ok := locale["content"] + if !ok { + continue + } + collectFromRows(contentRaw) + } + + return keys +} + // extractFileKey extracts the file_key from a Feishu file/audio message content JSON. // Format: {"file_key": "file_xxx", "file_name": "...", ...} func extractFileKey(content string) string { return extractJSONStringField(content, "file_key") } diff --git a/pkg/channels/feishu/common_test.go b/pkg/channels/feishu/common_test.go index ff4af0148..dcf7861a2 100644 --- a/pkg/channels/feishu/common_test.go +++ b/pkg/channels/feishu/common_test.go @@ -291,6 +291,100 @@ func TestStripMentionPlaceholders(t *testing.T) { } } +func TestExtractPostImageKeys(t *testing.T) { + tests := []struct { + name string + content string + want []string + }{ + { + name: "empty content", + content: "", + want: nil, + }, + { + name: "invalid JSON", + content: "not json", + want: nil, + }, + { + name: "post with no images", + content: `{"zh_cn":{"title":"Title","content":[[{"tag":"text","text":"hello"}]]}}`, + want: nil, + }, + { + name: "post with one image", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_v3_001"}]]}}`, + want: []string{"img_v3_001"}, + }, + { + name: "post with multiple images", + content: `{"zh_cn":{"title":"","content":[[{"tag":"text","text":"see"},{"tag":"img","image_key":"img_001"}],[{"tag":"img","image_key":"img_002"}]]}}`, + want: []string{"img_001", "img_002"}, + }, + { + name: "post with text and image mixed in row", + content: `{"zh_cn":{"title":"","content":[[{"tag":"text","text":"hi"},{"tag":"img","image_key":"img_mix"}]]}}`, + want: []string{"img_mix"}, + }, + { + name: "en_us locale", + content: `{"en_us":{"title":"","content":[[{"tag":"img","image_key":"img_en"}]]}}`, + want: []string{"img_en"}, + }, + { + name: "multiple locales with distinct images", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_zh"}]]},"en_us":{"title":"","content":[[{"tag":"img","image_key":"img_en"}]]}}`, + want: []string{"img_zh", "img_en"}, + }, + { + name: "duplicate image_key across locales is deduplicated", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_same"}]]},"en_us":{"title":"","content":[[{"tag":"img","image_key":"img_same"}]]}}`, + want: []string{"img_same"}, + }, + { + name: "image with empty image_key", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":""}]]}}`, + want: nil, + }, + { + name: "flat format without locale wrapper", + content: `{"title":"","content":[[{"tag":"img","image_key":"img_v3_flat","width":1826,"height":338}],[{"tag":"text","text":" check this image","style":[]}]]}`, + want: []string{"img_v3_flat"}, + }, + { + name: "flat format multiple images", + content: `{"title":"","content":[[{"tag":"img","image_key":"img_flat_1"}],[{"tag":"img","image_key":"img_flat_2"},{"tag":"text","text":"desc"}]]}`, + want: []string{"img_flat_1", "img_flat_2"}, + }, + { + name: "flat format no images", + content: `{"title":"Test","content":[[{"tag":"text","text":"just text"}]]}`, + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := extractPostImageKeys(tt.content) + if len(got) != len(tt.want) { + t.Errorf("extractPostImageKeys() = %v, want %v", got, tt.want) + return + } + // Use set comparison to avoid map iteration order dependency + gotSet := make(map[string]bool, len(got)) + for _, v := range got { + gotSet[v] = true + } + for _, v := range tt.want { + if !gotSet[v] { + t.Errorf("extractPostImageKeys() missing expected key %q; got %v", v, got) + } + } + }) + } +} + func TestExtractCardImageKeys(t *testing.T) { tests := []struct { name string diff --git a/pkg/channels/feishu/feishu_64.go b/pkg/channels/feishu/feishu_64.go index 8f3ae39d9..d09c021c7 100644 --- a/pkg/channels/feishu/feishu_64.go +++ b/pkg/channels/feishu/feishu_64.go @@ -803,6 +803,14 @@ func (c *FeishuChannel) downloadInboundMedia( refs = append(refs, ref) } + case larkim.MsgTypePost: + for _, imageKey := range extractPostImageKeys(rawContent) { + ref := c.downloadResource(ctx, messageID, imageKey, "image", ".jpg", store, scope) + if ref != "" { + refs = append(refs, ref) + } + } + case larkim.MsgTypeInteractive: // Extract and download images embedded in interactive cards feishuKeys, _ := extractCardImageKeys(rawContent) @@ -842,12 +850,41 @@ func (c *FeishuChannel) downloadInboundMedia( // downloadResource downloads a message resource (image/file) from Feishu, // writes it to the project media directory, and stores the reference in MediaStore. // fallbackExt (e.g. ".jpg") is appended when the resolved filename has no extension. +// +// For image resources, if the primary MessageResource.Get API fails (which +// requires im:message or im:message:readonly scope), a fallback to the +// Image.Get API (which requires im:resource scope) is attempted. This ensures +// image downloads succeed regardless of which permission the user has granted. func (c *FeishuChannel) downloadResource( ctx context.Context, messageID, fileKey, resourceType, fallbackExt string, store media.MediaStore, scope string, ) string { + file, filename := c.fetchResourceData(ctx, messageID, fileKey, resourceType) + if file == nil { + return "" + } + if closer, ok := file.(io.Closer); ok { + defer closer.Close() + } + + if filename == "" { + filename = fileKey + } + if filepath.Ext(filename) == "" && fallbackExt != "" { + filename += fallbackExt + } + + return c.storeResourceFile(ctx, messageID, fileKey, filename, file, store, scope) +} + +// fetchResourceData tries to download a resource from Feishu, first via +// MessageResource.Get, then falling back to Image.Get for image resources. +func (c *FeishuChannel) fetchResourceData( + ctx context.Context, + messageID, fileKey, resourceType string, +) (io.Reader, string) { req := larkim.NewGetMessageResourceReqBuilder(). MessageId(messageID). FileKey(fileKey). @@ -855,41 +892,80 @@ func (c *FeishuChannel) downloadResource( Build() resp, err := c.client.Im.V1.MessageResource.Get(ctx, req) + if err == nil && resp.Success() && resp.File != nil { + return resp.File, resp.FileName + } + if err != nil { - logger.ErrorCF("feishu", "Failed to download resource", map[string]any{ + logger.WarnCF("feishu", "MessageResource.Get failed", map[string]any{ "message_id": messageID, "file_key": fileKey, "error": err.Error(), }) - return "" + } else if !resp.Success() { + c.invalidateTokenOnAuthError(resp.Code) + logger.WarnCF("feishu", "MessageResource.Get api error", map[string]any{ + "message_id": messageID, + "file_key": fileKey, + "code": resp.Code, + "msg": resp.Msg, + }) + } else { + logger.WarnCF("feishu", "MessageResource.Get returned empty file body", map[string]any{ + "message_id": messageID, + "file_key": fileKey, + }) + } + + if resourceType != "image" { + return nil, "" + } + + return c.fetchImageDirect(ctx, fileKey) +} + +// fetchImageDirect downloads an image using the Image.Get API +// (/open-apis/im/v1/images/:image_key), which requires the im:resource scope. +func (c *FeishuChannel) fetchImageDirect(ctx context.Context, imageKey string) (io.Reader, string) { + req := larkim.NewGetImageReqBuilder(). + ImageKey(imageKey). + Build() + + resp, err := c.client.Im.V1.Image.Get(ctx, req) + if err != nil { + logger.ErrorCF("feishu", "Image.Get fallback failed", map[string]any{ + "image_key": imageKey, + "error": err.Error(), + }) + return nil, "" } if !resp.Success() { c.invalidateTokenOnAuthError(resp.Code) - logger.ErrorCF("feishu", "Resource download api error", map[string]any{ - "code": resp.Code, - "msg": resp.Msg, + logger.ErrorCF("feishu", "Image.Get fallback api error", map[string]any{ + "image_key": imageKey, + "code": resp.Code, + "msg": resp.Msg, }) - return "" + return nil, "" } - if resp.File == nil { - return "" - } - // Safely close the underlying reader if it implements io.Closer (e.g. HTTP response body). - if closer, ok := resp.File.(io.Closer); ok { - defer closer.Close() + return nil, "" } - filename := resp.FileName - if filename == "" { - filename = fileKey - } - // If filename still has no extension, append the fallback (like Telegram's ext parameter). - if filepath.Ext(filename) == "" && fallbackExt != "" { - filename += fallbackExt - } + logger.DebugCF("feishu", "Image downloaded via Image.Get fallback", map[string]any{ + "image_key": imageKey, + }) + return resp.File, resp.FileName +} - // Write to the shared picoclaw_media directory using a unique name to avoid collisions. +// storeResourceFile writes downloaded resource data to disk and registers it in the MediaStore. +func (c *FeishuChannel) storeResourceFile( + ctx context.Context, + messageID, fileKey, filename string, + file io.Reader, + store media.MediaStore, + scope string, +) string { mediaDir := media.TempDir() if mkdirErr := os.MkdirAll(mediaDir, 0o700); mkdirErr != nil { logger.ErrorCF("feishu", "Failed to create media directory", map[string]any{ @@ -908,7 +984,7 @@ func (c *FeishuChannel) downloadResource( return "" } - if _, copyErr := io.Copy(out, resp.File); copyErr != nil { + if _, copyErr := io.Copy(out, file); copyErr != nil { out.Close() os.Remove(localPath) logger.ErrorCF("feishu", "Failed to write resource to file", map[string]any{ @@ -943,8 +1019,8 @@ func appendMediaTags(content, messageType string, mediaRefs []string) string { return content } - // Don't append tags to JSON content (interactive cards) - would produce invalid JSON - if messageType == larkim.MsgTypeInteractive { + // Don't append tags to JSON content - would produce invalid JSON + if messageType == larkim.MsgTypeInteractive || messageType == larkim.MsgTypePost { return content } diff --git a/pkg/channels/feishu/feishu_64_test.go b/pkg/channels/feishu/feishu_64_test.go index 48fdf0f74..d256325ad 100644 --- a/pkg/channels/feishu/feishu_64_test.go +++ b/pkg/channels/feishu/feishu_64_test.go @@ -180,6 +180,13 @@ func TestAppendMediaTags(t *testing.T) { mediaRefs: []string{"ref1"}, want: `{"schema":"2.0","body":{"elements":[{"tag":"img","img_key":"img_123"}]}}`, }, + { + name: "post message with images returns content unchanged", + content: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_001"}]]}}`, + messageType: "post", + mediaRefs: []string{"ref1"}, + want: `{"zh_cn":{"title":"","content":[[{"tag":"img","image_key":"img_001"}]]}}`, + }, } for _, tt := range tests { diff --git a/pkg/tools/fs/load_image.go b/pkg/tools/fs/load_image.go index 6f612faea..0a67fa120 100644 --- a/pkg/tools/fs/load_image.go +++ b/pkg/tools/fs/load_image.go @@ -147,10 +147,10 @@ func (t *LoadImageTool) Execute(ctx context.Context, args map[string]any) *ToolR return ErrorResult(fmt.Sprintf("failed to register image in media store: %v", err)) } - // Build the tool result text. The media:// ref will be picked up by - // resolveMediaRefs in loop_media.go and converted to a base64 data URL - // before the next LLM call, exactly like channel-received images. - msg := fmt.Sprintf("Image loaded: %s\n[image: %s]", filename, ref) + // Build the tool result text. The media:// ref in Media will be picked + // up by resolveMediaRefs in agent_media.go and base64-encoded for tool + // result messages (role="tool"), so the LLM can see the image content. + msg := fmt.Sprintf("Image loaded: %s\n[image: photo]", filename) return &ToolResult{ ForLLM: msg, diff --git a/pkg/tools/fs/load_image_test.go b/pkg/tools/fs/load_image_test.go index 72f163d81..d33db73be 100644 --- a/pkg/tools/fs/load_image_test.go +++ b/pkg/tools/fs/load_image_test.go @@ -135,9 +135,10 @@ func TestLoadImage_SuccessPath(t *testing.T) { t.Errorf("expected ForLLM to contain '[image:' marker, got: %s", result.ForLLM) } - // 4. ForLLM should also contain the media:// ref - if !strings.Contains(result.ForLLM, result.Media[0]) { - t.Errorf("expected ForLLM to contain media ref %q, got: %s", result.Media[0], result.ForLLM) + // 4. ForLLM should contain the generic [image: photo] placeholder + // (resolveMediaRefs will replace it with the actual path later) + if !strings.Contains(result.ForLLM, "[image: photo]") { + t.Errorf("expected ForLLM to contain '[image: photo]' placeholder, got: %s", result.ForLLM) } // 5. Verify the ref is resolvable in the store