diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index dfa339dee..f20a56b9c 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -881,7 +881,7 @@ func (al *AgentLoop) runAgentLoop( opts.ChatID, ) - // Resolve media:// refs to base64 data URLs (streaming) + // Resolve media:// refs: images→base64 data URLs, non-images→local paths in content cfg := al.GetConfig() maxMediaSize := cfg.Agents.Defaults.GetMaxMediaSize() messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize) diff --git a/pkg/agent/loop_media.go b/pkg/agent/loop_media.go index 82547a008..1380f0214 100644 --- a/pkg/agent/loop_media.go +++ b/pkg/agent/loop_media.go @@ -20,9 +20,10 @@ import ( "github.com/sipeed/picoclaw/pkg/providers" ) -// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs. -// Uses streaming base64 encoding (file handle → encoder → buffer) to avoid holding -// both raw bytes and encoded string in memory simultaneously. +// resolveMediaRefs resolves media:// refs in messages. +// Images are base64-encoded into the Media array for multimodal LLMs. +// Non-image files (documents, audio, video) have their local path injected +// into Content so the agent can access them via file tools like read_file. // Returns a new slice; original messages are not mutated. func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxSize int) []providers.Message { if store == nil { @@ -38,6 +39,8 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS } resolved := make([]string, 0, len(m.Media)) + var pathTags []string + for _, ref := range m.Media { if !strings.HasPrefix(ref, "media://") { resolved = append(resolved, ref) @@ -61,62 +64,117 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS }) continue } - if info.Size() > int64(maxSize) { - logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ - "path": localPath, - "size": info.Size(), - "max_size": maxSize, - }) - continue - } - // Determine MIME type: prefer metadata, fallback to magic-bytes detection - mime := meta.ContentType - if mime == "" { - kind, ftErr := filetype.MatchFile(localPath) - if ftErr != nil || kind == filetype.Unknown { - logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{ - "path": localPath, - }) - continue + mime := detectMIME(localPath, meta) + + if strings.HasPrefix(mime, "image/") { + dataURL := encodeImageToDataURL(localPath, mime, info, maxSize) + if dataURL != "" { + resolved = append(resolved, dataURL) } - mime = kind.MIME.Value - } - - // Streaming base64: open file → base64 encoder → buffer - // Peak memory: ~1.33x file size (buffer only, no raw bytes copy) - f, err := os.Open(localPath) - if err != nil { - logger.WarnCF("agent", "Failed to open media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) continue } - prefix := "data:" + mime + ";base64," - encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) - var buf bytes.Buffer - buf.Grow(len(prefix) + encodedLen) - buf.WriteString(prefix) - - encoder := base64.NewEncoder(base64.StdEncoding, &buf) - if _, err := io.Copy(encoder, f); err != nil { - f.Close() - logger.WarnCF("agent", "Failed to encode media file", map[string]any{ - "path": localPath, - "error": err.Error(), - }) - continue - } - encoder.Close() - f.Close() - - resolved = append(resolved, buf.String()) + pathTags = append(pathTags, buildPathTag(mime, localPath)) } result[i].Media = resolved + if len(pathTags) > 0 { + result[i].Content = injectPathTags(result[i].Content, pathTags) + } } return result } + +// detectMIME determines the MIME type from metadata or magic-bytes detection. +// Returns empty string if detection fails. +func detectMIME(localPath string, meta media.MediaMeta) string { + if meta.ContentType != "" { + return meta.ContentType + } + kind, err := filetype.MatchFile(localPath) + if err != nil || kind == filetype.Unknown { + return "" + } + return kind.MIME.Value +} + +// encodeImageToDataURL base64-encodes an image file into a data URL. +// Returns empty string if the file exceeds maxSize or encoding fails. +func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string { + if info.Size() > int64(maxSize) { + logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ + "path": localPath, + "size": info.Size(), + "max_size": maxSize, + }) + return "" + } + + f, err := os.Open(localPath) + if err != nil { + logger.WarnCF("agent", "Failed to open media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + return "" + } + defer f.Close() + + prefix := "data:" + mime + ";base64," + encodedLen := base64.StdEncoding.EncodedLen(int(info.Size())) + var buf bytes.Buffer + buf.Grow(len(prefix) + encodedLen) + buf.WriteString(prefix) + + encoder := base64.NewEncoder(base64.StdEncoding, &buf) + if _, err := io.Copy(encoder, f); err != nil { + logger.WarnCF("agent", "Failed to encode media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + return "" + } + encoder.Close() + + return buf.String() +} + +// buildPathTag creates a structured tag exposing the local file path. +// Tag type is derived from MIME: [audio:/path], [video:/path], or [file:/path]. +func buildPathTag(mime, localPath string) string { + switch { + case strings.HasPrefix(mime, "audio/"): + return "[audio:" + localPath + "]" + case strings.HasPrefix(mime, "video/"): + return "[video:" + localPath + "]" + default: + return "[file:" + localPath + "]" + } +} + +// injectPathTags replaces generic media tags in content with path-bearing versions, +// or appends if no matching generic tag is found. +func injectPathTags(content string, tags []string) string { + for _, tag := range tags { + var generic string + switch { + case strings.HasPrefix(tag, "[audio:"): + generic = "[audio]" + case strings.HasPrefix(tag, "[video:"): + generic = "[video]" + case strings.HasPrefix(tag, "[file:"): + generic = "[file]" + } + + if generic != "" && strings.Contains(content, generic) { + content = strings.Replace(content, generic, tag, 1) + } else if content == "" { + content = tag + } else { + content += " " + tag + } + } + return content +} diff --git a/pkg/agent/loop_test.go b/pkg/agent/loop_test.go index 1e8d92db8..a6604e87f 100644 --- a/pkg/agent/loop_test.go +++ b/pkg/agent/loop_test.go @@ -1095,7 +1095,7 @@ func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { } } -func TestResolveMediaRefs_SkipsUnknownType(t *testing.T) { +func TestResolveMediaRefs_UnknownTypeInjectsPath(t *testing.T) { store := media.NewFileMediaStore() dir := t.TempDir() @@ -1111,7 +1111,11 @@ func TestResolveMediaRefs_SkipsUnknownType(t *testing.T) { result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) if len(result[0].Media) != 0 { - t.Fatalf("expected 0 media (unknown type), got %d", len(result[0].Media)) + t.Fatalf("expected 0 media entries, got %d", len(result[0].Media)) + } + expected := "hi [file:" + txtPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) } } @@ -1173,3 +1177,144 @@ func TestResolveMediaRefs_UsesMetaContentType(t *testing.T) { t.Fatalf("expected jpeg prefix, got %q", result[0].Media[0][:30]) } } + +func TestResolveMediaRefs_PDFInjectsFilePath(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + pdfPath := filepath.Join(dir, "report.pdf") + // PDF magic bytes + os.WriteFile(pdfPath, []byte("%PDF-1.4 test content"), 0o644) + ref, _ := store.Store(pdfPath, media.MediaMeta{ContentType: "application/pdf"}, "test") + + messages := []providers.Message{ + {Role: "user", Content: "report.pdf [file]", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media (non-image), got %d", len(result[0].Media)) + } + expected := "report.pdf [file:" + pdfPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } +} + +func TestResolveMediaRefs_AudioInjectsAudioPath(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + oggPath := filepath.Join(dir, "voice.ogg") + os.WriteFile(oggPath, []byte("fake audio"), 0o644) + ref, _ := store.Store(oggPath, media.MediaMeta{ContentType: "audio/ogg"}, "test") + + messages := []providers.Message{ + {Role: "user", Content: "voice.ogg [audio]", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media, got %d", len(result[0].Media)) + } + expected := "voice.ogg [audio:" + oggPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } +} + +func TestResolveMediaRefs_VideoInjectsVideoPath(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + mp4Path := filepath.Join(dir, "clip.mp4") + os.WriteFile(mp4Path, []byte("fake video"), 0o644) + ref, _ := store.Store(mp4Path, media.MediaMeta{ContentType: "video/mp4"}, "test") + + messages := []providers.Message{ + {Role: "user", Content: "clip.mp4 [video]", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + if len(result[0].Media) != 0 { + t.Fatalf("expected 0 media, got %d", len(result[0].Media)) + } + expected := "clip.mp4 [video:" + mp4Path + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } +} + +func TestResolveMediaRefs_NoGenericTagAppendsPath(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + csvPath := filepath.Join(dir, "data.csv") + os.WriteFile(csvPath, []byte("a,b,c"), 0o644) + ref, _ := store.Store(csvPath, media.MediaMeta{ContentType: "text/csv"}, "test") + + messages := []providers.Message{ + {Role: "user", Content: "here is my data", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + expected := "here is my data [file:" + csvPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } +} + +func TestResolveMediaRefs_EmptyContentGetsPathTag(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + docPath := filepath.Join(dir, "doc.docx") + os.WriteFile(docPath, []byte("fake docx"), 0o644) + docxMIME := "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ref, _ := store.Store(docPath, media.MediaMeta{ContentType: docxMIME}, "test") + + messages := []providers.Message{ + {Role: "user", Content: "", Media: []string{ref}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + expected := "[file:" + docPath + "]" + if result[0].Content != expected { + t.Fatalf("expected content %q, got %q", expected, result[0].Content) + } +} + +func TestResolveMediaRefs_MixedImageAndFile(t *testing.T) { + store := media.NewFileMediaStore() + dir := t.TempDir() + + pngPath := filepath.Join(dir, "photo.png") + pngHeader := []byte{ + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, + 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, + 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, + } + os.WriteFile(pngPath, pngHeader, 0o644) + imgRef, _ := store.Store(pngPath, media.MediaMeta{}, "test") + + pdfPath := filepath.Join(dir, "report.pdf") + os.WriteFile(pdfPath, []byte("%PDF-1.4 test"), 0o644) + fileRef, _ := store.Store(pdfPath, media.MediaMeta{ContentType: "application/pdf"}, "test") + + messages := []providers.Message{ + {Role: "user", Content: "check these [file]", Media: []string{imgRef, fileRef}}, + } + result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize) + + if len(result[0].Media) != 1 { + t.Fatalf("expected 1 media (image only), got %d", len(result[0].Media)) + } + if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { + t.Fatal("expected image to be base64 encoded") + } + expectedContent := "check these [file:" + pdfPath + "]" + if result[0].Content != expectedContent { + t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content) + } +}