feat: expose local file paths for non-image media to enable agent file tools (#1516)

* feat: expose local file paths for non-image media to enable agent file tools

* fix: Golang Lint error
This commit is contained in:
dataCenter430
2026-03-14 05:09:11 +01:00
committed by GitHub
parent 555af137b4
commit 0c5d7500e8
3 changed files with 256 additions and 53 deletions
+1 -1
View File
@@ -881,7 +881,7 @@ func (al *AgentLoop) runAgentLoop(
opts.ChatID,
)
// Resolve media:// refs to base64 data URLs (streaming)
// Resolve media:// refs: images→base64 data URLs, non-images→local paths in content
cfg := al.GetConfig()
maxMediaSize := cfg.Agents.Defaults.GetMaxMediaSize()
messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize)
+108 -50
View File
@@ -20,9 +20,10 @@ import (
"github.com/sipeed/picoclaw/pkg/providers"
)
// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs.
// Uses streaming base64 encoding (file handle → encoder → buffer) to avoid holding
// both raw bytes and encoded string in memory simultaneously.
// resolveMediaRefs resolves media:// refs in messages.
// Images are base64-encoded into the Media array for multimodal LLMs.
// Non-image files (documents, audio, video) have their local path injected
// into Content so the agent can access them via file tools like read_file.
// Returns a new slice; original messages are not mutated.
func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxSize int) []providers.Message {
if store == nil {
@@ -38,6 +39,8 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS
}
resolved := make([]string, 0, len(m.Media))
var pathTags []string
for _, ref := range m.Media {
if !strings.HasPrefix(ref, "media://") {
resolved = append(resolved, ref)
@@ -61,62 +64,117 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS
})
continue
}
if info.Size() > int64(maxSize) {
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
"path": localPath,
"size": info.Size(),
"max_size": maxSize,
})
continue
}
// Determine MIME type: prefer metadata, fallback to magic-bytes detection
mime := meta.ContentType
if mime == "" {
kind, ftErr := filetype.MatchFile(localPath)
if ftErr != nil || kind == filetype.Unknown {
logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{
"path": localPath,
})
continue
mime := detectMIME(localPath, meta)
if strings.HasPrefix(mime, "image/") {
dataURL := encodeImageToDataURL(localPath, mime, info, maxSize)
if dataURL != "" {
resolved = append(resolved, dataURL)
}
mime = kind.MIME.Value
}
// Streaming base64: open file → base64 encoder → buffer
// Peak memory: ~1.33x file size (buffer only, no raw bytes copy)
f, err := os.Open(localPath)
if err != nil {
logger.WarnCF("agent", "Failed to open media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
continue
}
prefix := "data:" + mime + ";base64,"
encodedLen := base64.StdEncoding.EncodedLen(int(info.Size()))
var buf bytes.Buffer
buf.Grow(len(prefix) + encodedLen)
buf.WriteString(prefix)
encoder := base64.NewEncoder(base64.StdEncoding, &buf)
if _, err := io.Copy(encoder, f); err != nil {
f.Close()
logger.WarnCF("agent", "Failed to encode media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
continue
}
encoder.Close()
f.Close()
resolved = append(resolved, buf.String())
pathTags = append(pathTags, buildPathTag(mime, localPath))
}
result[i].Media = resolved
if len(pathTags) > 0 {
result[i].Content = injectPathTags(result[i].Content, pathTags)
}
}
return result
}
// detectMIME determines the MIME type from metadata or magic-bytes detection.
// Returns empty string if detection fails.
func detectMIME(localPath string, meta media.MediaMeta) string {
if meta.ContentType != "" {
return meta.ContentType
}
kind, err := filetype.MatchFile(localPath)
if err != nil || kind == filetype.Unknown {
return ""
}
return kind.MIME.Value
}
// encodeImageToDataURL base64-encodes an image file into a data URL.
// Returns empty string if the file exceeds maxSize or encoding fails.
func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string {
if info.Size() > int64(maxSize) {
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
"path": localPath,
"size": info.Size(),
"max_size": maxSize,
})
return ""
}
f, err := os.Open(localPath)
if err != nil {
logger.WarnCF("agent", "Failed to open media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
return ""
}
defer f.Close()
prefix := "data:" + mime + ";base64,"
encodedLen := base64.StdEncoding.EncodedLen(int(info.Size()))
var buf bytes.Buffer
buf.Grow(len(prefix) + encodedLen)
buf.WriteString(prefix)
encoder := base64.NewEncoder(base64.StdEncoding, &buf)
if _, err := io.Copy(encoder, f); err != nil {
logger.WarnCF("agent", "Failed to encode media file", map[string]any{
"path": localPath,
"error": err.Error(),
})
return ""
}
encoder.Close()
return buf.String()
}
// buildPathTag creates a structured tag exposing the local file path.
// Tag type is derived from MIME: [audio:/path], [video:/path], or [file:/path].
func buildPathTag(mime, localPath string) string {
switch {
case strings.HasPrefix(mime, "audio/"):
return "[audio:" + localPath + "]"
case strings.HasPrefix(mime, "video/"):
return "[video:" + localPath + "]"
default:
return "[file:" + localPath + "]"
}
}
// injectPathTags replaces generic media tags in content with path-bearing versions,
// or appends if no matching generic tag is found.
func injectPathTags(content string, tags []string) string {
for _, tag := range tags {
var generic string
switch {
case strings.HasPrefix(tag, "[audio:"):
generic = "[audio]"
case strings.HasPrefix(tag, "[video:"):
generic = "[video]"
case strings.HasPrefix(tag, "[file:"):
generic = "[file]"
}
if generic != "" && strings.Contains(content, generic) {
content = strings.Replace(content, generic, tag, 1)
} else if content == "" {
content = tag
} else {
content += " " + tag
}
}
return content
}
+147 -2
View File
@@ -1095,7 +1095,7 @@ func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) {
}
}
func TestResolveMediaRefs_SkipsUnknownType(t *testing.T) {
func TestResolveMediaRefs_UnknownTypeInjectsPath(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
@@ -1111,7 +1111,11 @@ func TestResolveMediaRefs_SkipsUnknownType(t *testing.T) {
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
if len(result[0].Media) != 0 {
t.Fatalf("expected 0 media (unknown type), got %d", len(result[0].Media))
t.Fatalf("expected 0 media entries, got %d", len(result[0].Media))
}
expected := "hi [file:" + txtPath + "]"
if result[0].Content != expected {
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
}
}
@@ -1173,3 +1177,144 @@ func TestResolveMediaRefs_UsesMetaContentType(t *testing.T) {
t.Fatalf("expected jpeg prefix, got %q", result[0].Media[0][:30])
}
}
func TestResolveMediaRefs_PDFInjectsFilePath(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
pdfPath := filepath.Join(dir, "report.pdf")
// PDF magic bytes
os.WriteFile(pdfPath, []byte("%PDF-1.4 test content"), 0o644)
ref, _ := store.Store(pdfPath, media.MediaMeta{ContentType: "application/pdf"}, "test")
messages := []providers.Message{
{Role: "user", Content: "report.pdf [file]", Media: []string{ref}},
}
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
if len(result[0].Media) != 0 {
t.Fatalf("expected 0 media (non-image), got %d", len(result[0].Media))
}
expected := "report.pdf [file:" + pdfPath + "]"
if result[0].Content != expected {
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
}
}
func TestResolveMediaRefs_AudioInjectsAudioPath(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
oggPath := filepath.Join(dir, "voice.ogg")
os.WriteFile(oggPath, []byte("fake audio"), 0o644)
ref, _ := store.Store(oggPath, media.MediaMeta{ContentType: "audio/ogg"}, "test")
messages := []providers.Message{
{Role: "user", Content: "voice.ogg [audio]", Media: []string{ref}},
}
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
if len(result[0].Media) != 0 {
t.Fatalf("expected 0 media, got %d", len(result[0].Media))
}
expected := "voice.ogg [audio:" + oggPath + "]"
if result[0].Content != expected {
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
}
}
func TestResolveMediaRefs_VideoInjectsVideoPath(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
mp4Path := filepath.Join(dir, "clip.mp4")
os.WriteFile(mp4Path, []byte("fake video"), 0o644)
ref, _ := store.Store(mp4Path, media.MediaMeta{ContentType: "video/mp4"}, "test")
messages := []providers.Message{
{Role: "user", Content: "clip.mp4 [video]", Media: []string{ref}},
}
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
if len(result[0].Media) != 0 {
t.Fatalf("expected 0 media, got %d", len(result[0].Media))
}
expected := "clip.mp4 [video:" + mp4Path + "]"
if result[0].Content != expected {
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
}
}
func TestResolveMediaRefs_NoGenericTagAppendsPath(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
csvPath := filepath.Join(dir, "data.csv")
os.WriteFile(csvPath, []byte("a,b,c"), 0o644)
ref, _ := store.Store(csvPath, media.MediaMeta{ContentType: "text/csv"}, "test")
messages := []providers.Message{
{Role: "user", Content: "here is my data", Media: []string{ref}},
}
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
expected := "here is my data [file:" + csvPath + "]"
if result[0].Content != expected {
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
}
}
func TestResolveMediaRefs_EmptyContentGetsPathTag(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
docPath := filepath.Join(dir, "doc.docx")
os.WriteFile(docPath, []byte("fake docx"), 0o644)
docxMIME := "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
ref, _ := store.Store(docPath, media.MediaMeta{ContentType: docxMIME}, "test")
messages := []providers.Message{
{Role: "user", Content: "", Media: []string{ref}},
}
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
expected := "[file:" + docPath + "]"
if result[0].Content != expected {
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
}
}
func TestResolveMediaRefs_MixedImageAndFile(t *testing.T) {
store := media.NewFileMediaStore()
dir := t.TempDir()
pngPath := filepath.Join(dir, "photo.png")
pngHeader := []byte{
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A,
0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02,
0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE,
}
os.WriteFile(pngPath, pngHeader, 0o644)
imgRef, _ := store.Store(pngPath, media.MediaMeta{}, "test")
pdfPath := filepath.Join(dir, "report.pdf")
os.WriteFile(pdfPath, []byte("%PDF-1.4 test"), 0o644)
fileRef, _ := store.Store(pdfPath, media.MediaMeta{ContentType: "application/pdf"}, "test")
messages := []providers.Message{
{Role: "user", Content: "check these [file]", Media: []string{imgRef, fileRef}},
}
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
if len(result[0].Media) != 1 {
t.Fatalf("expected 1 media (image only), got %d", len(result[0].Media))
}
if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") {
t.Fatal("expected image to be base64 encoded")
}
expectedContent := "check these [file:" + pdfPath + "]"
if result[0].Content != expectedContent {
t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content)
}
}