mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat: expose local file paths for non-image media to enable agent file tools (#1516)
* feat: expose local file paths for non-image media to enable agent file tools * fix: Golang Lint error
This commit is contained in:
+1
-1
@@ -881,7 +881,7 @@ func (al *AgentLoop) runAgentLoop(
|
||||
opts.ChatID,
|
||||
)
|
||||
|
||||
// Resolve media:// refs to base64 data URLs (streaming)
|
||||
// Resolve media:// refs: images→base64 data URLs, non-images→local paths in content
|
||||
cfg := al.GetConfig()
|
||||
maxMediaSize := cfg.Agents.Defaults.GetMaxMediaSize()
|
||||
messages = resolveMediaRefs(messages, al.mediaStore, maxMediaSize)
|
||||
|
||||
+108
-50
@@ -20,9 +20,10 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/providers"
|
||||
)
|
||||
|
||||
// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs.
|
||||
// Uses streaming base64 encoding (file handle → encoder → buffer) to avoid holding
|
||||
// both raw bytes and encoded string in memory simultaneously.
|
||||
// resolveMediaRefs resolves media:// refs in messages.
|
||||
// Images are base64-encoded into the Media array for multimodal LLMs.
|
||||
// Non-image files (documents, audio, video) have their local path injected
|
||||
// into Content so the agent can access them via file tools like read_file.
|
||||
// Returns a new slice; original messages are not mutated.
|
||||
func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxSize int) []providers.Message {
|
||||
if store == nil {
|
||||
@@ -38,6 +39,8 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS
|
||||
}
|
||||
|
||||
resolved := make([]string, 0, len(m.Media))
|
||||
var pathTags []string
|
||||
|
||||
for _, ref := range m.Media {
|
||||
if !strings.HasPrefix(ref, "media://") {
|
||||
resolved = append(resolved, ref)
|
||||
@@ -61,62 +64,117 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS
|
||||
})
|
||||
continue
|
||||
}
|
||||
if info.Size() > int64(maxSize) {
|
||||
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
|
||||
"path": localPath,
|
||||
"size": info.Size(),
|
||||
"max_size": maxSize,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Determine MIME type: prefer metadata, fallback to magic-bytes detection
|
||||
mime := meta.ContentType
|
||||
if mime == "" {
|
||||
kind, ftErr := filetype.MatchFile(localPath)
|
||||
if ftErr != nil || kind == filetype.Unknown {
|
||||
logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{
|
||||
"path": localPath,
|
||||
})
|
||||
continue
|
||||
mime := detectMIME(localPath, meta)
|
||||
|
||||
if strings.HasPrefix(mime, "image/") {
|
||||
dataURL := encodeImageToDataURL(localPath, mime, info, maxSize)
|
||||
if dataURL != "" {
|
||||
resolved = append(resolved, dataURL)
|
||||
}
|
||||
mime = kind.MIME.Value
|
||||
}
|
||||
|
||||
// Streaming base64: open file → base64 encoder → buffer
|
||||
// Peak memory: ~1.33x file size (buffer only, no raw bytes copy)
|
||||
f, err := os.Open(localPath)
|
||||
if err != nil {
|
||||
logger.WarnCF("agent", "Failed to open media file", map[string]any{
|
||||
"path": localPath,
|
||||
"error": err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
prefix := "data:" + mime + ";base64,"
|
||||
encodedLen := base64.StdEncoding.EncodedLen(int(info.Size()))
|
||||
var buf bytes.Buffer
|
||||
buf.Grow(len(prefix) + encodedLen)
|
||||
buf.WriteString(prefix)
|
||||
|
||||
encoder := base64.NewEncoder(base64.StdEncoding, &buf)
|
||||
if _, err := io.Copy(encoder, f); err != nil {
|
||||
f.Close()
|
||||
logger.WarnCF("agent", "Failed to encode media file", map[string]any{
|
||||
"path": localPath,
|
||||
"error": err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
encoder.Close()
|
||||
f.Close()
|
||||
|
||||
resolved = append(resolved, buf.String())
|
||||
pathTags = append(pathTags, buildPathTag(mime, localPath))
|
||||
}
|
||||
|
||||
result[i].Media = resolved
|
||||
if len(pathTags) > 0 {
|
||||
result[i].Content = injectPathTags(result[i].Content, pathTags)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// detectMIME determines the MIME type from metadata or magic-bytes detection.
|
||||
// Returns empty string if detection fails.
|
||||
func detectMIME(localPath string, meta media.MediaMeta) string {
|
||||
if meta.ContentType != "" {
|
||||
return meta.ContentType
|
||||
}
|
||||
kind, err := filetype.MatchFile(localPath)
|
||||
if err != nil || kind == filetype.Unknown {
|
||||
return ""
|
||||
}
|
||||
return kind.MIME.Value
|
||||
}
|
||||
|
||||
// encodeImageToDataURL base64-encodes an image file into a data URL.
|
||||
// Returns empty string if the file exceeds maxSize or encoding fails.
|
||||
func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int) string {
|
||||
if info.Size() > int64(maxSize) {
|
||||
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
|
||||
"path": localPath,
|
||||
"size": info.Size(),
|
||||
"max_size": maxSize,
|
||||
})
|
||||
return ""
|
||||
}
|
||||
|
||||
f, err := os.Open(localPath)
|
||||
if err != nil {
|
||||
logger.WarnCF("agent", "Failed to open media file", map[string]any{
|
||||
"path": localPath,
|
||||
"error": err.Error(),
|
||||
})
|
||||
return ""
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
prefix := "data:" + mime + ";base64,"
|
||||
encodedLen := base64.StdEncoding.EncodedLen(int(info.Size()))
|
||||
var buf bytes.Buffer
|
||||
buf.Grow(len(prefix) + encodedLen)
|
||||
buf.WriteString(prefix)
|
||||
|
||||
encoder := base64.NewEncoder(base64.StdEncoding, &buf)
|
||||
if _, err := io.Copy(encoder, f); err != nil {
|
||||
logger.WarnCF("agent", "Failed to encode media file", map[string]any{
|
||||
"path": localPath,
|
||||
"error": err.Error(),
|
||||
})
|
||||
return ""
|
||||
}
|
||||
encoder.Close()
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// buildPathTag creates a structured tag exposing the local file path.
|
||||
// Tag type is derived from MIME: [audio:/path], [video:/path], or [file:/path].
|
||||
func buildPathTag(mime, localPath string) string {
|
||||
switch {
|
||||
case strings.HasPrefix(mime, "audio/"):
|
||||
return "[audio:" + localPath + "]"
|
||||
case strings.HasPrefix(mime, "video/"):
|
||||
return "[video:" + localPath + "]"
|
||||
default:
|
||||
return "[file:" + localPath + "]"
|
||||
}
|
||||
}
|
||||
|
||||
// injectPathTags replaces generic media tags in content with path-bearing versions,
|
||||
// or appends if no matching generic tag is found.
|
||||
func injectPathTags(content string, tags []string) string {
|
||||
for _, tag := range tags {
|
||||
var generic string
|
||||
switch {
|
||||
case strings.HasPrefix(tag, "[audio:"):
|
||||
generic = "[audio]"
|
||||
case strings.HasPrefix(tag, "[video:"):
|
||||
generic = "[video]"
|
||||
case strings.HasPrefix(tag, "[file:"):
|
||||
generic = "[file]"
|
||||
}
|
||||
|
||||
if generic != "" && strings.Contains(content, generic) {
|
||||
content = strings.Replace(content, generic, tag, 1)
|
||||
} else if content == "" {
|
||||
content = tag
|
||||
} else {
|
||||
content += " " + tag
|
||||
}
|
||||
}
|
||||
return content
|
||||
}
|
||||
|
||||
+147
-2
@@ -1095,7 +1095,7 @@ func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_SkipsUnknownType(t *testing.T) {
|
||||
func TestResolveMediaRefs_UnknownTypeInjectsPath(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
@@ -1111,7 +1111,11 @@ func TestResolveMediaRefs_SkipsUnknownType(t *testing.T) {
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
if len(result[0].Media) != 0 {
|
||||
t.Fatalf("expected 0 media (unknown type), got %d", len(result[0].Media))
|
||||
t.Fatalf("expected 0 media entries, got %d", len(result[0].Media))
|
||||
}
|
||||
expected := "hi [file:" + txtPath + "]"
|
||||
if result[0].Content != expected {
|
||||
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1173,3 +1177,144 @@ func TestResolveMediaRefs_UsesMetaContentType(t *testing.T) {
|
||||
t.Fatalf("expected jpeg prefix, got %q", result[0].Media[0][:30])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_PDFInjectsFilePath(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
pdfPath := filepath.Join(dir, "report.pdf")
|
||||
// PDF magic bytes
|
||||
os.WriteFile(pdfPath, []byte("%PDF-1.4 test content"), 0o644)
|
||||
ref, _ := store.Store(pdfPath, media.MediaMeta{ContentType: "application/pdf"}, "test")
|
||||
|
||||
messages := []providers.Message{
|
||||
{Role: "user", Content: "report.pdf [file]", Media: []string{ref}},
|
||||
}
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
if len(result[0].Media) != 0 {
|
||||
t.Fatalf("expected 0 media (non-image), got %d", len(result[0].Media))
|
||||
}
|
||||
expected := "report.pdf [file:" + pdfPath + "]"
|
||||
if result[0].Content != expected {
|
||||
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_AudioInjectsAudioPath(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
oggPath := filepath.Join(dir, "voice.ogg")
|
||||
os.WriteFile(oggPath, []byte("fake audio"), 0o644)
|
||||
ref, _ := store.Store(oggPath, media.MediaMeta{ContentType: "audio/ogg"}, "test")
|
||||
|
||||
messages := []providers.Message{
|
||||
{Role: "user", Content: "voice.ogg [audio]", Media: []string{ref}},
|
||||
}
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
if len(result[0].Media) != 0 {
|
||||
t.Fatalf("expected 0 media, got %d", len(result[0].Media))
|
||||
}
|
||||
expected := "voice.ogg [audio:" + oggPath + "]"
|
||||
if result[0].Content != expected {
|
||||
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_VideoInjectsVideoPath(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
mp4Path := filepath.Join(dir, "clip.mp4")
|
||||
os.WriteFile(mp4Path, []byte("fake video"), 0o644)
|
||||
ref, _ := store.Store(mp4Path, media.MediaMeta{ContentType: "video/mp4"}, "test")
|
||||
|
||||
messages := []providers.Message{
|
||||
{Role: "user", Content: "clip.mp4 [video]", Media: []string{ref}},
|
||||
}
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
if len(result[0].Media) != 0 {
|
||||
t.Fatalf("expected 0 media, got %d", len(result[0].Media))
|
||||
}
|
||||
expected := "clip.mp4 [video:" + mp4Path + "]"
|
||||
if result[0].Content != expected {
|
||||
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_NoGenericTagAppendsPath(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
csvPath := filepath.Join(dir, "data.csv")
|
||||
os.WriteFile(csvPath, []byte("a,b,c"), 0o644)
|
||||
ref, _ := store.Store(csvPath, media.MediaMeta{ContentType: "text/csv"}, "test")
|
||||
|
||||
messages := []providers.Message{
|
||||
{Role: "user", Content: "here is my data", Media: []string{ref}},
|
||||
}
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
expected := "here is my data [file:" + csvPath + "]"
|
||||
if result[0].Content != expected {
|
||||
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_EmptyContentGetsPathTag(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
docPath := filepath.Join(dir, "doc.docx")
|
||||
os.WriteFile(docPath, []byte("fake docx"), 0o644)
|
||||
docxMIME := "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
ref, _ := store.Store(docPath, media.MediaMeta{ContentType: docxMIME}, "test")
|
||||
|
||||
messages := []providers.Message{
|
||||
{Role: "user", Content: "", Media: []string{ref}},
|
||||
}
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
expected := "[file:" + docPath + "]"
|
||||
if result[0].Content != expected {
|
||||
t.Fatalf("expected content %q, got %q", expected, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_MixedImageAndFile(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
dir := t.TempDir()
|
||||
|
||||
pngPath := filepath.Join(dir, "photo.png")
|
||||
pngHeader := []byte{
|
||||
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A,
|
||||
0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
|
||||
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02,
|
||||
0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE,
|
||||
}
|
||||
os.WriteFile(pngPath, pngHeader, 0o644)
|
||||
imgRef, _ := store.Store(pngPath, media.MediaMeta{}, "test")
|
||||
|
||||
pdfPath := filepath.Join(dir, "report.pdf")
|
||||
os.WriteFile(pdfPath, []byte("%PDF-1.4 test"), 0o644)
|
||||
fileRef, _ := store.Store(pdfPath, media.MediaMeta{ContentType: "application/pdf"}, "test")
|
||||
|
||||
messages := []providers.Message{
|
||||
{Role: "user", Content: "check these [file]", Media: []string{imgRef, fileRef}},
|
||||
}
|
||||
result := resolveMediaRefs(messages, store, config.DefaultMaxMediaSize)
|
||||
|
||||
if len(result[0].Media) != 1 {
|
||||
t.Fatalf("expected 1 media (image only), got %d", len(result[0].Media))
|
||||
}
|
||||
if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") {
|
||||
t.Fatal("expected image to be base64 encoded")
|
||||
}
|
||||
expectedContent := "check these [file:" + pdfPath + "]"
|
||||
if result[0].Content != expectedContent {
|
||||
t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user