From 3d54a77c407404e2043fa3af568fef5321a21ad8 Mon Sep 17 00:00:00 2001 From: Zachary Guerrero Date: Fri, 20 Feb 2026 14:32:58 -0800 Subject: [PATCH 1/5] feat: add Media field to Message struct and implement serializeMessages for vision API support - Add Media []string field to Message struct for image/media URLs - Implement serializeMessages() to format messages with image_url content parts - Enables OpenAI-compatible vision APIs to receive image attachments --- pkg/providers/openai_compat/provider.go | 43 ++++++++++++++++++++++++- pkg/providers/protocoltypes/types.go | 1 + 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/pkg/providers/openai_compat/provider.go b/pkg/providers/openai_compat/provider.go index 74e612046..726a34dee 100644 --- a/pkg/providers/openai_compat/provider.go +++ b/pkg/providers/openai_compat/provider.go @@ -116,7 +116,7 @@ func (p *Provider) Chat( requestBody := map[string]any{ "model": model, - "messages": stripSystemParts(messages), + "messages": serializeMessages(messages), } if len(tools) > 0 { @@ -195,6 +195,47 @@ func (p *Provider) Chat( return parseResponse(body) } +func serializeMessages(messages []Message) []map[string]interface{} { + result := make([]map[string]interface{}, 0, len(messages)) + for _, m := range messages { + if len(m.Media) == 0 { + msg := map[string]interface{}{ + "role": m.Role, + "content": m.Content, + } + if m.ToolCallID != "" { + msg["tool_call_id"] = m.ToolCallID + } + if len(m.ToolCalls) > 0 { + msg["tool_calls"] = m.ToolCalls + } + result = append(result, msg) + continue + } + + parts := make([]map[string]interface{}, 0, 1+len(m.Media)) + if m.Content != "" { + parts = append(parts, map[string]interface{}{ + "type": "text", + "text": m.Content, + }) + } + for _, mediaURL := range m.Media { + parts = append(parts, map[string]interface{}{ + "type": "image_url", + "image_url": map[string]interface{}{ + "url": mediaURL, + }, + }) + } + result = append(result, map[string]interface{}{ + "role": m.Role, + "content": parts, + }) + } + return result +} + func parseResponse(body []byte) (*LLMResponse, error) { var apiResponse struct { Choices []struct { diff --git a/pkg/providers/protocoltypes/types.go b/pkg/providers/protocoltypes/types.go index 99f13334e..efac1e10b 100644 --- a/pkg/providers/protocoltypes/types.go +++ b/pkg/providers/protocoltypes/types.go @@ -65,6 +65,7 @@ type ContentBlock struct { type Message struct { Role string `json:"role"` Content string `json:"content"` + Media []string `json:"media,omitempty"` // URLs of images or other media attachments ReasoningContent string `json:"reasoning_content,omitempty"` SystemParts []ContentBlock `json:"system_parts,omitempty"` // structured system blocks for cache-aware adapters ToolCalls []ToolCall `json:"tool_calls,omitempty"` From 6997edc82e1cd555187f982701d03ebc51e26bba Mon Sep 17 00:00:00 2001 From: shikihane Date: Sun, 1 Mar 2026 19:19:31 +0800 Subject: [PATCH 2/5] feat(agent): wire Media through agent pipeline (cherry-pick PR #555) Add Media field to processOptions, pass msg.Media from inbound messages through to BuildMessages and serializeMessages so vision-capable LLMs receive image_url content parts. Based on work by @as3k in sipeed/picoclaw#555. Co-Authored-By: Claude Opus 4.6 --- pkg/agent/context.go | 3 ++- pkg/agent/loop.go | 14 ++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pkg/agent/context.go b/pkg/agent/context.go index 6fccbaf53..8868d6bf4 100644 --- a/pkg/agent/context.go +++ b/pkg/agent/context.go @@ -465,10 +465,11 @@ func (cb *ContextBuilder) BuildMessages( messages = append(messages, history...) // Add current user message - if strings.TrimSpace(currentMessage) != "" { + if strings.TrimSpace(currentMessage) != "" || len(media) > 0 { messages = append(messages, providers.Message{ Role: "user", Content: currentMessage, + Media: media, }) } diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 00b0f096a..52a72d0f1 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -46,11 +46,12 @@ type AgentLoop struct { // processOptions configures how a message is processed type processOptions struct { - SessionKey string // Session identifier for history/context - Channel string // Target channel for tool execution - ChatID string // Target chat ID for tool execution - UserMessage string // User message content (may include prefix) - DefaultResponse string // Response when LLM returns empty + SessionKey string // Session identifier for history/context + Channel string // Target channel for tool execution + ChatID string // Target chat ID for tool execution + UserMessage string // User message content (may include prefix) + Media []string // Media URLs attached to the user message + DefaultResponse string // Response when LLM returns empty EnableSummary bool // Whether to trigger summarization SendResponse bool // Whether to send response via bus NoHistory bool // If true, don't load session history (for heartbeat) @@ -417,6 +418,7 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage) Channel: msg.Channel, ChatID: msg.ChatID, UserMessage: msg.Content, + Media: msg.Media, DefaultResponse: defaultResponse, EnableSummary: true, SendResponse: false, @@ -509,7 +511,7 @@ func (al *AgentLoop) runAgentLoop(ctx context.Context, agent *AgentInstance, opt history, summary, opts.UserMessage, - nil, + opts.Media, opts.Channel, opts.ChatID, ) From a4e5c391bd67357a52936e1e3081b97a01edffb8 Mon Sep 17 00:00:00 2001 From: shikihane Date: Mon, 2 Mar 2026 17:38:08 +0800 Subject: [PATCH 3/5] fix(openai_compat): preserve reasoning_content in serializeMessages The serializeMessages() function was not preserving the reasoning_content field when serializing messages for vision API calls. This caused the TestProviderChat_PreservesReasoningContentInHistory test to fail. This fix ensures reasoning_content is included in both text-only messages and vision messages with media attachments. Co-authored-by: Zachary Guerrero --- pkg/providers/openai_compat/provider.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/providers/openai_compat/provider.go b/pkg/providers/openai_compat/provider.go index 726a34dee..f6c5a3664 100644 --- a/pkg/providers/openai_compat/provider.go +++ b/pkg/providers/openai_compat/provider.go @@ -209,6 +209,9 @@ func serializeMessages(messages []Message) []map[string]interface{} { if len(m.ToolCalls) > 0 { msg["tool_calls"] = m.ToolCalls } + if m.ReasoningContent != "" { + msg["reasoning_content"] = m.ReasoningContent + } result = append(result, msg) continue } @@ -228,10 +231,14 @@ func serializeMessages(messages []Message) []map[string]interface{} { }, }) } - result = append(result, map[string]interface{}{ + msg := map[string]interface{}{ "role": m.Role, "content": parts, - }) + } + if m.ReasoningContent != "" { + msg["reasoning_content"] = m.ReasoningContent + } + result = append(result, msg) } return result } From 18b36af9342b648b0dd1e2ba6d9f56866af76744 Mon Sep 17 00:00:00 2001 From: shikihane Date: Mon, 2 Mar 2026 18:08:32 +0800 Subject: [PATCH 4/5] feat(agent): add resolveMediaRefs to convert media:// refs to base64 data URLs Without this function, media:// refs stored by MediaStore are passed directly to the LLM API, which rejects them as invalid URLs. resolveMediaRefs() runs after BuildMessages() and before the LLM call, converting each media:// ref to a data:image/...;base64,... URL that vision-capable models can process. Also adds mimeFromExtension() helper for MIME type inference from file extensions when ContentType metadata is not available. --- pkg/agent/loop.go | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 52a72d0f1..36a3ad508 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -8,9 +8,11 @@ package agent import ( "context" + "encoding/base64" "encoding/json" "errors" "fmt" + "os" "path/filepath" "strings" "sync" @@ -515,6 +517,7 @@ func (al *AgentLoop) runAgentLoop(ctx context.Context, agent *AgentInstance, opt opts.Channel, opts.ChatID, ) + messages = resolveMediaRefs(messages, al.mediaStore) // 3. Save user message to session agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage) @@ -1352,3 +1355,76 @@ func extractParentPeer(msg bus.InboundMessage) *routing.RoutePeer { } return &routing.RoutePeer{Kind: parentKind, ID: parentID} } + +// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs. +// Returns a new slice with resolved URLs; original messages are not mutated. +func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []providers.Message { + if store == nil { + return messages + } + + result := make([]providers.Message, len(messages)) + copy(result, messages) + + for i, m := range result { + if len(m.Media) == 0 { + continue + } + + resolved := make([]string, 0, len(m.Media)) + for _, ref := range m.Media { + if !strings.HasPrefix(ref, "media://") { + resolved = append(resolved, ref) + continue + } + + localPath, meta, err := store.ResolveWithMeta(ref) + if err != nil { + logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{ + "ref": ref, + "error": err.Error(), + }) + continue + } + + data, err := os.ReadFile(localPath) + if err != nil { + logger.WarnCF("agent", "Failed to read media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + continue + } + + mime := meta.ContentType + if mime == "" { + mime = mimeFromExtension(filepath.Ext(localPath)) + } + + dataURL := "data:" + mime + ";base64," + base64.StdEncoding.EncodeToString(data) + resolved = append(resolved, dataURL) + } + + result[i].Media = resolved + } + + return result +} + +// mimeFromExtension returns a MIME type for common image extensions. +func mimeFromExtension(ext string) string { + switch strings.ToLower(ext) { + case ".jpg", ".jpeg": + return "image/jpeg" + case ".png": + return "image/png" + case ".gif": + return "image/gif" + case ".webp": + return "image/webp" + case ".bmp": + return "image/bmp" + default: + return "image/jpeg" + } +} From 8ebeefc59f6f43954b9f4fd57b7359176aa9f715 Mon Sep 17 00:00:00 2001 From: shikihane Date: Tue, 3 Mar 2026 11:13:22 +0800 Subject: [PATCH 5/5] fix(agent,openai_compat): address review feedback on vision pipeline - serializeMessages: preserve ToolCallID/ToolCalls when Media is present - resolveMediaRefs: add 20MB file size limit to prevent OOM - mimeFromExtension: return empty string for unknown extensions - Add 11 unit tests for serializeMessages, resolveMediaRefs, mimeFromExtension Co-Authored-By: Claude Opus 4.6 --- pkg/agent/loop.go | 31 ++++- pkg/agent/loop_test.go | 123 +++++++++++++++++++ pkg/providers/openai_compat/provider.go | 6 + pkg/providers/openai_compat/provider_test.go | 70 +++++++++++ 4 files changed, 229 insertions(+), 1 deletion(-) diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index 36a3ad508..11d4fa7db 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -1356,6 +1356,10 @@ func extractParentPeer(msg bus.InboundMessage) *routing.RoutePeer { return &routing.RoutePeer{Kind: parentKind, ID: parentID} } +// maxMediaFileSize is the maximum file size (20 MB) for media resolution. +// Files larger than this are skipped to prevent OOM under concurrent load. +const maxMediaFileSize = 20 * 1024 * 1024 + // resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs. // Returns a new slice with resolved URLs; original messages are not mutated. func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []providers.Message { @@ -1387,6 +1391,23 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []pr continue } + info, err := os.Stat(localPath) + if err != nil { + logger.WarnCF("agent", "Failed to stat media file", map[string]any{ + "path": localPath, + "error": err.Error(), + }) + continue + } + if info.Size() > maxMediaFileSize { + logger.WarnCF("agent", "Media file too large, skipping", map[string]any{ + "path": localPath, + "size": info.Size(), + "max_size": maxMediaFileSize, + }) + continue + } + data, err := os.ReadFile(localPath) if err != nil { logger.WarnCF("agent", "Failed to read media file", map[string]any{ @@ -1400,6 +1421,13 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []pr if mime == "" { mime = mimeFromExtension(filepath.Ext(localPath)) } + if mime == "" { + logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{ + "path": localPath, + "ext": filepath.Ext(localPath), + }) + continue + } dataURL := "data:" + mime + ";base64," + base64.StdEncoding.EncodeToString(data) resolved = append(resolved, dataURL) @@ -1412,6 +1440,7 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []pr } // mimeFromExtension returns a MIME type for common image extensions. +// Returns empty string for unrecognized extensions. func mimeFromExtension(ext string) string { switch strings.ToLower(ext) { case ".jpg", ".jpeg": @@ -1425,6 +1454,6 @@ func mimeFromExtension(ext string) string { case ".bmp": return "image/bmp" default: - return "image/jpeg" + return "" } } diff --git a/pkg/agent/loop_test.go b/pkg/agent/loop_test.go index 1034b06e8..c4f139630 100644 --- a/pkg/agent/loop_test.go +++ b/pkg/agent/loop_test.go @@ -6,12 +6,14 @@ import ( "os" "path/filepath" "slices" + "strings" "testing" "time" "github.com/sipeed/picoclaw/pkg/bus" "github.com/sipeed/picoclaw/pkg/channels" "github.com/sipeed/picoclaw/pkg/config" + "github.com/sipeed/picoclaw/pkg/media" "github.com/sipeed/picoclaw/pkg/providers" "github.com/sipeed/picoclaw/pkg/tools" ) @@ -840,3 +842,124 @@ func TestHandleReasoning(t *testing.T) { } }) } + +func TestMimeFromExtension(t *testing.T) { + tests := []struct { + ext string + want string + }{ + {".jpg", "image/jpeg"}, + {".JPEG", "image/jpeg"}, + {".png", "image/png"}, + {".gif", "image/gif"}, + {".webp", "image/webp"}, + {".bmp", "image/bmp"}, + {".txt", ""}, + {".pdf", ""}, + {"", ""}, + } + for _, tt := range tests { + if got := mimeFromExtension(tt.ext); got != tt.want { + t.Errorf("mimeFromExtension(%q) = %q, want %q", tt.ext, got, tt.want) + } + } +} + +func TestResolveMediaRefs_NilStore(t *testing.T) { + msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"media://abc"}}} + result := resolveMediaRefs(msgs, nil) + if result[0].Media[0] != "media://abc" { + t.Error("nil store should return messages unchanged") + } +} + +func TestResolveMediaRefs_NonMediaRef(t *testing.T) { + msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"https://example.com/img.png"}}} + result := resolveMediaRefs(msgs, media.NewFileMediaStore()) + if result[0].Media[0] != "https://example.com/img.png" { + t.Error("non-media:// refs should be passed through unchanged") + } +} + +func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) { + store := media.NewFileMediaStore() + + imgPath := filepath.Join(t.TempDir(), "test.png") + if err := os.WriteFile(imgPath, []byte("fake-png-data"), 0o644); err != nil { + t.Fatal(err) + } + + ref, err := store.Store(imgPath, media.MediaMeta{ContentType: "image/png"}, "test") + if err != nil { + t.Fatal(err) + } + + msgs := []providers.Message{{Role: "user", Content: "describe", Media: []string{ref}}} + result := resolveMediaRefs(msgs, store) + + if len(result[0].Media) != 1 { + t.Fatalf("expected 1 resolved media, got %d", len(result[0].Media)) + } + if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") { + t.Errorf("expected data URL, got %s", result[0].Media[0][:40]) + } +} + +func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) { + store := media.NewFileMediaStore() + + bigPath := filepath.Join(t.TempDir(), "big.jpg") + if err := os.WriteFile(bigPath, make([]byte, maxMediaFileSize+1), 0o644); err != nil { + t.Fatal(err) + } + + ref, err := store.Store(bigPath, media.MediaMeta{ContentType: "image/jpeg"}, "test") + if err != nil { + t.Fatal(err) + } + + msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}} + result := resolveMediaRefs(msgs, store) + + if len(result[0].Media) != 0 { + t.Error("oversized file should be skipped") + } +} + +func TestResolveMediaRefs_SkipsUnknownExtension(t *testing.T) { + store := media.NewFileMediaStore() + + txtPath := filepath.Join(t.TempDir(), "readme.txt") + if err := os.WriteFile(txtPath, []byte("hello"), 0o644); err != nil { + t.Fatal(err) + } + + ref, err := store.Store(txtPath, media.MediaMeta{}, "test") + if err != nil { + t.Fatal(err) + } + + msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}} + result := resolveMediaRefs(msgs, store) + + if len(result[0].Media) != 0 { + t.Error("unknown extension with no ContentType should be skipped") + } +} + +func TestResolveMediaRefs_DoesNotMutateOriginal(t *testing.T) { + store := media.NewFileMediaStore() + + imgPath := filepath.Join(t.TempDir(), "test.jpg") + if err := os.WriteFile(imgPath, []byte("data"), 0o644); err != nil { + t.Fatal(err) + } + + ref, _ := store.Store(imgPath, media.MediaMeta{ContentType: "image/jpeg"}, "test") + original := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}} + resolveMediaRefs(original, store) + + if !strings.HasPrefix(original[0].Media[0], "media://") { + t.Error("original message should not be mutated") + } +} diff --git a/pkg/providers/openai_compat/provider.go b/pkg/providers/openai_compat/provider.go index f6c5a3664..770881b49 100644 --- a/pkg/providers/openai_compat/provider.go +++ b/pkg/providers/openai_compat/provider.go @@ -235,6 +235,12 @@ func serializeMessages(messages []Message) []map[string]interface{} { "role": m.Role, "content": parts, } + if m.ToolCallID != "" { + msg["tool_call_id"] = m.ToolCallID + } + if len(m.ToolCalls) > 0 { + msg["tool_calls"] = m.ToolCalls + } if m.ReasoningContent != "" { msg["reasoning_content"] = m.ReasoningContent } diff --git a/pkg/providers/openai_compat/provider_test.go b/pkg/providers/openai_compat/provider_test.go index d9e6ba871..da3e48cf0 100644 --- a/pkg/providers/openai_compat/provider_test.go +++ b/pkg/providers/openai_compat/provider_test.go @@ -411,3 +411,73 @@ func TestProvider_FunctionalOptionRequestTimeoutNonPositive(t *testing.T) { t.Fatalf("http timeout = %v, want %v", p.httpClient.Timeout, defaultRequestTimeout) } } + +func TestSerializeMessages_PlainText(t *testing.T) { + msgs := []Message{ + {Role: "user", Content: "hello"}, + {Role: "assistant", Content: "hi"}, + } + result := serializeMessages(msgs) + if len(result) != 2 { + t.Fatalf("expected 2 messages, got %d", len(result)) + } + if result[0]["content"] != "hello" { + t.Errorf("expected plain string content, got %v", result[0]["content"]) + } +} + +func TestSerializeMessages_WithMedia(t *testing.T) { + msgs := []Message{ + {Role: "user", Content: "describe this", Media: []string{"data:image/png;base64,abc123"}}, + } + result := serializeMessages(msgs) + if len(result) != 1 { + t.Fatalf("expected 1 message, got %d", len(result)) + } + parts, ok := result[0]["content"].([]map[string]interface{}) + if !ok { + t.Fatalf("expected content to be []map, got %T", result[0]["content"]) + } + if len(parts) != 2 { + t.Fatalf("expected 2 parts (text + image), got %d", len(parts)) + } + if parts[0]["type"] != "text" { + t.Errorf("expected first part type=text, got %v", parts[0]["type"]) + } + if parts[1]["type"] != "image_url" { + t.Errorf("expected second part type=image_url, got %v", parts[1]["type"]) + } +} + +func TestSerializeMessages_WithMediaPreservesToolFields(t *testing.T) { + msgs := []Message{ + { + Role: "assistant", + Content: "result", + Media: []string{"data:image/png;base64,abc"}, + ToolCallID: "call_123", + ToolCalls: []ToolCall{{ID: "tc_1", Type: "function", Function: &FunctionCall{Name: "test", Arguments: "{}"}}}, + ReasoningContent: "thinking...", + }, + } + result := serializeMessages(msgs) + if result[0]["tool_call_id"] != "call_123" { + t.Errorf("expected tool_call_id=call_123, got %v", result[0]["tool_call_id"]) + } + if result[0]["tool_calls"] == nil { + t.Error("expected tool_calls to be present") + } + if result[0]["reasoning_content"] != "thinking..." { + t.Errorf("expected reasoning_content, got %v", result[0]["reasoning_content"]) + } +} + +func TestSerializeMessages_EmptyMediaUsesPlainFormat(t *testing.T) { + msgs := []Message{ + {Role: "user", Content: "hello", Media: []string{}}, + } + result := serializeMessages(msgs) + if _, ok := result[0]["content"].(string); !ok { + t.Errorf("empty Media should use plain string format, got %T", result[0]["content"]) + } +}