mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
Revert "feat(agent): add vision/image support to agent pipeline"
This commit is contained in:
@@ -465,11 +465,10 @@ func (cb *ContextBuilder) BuildMessages(
|
||||
messages = append(messages, history...)
|
||||
|
||||
// Add current user message
|
||||
if strings.TrimSpace(currentMessage) != "" || len(media) > 0 {
|
||||
if strings.TrimSpace(currentMessage) != "" {
|
||||
messages = append(messages, providers.Message{
|
||||
Role: "user",
|
||||
Content: currentMessage,
|
||||
Media: media,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
+6
-113
@@ -8,11 +8,9 @@ package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -49,12 +47,11 @@ type AgentLoop struct {
|
||||
|
||||
// processOptions configures how a message is processed
|
||||
type processOptions struct {
|
||||
SessionKey string // Session identifier for history/context
|
||||
Channel string // Target channel for tool execution
|
||||
ChatID string // Target chat ID for tool execution
|
||||
UserMessage string // User message content (may include prefix)
|
||||
Media []string // Media URLs attached to the user message
|
||||
DefaultResponse string // Response when LLM returns empty
|
||||
SessionKey string // Session identifier for history/context
|
||||
Channel string // Target channel for tool execution
|
||||
ChatID string // Target chat ID for tool execution
|
||||
UserMessage string // User message content (may include prefix)
|
||||
DefaultResponse string // Response when LLM returns empty
|
||||
EnableSummary bool // Whether to trigger summarization
|
||||
SendResponse bool // Whether to send response via bus
|
||||
NoHistory bool // If true, don't load session history (for heartbeat)
|
||||
@@ -499,7 +496,6 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
|
||||
Channel: msg.Channel,
|
||||
ChatID: msg.ChatID,
|
||||
UserMessage: msg.Content,
|
||||
Media: msg.Media,
|
||||
DefaultResponse: defaultResponse,
|
||||
EnableSummary: true,
|
||||
SendResponse: false,
|
||||
@@ -606,11 +602,10 @@ func (al *AgentLoop) runAgentLoop(
|
||||
history,
|
||||
summary,
|
||||
opts.UserMessage,
|
||||
opts.Media,
|
||||
nil,
|
||||
opts.Channel,
|
||||
opts.ChatID,
|
||||
)
|
||||
messages = resolveMediaRefs(messages, al.mediaStore)
|
||||
|
||||
// 3. Save user message to session
|
||||
agent.Sessions.AddMessage(opts.SessionKey, "user", opts.UserMessage)
|
||||
@@ -1481,105 +1476,3 @@ func extractParentPeer(msg bus.InboundMessage) *routing.RoutePeer {
|
||||
}
|
||||
return &routing.RoutePeer{Kind: parentKind, ID: parentID}
|
||||
}
|
||||
|
||||
// maxMediaFileSize is the maximum file size (20 MB) for media resolution.
|
||||
// Files larger than this are skipped to prevent OOM under concurrent load.
|
||||
const maxMediaFileSize = 20 * 1024 * 1024
|
||||
|
||||
// resolveMediaRefs replaces media:// refs in message Media fields with base64 data URLs.
|
||||
// Returns a new slice with resolved URLs; original messages are not mutated.
|
||||
func resolveMediaRefs(messages []providers.Message, store media.MediaStore) []providers.Message {
|
||||
if store == nil {
|
||||
return messages
|
||||
}
|
||||
|
||||
result := make([]providers.Message, len(messages))
|
||||
copy(result, messages)
|
||||
|
||||
for i, m := range result {
|
||||
if len(m.Media) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
resolved := make([]string, 0, len(m.Media))
|
||||
for _, ref := range m.Media {
|
||||
if !strings.HasPrefix(ref, "media://") {
|
||||
resolved = append(resolved, ref)
|
||||
continue
|
||||
}
|
||||
|
||||
localPath, meta, err := store.ResolveWithMeta(ref)
|
||||
if err != nil {
|
||||
logger.WarnCF("agent", "Failed to resolve media ref", map[string]any{
|
||||
"ref": ref,
|
||||
"error": err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
info, err := os.Stat(localPath)
|
||||
if err != nil {
|
||||
logger.WarnCF("agent", "Failed to stat media file", map[string]any{
|
||||
"path": localPath,
|
||||
"error": err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
if info.Size() > maxMediaFileSize {
|
||||
logger.WarnCF("agent", "Media file too large, skipping", map[string]any{
|
||||
"path": localPath,
|
||||
"size": info.Size(),
|
||||
"max_size": maxMediaFileSize,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(localPath)
|
||||
if err != nil {
|
||||
logger.WarnCF("agent", "Failed to read media file", map[string]any{
|
||||
"path": localPath,
|
||||
"error": err.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
mime := meta.ContentType
|
||||
if mime == "" {
|
||||
mime = mimeFromExtension(filepath.Ext(localPath))
|
||||
}
|
||||
if mime == "" {
|
||||
logger.WarnCF("agent", "Unknown media type, skipping", map[string]any{
|
||||
"path": localPath,
|
||||
"ext": filepath.Ext(localPath),
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
dataURL := "data:" + mime + ";base64," + base64.StdEncoding.EncodeToString(data)
|
||||
resolved = append(resolved, dataURL)
|
||||
}
|
||||
|
||||
result[i].Media = resolved
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// mimeFromExtension returns a MIME type for common image extensions.
|
||||
// Returns empty string for unrecognized extensions.
|
||||
func mimeFromExtension(ext string) string {
|
||||
switch strings.ToLower(ext) {
|
||||
case ".jpg", ".jpeg":
|
||||
return "image/jpeg"
|
||||
case ".png":
|
||||
return "image/png"
|
||||
case ".gif":
|
||||
return "image/gif"
|
||||
case ".webp":
|
||||
return "image/webp"
|
||||
case ".bmp":
|
||||
return "image/bmp"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,14 +6,12 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/bus"
|
||||
"github.com/sipeed/picoclaw/pkg/channels"
|
||||
"github.com/sipeed/picoclaw/pkg/config"
|
||||
"github.com/sipeed/picoclaw/pkg/media"
|
||||
"github.com/sipeed/picoclaw/pkg/providers"
|
||||
"github.com/sipeed/picoclaw/pkg/tools"
|
||||
)
|
||||
@@ -810,124 +808,3 @@ func TestHandleReasoning(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestMimeFromExtension(t *testing.T) {
|
||||
tests := []struct {
|
||||
ext string
|
||||
want string
|
||||
}{
|
||||
{".jpg", "image/jpeg"},
|
||||
{".JPEG", "image/jpeg"},
|
||||
{".png", "image/png"},
|
||||
{".gif", "image/gif"},
|
||||
{".webp", "image/webp"},
|
||||
{".bmp", "image/bmp"},
|
||||
{".txt", ""},
|
||||
{".pdf", ""},
|
||||
{"", ""},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := mimeFromExtension(tt.ext); got != tt.want {
|
||||
t.Errorf("mimeFromExtension(%q) = %q, want %q", tt.ext, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_NilStore(t *testing.T) {
|
||||
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"media://abc"}}}
|
||||
result := resolveMediaRefs(msgs, nil)
|
||||
if result[0].Media[0] != "media://abc" {
|
||||
t.Error("nil store should return messages unchanged")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_NonMediaRef(t *testing.T) {
|
||||
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{"https://example.com/img.png"}}}
|
||||
result := resolveMediaRefs(msgs, media.NewFileMediaStore())
|
||||
if result[0].Media[0] != "https://example.com/img.png" {
|
||||
t.Error("non-media:// refs should be passed through unchanged")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_ResolvesToBase64(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
|
||||
imgPath := filepath.Join(t.TempDir(), "test.png")
|
||||
if err := os.WriteFile(imgPath, []byte("fake-png-data"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ref, err := store.Store(imgPath, media.MediaMeta{ContentType: "image/png"}, "test")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
msgs := []providers.Message{{Role: "user", Content: "describe", Media: []string{ref}}}
|
||||
result := resolveMediaRefs(msgs, store)
|
||||
|
||||
if len(result[0].Media) != 1 {
|
||||
t.Fatalf("expected 1 resolved media, got %d", len(result[0].Media))
|
||||
}
|
||||
if !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") {
|
||||
t.Errorf("expected data URL, got %s", result[0].Media[0][:40])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_SkipsOversizedFile(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
|
||||
bigPath := filepath.Join(t.TempDir(), "big.jpg")
|
||||
if err := os.WriteFile(bigPath, make([]byte, maxMediaFileSize+1), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ref, err := store.Store(bigPath, media.MediaMeta{ContentType: "image/jpeg"}, "test")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}}
|
||||
result := resolveMediaRefs(msgs, store)
|
||||
|
||||
if len(result[0].Media) != 0 {
|
||||
t.Error("oversized file should be skipped")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_SkipsUnknownExtension(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
|
||||
txtPath := filepath.Join(t.TempDir(), "readme.txt")
|
||||
if err := os.WriteFile(txtPath, []byte("hello"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ref, err := store.Store(txtPath, media.MediaMeta{}, "test")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
msgs := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}}
|
||||
result := resolveMediaRefs(msgs, store)
|
||||
|
||||
if len(result[0].Media) != 0 {
|
||||
t.Error("unknown extension with no ContentType should be skipped")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveMediaRefs_DoesNotMutateOriginal(t *testing.T) {
|
||||
store := media.NewFileMediaStore()
|
||||
|
||||
imgPath := filepath.Join(t.TempDir(), "test.jpg")
|
||||
if err := os.WriteFile(imgPath, []byte("data"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ref, _ := store.Store(imgPath, media.MediaMeta{ContentType: "image/jpeg"}, "test")
|
||||
original := []providers.Message{{Role: "user", Content: "hi", Media: []string{ref}}}
|
||||
resolveMediaRefs(original, store)
|
||||
|
||||
if !strings.HasPrefix(original[0].Media[0], "media://") {
|
||||
t.Error("original message should not be mutated")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,7 +116,7 @@ func (p *Provider) Chat(
|
||||
|
||||
requestBody := map[string]any{
|
||||
"model": model,
|
||||
"messages": serializeMessages(messages),
|
||||
"messages": stripSystemParts(messages),
|
||||
}
|
||||
|
||||
if len(tools) > 0 {
|
||||
@@ -195,60 +195,6 @@ func (p *Provider) Chat(
|
||||
return parseResponse(body)
|
||||
}
|
||||
|
||||
func serializeMessages(messages []Message) []map[string]interface{} {
|
||||
result := make([]map[string]interface{}, 0, len(messages))
|
||||
for _, m := range messages {
|
||||
if len(m.Media) == 0 {
|
||||
msg := map[string]interface{}{
|
||||
"role": m.Role,
|
||||
"content": m.Content,
|
||||
}
|
||||
if m.ToolCallID != "" {
|
||||
msg["tool_call_id"] = m.ToolCallID
|
||||
}
|
||||
if len(m.ToolCalls) > 0 {
|
||||
msg["tool_calls"] = m.ToolCalls
|
||||
}
|
||||
if m.ReasoningContent != "" {
|
||||
msg["reasoning_content"] = m.ReasoningContent
|
||||
}
|
||||
result = append(result, msg)
|
||||
continue
|
||||
}
|
||||
|
||||
parts := make([]map[string]interface{}, 0, 1+len(m.Media))
|
||||
if m.Content != "" {
|
||||
parts = append(parts, map[string]interface{}{
|
||||
"type": "text",
|
||||
"text": m.Content,
|
||||
})
|
||||
}
|
||||
for _, mediaURL := range m.Media {
|
||||
parts = append(parts, map[string]interface{}{
|
||||
"type": "image_url",
|
||||
"image_url": map[string]interface{}{
|
||||
"url": mediaURL,
|
||||
},
|
||||
})
|
||||
}
|
||||
msg := map[string]interface{}{
|
||||
"role": m.Role,
|
||||
"content": parts,
|
||||
}
|
||||
if m.ToolCallID != "" {
|
||||
msg["tool_call_id"] = m.ToolCallID
|
||||
}
|
||||
if len(m.ToolCalls) > 0 {
|
||||
msg["tool_calls"] = m.ToolCalls
|
||||
}
|
||||
if m.ReasoningContent != "" {
|
||||
msg["reasoning_content"] = m.ReasoningContent
|
||||
}
|
||||
result = append(result, msg)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseResponse(body []byte) (*LLMResponse, error) {
|
||||
var apiResponse struct {
|
||||
Choices []struct {
|
||||
|
||||
@@ -416,73 +416,3 @@ func TestProvider_FunctionalOptionRequestTimeoutNonPositive(t *testing.T) {
|
||||
t.Fatalf("http timeout = %v, want %v", p.httpClient.Timeout, defaultRequestTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSerializeMessages_PlainText(t *testing.T) {
|
||||
msgs := []Message{
|
||||
{Role: "user", Content: "hello"},
|
||||
{Role: "assistant", Content: "hi"},
|
||||
}
|
||||
result := serializeMessages(msgs)
|
||||
if len(result) != 2 {
|
||||
t.Fatalf("expected 2 messages, got %d", len(result))
|
||||
}
|
||||
if result[0]["content"] != "hello" {
|
||||
t.Errorf("expected plain string content, got %v", result[0]["content"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestSerializeMessages_WithMedia(t *testing.T) {
|
||||
msgs := []Message{
|
||||
{Role: "user", Content: "describe this", Media: []string{"data:image/png;base64,abc123"}},
|
||||
}
|
||||
result := serializeMessages(msgs)
|
||||
if len(result) != 1 {
|
||||
t.Fatalf("expected 1 message, got %d", len(result))
|
||||
}
|
||||
parts, ok := result[0]["content"].([]map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected content to be []map, got %T", result[0]["content"])
|
||||
}
|
||||
if len(parts) != 2 {
|
||||
t.Fatalf("expected 2 parts (text + image), got %d", len(parts))
|
||||
}
|
||||
if parts[0]["type"] != "text" {
|
||||
t.Errorf("expected first part type=text, got %v", parts[0]["type"])
|
||||
}
|
||||
if parts[1]["type"] != "image_url" {
|
||||
t.Errorf("expected second part type=image_url, got %v", parts[1]["type"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestSerializeMessages_WithMediaPreservesToolFields(t *testing.T) {
|
||||
msgs := []Message{
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: "result",
|
||||
Media: []string{"data:image/png;base64,abc"},
|
||||
ToolCallID: "call_123",
|
||||
ToolCalls: []ToolCall{{ID: "tc_1", Type: "function", Function: &FunctionCall{Name: "test", Arguments: "{}"}}},
|
||||
ReasoningContent: "thinking...",
|
||||
},
|
||||
}
|
||||
result := serializeMessages(msgs)
|
||||
if result[0]["tool_call_id"] != "call_123" {
|
||||
t.Errorf("expected tool_call_id=call_123, got %v", result[0]["tool_call_id"])
|
||||
}
|
||||
if result[0]["tool_calls"] == nil {
|
||||
t.Error("expected tool_calls to be present")
|
||||
}
|
||||
if result[0]["reasoning_content"] != "thinking..." {
|
||||
t.Errorf("expected reasoning_content, got %v", result[0]["reasoning_content"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestSerializeMessages_EmptyMediaUsesPlainFormat(t *testing.T) {
|
||||
msgs := []Message{
|
||||
{Role: "user", Content: "hello", Media: []string{}},
|
||||
}
|
||||
result := serializeMessages(msgs)
|
||||
if _, ok := result[0]["content"].(string); !ok {
|
||||
t.Errorf("empty Media should use plain string format, got %T", result[0]["content"])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,6 @@ type ContentBlock struct {
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
Media []string `json:"media,omitempty"` // URLs of images or other media attachments
|
||||
ReasoningContent string `json:"reasoning_content,omitempty"`
|
||||
SystemParts []ContentBlock `json:"system_parts,omitempty"` // structured system blocks for cache-aware adapters
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
|
||||
Reference in New Issue
Block a user