refined code

2026-06-12 18:08:54 +00:00 · 2026-03-09 09:32:21 +01:00
parent c69095457f
commit ff54128ab4
5 changed files with 117 additions and 147 deletions
@@ -70,7 +70,8 @@ func NewAgentInstance(
 	toolsRegistry := tools.NewToolRegistry()

 	if cfg.Tools.IsToolEnabled("read_file") {
-		toolsRegistry.Register(tools.NewReadFileTool(workspace, readRestrict, allowReadPaths))
+		maxReadFileSize := cfg.Tools.ReadFile.MaxReadFileSize
+		toolsRegistry.Register(tools.NewReadFileTool(workspace, readRestrict, maxReadFileSize, allowReadPaths))
 	}
 	if cfg.Tools.IsToolEnabled("write_file") {
 		toolsRegistry.Register(tools.NewWriteFileTool(workspace, restrict, allowWritePaths))
@@ -660,6 +660,11 @@ type MediaCleanupConfig struct {
 	Interval   int `                                    env:"PICOCLAW_MEDIA_CLEANUP_INTERVAL" json:"interval_minutes"`
 }

+type ReadFileToolConfig struct {
+	Enabled         bool `json:"enabled"`
+	MaxReadFileSize int  `json:"max_read_file_size"`
+}
+
 type ToolsConfig struct {
 	AllowReadPaths  []string           `json:"allow_read_paths"  env:"PICOCLAW_TOOLS_ALLOW_READ_PATHS"`
 	AllowWritePaths []string           `json:"allow_write_paths" env:"PICOCLAW_TOOLS_ALLOW_WRITE_PATHS"`
@@ -676,7 +681,7 @@ type ToolsConfig struct {
 	InstallSkill    ToolConfig         `json:"install_skill"                                            envPrefix:"PICOCLAW_TOOLS_INSTALL_SKILL_"`
 	ListDir         ToolConfig         `json:"list_dir"                                                 envPrefix:"PICOCLAW_TOOLS_LIST_DIR_"`
 	Message         ToolConfig         `json:"message"                                                  envPrefix:"PICOCLAW_TOOLS_MESSAGE_"`
-	ReadFile        ToolConfig         `json:"read_file"                                                envPrefix:"PICOCLAW_TOOLS_READ_FILE_"`
+	ReadFile        ReadFileToolConfig `json:"read_file"                                                envPrefix:"PICOCLAW_TOOLS_READ_FILE_"`
 	SendFile        ToolConfig         `json:"send_file"                                                envPrefix:"PICOCLAW_TOOLS_SEND_FILE_"`
 	Spawn           ToolConfig         `json:"spawn"                                                    envPrefix:"PICOCLAW_TOOLS_SPAWN_"`
 	SPI             ToolConfig         `json:"spi"                                                      envPrefix:"PICOCLAW_TOOLS_SPI_"`
@@ -458,8 +458,9 @@ func DefaultConfig() *Config {
 			Message: ToolConfig{
 				Enabled: true,
 			},
-			ReadFile: ToolConfig{
-				Enabled: true,
+			ReadFile: ReadFileToolConfig{
+				Enabled:         true,
+				MaxReadFileSize: 64 * 1024, // 64KB
 			},
 			Spawn: ToolConfig{
 				Enabled: true,
@@ -1,12 +1,12 @@
 package tools

 import (
-	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"io/fs"
-	"net/http"
+	"math"
 	"os"
 	"path/filepath"
 	"regexp"
@@ -15,9 +15,10 @@ import (
 	"time"

 	"github.com/sipeed/picoclaw/pkg/fileutil"
+	"github.com/sipeed/picoclaw/pkg/logger"
 )

-const MaxReadFileSize = 128 * 1024 // 64KB limit to avoid context overflow
+const MaxReadFileSize = 64 * 1024 // 64KB limit to avoid context overflow

 // validatePath ensures the given path is within the workspace if restrict is true.
 func validatePath(path, workspace string, restrict bool) (string, error) {
@@ -91,15 +92,30 @@ func isWithinWorkspace(candidate, workspace string) bool {
 }

 type ReadFileTool struct {
-	fs fileSystem
+	fs      fileSystem
+	maxSize int64
 }

-func NewReadFileTool(workspace string, restrict bool, allowPaths ...[]*regexp.Regexp) *ReadFileTool {
+func NewReadFileTool(
+	workspace string,
+	restrict bool,
+	maxReadFileSize int,
+	allowPaths ...[]*regexp.Regexp,
+) *ReadFileTool {
 	var patterns []*regexp.Regexp
 	if len(allowPaths) > 0 {
 		patterns = allowPaths[0]
 	}
-	return &ReadFileTool{fs: buildFs(workspace, restrict, patterns)}
+
+	maxSize := int64(maxReadFileSize)
+	if maxSize <= 0 {
+		maxSize = MaxReadFileSize
+	}
+
+	return &ReadFileTool{
+		fs:      buildFs(workspace, restrict, patterns),
+		maxSize: maxSize,
+	}
 }

 func (t *ReadFileTool) Name() string {
@@ -107,9 +123,7 @@ func (t *ReadFileTool) Name() string {
 }

 func (t *ReadFileTool) Description() string {
-	return "Read the contents of a file. Supports pagination via `offset` and `length` " +
-		"for files larger than the per-call limit. If the response header indicates the " +
-		"file is TRUNCATED, use the provided offset in your next call to continue reading."
+	return "Read the contents of a file. Supports pagination via `offset` and `length`."
 }

 func (t *ReadFileTool) Parameters() map[string]any {
@@ -122,15 +136,13 @@ func (t *ReadFileTool) Parameters() map[string]any {
 			},
 			"offset": map[string]any{
 				"type":        "integer",
-				"description": "Byte offset to start reading from (default: 0).",
+				"description": "Byte offset to start reading from.",
 				"default":     0,
 			},
 			"length": map[string]any{
-				"type": "integer",
-				"description": fmt.Sprintf(
-					"Maximum number of bytes to read (default / max: %d).", MaxReadFileSize,
-				),
-				"default": MaxReadFileSize,
+				"type":        "integer",
+				"description": "Maximum number of bytes to read.",
+				"default":     t.maxSize,
 			},
 		},
 		"required": []string{"path"},
@@ -153,15 +165,15 @@ func (t *ReadFileTool) Execute(ctx context.Context, args map[string]any) *ToolRe
 	}

 	// length (optional, capped at MaxReadFileSize)
-	length, err := getInt64Arg(args, "length", MaxReadFileSize)
+	length, err := getInt64Arg(args, "length", t.maxSize)
 	if err != nil {
 		return ErrorResult(err.Error())
 	}
 	if length <= 0 {
 		return ErrorResult("length must be > 0")
 	}
-	if length > MaxReadFileSize {
-		length = MaxReadFileSize
+	if length > t.maxSize {
+		length = t.maxSize
 	}

 	file, err := t.fs.Open(path)
@@ -174,66 +186,104 @@ func (t *ReadFileTool) Execute(ctx context.Context, args map[string]any) *ToolRe
 	totalSize := int64(-1) // -1 means unknown
 	if info, err := file.Stat(); err == nil {
 		totalSize = info.Size()
-	} else {
-		return ErrorResult(fmt.Sprintf("failed to get file info: %v", err))
 	}

-	// seek to offset
+	// sniff the first 512 bytes to detect binary content before loading
+	// it into the LLM context. Seeking back to 0 afterwards restores state.
+	sniff := make([]byte, 512)
+	sniffN, _ := file.Read(sniff)
+
+	// Reset read position to beginning before applying the caller's offset.
+	if seeker, ok := file.(io.Seeker); ok {
+		if _, err := seeker.Seek(0, io.SeekStart); err != nil {
+			return ErrorResult(fmt.Sprintf("failed to reset file position after sniff: %v", err))
+		}
+	} else {
+		// Non-seekable: we consumed sniffN bytes above; account for them when
+		// discarding to reach the requested offset below.
+		// If offset < sniffN the data we already read covers it, which we
+		// cannot replay on a non-seekable stream — return a clear error.
+		if offset < int64(sniffN) && offset > 0 {
+			return ErrorResult(
+				"non-seekable file: cannot seek to an offset within the first 512 bytes after binary detection",
+			)
+		}
+	}
+
+	// Seek to the requested offset.
 	if seeker, ok := file.(io.Seeker); ok {
 		if _, err := seeker.Seek(offset, io.SeekStart); err != nil {
 			return ErrorResult(fmt.Sprintf("failed to seek to offset %d: %v", offset, err))
 		}
 	} else if offset > 0 {
 		// Fallback for non-seekable streams: discard leading bytes.
-		if _, err := io.CopyN(io.Discard, file, offset); err != nil {
-			return ErrorResult(fmt.Sprintf("failed to advance to offset %d: %v", offset, err))
+		// sniffN bytes were already consumed above, so subtract them.
+		remaining := offset - int64(sniffN)
+		if remaining > 0 {
+			if _, err := io.CopyN(io.Discard, file, remaining); err != nil {
+				return ErrorResult(fmt.Sprintf("failed to advance to offset %d: %v", offset, err))
+			}
 		}
 	}

-	// read up to `length` bytes
-	data, err := io.ReadAll(io.LimitReader(file, length))
-	if err != nil {
+	// read length+1 bytes to reliably detect whether more content exists
+	// without relying on totalSize (which may be -1 for non-seekable streams).
+	// This avoids the false-positive TRUNCATED message on the last page.
+	probe := make([]byte, length+1)
+	n, err := io.ReadFull(file, probe)
+	// FIX: io.ReadFull returns io.ErrUnexpectedEOF for partial reads (0 < n < len),
+	// and io.EOF only when n == 0. Both are normal terminal conditions — only
+	// other errors are genuine failures.
+	if err != nil && err != io.EOF && !errors.Is(err, io.ErrUnexpectedEOF) {
 		return ErrorResult(fmt.Sprintf("failed to read file content: %v", err))
 	}

-	if len(data) == 0 && offset > 0 {
-		return NewToolResult("[END OF FILE — no content at this offset]")
-	}
+	// hasMore is true only when we actually got the extra probe byte.
+	hasMore := int64(n) > length
+	data := probe[:min(int64(n), length)]

-	// build metadata header
-	readEnd := offset + int64(len(data))
-	hasMore := int64(len(data)) == length && (totalSize < 0 || readEnd < totalSize)
-
-	// Calculates the reading range avoiding negative numbers if the file is empty
-	var readRange string
 	if len(data) == 0 {
-		readRange = "0 bytes"
-	} else {
-		readRange = fmt.Sprintf("bytes %d–%d", offset, readEnd-1)
+		return NewToolResult("[END OF FILE - no content at this offset]")
 	}

+	// Build metadata header.
+	// use filepath.Base(path) instead of the raw path to avoid leaking
+	// internal filesystem structure into the LLM context.
+	readEnd := offset + int64(len(data))
+	// use ASCII hyphen-minus instead of en-dash (U+2013) to keep the
+	// header parseable by downstream tools and log processors.
+	readRange := fmt.Sprintf("bytes %d-%d", offset, readEnd-1)
+
+	displayPath := filepath.Base(path)
 	var header string
 	if totalSize >= 0 {
 		header = fmt.Sprintf(
 			"[file: %s | total: %d bytes | read: %s]",
-			path, totalSize, readRange,
+			displayPath, totalSize, readRange,
 		)
 	} else {
 		header = fmt.Sprintf(
 			"[file: %s | read: %s | total size unknown]",
-			path, readRange,
+			displayPath, readRange,
 		)
 	}

 	if hasMore {
 		header += fmt.Sprintf(
-			"\n[TRUNCATED — file has more content. Call read_file again with offset=%d to continue.]",
+			"\n[TRUNCATED - file has more content. Call read_file again with offset=%d to continue.]",
 			readEnd,
 		)
 	} else {
-		header += "\n[END OF FILE — no further content.]"
+		header += "\n[END OF FILE - no further content.]"
 	}

+	logger.DebugCF("tool", "ReadFileTool execution completed successfully",
+		map[string]any{
+			"path":       path,
+			"bytes_read": len(data),
+			"has_more":   hasMore,
+		})
+
 	return NewToolResult(header + "\n\n" + string(data))
 }

@@ -247,6 +297,12 @@ func getInt64Arg(args map[string]any, key string, defaultVal int64) (int64, erro

 	switch v := raw.(type) {
 	case float64:
+		if v != math.Trunc(v) {
+			return 0, fmt.Errorf("%s must be an integer, got float %v", key, v)
+		}
+		if v > math.MaxInt64 || v < math.MinInt64 {
+			return 0, fmt.Errorf("%s value %v overflows int64", key, v)
+		}
 		return int64(v), nil
 	case int:
 		return int64(v), nil
@@ -636,33 +692,3 @@ func getSafeRelPath(workspace, path string) (string, error) {

 	return rel, nil
 }
-
-// isBinaryFile uses common heuristics to determine if the content is a binary file.
-func isBinaryFile(content []byte) bool {
-	if len(content) == 0 {
-		return false
-	}
-
-	// Sample the first 512 bytes (or less if the file is smaller)
-	limit := len(content)
-	if limit > 512 {
-		limit = 512
-	}
-	sample := content[:limit]
-
-	// Check for NUL bytes in the sample (standard binary detection)
-	if bytes.IndexByte(sample, 0) != -1 {
-		return true
-	}
-
-	// Use standard library content type detection to catch specific formats like PDF
-	contentType := http.DetectContentType(sample)
-	if contentType == "application/pdf" ||
-		strings.HasPrefix(contentType, "image/") ||
-		strings.HasPrefix(contentType, "video/") ||
-		strings.HasPrefix(contentType, "audio/") {
-		return true
-	}
-
-	return false
-}
@@ -18,7 +18,7 @@ func TestFilesystemTool_ReadFile_Success(t *testing.T) {
 	testFile := filepath.Join(tmpDir, "test.txt")
 	os.WriteFile(testFile, []byte("test content"), 0o644)

-	tool := NewReadFileTool("", false)
+	tool := NewReadFileTool("", false, MaxReadFileSize)
 	ctx := context.Background()
 	args := map[string]any{
 		"path": testFile,
@@ -45,7 +45,7 @@ func TestFilesystemTool_ReadFile_Success(t *testing.T) {

 // TestFilesystemTool_ReadFile_NotFound verifies error handling for missing file
 func TestFilesystemTool_ReadFile_NotFound(t *testing.T) {
-	tool := NewReadFileTool("", false)
+	tool := NewReadFileTool("", false, MaxReadFileSize)
 	ctx := context.Background()
 	args := map[string]any{
 		"path": "/nonexistent_file_12345.txt",
@@ -271,7 +271,7 @@ func TestFilesystemTool_ReadFile_RejectsSymlinkEscape(t *testing.T) {
 		t.Skipf("symlink not supported in this environment: %v", err)
 	}

-	tool := NewReadFileTool(workspace, true)
+	tool := NewReadFileTool(workspace, true, MaxReadFileSize)
 	result := tool.Execute(context.Background(), map[string]any{
 		"path": link,
 	})
@@ -289,7 +289,7 @@ func TestFilesystemTool_ReadFile_RejectsSymlinkEscape(t *testing.T) {
 }

 func TestFilesystemTool_EmptyWorkspace_AccessDenied(t *testing.T) {
-	tool := NewReadFileTool("", true) // restrict=true but workspace=""
+	tool := NewReadFileTool("", true, MaxReadFileSize) // restrict=true but workspace=""

 	// Try to read a sensitive file (simulated by a temp file outside workspace)
 	tmpDir := t.TempDir()
@@ -499,7 +499,7 @@ func TestWhitelistFs_AllowsMatchingPaths(t *testing.T) {
 	// Pattern allows access to the outsideDir.
 	patterns := []*regexp.Regexp{regexp.MustCompile(`^` + regexp.QuoteMeta(outsideDir))}

-	tool := NewReadFileTool(workspace, true, patterns)
+	tool := NewReadFileTool(workspace, true, MaxReadFileSize, patterns)

 	// Read from whitelisted path should succeed.
 	result := tool.Execute(context.Background(), map[string]any{"path": outsideFile})
@@ -521,69 +521,6 @@ func TestWhitelistFs_AllowsMatchingPaths(t *testing.T) {
 	}
 }

-func TestIsBinaryFile(t *testing.T) {
-	tests := []struct {
-		name     string
-		content  []byte
-		expected bool
-	}{
-		{
-			name:     "empty content",
-			content:  []byte(""),
-			expected: false,
-		},
-		{
-			name:     "plain text",
-			content:  []byte("This is a normal text file with punctuation and 12345 numbers."),
-			expected: false,
-		},
-		{
-			name:     "contains null byte",
-			content:  []byte("plain text\x00followed by a null byte"),
-			expected: true,
-		},
-		{
-			name:     "pdf header",
-			content:  []byte("%PDF-1.4\n%\xE2\xE3\xCF\xD3\n1 0 obj\n<</Type/Catalog/Pages 2 0 R>>"),
-			expected: true,
-		},
-		{
-			name:     "png magic bytes",
-			content:  []byte("\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\x00"),
-			expected: true,
-		},
-		{
-			name:     "jpeg magic bytes",
-			content:  []byte("\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H"),
-			expected: true,
-		},
-		{
-			name:     "html text (not binary)",
-			content:  []byte("<!DOCTYPE html><html><body><h1>Ciao</h1></body></html>"),
-			expected: false,
-		},
-		{
-			name:     "json text (not binary)",
-			content:  []byte(`{"key": "value", "number": 42}`),
-			expected: false,
-		},
-		{
-			name:     "markdown text (not binary)",
-			content:  []byte("# Markdown Title\n\nThis is a **bold text** and a [link](https://example.com)."),
-			expected: false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := isBinaryFile(tt.content)
-			if result != tt.expected {
-				t.Errorf("isBinaryFile() for %q returned %v, expected %v", tt.name, result, tt.expected)
-			}
-		})
-	}
-}
-
 // TestReadFileTool_ChunkedReading verifies the pagination logic of the tool
 // by reading a file in multiple chunks using 'offset' and 'length'.
 func TestReadFileTool_ChunkedReading(t *testing.T) {
@@ -597,7 +534,7 @@ func TestReadFileTool_ChunkedReading(t *testing.T) {
 		t.Fatalf("Failed to write test file: %v", err)
 	}

-	tool := NewReadFileTool(tmpDir, false)
+	tool := NewReadFileTool(tmpDir, false, MaxReadFileSize)
 	ctx := context.Background()

 	// --- Step 1: Read the first chunk (10 bytes) ---
@@ -686,7 +623,7 @@ func TestReadFileTool_OffsetBeyondEOF(t *testing.T) {
 		t.Fatalf("Failed to write test file: %v", err)
 	}

-	tool := NewReadFileTool(tmpDir, false)
+	tool := NewReadFileTool(tmpDir, false, MaxReadFileSize)
 	ctx := context.Background()

 	args := map[string]any{
@@ -702,7 +639,7 @@ func TestReadFileTool_OffsetBeyondEOF(t *testing.T) {
 	}

 	// Must return EXACTLY the string provided in the code
-	expectedMsg := "[END OF FILE — no content at this offset]"
+	expectedMsg := "[END OF FILE - no content at this offset]"
 	if result.ForLLM != expectedMsg {
 		t.Errorf("The message %q was expected, obtained: %q", expectedMsg, result.ForLLM)
 	}