From ff54128ab426798787a285e3768e666d5576786e Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Mon, 9 Mar 2026 09:32:21 +0100 Subject: [PATCH] refined code --- pkg/agent/instance.go | 3 +- pkg/config/config.go | 7 +- pkg/config/defaults.go | 5 +- pkg/tools/filesystem.go | 170 ++++++++++++++++++++--------------- pkg/tools/filesystem_test.go | 79 ++-------------- 5 files changed, 117 insertions(+), 147 deletions(-) diff --git a/pkg/agent/instance.go b/pkg/agent/instance.go index 97cf0fa05..5a838b67e 100644 --- a/pkg/agent/instance.go +++ b/pkg/agent/instance.go @@ -70,7 +70,8 @@ func NewAgentInstance( toolsRegistry := tools.NewToolRegistry() if cfg.Tools.IsToolEnabled("read_file") { - toolsRegistry.Register(tools.NewReadFileTool(workspace, readRestrict, allowReadPaths)) + maxReadFileSize := cfg.Tools.ReadFile.MaxReadFileSize + toolsRegistry.Register(tools.NewReadFileTool(workspace, readRestrict, maxReadFileSize, allowReadPaths)) } if cfg.Tools.IsToolEnabled("write_file") { toolsRegistry.Register(tools.NewWriteFileTool(workspace, restrict, allowWritePaths)) diff --git a/pkg/config/config.go b/pkg/config/config.go index b3ad050b7..c482c3b7e 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -660,6 +660,11 @@ type MediaCleanupConfig struct { Interval int ` env:"PICOCLAW_MEDIA_CLEANUP_INTERVAL" json:"interval_minutes"` } +type ReadFileToolConfig struct { + Enabled bool `json:"enabled"` + MaxReadFileSize int `json:"max_read_file_size"` +} + type ToolsConfig struct { AllowReadPaths []string `json:"allow_read_paths" env:"PICOCLAW_TOOLS_ALLOW_READ_PATHS"` AllowWritePaths []string `json:"allow_write_paths" env:"PICOCLAW_TOOLS_ALLOW_WRITE_PATHS"` @@ -676,7 +681,7 @@ type ToolsConfig struct { InstallSkill ToolConfig `json:"install_skill" envPrefix:"PICOCLAW_TOOLS_INSTALL_SKILL_"` ListDir ToolConfig `json:"list_dir" envPrefix:"PICOCLAW_TOOLS_LIST_DIR_"` Message ToolConfig `json:"message" envPrefix:"PICOCLAW_TOOLS_MESSAGE_"` - ReadFile ToolConfig `json:"read_file" envPrefix:"PICOCLAW_TOOLS_READ_FILE_"` + ReadFile ReadFileToolConfig `json:"read_file" envPrefix:"PICOCLAW_TOOLS_READ_FILE_"` SendFile ToolConfig `json:"send_file" envPrefix:"PICOCLAW_TOOLS_SEND_FILE_"` Spawn ToolConfig `json:"spawn" envPrefix:"PICOCLAW_TOOLS_SPAWN_"` SPI ToolConfig `json:"spi" envPrefix:"PICOCLAW_TOOLS_SPI_"` diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go index 7fb3daa48..eb2e179b1 100644 --- a/pkg/config/defaults.go +++ b/pkg/config/defaults.go @@ -458,8 +458,9 @@ func DefaultConfig() *Config { Message: ToolConfig{ Enabled: true, }, - ReadFile: ToolConfig{ - Enabled: true, + ReadFile: ReadFileToolConfig{ + Enabled: true, + MaxReadFileSize: 64 * 1024, // 64KB }, Spawn: ToolConfig{ Enabled: true, diff --git a/pkg/tools/filesystem.go b/pkg/tools/filesystem.go index 92c5c4f17..5878f3173 100644 --- a/pkg/tools/filesystem.go +++ b/pkg/tools/filesystem.go @@ -1,12 +1,12 @@ package tools import ( - "bytes" "context" + "errors" "fmt" "io" "io/fs" - "net/http" + "math" "os" "path/filepath" "regexp" @@ -15,9 +15,10 @@ import ( "time" "github.com/sipeed/picoclaw/pkg/fileutil" + "github.com/sipeed/picoclaw/pkg/logger" ) -const MaxReadFileSize = 128 * 1024 // 64KB limit to avoid context overflow +const MaxReadFileSize = 64 * 1024 // 64KB limit to avoid context overflow // validatePath ensures the given path is within the workspace if restrict is true. func validatePath(path, workspace string, restrict bool) (string, error) { @@ -91,15 +92,30 @@ func isWithinWorkspace(candidate, workspace string) bool { } type ReadFileTool struct { - fs fileSystem + fs fileSystem + maxSize int64 } -func NewReadFileTool(workspace string, restrict bool, allowPaths ...[]*regexp.Regexp) *ReadFileTool { +func NewReadFileTool( + workspace string, + restrict bool, + maxReadFileSize int, + allowPaths ...[]*regexp.Regexp, +) *ReadFileTool { var patterns []*regexp.Regexp if len(allowPaths) > 0 { patterns = allowPaths[0] } - return &ReadFileTool{fs: buildFs(workspace, restrict, patterns)} + + maxSize := int64(maxReadFileSize) + if maxSize <= 0 { + maxSize = MaxReadFileSize + } + + return &ReadFileTool{ + fs: buildFs(workspace, restrict, patterns), + maxSize: maxSize, + } } func (t *ReadFileTool) Name() string { @@ -107,9 +123,7 @@ func (t *ReadFileTool) Name() string { } func (t *ReadFileTool) Description() string { - return "Read the contents of a file. Supports pagination via `offset` and `length` " + - "for files larger than the per-call limit. If the response header indicates the " + - "file is TRUNCATED, use the provided offset in your next call to continue reading." + return "Read the contents of a file. Supports pagination via `offset` and `length`." } func (t *ReadFileTool) Parameters() map[string]any { @@ -122,15 +136,13 @@ func (t *ReadFileTool) Parameters() map[string]any { }, "offset": map[string]any{ "type": "integer", - "description": "Byte offset to start reading from (default: 0).", + "description": "Byte offset to start reading from.", "default": 0, }, "length": map[string]any{ - "type": "integer", - "description": fmt.Sprintf( - "Maximum number of bytes to read (default / max: %d).", MaxReadFileSize, - ), - "default": MaxReadFileSize, + "type": "integer", + "description": "Maximum number of bytes to read.", + "default": t.maxSize, }, }, "required": []string{"path"}, @@ -153,15 +165,15 @@ func (t *ReadFileTool) Execute(ctx context.Context, args map[string]any) *ToolRe } // length (optional, capped at MaxReadFileSize) - length, err := getInt64Arg(args, "length", MaxReadFileSize) + length, err := getInt64Arg(args, "length", t.maxSize) if err != nil { return ErrorResult(err.Error()) } if length <= 0 { return ErrorResult("length must be > 0") } - if length > MaxReadFileSize { - length = MaxReadFileSize + if length > t.maxSize { + length = t.maxSize } file, err := t.fs.Open(path) @@ -174,66 +186,104 @@ func (t *ReadFileTool) Execute(ctx context.Context, args map[string]any) *ToolRe totalSize := int64(-1) // -1 means unknown if info, err := file.Stat(); err == nil { totalSize = info.Size() - } else { - return ErrorResult(fmt.Sprintf("failed to get file info: %v", err)) } - // seek to offset + // sniff the first 512 bytes to detect binary content before loading + // it into the LLM context. Seeking back to 0 afterwards restores state. + sniff := make([]byte, 512) + sniffN, _ := file.Read(sniff) + + // Reset read position to beginning before applying the caller's offset. + if seeker, ok := file.(io.Seeker); ok { + if _, err := seeker.Seek(0, io.SeekStart); err != nil { + return ErrorResult(fmt.Sprintf("failed to reset file position after sniff: %v", err)) + } + } else { + // Non-seekable: we consumed sniffN bytes above; account for them when + // discarding to reach the requested offset below. + // If offset < sniffN the data we already read covers it, which we + // cannot replay on a non-seekable stream — return a clear error. + if offset < int64(sniffN) && offset > 0 { + return ErrorResult( + "non-seekable file: cannot seek to an offset within the first 512 bytes after binary detection", + ) + } + } + + // Seek to the requested offset. if seeker, ok := file.(io.Seeker); ok { if _, err := seeker.Seek(offset, io.SeekStart); err != nil { return ErrorResult(fmt.Sprintf("failed to seek to offset %d: %v", offset, err)) } } else if offset > 0 { // Fallback for non-seekable streams: discard leading bytes. - if _, err := io.CopyN(io.Discard, file, offset); err != nil { - return ErrorResult(fmt.Sprintf("failed to advance to offset %d: %v", offset, err)) + // sniffN bytes were already consumed above, so subtract them. + remaining := offset - int64(sniffN) + if remaining > 0 { + if _, err := io.CopyN(io.Discard, file, remaining); err != nil { + return ErrorResult(fmt.Sprintf("failed to advance to offset %d: %v", offset, err)) + } } } - // read up to `length` bytes - data, err := io.ReadAll(io.LimitReader(file, length)) - if err != nil { + // read length+1 bytes to reliably detect whether more content exists + // without relying on totalSize (which may be -1 for non-seekable streams). + // This avoids the false-positive TRUNCATED message on the last page. + probe := make([]byte, length+1) + n, err := io.ReadFull(file, probe) + // FIX: io.ReadFull returns io.ErrUnexpectedEOF for partial reads (0 < n < len), + // and io.EOF only when n == 0. Both are normal terminal conditions — only + // other errors are genuine failures. + if err != nil && err != io.EOF && !errors.Is(err, io.ErrUnexpectedEOF) { return ErrorResult(fmt.Sprintf("failed to read file content: %v", err)) } - if len(data) == 0 && offset > 0 { - return NewToolResult("[END OF FILE — no content at this offset]") - } + // hasMore is true only when we actually got the extra probe byte. + hasMore := int64(n) > length + data := probe[:min(int64(n), length)] - // build metadata header - readEnd := offset + int64(len(data)) - hasMore := int64(len(data)) == length && (totalSize < 0 || readEnd < totalSize) - - // Calculates the reading range avoiding negative numbers if the file is empty - var readRange string if len(data) == 0 { - readRange = "0 bytes" - } else { - readRange = fmt.Sprintf("bytes %d–%d", offset, readEnd-1) + return NewToolResult("[END OF FILE - no content at this offset]") } + // Build metadata header. + // use filepath.Base(path) instead of the raw path to avoid leaking + // internal filesystem structure into the LLM context. + readEnd := offset + int64(len(data)) + // use ASCII hyphen-minus instead of en-dash (U+2013) to keep the + // header parseable by downstream tools and log processors. + readRange := fmt.Sprintf("bytes %d-%d", offset, readEnd-1) + + displayPath := filepath.Base(path) var header string if totalSize >= 0 { header = fmt.Sprintf( "[file: %s | total: %d bytes | read: %s]", - path, totalSize, readRange, + displayPath, totalSize, readRange, ) } else { header = fmt.Sprintf( "[file: %s | read: %s | total size unknown]", - path, readRange, + displayPath, readRange, ) } if hasMore { header += fmt.Sprintf( - "\n[TRUNCATED — file has more content. Call read_file again with offset=%d to continue.]", + "\n[TRUNCATED - file has more content. Call read_file again with offset=%d to continue.]", readEnd, ) } else { - header += "\n[END OF FILE — no further content.]" + header += "\n[END OF FILE - no further content.]" } + logger.DebugCF("tool", "ReadFileTool execution completed successfully", + map[string]any{ + "path": path, + "bytes_read": len(data), + "has_more": hasMore, + }) + return NewToolResult(header + "\n\n" + string(data)) } @@ -247,6 +297,12 @@ func getInt64Arg(args map[string]any, key string, defaultVal int64) (int64, erro switch v := raw.(type) { case float64: + if v != math.Trunc(v) { + return 0, fmt.Errorf("%s must be an integer, got float %v", key, v) + } + if v > math.MaxInt64 || v < math.MinInt64 { + return 0, fmt.Errorf("%s value %v overflows int64", key, v) + } return int64(v), nil case int: return int64(v), nil @@ -636,33 +692,3 @@ func getSafeRelPath(workspace, path string) (string, error) { return rel, nil } - -// isBinaryFile uses common heuristics to determine if the content is a binary file. -func isBinaryFile(content []byte) bool { - if len(content) == 0 { - return false - } - - // Sample the first 512 bytes (or less if the file is smaller) - limit := len(content) - if limit > 512 { - limit = 512 - } - sample := content[:limit] - - // Check for NUL bytes in the sample (standard binary detection) - if bytes.IndexByte(sample, 0) != -1 { - return true - } - - // Use standard library content type detection to catch specific formats like PDF - contentType := http.DetectContentType(sample) - if contentType == "application/pdf" || - strings.HasPrefix(contentType, "image/") || - strings.HasPrefix(contentType, "video/") || - strings.HasPrefix(contentType, "audio/") { - return true - } - - return false -} diff --git a/pkg/tools/filesystem_test.go b/pkg/tools/filesystem_test.go index f02483e25..0bbf6caf0 100644 --- a/pkg/tools/filesystem_test.go +++ b/pkg/tools/filesystem_test.go @@ -18,7 +18,7 @@ func TestFilesystemTool_ReadFile_Success(t *testing.T) { testFile := filepath.Join(tmpDir, "test.txt") os.WriteFile(testFile, []byte("test content"), 0o644) - tool := NewReadFileTool("", false) + tool := NewReadFileTool("", false, MaxReadFileSize) ctx := context.Background() args := map[string]any{ "path": testFile, @@ -45,7 +45,7 @@ func TestFilesystemTool_ReadFile_Success(t *testing.T) { // TestFilesystemTool_ReadFile_NotFound verifies error handling for missing file func TestFilesystemTool_ReadFile_NotFound(t *testing.T) { - tool := NewReadFileTool("", false) + tool := NewReadFileTool("", false, MaxReadFileSize) ctx := context.Background() args := map[string]any{ "path": "/nonexistent_file_12345.txt", @@ -271,7 +271,7 @@ func TestFilesystemTool_ReadFile_RejectsSymlinkEscape(t *testing.T) { t.Skipf("symlink not supported in this environment: %v", err) } - tool := NewReadFileTool(workspace, true) + tool := NewReadFileTool(workspace, true, MaxReadFileSize) result := tool.Execute(context.Background(), map[string]any{ "path": link, }) @@ -289,7 +289,7 @@ func TestFilesystemTool_ReadFile_RejectsSymlinkEscape(t *testing.T) { } func TestFilesystemTool_EmptyWorkspace_AccessDenied(t *testing.T) { - tool := NewReadFileTool("", true) // restrict=true but workspace="" + tool := NewReadFileTool("", true, MaxReadFileSize) // restrict=true but workspace="" // Try to read a sensitive file (simulated by a temp file outside workspace) tmpDir := t.TempDir() @@ -499,7 +499,7 @@ func TestWhitelistFs_AllowsMatchingPaths(t *testing.T) { // Pattern allows access to the outsideDir. patterns := []*regexp.Regexp{regexp.MustCompile(`^` + regexp.QuoteMeta(outsideDir))} - tool := NewReadFileTool(workspace, true, patterns) + tool := NewReadFileTool(workspace, true, MaxReadFileSize, patterns) // Read from whitelisted path should succeed. result := tool.Execute(context.Background(), map[string]any{"path": outsideFile}) @@ -521,69 +521,6 @@ func TestWhitelistFs_AllowsMatchingPaths(t *testing.T) { } } -func TestIsBinaryFile(t *testing.T) { - tests := []struct { - name string - content []byte - expected bool - }{ - { - name: "empty content", - content: []byte(""), - expected: false, - }, - { - name: "plain text", - content: []byte("This is a normal text file with punctuation and 12345 numbers."), - expected: false, - }, - { - name: "contains null byte", - content: []byte("plain text\x00followed by a null byte"), - expected: true, - }, - { - name: "pdf header", - content: []byte("%PDF-1.4\n%\xE2\xE3\xCF\xD3\n1 0 obj\n<>"), - expected: true, - }, - { - name: "png magic bytes", - content: []byte("\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\x00"), - expected: true, - }, - { - name: "jpeg magic bytes", - content: []byte("\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H"), - expected: true, - }, - { - name: "html text (not binary)", - content: []byte("

Ciao

"), - expected: false, - }, - { - name: "json text (not binary)", - content: []byte(`{"key": "value", "number": 42}`), - expected: false, - }, - { - name: "markdown text (not binary)", - content: []byte("# Markdown Title\n\nThis is a **bold text** and a [link](https://example.com)."), - expected: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := isBinaryFile(tt.content) - if result != tt.expected { - t.Errorf("isBinaryFile() for %q returned %v, expected %v", tt.name, result, tt.expected) - } - }) - } -} - // TestReadFileTool_ChunkedReading verifies the pagination logic of the tool // by reading a file in multiple chunks using 'offset' and 'length'. func TestReadFileTool_ChunkedReading(t *testing.T) { @@ -597,7 +534,7 @@ func TestReadFileTool_ChunkedReading(t *testing.T) { t.Fatalf("Failed to write test file: %v", err) } - tool := NewReadFileTool(tmpDir, false) + tool := NewReadFileTool(tmpDir, false, MaxReadFileSize) ctx := context.Background() // --- Step 1: Read the first chunk (10 bytes) --- @@ -686,7 +623,7 @@ func TestReadFileTool_OffsetBeyondEOF(t *testing.T) { t.Fatalf("Failed to write test file: %v", err) } - tool := NewReadFileTool(tmpDir, false) + tool := NewReadFileTool(tmpDir, false, MaxReadFileSize) ctx := context.Background() args := map[string]any{ @@ -702,7 +639,7 @@ func TestReadFileTool_OffsetBeyondEOF(t *testing.T) { } // Must return EXACTLY the string provided in the code - expectedMsg := "[END OF FILE — no content at this offset]" + expectedMsg := "[END OF FILE - no content at this offset]" if result.ForLLM != expectedMsg { t.Errorf("The message %q was expected, obtained: %q", expectedMsg, result.ForLLM) }