diff --git a/pkg/tools/web.go b/pkg/tools/web.go index 6a6d40ecf..1f5c58ea5 100644 --- a/pkg/tools/web.go +++ b/pkg/tools/web.go @@ -492,8 +492,10 @@ func (t *WebFetchTool) extractText(htmlContent string) string { result = strings.TrimSpace(result) - re = regexp.MustCompile(`\s+`) - result = re.ReplaceAllLiteralString(result, " ") + re = regexp.MustCompile(`[^\S\n]+`) + result = re.ReplaceAllString(result, " ") + re = regexp.MustCompile(`\n{3,}`) + result = re.ReplaceAllString(result, "\n\n") lines := strings.Split(result, "\n") var cleanLines []string diff --git a/pkg/tools/web_test.go b/pkg/tools/web_test.go index a526ea34a..7e6d62213 100644 --- a/pkg/tools/web_test.go +++ b/pkg/tools/web_test.go @@ -234,6 +234,80 @@ func TestWebTool_WebFetch_HTMLExtraction(t *testing.T) { } } +// TestWebFetchTool_extractText verifies text extraction preserves newlines +func TestWebFetchTool_extractText(t *testing.T) { + tool := &WebFetchTool{} + + tests := []struct { + name string + input string + wantFunc func(t *testing.T, got string) + }{ + { + name: "preserves newlines between block elements", + input: "

Title

\n

Paragraph 1

\n

Paragraph 2

", + wantFunc: func(t *testing.T, got string) { + lines := strings.Split(got, "\n") + if len(lines) < 2 { + t.Errorf("Expected multiple lines, got %d: %q", len(lines), got) + } + if !strings.Contains(got, "Title") || !strings.Contains(got, "Paragraph 1") || !strings.Contains(got, "Paragraph 2") { + t.Errorf("Missing expected text: %q", got) + } + }, + }, + { + name: "removes script and style tags", + input: "

Keep this

", + wantFunc: func(t *testing.T, got string) { + if strings.Contains(got, "alert") || strings.Contains(got, "body{}") { + t.Errorf("Expected script/style content removed, got: %q", got) + } + if !strings.Contains(got, "Keep this") { + t.Errorf("Expected 'Keep this' to remain, got: %q", got) + } + }, + }, + { + name: "collapses excessive blank lines", + input: "

A

\n\n\n\n\n

B

", + wantFunc: func(t *testing.T, got string) { + if strings.Contains(got, "\n\n\n") { + t.Errorf("Expected excessive blank lines collapsed, got: %q", got) + } + }, + }, + { + name: "collapses horizontal whitespace", + input: "

hello world

", + wantFunc: func(t *testing.T, got string) { + if strings.Contains(got, " ") { + t.Errorf("Expected spaces collapsed, got: %q", got) + } + if !strings.Contains(got, "hello world") { + t.Errorf("Expected 'hello world', got: %q", got) + } + }, + }, + { + name: "empty input", + input: "", + wantFunc: func(t *testing.T, got string) { + if got != "" { + t.Errorf("Expected empty string, got: %q", got) + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tool.extractText(tt.input) + tt.wantFunc(t, got) + }) + } +} + // TestWebTool_WebFetch_MissingDomain verifies error handling for URL without domain func TestWebTool_WebFetch_MissingDomain(t *testing.T) { tool := NewWebFetchTool(50000)