Files
picoclaw/pkg/tools/web.go
T
Truong Vinh Tran e4daab8b09 Merge upstream/main into feat/searxng
Resolve merge conflicts to keep both SearXNG and GLM Search
providers. Updated search priority order to:
Perplexity > Brave > SearXNG > Tavily > DuckDuckGo > GLM Search

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 21:42:03 +01:00

888 lines
24 KiB
Go

package tools
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
)
const (
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
// HTTP client timeouts for web tool providers.
searchTimeout = 10 * time.Second // Brave, Tavily, DuckDuckGo
perplexityTimeout = 30 * time.Second // Perplexity (LLM-based, slower)
fetchTimeout = 60 * time.Second // WebFetchTool
defaultMaxChars = 50000
maxRedirects = 5
)
// Pre-compiled regexes for HTML text extraction
var (
reScript = regexp.MustCompile(`<script[\s\S]*?</script>`)
reStyle = regexp.MustCompile(`<style[\s\S]*?</style>`)
reTags = regexp.MustCompile(`<[^>]+>`)
reWhitespace = regexp.MustCompile(`[^\S\n]+`)
reBlankLines = regexp.MustCompile(`\n{3,}`)
// DuckDuckGo result extraction
reDDGLink = regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
reDDGSnippet = regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
)
// createHTTPClient creates an HTTP client with optional proxy support
func createHTTPClient(proxyURL string, timeout time.Duration) (*http.Client, error) {
client := &http.Client{
Timeout: timeout,
Transport: &http.Transport{
MaxIdleConns: 10,
IdleConnTimeout: 30 * time.Second,
DisableCompression: false,
TLSHandshakeTimeout: 15 * time.Second,
},
}
if proxyURL != "" {
proxy, err := url.Parse(proxyURL)
if err != nil {
return nil, fmt.Errorf("invalid proxy URL: %w", err)
}
scheme := strings.ToLower(proxy.Scheme)
switch scheme {
case "http", "https", "socks5", "socks5h":
default:
return nil, fmt.Errorf(
"unsupported proxy scheme %q (supported: http, https, socks5, socks5h)",
proxy.Scheme,
)
}
if proxy.Host == "" {
return nil, fmt.Errorf("invalid proxy URL: missing host")
}
client.Transport.(*http.Transport).Proxy = http.ProxyURL(proxy)
} else {
client.Transport.(*http.Transport).Proxy = http.ProxyFromEnvironment
}
return client, nil
}
type SearchProvider interface {
Search(ctx context.Context, query string, count int) (string, error)
}
type BraveSearchProvider struct {
apiKey string
proxy string
client *http.Client
}
func (p *BraveSearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
searchURL := fmt.Sprintf("https://api.search.brave.com/res/v1/web/search?q=%s&count=%d",
url.QueryEscape(query), count)
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Accept", "application/json")
req.Header.Set("X-Subscription-Token", p.apiKey)
resp, err := p.client.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("brave api error (status %d): %s", resp.StatusCode, string(body))
}
var searchResp struct {
Web struct {
Results []struct {
Title string `json:"title"`
URL string `json:"url"`
Description string `json:"description"`
} `json:"results"`
} `json:"web"`
}
if err := json.Unmarshal(body, &searchResp); err != nil {
// Log error body for debugging
fmt.Printf("Brave API Error Body: %s\n", string(body))
return "", fmt.Errorf("failed to parse response: %w", err)
}
results := searchResp.Web.Results
if len(results) == 0 {
return fmt.Sprintf("No results for: %s", query), nil
}
var lines []string
lines = append(lines, fmt.Sprintf("Results for: %s", query))
for i, item := range results {
if i >= count {
break
}
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, item.Title, item.URL))
if item.Description != "" {
lines = append(lines, fmt.Sprintf(" %s", item.Description))
}
}
return strings.Join(lines, "\n"), nil
}
type TavilySearchProvider struct {
apiKey string
baseURL string
proxy string
client *http.Client
}
func (p *TavilySearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
searchURL := p.baseURL
if searchURL == "" {
searchURL = "https://api.tavily.com/search"
}
payload := map[string]any{
"api_key": p.apiKey,
"query": query,
"search_depth": "advanced",
"include_answer": false,
"include_images": false,
"include_raw_content": false,
"max_results": count,
}
bodyBytes, err := json.Marshal(payload)
if err != nil {
return "", fmt.Errorf("failed to marshal payload: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", searchURL, bytes.NewBuffer(bodyBytes))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("User-Agent", userAgent)
resp, err := p.client.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("tavily api error (status %d): %s", resp.StatusCode, string(body))
}
var searchResp struct {
Results []struct {
Title string `json:"title"`
URL string `json:"url"`
Content string `json:"content"`
} `json:"results"`
}
if err := json.Unmarshal(body, &searchResp); err != nil {
return "", fmt.Errorf("failed to parse response: %w", err)
}
results := searchResp.Results
if len(results) == 0 {
return fmt.Sprintf("No results for: %s", query), nil
}
var lines []string
lines = append(lines, fmt.Sprintf("Results for: %s (via Tavily)", query))
for i, item := range results {
if i >= count {
break
}
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, item.Title, item.URL))
if item.Content != "" {
lines = append(lines, fmt.Sprintf(" %s", item.Content))
}
}
return strings.Join(lines, "\n"), nil
}
type DuckDuckGoSearchProvider struct {
proxy string
client *http.Client
}
func (p *DuckDuckGoSearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
searchURL := fmt.Sprintf("https://html.duckduckgo.com/html/?q=%s", url.QueryEscape(query))
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("User-Agent", userAgent)
resp, err := p.client.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
return p.extractResults(string(body), count, query)
}
func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query string) (string, error) {
// Simple regex based extraction for DDG HTML
// Strategy: Find all result containers or key anchors directly
// Try finding the result links directly first, as they are the most critical
// Pattern: <a class="result__a" href="...">Title</a>
// The previous regex was a bit strict. Let's make it more flexible for attributes order/content
matches := reDDGLink.FindAllStringSubmatch(html, count+5)
if len(matches) == 0 {
return fmt.Sprintf("No results found or extraction failed. Query: %s", query), nil
}
var lines []string
lines = append(lines, fmt.Sprintf("Results for: %s (via DuckDuckGo)", query))
// Pre-compile snippet regex to run inside the loop
// We'll search for snippets relative to the link position or just globally if needed
// But simple global search for snippets might mismatch order.
// Since we only have the raw HTML string, let's just extract snippets globally and assume order matches (risky but simple for regex)
// Or better: Let's assume the snippet follows the link in the HTML
// A better regex approach: iterate through text and find matches in order
// But for now, let's grab all snippets too
snippetMatches := reDDGSnippet.FindAllStringSubmatch(html, count+5)
maxItems := min(len(matches), count)
for i := range maxItems {
urlStr := matches[i][1]
title := stripTags(matches[i][2])
title = strings.TrimSpace(title)
// URL decoding if needed
if strings.Contains(urlStr, "uddg=") {
if u, err := url.QueryUnescape(urlStr); err == nil {
_, after, ok := strings.Cut(u, "uddg=")
if ok {
urlStr = after
}
}
}
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, title, urlStr))
// Attempt to attach snippet if available and index aligns
if i < len(snippetMatches) {
snippet := stripTags(snippetMatches[i][1])
snippet = strings.TrimSpace(snippet)
if snippet != "" {
lines = append(lines, fmt.Sprintf(" %s", snippet))
}
}
}
return strings.Join(lines, "\n"), nil
}
func stripTags(content string) string {
return reTags.ReplaceAllString(content, "")
}
type PerplexitySearchProvider struct {
apiKey string
proxy string
client *http.Client
}
func (p *PerplexitySearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
searchURL := "https://api.perplexity.ai/chat/completions"
payload := map[string]any{
"model": "sonar",
"messages": []map[string]string{
{
"role": "system",
"content": "You are a search assistant. Provide concise search results with titles, URLs, and brief descriptions in the following format:\n1. Title\n URL\n Description\n\nDo not add extra commentary.",
},
{
"role": "user",
"content": fmt.Sprintf("Search for: %s. Provide up to %d relevant results.", query, count),
},
},
"max_tokens": 1000,
}
payloadBytes, err := json.Marshal(payload)
if err != nil {
return "", fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", searchURL, strings.NewReader(string(payloadBytes)))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+p.apiKey)
req.Header.Set("User-Agent", userAgent)
resp, err := p.client.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("Perplexity API error: %s", string(body))
}
var searchResp struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
}
if err := json.Unmarshal(body, &searchResp); err != nil {
return "", fmt.Errorf("failed to parse response: %w", err)
}
if len(searchResp.Choices) == 0 {
return fmt.Sprintf("No results for: %s", query), nil
}
return fmt.Sprintf("Results for: %s (via Perplexity)\n%s", query, searchResp.Choices[0].Message.Content), nil
}
type SearXNGSearchProvider struct {
baseURL string
}
func (p *SearXNGSearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
searchURL := fmt.Sprintf("%s/search?q=%s&format=json&categories=general",
strings.TrimSuffix(p.baseURL, "/"),
url.QueryEscape(query))
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("SearXNG returned status %d", resp.StatusCode)
}
var result struct {
Results []struct {
Title string `json:"title"`
URL string `json:"url"`
Content string `json:"content"`
Engine string `json:"engine"`
Score float64 `json:"score"`
} `json:"results"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", fmt.Errorf("failed to parse response: %w", err)
}
if len(result.Results) == 0 {
return fmt.Sprintf("No results for: %s", query), nil
}
// Limit results to requested count
if len(result.Results) > count {
result.Results = result.Results[:count]
}
// Format results in standard PicoClaw format
var b strings.Builder
b.WriteString(fmt.Sprintf("Results for: %s (via SearXNG)\n", query))
for i, r := range result.Results {
b.WriteString(fmt.Sprintf("%d. %s\n", i+1, r.Title))
b.WriteString(fmt.Sprintf(" %s\n", r.URL))
if r.Content != "" {
b.WriteString(fmt.Sprintf(" %s\n", r.Content))
}
}
return b.String(), nil
}
type GLMSearchProvider struct {
apiKey string
baseURL string
searchEngine string
proxy string
client *http.Client
}
func (p *GLMSearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
searchURL := p.baseURL
if searchURL == "" {
searchURL = "https://open.bigmodel.cn/api/paas/v4/web_search"
}
payload := map[string]any{
"search_query": query,
"search_engine": p.searchEngine,
"search_intent": false,
"count": count,
"content_size": "medium",
}
bodyBytes, err := json.Marshal(payload)
if err != nil {
return "", fmt.Errorf("failed to marshal payload: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", searchURL, bytes.NewReader(bodyBytes))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+p.apiKey)
resp, err := p.client.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("GLM Search API error (status %d): %s", resp.StatusCode, string(body))
}
var searchResp struct {
SearchResult []struct {
Title string `json:"title"`
Content string `json:"content"`
Link string `json:"link"`
} `json:"search_result"`
}
if err := json.Unmarshal(body, &searchResp); err != nil {
return "", fmt.Errorf("failed to parse response: %w", err)
}
results := searchResp.SearchResult
if len(results) == 0 {
return fmt.Sprintf("No results for: %s", query), nil
}
var lines []string
lines = append(lines, fmt.Sprintf("Results for: %s (via GLM Search)", query))
for i, item := range results {
if i >= count {
break
}
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, item.Title, item.Link))
if item.Content != "" {
lines = append(lines, fmt.Sprintf(" %s", item.Content))
}
}
return strings.Join(lines, "\n"), nil
}
type WebSearchTool struct {
provider SearchProvider
maxResults int
}
type WebSearchToolOptions struct {
BraveAPIKey string
BraveMaxResults int
BraveEnabled bool
TavilyAPIKey string
TavilyBaseURL string
TavilyMaxResults int
TavilyEnabled bool
DuckDuckGoMaxResults int
DuckDuckGoEnabled bool
PerplexityAPIKey string
PerplexityMaxResults int
PerplexityEnabled bool
SearXNGBaseURL string
SearXNGMaxResults int
SearXNGEnabled bool
GLMSearchAPIKey string
GLMSearchBaseURL string
GLMSearchEngine string
GLMSearchMaxResults int
GLMSearchEnabled bool
Proxy string
}
func NewWebSearchTool(opts WebSearchToolOptions) (*WebSearchTool, error) {
var provider SearchProvider
maxResults := 5
// Priority: Perplexity > Brave > SearXNG > Tavily > DuckDuckGo > GLM Search
if opts.PerplexityEnabled && opts.PerplexityAPIKey != "" {
client, err := createHTTPClient(opts.Proxy, perplexityTimeout)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP client for Perplexity: %w", err)
}
provider = &PerplexitySearchProvider{apiKey: opts.PerplexityAPIKey, proxy: opts.Proxy, client: client}
if opts.PerplexityMaxResults > 0 {
maxResults = opts.PerplexityMaxResults
}
} else if opts.BraveEnabled && opts.BraveAPIKey != "" {
client, err := createHTTPClient(opts.Proxy, searchTimeout)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP client for Brave: %w", err)
}
provider = &BraveSearchProvider{apiKey: opts.BraveAPIKey, proxy: opts.Proxy, client: client}
if opts.BraveMaxResults > 0 {
maxResults = opts.BraveMaxResults
}
} else if opts.SearXNGEnabled && opts.SearXNGBaseURL != "" {
provider = &SearXNGSearchProvider{baseURL: opts.SearXNGBaseURL}
if opts.SearXNGMaxResults > 0 {
maxResults = opts.SearXNGMaxResults
}
} else if opts.TavilyEnabled && opts.TavilyAPIKey != "" {
client, err := createHTTPClient(opts.Proxy, searchTimeout)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP client for Tavily: %w", err)
}
provider = &TavilySearchProvider{
apiKey: opts.TavilyAPIKey,
baseURL: opts.TavilyBaseURL,
proxy: opts.Proxy,
client: client,
}
if opts.TavilyMaxResults > 0 {
maxResults = opts.TavilyMaxResults
}
} else if opts.DuckDuckGoEnabled {
client, err := createHTTPClient(opts.Proxy, searchTimeout)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP client for DuckDuckGo: %w", err)
}
provider = &DuckDuckGoSearchProvider{proxy: opts.Proxy, client: client}
if opts.DuckDuckGoMaxResults > 0 {
maxResults = opts.DuckDuckGoMaxResults
}
} else if opts.GLMSearchEnabled && opts.GLMSearchAPIKey != "" {
client, err := createHTTPClient(opts.Proxy, searchTimeout)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP client for GLM Search: %w", err)
}
searchEngine := opts.GLMSearchEngine
if searchEngine == "" {
searchEngine = "search_std"
}
provider = &GLMSearchProvider{
apiKey: opts.GLMSearchAPIKey,
baseURL: opts.GLMSearchBaseURL,
searchEngine: searchEngine,
proxy: opts.Proxy,
client: client,
}
if opts.GLMSearchMaxResults > 0 {
maxResults = opts.GLMSearchMaxResults
}
} else {
return nil, nil
}
return &WebSearchTool{
provider: provider,
maxResults: maxResults,
}, nil
}
func (t *WebSearchTool) Name() string {
return "web_search"
}
func (t *WebSearchTool) Description() string {
return "Search the web for current information. Returns titles, URLs, and snippets from search results."
}
func (t *WebSearchTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"query": map[string]any{
"type": "string",
"description": "Search query",
},
"count": map[string]any{
"type": "integer",
"description": "Number of results (1-10)",
"minimum": 1.0,
"maximum": 10.0,
},
},
"required": []string{"query"},
}
}
func (t *WebSearchTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
query, ok := args["query"].(string)
if !ok {
return ErrorResult("query is required")
}
count := t.maxResults
if c, ok := args["count"].(float64); ok {
if int(c) > 0 && int(c) <= 10 {
count = int(c)
}
}
result, err := t.provider.Search(ctx, query, count)
if err != nil {
return ErrorResult(fmt.Sprintf("search failed: %v", err))
}
return &ToolResult{
ForLLM: result,
ForUser: result,
}
}
type WebFetchTool struct {
maxChars int
proxy string
client *http.Client
fetchLimitBytes int64
}
func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) {
// createHTTPClient cannot fail with an empty proxy string.
return NewWebFetchToolWithProxy(maxChars, "", fetchLimitBytes)
}
func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) {
if maxChars <= 0 {
maxChars = defaultMaxChars
}
client, err := createHTTPClient(proxy, fetchTimeout)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP client for web fetch: %w", err)
}
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
if len(via) >= maxRedirects {
return fmt.Errorf("stopped after %d redirects", maxRedirects)
}
return nil
}
if fetchLimitBytes <= 0 {
fetchLimitBytes = 10 * 1024 * 1024 // Security Fallback
}
return &WebFetchTool{
maxChars: maxChars,
proxy: proxy,
client: client,
fetchLimitBytes: fetchLimitBytes,
}, nil
}
func (t *WebFetchTool) Name() string {
return "web_fetch"
}
func (t *WebFetchTool) Description() string {
return "Fetch a URL and extract readable content (HTML to text). Use this to get weather info, news, articles, or any web content."
}
func (t *WebFetchTool) Parameters() map[string]any {
return map[string]any{
"type": "object",
"properties": map[string]any{
"url": map[string]any{
"type": "string",
"description": "URL to fetch",
},
"maxChars": map[string]any{
"type": "integer",
"description": "Maximum characters to extract",
"minimum": 100.0,
},
},
"required": []string{"url"},
}
}
func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
urlStr, ok := args["url"].(string)
if !ok {
return ErrorResult("url is required")
}
parsedURL, err := url.Parse(urlStr)
if err != nil {
return ErrorResult(fmt.Sprintf("invalid URL: %v", err))
}
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return ErrorResult("only http/https URLs are allowed")
}
if parsedURL.Host == "" {
return ErrorResult("missing domain in URL")
}
maxChars := t.maxChars
if mc, ok := args["maxChars"].(float64); ok {
if int(mc) > 100 {
maxChars = int(mc)
}
}
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return ErrorResult(fmt.Sprintf("failed to create request: %v", err))
}
req.Header.Set("User-Agent", userAgent)
resp, err := t.client.Do(req)
if err != nil {
return ErrorResult(fmt.Sprintf("request failed: %v", err))
}
resp.Body = http.MaxBytesReader(nil, resp.Body, t.fetchLimitBytes)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
var maxBytesErr *http.MaxBytesError
if errors.As(err, &maxBytesErr) {
return ErrorResult(fmt.Sprintf("failed to read response: size exceeded %d bytes limit", t.fetchLimitBytes))
}
return ErrorResult(fmt.Sprintf("failed to read response: %v", err))
}
contentType := resp.Header.Get("Content-Type")
var text, extractor string
if strings.Contains(contentType, "application/json") {
var jsonData any
if err := json.Unmarshal(body, &jsonData); err == nil {
formatted, _ := json.MarshalIndent(jsonData, "", " ")
text = string(formatted)
extractor = "json"
} else {
text = string(body)
extractor = "raw"
}
} else if strings.Contains(contentType, "text/html") || len(body) > 0 &&
(strings.HasPrefix(string(body), "<!DOCTYPE") || strings.HasPrefix(strings.ToLower(string(body)), "<html")) {
text = t.extractText(string(body))
extractor = "text"
} else {
text = string(body)
extractor = "raw"
}
truncated := len(text) > maxChars
if truncated {
text = text[:maxChars]
}
result := map[string]any{
"url": urlStr,
"status": resp.StatusCode,
"extractor": extractor,
"truncated": truncated,
"length": len(text),
"text": text,
}
resultJSON, _ := json.MarshalIndent(result, "", " ")
return &ToolResult{
ForLLM: string(resultJSON),
ForUser: fmt.Sprintf(
"Fetched %d bytes from %s (extractor: %s, truncated: %v)",
len(text),
urlStr,
extractor,
truncated,
),
}
}
func (t *WebFetchTool) extractText(htmlContent string) string {
result := reScript.ReplaceAllLiteralString(htmlContent, "")
result = reStyle.ReplaceAllLiteralString(result, "")
result = reTags.ReplaceAllLiteralString(result, "")
result = strings.TrimSpace(result)
result = reWhitespace.ReplaceAllString(result, " ")
result = reBlankLines.ReplaceAllString(result, "\n\n")
lines := strings.Split(result, "\n")
var cleanLines []string
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
cleanLines = append(cleanLines, line)
}
}
return strings.Join(cleanLines, "\n")
}