mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
44a52c0cf6
* fix(tools): close resp.Body on retry cancel and cache http.Client instances Fix resp.Body leak in DoRequestWithRetry where req.Body (request) was incorrectly closed instead of resp.Body (response) on context cancel. Cache http.Client on web search/fetch provider structs and channel adapters (WeCom, LINE) to avoid per-call allocation overhead. * fix(channels): preserve original http client timeouts for LINE and WeCom Split LINE single 60s client into infoClient (10s) for bot info lookups and apiClient (30s) for messaging API calls. Lower WeCom cached client base timeout from 60s to 30s (matching uploadMedia), and ensure it is always >= the configured ReplyTimeout so the per-request context deadline remains the effective limit. * refactor(tools): extract timeout consts and deduplicate WebFetchTool constructors Address PR review feedback from xiaket: - Define searchTimeout, perplexityTimeout, fetchTimeout, defaultMaxChars, and maxRedirects as package-level consts instead of magic numbers. - Remove misleading "No proxy" comment in NewWebFetchTool. - Deduplicate NewWebFetchTool by delegating to NewWebFetchToolWithProxy. * test(utils): add context cancellation test for DoRequestWithRetry Verify that resp.Body is properly closed when the context is canceled during retry sleep, covering the C8 resp.Body leak fix. * fix(utils): close resp in test to satisfy bodyclose linter * fix(utils): eliminate flakiness in context cancellation retry test Synchronize cancellation using an onRoundTrip callback from the transport wrapper instead of a timing-based context timeout. This ensures the first client.Do completes before cancel fires, so cancellation always hits during sleepWithCtx.
696 lines
19 KiB
Go
696 lines
19 KiB
Go
package tools
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
|
|
// HTTP client timeouts for web tool providers.
|
|
searchTimeout = 10 * time.Second // Brave, Tavily, DuckDuckGo
|
|
perplexityTimeout = 30 * time.Second // Perplexity (LLM-based, slower)
|
|
fetchTimeout = 60 * time.Second // WebFetchTool
|
|
|
|
defaultMaxChars = 50000
|
|
maxRedirects = 5
|
|
)
|
|
|
|
// Pre-compiled regexes for HTML text extraction
|
|
var (
|
|
reScript = regexp.MustCompile(`<script[\s\S]*?</script>`)
|
|
reStyle = regexp.MustCompile(`<style[\s\S]*?</style>`)
|
|
reTags = regexp.MustCompile(`<[^>]+>`)
|
|
reWhitespace = regexp.MustCompile(`[^\S\n]+`)
|
|
reBlankLines = regexp.MustCompile(`\n{3,}`)
|
|
|
|
// DuckDuckGo result extraction
|
|
reDDGLink = regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
|
|
reDDGSnippet = regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
|
|
)
|
|
|
|
// createHTTPClient creates an HTTP client with optional proxy support
|
|
func createHTTPClient(proxyURL string, timeout time.Duration) (*http.Client, error) {
|
|
client := &http.Client{
|
|
Timeout: timeout,
|
|
Transport: &http.Transport{
|
|
MaxIdleConns: 10,
|
|
IdleConnTimeout: 30 * time.Second,
|
|
DisableCompression: false,
|
|
TLSHandshakeTimeout: 15 * time.Second,
|
|
},
|
|
}
|
|
|
|
if proxyURL != "" {
|
|
proxy, err := url.Parse(proxyURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid proxy URL: %w", err)
|
|
}
|
|
scheme := strings.ToLower(proxy.Scheme)
|
|
switch scheme {
|
|
case "http", "https", "socks5", "socks5h":
|
|
default:
|
|
return nil, fmt.Errorf(
|
|
"unsupported proxy scheme %q (supported: http, https, socks5, socks5h)",
|
|
proxy.Scheme,
|
|
)
|
|
}
|
|
if proxy.Host == "" {
|
|
return nil, fmt.Errorf("invalid proxy URL: missing host")
|
|
}
|
|
client.Transport.(*http.Transport).Proxy = http.ProxyURL(proxy)
|
|
} else {
|
|
client.Transport.(*http.Transport).Proxy = http.ProxyFromEnvironment
|
|
}
|
|
|
|
return client, nil
|
|
}
|
|
|
|
type SearchProvider interface {
|
|
Search(ctx context.Context, query string, count int) (string, error)
|
|
}
|
|
|
|
type BraveSearchProvider struct {
|
|
apiKey string
|
|
proxy string
|
|
client *http.Client
|
|
}
|
|
|
|
func (p *BraveSearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
|
|
searchURL := fmt.Sprintf("https://api.search.brave.com/res/v1/web/search?q=%s&count=%d",
|
|
url.QueryEscape(query), count)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Accept", "application/json")
|
|
req.Header.Set("X-Subscription-Token", p.apiKey)
|
|
|
|
resp, err := p.client.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read response: %w", err)
|
|
}
|
|
|
|
var searchResp struct {
|
|
Web struct {
|
|
Results []struct {
|
|
Title string `json:"title"`
|
|
URL string `json:"url"`
|
|
Description string `json:"description"`
|
|
} `json:"results"`
|
|
} `json:"web"`
|
|
}
|
|
|
|
if err := json.Unmarshal(body, &searchResp); err != nil {
|
|
// Log error body for debugging
|
|
fmt.Printf("Brave API Error Body: %s\n", string(body))
|
|
return "", fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
|
|
results := searchResp.Web.Results
|
|
if len(results) == 0 {
|
|
return fmt.Sprintf("No results for: %s", query), nil
|
|
}
|
|
|
|
var lines []string
|
|
lines = append(lines, fmt.Sprintf("Results for: %s", query))
|
|
for i, item := range results {
|
|
if i >= count {
|
|
break
|
|
}
|
|
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, item.Title, item.URL))
|
|
if item.Description != "" {
|
|
lines = append(lines, fmt.Sprintf(" %s", item.Description))
|
|
}
|
|
}
|
|
|
|
return strings.Join(lines, "\n"), nil
|
|
}
|
|
|
|
type TavilySearchProvider struct {
|
|
apiKey string
|
|
baseURL string
|
|
proxy string
|
|
client *http.Client
|
|
}
|
|
|
|
func (p *TavilySearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
|
|
searchURL := p.baseURL
|
|
if searchURL == "" {
|
|
searchURL = "https://api.tavily.com/search"
|
|
}
|
|
|
|
payload := map[string]any{
|
|
"api_key": p.apiKey,
|
|
"query": query,
|
|
"search_depth": "advanced",
|
|
"include_answer": false,
|
|
"include_images": false,
|
|
"include_raw_content": false,
|
|
"max_results": count,
|
|
}
|
|
|
|
bodyBytes, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to marshal payload: %w", err)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "POST", searchURL, bytes.NewBuffer(bodyBytes))
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := p.client.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read response: %w", err)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("tavily api error (status %d): %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
var searchResp struct {
|
|
Results []struct {
|
|
Title string `json:"title"`
|
|
URL string `json:"url"`
|
|
Content string `json:"content"`
|
|
} `json:"results"`
|
|
}
|
|
|
|
if err := json.Unmarshal(body, &searchResp); err != nil {
|
|
return "", fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
|
|
results := searchResp.Results
|
|
if len(results) == 0 {
|
|
return fmt.Sprintf("No results for: %s", query), nil
|
|
}
|
|
|
|
var lines []string
|
|
lines = append(lines, fmt.Sprintf("Results for: %s (via Tavily)", query))
|
|
for i, item := range results {
|
|
if i >= count {
|
|
break
|
|
}
|
|
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, item.Title, item.URL))
|
|
if item.Content != "" {
|
|
lines = append(lines, fmt.Sprintf(" %s", item.Content))
|
|
}
|
|
}
|
|
|
|
return strings.Join(lines, "\n"), nil
|
|
}
|
|
|
|
type DuckDuckGoSearchProvider struct {
|
|
proxy string
|
|
client *http.Client
|
|
}
|
|
|
|
func (p *DuckDuckGoSearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
|
|
searchURL := fmt.Sprintf("https://html.duckduckgo.com/html/?q=%s", url.QueryEscape(query))
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := p.client.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read response: %w", err)
|
|
}
|
|
|
|
return p.extractResults(string(body), count, query)
|
|
}
|
|
|
|
func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query string) (string, error) {
|
|
// Simple regex based extraction for DDG HTML
|
|
// Strategy: Find all result containers or key anchors directly
|
|
|
|
// Try finding the result links directly first, as they are the most critical
|
|
// Pattern: <a class="result__a" href="...">Title</a>
|
|
// The previous regex was a bit strict. Let's make it more flexible for attributes order/content
|
|
matches := reDDGLink.FindAllStringSubmatch(html, count+5)
|
|
|
|
if len(matches) == 0 {
|
|
return fmt.Sprintf("No results found or extraction failed. Query: %s", query), nil
|
|
}
|
|
|
|
var lines []string
|
|
lines = append(lines, fmt.Sprintf("Results for: %s (via DuckDuckGo)", query))
|
|
|
|
// Pre-compile snippet regex to run inside the loop
|
|
// We'll search for snippets relative to the link position or just globally if needed
|
|
// But simple global search for snippets might mismatch order.
|
|
// Since we only have the raw HTML string, let's just extract snippets globally and assume order matches (risky but simple for regex)
|
|
// Or better: Let's assume the snippet follows the link in the HTML
|
|
|
|
// A better regex approach: iterate through text and find matches in order
|
|
// But for now, let's grab all snippets too
|
|
snippetMatches := reDDGSnippet.FindAllStringSubmatch(html, count+5)
|
|
|
|
maxItems := min(len(matches), count)
|
|
|
|
for i := 0; i < maxItems; i++ {
|
|
urlStr := matches[i][1]
|
|
title := stripTags(matches[i][2])
|
|
title = strings.TrimSpace(title)
|
|
|
|
// URL decoding if needed
|
|
if strings.Contains(urlStr, "uddg=") {
|
|
if u, err := url.QueryUnescape(urlStr); err == nil {
|
|
idx := strings.Index(u, "uddg=")
|
|
if idx != -1 {
|
|
urlStr = u[idx+5:]
|
|
}
|
|
}
|
|
}
|
|
|
|
lines = append(lines, fmt.Sprintf("%d. %s\n %s", i+1, title, urlStr))
|
|
|
|
// Attempt to attach snippet if available and index aligns
|
|
if i < len(snippetMatches) {
|
|
snippet := stripTags(snippetMatches[i][1])
|
|
snippet = strings.TrimSpace(snippet)
|
|
if snippet != "" {
|
|
lines = append(lines, fmt.Sprintf(" %s", snippet))
|
|
}
|
|
}
|
|
}
|
|
|
|
return strings.Join(lines, "\n"), nil
|
|
}
|
|
|
|
func stripTags(content string) string {
|
|
return reTags.ReplaceAllString(content, "")
|
|
}
|
|
|
|
type PerplexitySearchProvider struct {
|
|
apiKey string
|
|
proxy string
|
|
client *http.Client
|
|
}
|
|
|
|
func (p *PerplexitySearchProvider) Search(ctx context.Context, query string, count int) (string, error) {
|
|
searchURL := "https://api.perplexity.ai/chat/completions"
|
|
|
|
payload := map[string]any{
|
|
"model": "sonar",
|
|
"messages": []map[string]string{
|
|
{
|
|
"role": "system",
|
|
"content": "You are a search assistant. Provide concise search results with titles, URLs, and brief descriptions in the following format:\n1. Title\n URL\n Description\n\nDo not add extra commentary.",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": fmt.Sprintf("Search for: %s. Provide up to %d relevant results.", query, count),
|
|
},
|
|
},
|
|
"max_tokens": 1000,
|
|
}
|
|
|
|
payloadBytes, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to marshal request: %w", err)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "POST", searchURL, strings.NewReader(string(payloadBytes)))
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Authorization", "Bearer "+p.apiKey)
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := p.client.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read response: %w", err)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("Perplexity API error: %s", string(body))
|
|
}
|
|
|
|
var searchResp struct {
|
|
Choices []struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
} `json:"choices"`
|
|
}
|
|
|
|
if err := json.Unmarshal(body, &searchResp); err != nil {
|
|
return "", fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
|
|
if len(searchResp.Choices) == 0 {
|
|
return fmt.Sprintf("No results for: %s", query), nil
|
|
}
|
|
|
|
return fmt.Sprintf("Results for: %s (via Perplexity)\n%s", query, searchResp.Choices[0].Message.Content), nil
|
|
}
|
|
|
|
type WebSearchTool struct {
|
|
provider SearchProvider
|
|
maxResults int
|
|
}
|
|
|
|
type WebSearchToolOptions struct {
|
|
BraveAPIKey string
|
|
BraveMaxResults int
|
|
BraveEnabled bool
|
|
TavilyAPIKey string
|
|
TavilyBaseURL string
|
|
TavilyMaxResults int
|
|
TavilyEnabled bool
|
|
DuckDuckGoMaxResults int
|
|
DuckDuckGoEnabled bool
|
|
PerplexityAPIKey string
|
|
PerplexityMaxResults int
|
|
PerplexityEnabled bool
|
|
Proxy string
|
|
}
|
|
|
|
func NewWebSearchTool(opts WebSearchToolOptions) (*WebSearchTool, error) {
|
|
var provider SearchProvider
|
|
maxResults := 5
|
|
|
|
// Priority: Perplexity > Brave > Tavily > DuckDuckGo
|
|
if opts.PerplexityEnabled && opts.PerplexityAPIKey != "" {
|
|
client, err := createHTTPClient(opts.Proxy, perplexityTimeout)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create HTTP client for Perplexity: %w", err)
|
|
}
|
|
provider = &PerplexitySearchProvider{apiKey: opts.PerplexityAPIKey, proxy: opts.Proxy, client: client}
|
|
if opts.PerplexityMaxResults > 0 {
|
|
maxResults = opts.PerplexityMaxResults
|
|
}
|
|
} else if opts.BraveEnabled && opts.BraveAPIKey != "" {
|
|
client, err := createHTTPClient(opts.Proxy, searchTimeout)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create HTTP client for Brave: %w", err)
|
|
}
|
|
provider = &BraveSearchProvider{apiKey: opts.BraveAPIKey, proxy: opts.Proxy, client: client}
|
|
if opts.BraveMaxResults > 0 {
|
|
maxResults = opts.BraveMaxResults
|
|
}
|
|
} else if opts.TavilyEnabled && opts.TavilyAPIKey != "" {
|
|
client, err := createHTTPClient(opts.Proxy, searchTimeout)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create HTTP client for Tavily: %w", err)
|
|
}
|
|
provider = &TavilySearchProvider{
|
|
apiKey: opts.TavilyAPIKey,
|
|
baseURL: opts.TavilyBaseURL,
|
|
proxy: opts.Proxy,
|
|
client: client,
|
|
}
|
|
if opts.TavilyMaxResults > 0 {
|
|
maxResults = opts.TavilyMaxResults
|
|
}
|
|
} else if opts.DuckDuckGoEnabled {
|
|
client, err := createHTTPClient(opts.Proxy, searchTimeout)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create HTTP client for DuckDuckGo: %w", err)
|
|
}
|
|
provider = &DuckDuckGoSearchProvider{proxy: opts.Proxy, client: client}
|
|
if opts.DuckDuckGoMaxResults > 0 {
|
|
maxResults = opts.DuckDuckGoMaxResults
|
|
}
|
|
} else {
|
|
return nil, nil
|
|
}
|
|
|
|
return &WebSearchTool{
|
|
provider: provider,
|
|
maxResults: maxResults,
|
|
}, nil
|
|
}
|
|
|
|
func (t *WebSearchTool) Name() string {
|
|
return "web_search"
|
|
}
|
|
|
|
func (t *WebSearchTool) Description() string {
|
|
return "Search the web for current information. Returns titles, URLs, and snippets from search results."
|
|
}
|
|
|
|
func (t *WebSearchTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"query": map[string]any{
|
|
"type": "string",
|
|
"description": "Search query",
|
|
},
|
|
"count": map[string]any{
|
|
"type": "integer",
|
|
"description": "Number of results (1-10)",
|
|
"minimum": 1.0,
|
|
"maximum": 10.0,
|
|
},
|
|
},
|
|
"required": []string{"query"},
|
|
}
|
|
}
|
|
|
|
func (t *WebSearchTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
|
|
query, ok := args["query"].(string)
|
|
if !ok {
|
|
return ErrorResult("query is required")
|
|
}
|
|
|
|
count := t.maxResults
|
|
if c, ok := args["count"].(float64); ok {
|
|
if int(c) > 0 && int(c) <= 10 {
|
|
count = int(c)
|
|
}
|
|
}
|
|
|
|
result, err := t.provider.Search(ctx, query, count)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("search failed: %v", err))
|
|
}
|
|
|
|
return &ToolResult{
|
|
ForLLM: result,
|
|
ForUser: result,
|
|
}
|
|
}
|
|
|
|
type WebFetchTool struct {
|
|
maxChars int
|
|
proxy string
|
|
client *http.Client
|
|
}
|
|
|
|
func NewWebFetchTool(maxChars int) *WebFetchTool {
|
|
// createHTTPClient cannot fail with an empty proxy string.
|
|
tool, _ := NewWebFetchToolWithProxy(maxChars, "")
|
|
return tool
|
|
}
|
|
|
|
func NewWebFetchToolWithProxy(maxChars int, proxy string) (*WebFetchTool, error) {
|
|
if maxChars <= 0 {
|
|
maxChars = defaultMaxChars
|
|
}
|
|
client, err := createHTTPClient(proxy, fetchTimeout)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create HTTP client for web fetch: %w", err)
|
|
}
|
|
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
|
|
if len(via) >= maxRedirects {
|
|
return fmt.Errorf("stopped after %d redirects", maxRedirects)
|
|
}
|
|
return nil
|
|
}
|
|
return &WebFetchTool{
|
|
maxChars: maxChars,
|
|
proxy: proxy,
|
|
client: client,
|
|
}, nil
|
|
}
|
|
|
|
func (t *WebFetchTool) Name() string {
|
|
return "web_fetch"
|
|
}
|
|
|
|
func (t *WebFetchTool) Description() string {
|
|
return "Fetch a URL and extract readable content (HTML to text). Use this to get weather info, news, articles, or any web content."
|
|
}
|
|
|
|
func (t *WebFetchTool) Parameters() map[string]any {
|
|
return map[string]any{
|
|
"type": "object",
|
|
"properties": map[string]any{
|
|
"url": map[string]any{
|
|
"type": "string",
|
|
"description": "URL to fetch",
|
|
},
|
|
"maxChars": map[string]any{
|
|
"type": "integer",
|
|
"description": "Maximum characters to extract",
|
|
"minimum": 100.0,
|
|
},
|
|
},
|
|
"required": []string{"url"},
|
|
}
|
|
}
|
|
|
|
func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolResult {
|
|
urlStr, ok := args["url"].(string)
|
|
if !ok {
|
|
return ErrorResult("url is required")
|
|
}
|
|
|
|
parsedURL, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("invalid URL: %v", err))
|
|
}
|
|
|
|
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
|
|
return ErrorResult("only http/https URLs are allowed")
|
|
}
|
|
|
|
if parsedURL.Host == "" {
|
|
return ErrorResult("missing domain in URL")
|
|
}
|
|
|
|
maxChars := t.maxChars
|
|
if mc, ok := args["maxChars"].(float64); ok {
|
|
if int(mc) > 100 {
|
|
maxChars = int(mc)
|
|
}
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("failed to create request: %v", err))
|
|
}
|
|
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := t.client.Do(req)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("request failed: %v", err))
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return ErrorResult(fmt.Sprintf("failed to read response: %v", err))
|
|
}
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
|
|
var text, extractor string
|
|
|
|
if strings.Contains(contentType, "application/json") {
|
|
var jsonData any
|
|
if err := json.Unmarshal(body, &jsonData); err == nil {
|
|
formatted, _ := json.MarshalIndent(jsonData, "", " ")
|
|
text = string(formatted)
|
|
extractor = "json"
|
|
} else {
|
|
text = string(body)
|
|
extractor = "raw"
|
|
}
|
|
} else if strings.Contains(contentType, "text/html") || len(body) > 0 &&
|
|
(strings.HasPrefix(string(body), "<!DOCTYPE") || strings.HasPrefix(strings.ToLower(string(body)), "<html")) {
|
|
text = t.extractText(string(body))
|
|
extractor = "text"
|
|
} else {
|
|
text = string(body)
|
|
extractor = "raw"
|
|
}
|
|
|
|
truncated := len(text) > maxChars
|
|
if truncated {
|
|
text = text[:maxChars]
|
|
}
|
|
|
|
result := map[string]any{
|
|
"url": urlStr,
|
|
"status": resp.StatusCode,
|
|
"extractor": extractor,
|
|
"truncated": truncated,
|
|
"length": len(text),
|
|
"text": text,
|
|
}
|
|
|
|
resultJSON, _ := json.MarshalIndent(result, "", " ")
|
|
|
|
return &ToolResult{
|
|
ForLLM: fmt.Sprintf(
|
|
"Fetched %d bytes from %s (extractor: %s, truncated: %v)",
|
|
len(text),
|
|
urlStr,
|
|
extractor,
|
|
truncated,
|
|
),
|
|
ForUser: string(resultJSON),
|
|
}
|
|
}
|
|
|
|
func (t *WebFetchTool) extractText(htmlContent string) string {
|
|
result := reScript.ReplaceAllLiteralString(htmlContent, "")
|
|
result = reStyle.ReplaceAllLiteralString(result, "")
|
|
result = reTags.ReplaceAllLiteralString(result, "")
|
|
|
|
result = strings.TrimSpace(result)
|
|
|
|
result = reWhitespace.ReplaceAllString(result, " ")
|
|
result = reBlankLines.ReplaceAllString(result, "\n\n")
|
|
|
|
lines := strings.Split(result, "\n")
|
|
var cleanLines []string
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line != "" {
|
|
cleanLines = append(cleanLines, line)
|
|
}
|
|
}
|
|
|
|
return strings.Join(cleanLines, "\n")
|
|
}
|