mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat(tool): markdown format in output web_fetch tool
This commit is contained in:
+55
-16
@@ -7,6 +7,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
@@ -28,6 +29,7 @@ const (
|
||||
|
||||
defaultMaxChars = 50000
|
||||
maxRedirects = 5
|
||||
format = "plaintext"
|
||||
)
|
||||
|
||||
// Pre-compiled regexes for HTML text extraction
|
||||
@@ -776,19 +778,20 @@ type WebFetchTool struct {
|
||||
maxChars int
|
||||
proxy string
|
||||
client *http.Client
|
||||
format string
|
||||
fetchLimitBytes int64
|
||||
}
|
||||
|
||||
func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
func NewWebFetchTool(maxChars int, format string, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
// createHTTPClient cannot fail with an empty proxy string.
|
||||
return NewWebFetchToolWithProxy(maxChars, "", fetchLimitBytes)
|
||||
return NewWebFetchToolWithProxy(maxChars, "", format, fetchLimitBytes)
|
||||
}
|
||||
|
||||
// allowPrivateWebFetchHosts controls whether loopback/private hosts are allowed.
|
||||
// This is false in normal runtime to reduce SSRF exposure, and tests can override it temporarily.
|
||||
var allowPrivateWebFetchHosts atomic.Bool
|
||||
|
||||
func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
func NewWebFetchToolWithProxy(maxChars int, proxy string, format string, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
if maxChars <= 0 {
|
||||
maxChars = defaultMaxChars
|
||||
}
|
||||
@@ -819,6 +822,7 @@ func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64)
|
||||
maxChars: maxChars,
|
||||
proxy: proxy,
|
||||
client: client,
|
||||
format: format,
|
||||
fetchLimitBytes: fetchLimitBytes,
|
||||
}, nil
|
||||
}
|
||||
@@ -906,26 +910,50 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
|
||||
return ErrorResult(fmt.Sprintf("failed to read response: %v", err))
|
||||
}
|
||||
|
||||
bodyStr := string(body)
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
|
||||
mediaType, _, _ := mime.ParseMediaType(contentType)
|
||||
|
||||
var text, extractor string
|
||||
|
||||
if strings.Contains(contentType, "application/json") {
|
||||
switch {
|
||||
case mediaType == "application/json":
|
||||
var jsonData any
|
||||
if err := json.Unmarshal(body, &jsonData); err == nil {
|
||||
formatted, _ := json.MarshalIndent(jsonData, "", " ")
|
||||
text = string(formatted)
|
||||
extractor = "json"
|
||||
} else {
|
||||
text = string(body)
|
||||
if err := json.Unmarshal(body, &jsonData); err != nil {
|
||||
text = bodyStr
|
||||
extractor = "raw"
|
||||
break
|
||||
}
|
||||
} else if strings.Contains(contentType, "text/html") || len(body) > 0 &&
|
||||
(strings.HasPrefix(string(body), "<!DOCTYPE") || strings.HasPrefix(strings.ToLower(string(body)), "<html")) {
|
||||
text = t.extractText(string(body))
|
||||
extractor = "text"
|
||||
} else {
|
||||
text = string(body)
|
||||
|
||||
formatted, err := json.MarshalIndent(jsonData, "", " ")
|
||||
if err != nil {
|
||||
text = bodyStr
|
||||
extractor = "raw"
|
||||
break
|
||||
}
|
||||
|
||||
text = string(formatted)
|
||||
extractor = "json"
|
||||
|
||||
case mediaType == "text/html" || looksLikeHTML(bodyStr):
|
||||
switch strings.ToLower(t.format) {
|
||||
|
||||
case "markdown":
|
||||
var err error
|
||||
text, err = utils.HtmlToMarkdown(bodyStr)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("failed to HTML to markdown: %v", err))
|
||||
}
|
||||
extractor = "markdown"
|
||||
|
||||
default:
|
||||
text = t.extractText(bodyStr)
|
||||
extractor = "text"
|
||||
}
|
||||
|
||||
default:
|
||||
text = bodyStr
|
||||
extractor = "raw"
|
||||
}
|
||||
|
||||
@@ -957,6 +985,17 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
|
||||
}
|
||||
}
|
||||
|
||||
func looksLikeHTML(body string) bool {
|
||||
if body == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
lower := strings.ToLower(body)
|
||||
|
||||
return strings.HasPrefix(body, "<!doctype") ||
|
||||
strings.HasPrefix(lower, "<html")
|
||||
}
|
||||
|
||||
func (t *WebFetchTool) extractText(htmlContent string) string {
|
||||
result := reScript.ReplaceAllLiteralString(htmlContent, "")
|
||||
result = reStyle.ReplaceAllLiteralString(result, "")
|
||||
|
||||
Reference in New Issue
Block a user