Article title
+This is the body of the article.
+diff --git a/README.fr.md b/README.fr.md index 49a02fb77..ac6bdcbd6 100644 --- a/README.fr.md +++ b/README.fr.md @@ -251,6 +251,9 @@ picoclaw onboard }, "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "VOTRE_CLE_API_BRAVE", diff --git a/README.ja.md b/README.ja.md index c0d27de4f..61b35a91b 100644 --- a/README.ja.md +++ b/README.ja.md @@ -216,6 +216,9 @@ picoclaw onboard }, "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "search": { "api_key": "YOUR_BRAVE_API_KEY", "max_results": 5 diff --git a/README.md b/README.md index 159ac706f..39c8d14b0 100644 --- a/README.md +++ b/README.md @@ -270,6 +270,9 @@ picoclaw onboard ], "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", diff --git a/README.pt-br.md b/README.pt-br.md index 56946139b..0b0620b16 100644 --- a/README.pt-br.md +++ b/README.pt-br.md @@ -245,6 +245,9 @@ picoclaw onboard }, "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", diff --git a/README.zh.md b/README.zh.md index 9877ef9f4..4d15060a5 100644 --- a/README.zh.md +++ b/README.zh.md @@ -255,6 +255,9 @@ picoclaw onboard ], "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", diff --git a/config/config.example.json b/config/config.example.json index 1c11cd42a..f08989c4d 100644 --- a/config/config.example.json +++ b/config/config.example.json @@ -313,6 +313,8 @@ "allow_write_paths": null, "web": { "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", @@ -350,8 +352,7 @@ "base_url": "https://open.bigmodel.cn/api/paas/v4/web_search", "search_engine": "search_std", "max_results": 5 - }, - "fetch_limit_bytes": 10485760 + } }, "cron": { "enabled": true, diff --git a/docs/tools_configuration.md b/docs/tools_configuration.md index 8c8eb31f0..ae3252e7c 100644 --- a/docs/tools_configuration.md +++ b/docs/tools_configuration.md @@ -30,6 +30,15 @@ PicoClaw's tools configuration is located in the `tools` field of `config.json`. Web tools are used for web search and fetching. +### Web Fetcher +General settings for fetching and processing webpage content. + +| Config | Type | Default | Description | +|---------------------|--------|---------------|-----------------------------------------------------------------------------------------------| +| `enabled` | bool | true | Enable the webpage fetching capability. | +| `fetch_limit_bytes` | int | 10485760 | Maximum size of the webpage payload to fetch, in bytes (default is 10MB). | +| `format` | string | "plaintext" | Output format of the fetched content. Options: `plaintext` or `markdown` (recommended). | + ### Brave | Config | Type | Default | Description | diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index f20a56b9c..5700a67b4 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -157,7 +157,11 @@ func registerSharedTools( } } if cfg.Tools.IsToolEnabled("web_fetch") { - fetchTool, err := tools.NewWebFetchToolWithProxy(50000, cfg.Tools.Web.Proxy, cfg.Tools.Web.FetchLimitBytes) + fetchTool, err := tools.NewWebFetchToolWithProxy( + 50000, + cfg.Tools.Web.Proxy, + cfg.Tools.Web.Format, + cfg.Tools.Web.FetchLimitBytes) if err != nil { logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()}) } else { diff --git a/pkg/config/config.go b/pkg/config/config.go index 190341224..9f6253cdc 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -694,6 +694,7 @@ type WebToolsConfig struct { // For authenticated proxies, prefer HTTP_PROXY/HTTPS_PROXY env vars instead of embedding credentials in config. Proxy string `json:"proxy,omitempty" env:"PICOCLAW_TOOLS_WEB_PROXY"` FetchLimitBytes int64 `json:"fetch_limit_bytes,omitempty" env:"PICOCLAW_TOOLS_WEB_FETCH_LIMIT_BYTES"` + Format string `json:"format,omitempty" env:"PICOCLAW_TOOLS_WEB_FORMAT"` } type CronToolsConfig struct { diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go index dc534d852..d0e528e12 100644 --- a/pkg/config/defaults.go +++ b/pkg/config/defaults.go @@ -412,6 +412,7 @@ func DefaultConfig() *Config { }, Proxy: "", FetchLimitBytes: 10 * 1024 * 1024, // 10MB by default + Format: "plaintext", Brave: BraveConfig{ Enabled: false, APIKey: "", diff --git a/pkg/tools/web.go b/pkg/tools/web.go index e5036d3a8..64df27780 100644 --- a/pkg/tools/web.go +++ b/pkg/tools/web.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "io" + "mime" "net" "net/http" "net/url" @@ -28,6 +29,7 @@ const ( defaultMaxChars = 50000 maxRedirects = 5 + format = "plaintext" ) // Pre-compiled regexes for HTML text extraction @@ -776,19 +778,20 @@ type WebFetchTool struct { maxChars int proxy string client *http.Client + format string fetchLimitBytes int64 } -func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) { +func NewWebFetchTool(maxChars int, format string, fetchLimitBytes int64) (*WebFetchTool, error) { // createHTTPClient cannot fail with an empty proxy string. - return NewWebFetchToolWithProxy(maxChars, "", fetchLimitBytes) + return NewWebFetchToolWithProxy(maxChars, "", format, fetchLimitBytes) } // allowPrivateWebFetchHosts controls whether loopback/private hosts are allowed. // This is false in normal runtime to reduce SSRF exposure, and tests can override it temporarily. var allowPrivateWebFetchHosts atomic.Bool -func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) { +func NewWebFetchToolWithProxy(maxChars int, proxy string, format string, fetchLimitBytes int64) (*WebFetchTool, error) { if maxChars <= 0 { maxChars = defaultMaxChars } @@ -819,6 +822,7 @@ func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) maxChars: maxChars, proxy: proxy, client: client, + format: format, fetchLimitBytes: fetchLimitBytes, }, nil } @@ -906,26 +910,50 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe return ErrorResult(fmt.Sprintf("failed to read response: %v", err)) } + bodyStr := string(body) contentType := resp.Header.Get("Content-Type") + mediaType, _, _ := mime.ParseMediaType(contentType) + var text, extractor string - if strings.Contains(contentType, "application/json") { + switch { + case mediaType == "application/json": var jsonData any - if err := json.Unmarshal(body, &jsonData); err == nil { - formatted, _ := json.MarshalIndent(jsonData, "", " ") - text = string(formatted) - extractor = "json" - } else { - text = string(body) + if err := json.Unmarshal(body, &jsonData); err != nil { + text = bodyStr extractor = "raw" + break } - } else if strings.Contains(contentType, "text/html") || len(body) > 0 && - (strings.HasPrefix(string(body), "]*>\)\]\(<[^>]*>\)`) + reEmptyHeader = regexp.MustCompile(`(?m)^#{1,6}\s*$`) + reLeadingLineSpace = regexp.MustCompile(`(?m)^([ \t])([^ \t\n])`) +) + +var skipTags = map[string]bool{ + "script": true, "style": true, "head": true, + "noscript": true, "template": true, + "nav": true, "footer": true, "aside": true, "header": true, "form": true, "dialog": true, +} + +func isSafeHref(href string) bool { + lower := strings.ToLower(strings.TrimSpace(href)) + if strings.HasPrefix(lower, "javascript:") || strings.HasPrefix(lower, "vbscript:") || + strings.HasPrefix(lower, "data:") { + return false + } + u, err := url.Parse(strings.TrimSpace(href)) + if err != nil { + return false + } + scheme := strings.ToLower(u.Scheme) + return scheme == "" || scheme == "http" || scheme == "https" || scheme == "mailto" +} + +func isSafeImageSrc(src string) bool { + lower := strings.ToLower(strings.TrimSpace(src)) + if strings.HasPrefix(lower, "data:image/") { + return true + } + return isSafeHref(src) +} + +func escapeMdAlt(s string) string { + s = strings.ReplaceAll(s, `\`, `\\`) + s = strings.ReplaceAll(s, `[`, `\[`) + s = strings.ReplaceAll(s, `]`, `\]`) + return s +} + +func getAttr(n *html.Node, key string) string { + for _, a := range n.Attr { + if a.Key == key { + return a.Val + } + } + return "" +} + +func normalizeAttr(val string) string { + val = strings.ReplaceAll(val, "\n", "") + val = strings.ReplaceAll(val, "\r", "") + val = strings.ReplaceAll(val, "\t", "") + return strings.TrimSpace(val) +} + +func isUnlikelyNode(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + classId := strings.ToLower(getAttr(n, "class") + " " + getAttr(n, "id")) + if classId == " " { + return false + } + if strings.Contains(classId, "article") || strings.Contains(classId, "main") || + strings.Contains(classId, "content") { + return false + } + unlikelyKeywords := []string{ + "menu", + "nav", + "footer", + "sidebar", + "cookie", + "banner", + "sponsor", + "advert", + "popup", + "modal", + "newsletter", + "share", + "social", + } + for _, keyword := range unlikelyKeywords { + if strings.Contains(classId, keyword) { + return true + } + } + return false +} + +type converter struct { + stack []*bytes.Buffer + linkHrefs []string + linkStates []bool + emphStack []string // Tracks "**", "*", "~~" for buffered emphasis + olCounters []int + inPre bool + listDepth int +} + +func newConverter() *converter { + return &converter{ + stack: []*bytes.Buffer{{}}, + } +} + +func (c *converter) write(s string) { + c.stack[len(c.stack)-1].WriteString(s) +} + +func (c *converter) pushBuf() { + c.stack = append(c.stack, &bytes.Buffer{}) +} + +func (c *converter) popBuf() string { + top := c.stack[len(c.stack)-1] + c.stack = c.stack[:len(c.stack)-1] + return top.String() +} + +func (c *converter) walk(n *html.Node) { + if n.Type == html.ElementNode { + if skipTags[n.Data] { + return + } + if isUnlikelyNode(n) { + return + } + } + + if n.Type == html.TextNode { + text := n.Data + if !c.inPre { + text = strings.ReplaceAll(text, "\n", " ") + text = reSpaces.ReplaceAllString(text, " ") + } + if text != "" { + c.write(text) + } + return + } + + if n.Type != html.ElementNode { + for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { + c.walk(ch) + } + return + } + + // Opening Tags + switch n.Data { + + // Buffer emphasis content so we can TrimSpace the inner text, + // avoiding the regex-across-boundaries bug. + case "b", "strong": + c.emphStack = append(c.emphStack, "**") + c.pushBuf() + case "i", "em": + c.emphStack = append(c.emphStack, "*") + c.pushBuf() + case "del", "s": + c.emphStack = append(c.emphStack, "~~") + c.pushBuf() + + case "a": + href := normalizeAttr(getAttr(n, "href")) + if href != "" && !isSafeHref(href) { + href = "#" + } + hasHref := href != "" + c.linkStates = append(c.linkStates, hasHref) + if hasHref { + c.linkHrefs = append(c.linkHrefs, href) + c.pushBuf() + } + + case "h1": + c.write("\n\n# ") + case "h2": + c.write("\n\n## ") + case "h3": + c.write("\n\n### ") + case "h4": + c.write("\n\n#### ") + case "h5": + c.write("\n\n##### ") + case "h6": + c.write("\n\n###### ") + + case "p": + c.write("\n\n") + case "br": + c.write("\n") + case "hr": + c.write("\n\n---\n\n") + + case "ol": + c.olCounters = append(c.olCounters, 1) + // Only write leading newline for top-level list. + if c.listDepth == 0 { + c.write("\n") + } + c.listDepth++ + case "ul": + if c.listDepth == 0 { + c.write("\n") + } + c.listDepth++ + case "li": + c.write("\n") + if c.listDepth > 1 { + c.write(strings.Repeat(" ", c.listDepth-1)) + } + if n.Parent != nil && n.Parent.Data == "ol" && len(c.olCounters) > 0 { + idx := c.olCounters[len(c.olCounters)-1] + c.write(strconv.Itoa(idx) + ". ") + c.olCounters[len(c.olCounters)-1]++ + } else { + c.write("- ") + } + + case "pre": + c.inPre = true + c.write("\n\n```\n") + case "code": + if !c.inPre { + c.write("`") + } + + case "blockquote": + c.pushBuf() + for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { + c.walk(ch) + } + inner := strings.TrimSpace(c.popBuf()) + lines := strings.Split(inner, "\n") + var quoted []string + for _, l := range lines { + if strings.TrimSpace(l) == "" { + quoted = append(quoted, ">") + } else { + quoted = append(quoted, "> "+l) + } + } + var deduped []string + for i, line := range quoted { + if line == ">" && i > 0 && deduped[len(deduped)-1] == ">" { + continue + } + deduped = append(deduped, line) + } + c.write("\n\n" + strings.Join(deduped, "\n") + "\n\n") + return + + case "img": + src := normalizeAttr(getAttr(n, "src")) + if src == "" { + src = normalizeAttr(getAttr(n, "data-src")) + } + if src == "" { + return + } + alt := escapeMdAlt(normalizeAttr(getAttr(n, "alt"))) + if isSafeImageSrc(src) { + c.write("") + } + return + } + + // Traverse Children + for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { + c.walk(ch) + } + + // Closing Tags + switch n.Data { + + // Pop buffer, trim, wrap with the correct marker. + case "b", "strong", "i", "em", "del", "s": + if len(c.emphStack) == 0 { + break + } + marker := c.emphStack[len(c.emphStack)-1] + c.emphStack = c.emphStack[:len(c.emphStack)-1] + inner := strings.TrimSpace(c.popBuf()) + if inner != "" { + c.write(marker + inner + marker) + } + + case "a": + if len(c.linkStates) == 0 { + break + } + hasHref := c.linkStates[len(c.linkStates)-1] + c.linkStates = c.linkStates[:len(c.linkStates)-1] + if !hasHref { + break + } + href := c.linkHrefs[len(c.linkHrefs)-1] + c.linkHrefs = c.linkHrefs[:len(c.linkHrefs)-1] + inner := strings.TrimSpace(c.popBuf()) + if strings.Contains(inner, "\n") { + lines := strings.Split(inner, "\n") + linked := false + for i, l := range lines { + cleanLine := strings.TrimSpace(l) + if cleanLine != "" && !strings.HasPrefix(cleanLine, "![") && !linked { + lines[i] = "[" + cleanLine + "](" + href + ")" + linked = true + } + } + c.write(strings.Join(lines, "\n")) + } else { + c.write("[" + inner + "](" + href + ")") + } + + case "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "div", + "section", + "article", + "header", + "footer", + "aside", + "nav", + "figure": + c.write("\n") + + case "ol": + c.listDepth-- + if len(c.olCounters) > 0 { + c.olCounters = c.olCounters[:len(c.olCounters)-1] + } + if c.listDepth == 0 { + c.write("\n") + } + case "ul": + c.listDepth-- + if c.listDepth == 0 { + c.write("\n") + } + + case "pre": + c.inPre = false + c.write("\n```\n\n") + case "code": + if !c.inPre { + c.write("`") + } + } +} + +func HtmlToMarkdown(htmlStr string) (string, error) { + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + return "", err + } + + c := newConverter() + c.walk(doc) + + res := c.stack[0].String() + + // Post-processing + res = reImageOnlyLink.ReplaceAllString(res, "") + res = reEmptyListItem.ReplaceAllString(res, "") + res = reEmptyHeader.ReplaceAllString(res, "") + + lines := strings.Split(res, "\n") + var cleanLines []string + for _, line := range lines { + line = strings.TrimRight(line, " \t") + cleanTest := strings.TrimSpace(line) + if cleanTest == "[](>)" || cleanTest == "[](#)" || cleanTest == "-" { + cleanLines = append(cleanLines, "") + continue + } + cleanLines = append(cleanLines, line) + } + res = strings.Join(cleanLines, "\n") + + res = strings.TrimSpace(res) + res = reNewlines.ReplaceAllString(res, "\n\n") + + // Strip a single leading space from lines that are NOT list indentation. + // "(?m)^([ \t])([^ \t\n])" matches exactly one space/tab at line start followed + // by a non-whitespace char, so " - nested" (4 spaces) is left untouched. + res = reLeadingLineSpace.ReplaceAllString(res, "$2") + + return res, nil +} diff --git a/pkg/utils/markdown_test.go b/pkg/utils/markdown_test.go new file mode 100644 index 000000000..72277fb91 --- /dev/null +++ b/pkg/utils/markdown_test.go @@ -0,0 +1,245 @@ +package utils + +import ( + "testing" + + "github.com/sipeed/picoclaw/pkg/logger" +) + +func TestHtmlToMarkdown(t *testing.T) { + // Define our test cases + tests := []struct { + name string + input string + expected string + }{ + { + name: "Removes scripts and styles", + input: `
Clean text
`, + expected: "Clean text", + }, + { + name: "Extracts links correctly", + input: `Visit my website for info.`, + expected: "Visit my [website](https://example.com) for info.", + }, + { + name: "Converts headers (H1, H2, H3)", + input: `First paragraph
Second paragraph with
a line break.
`,
+ // Correct Markdown syntax for images
+ expected: "",
+ },
+ {
+ name: "Image support without alt-text",
+ input: `
`,
+ // If alt is missing, square brackets remain empty
+ expected: "",
+ },
+ {
+ name: "XSS Bypass on Links (Obfuscated HTML entities)",
+ // The Go HTML parser resolves entities, so this becomes "javascript:alert(1)"
+ input: `Click here`,
+ // Our isSafeHref (if updated with net/url) should neutralize it to "#"
+ expected: "[Click here](#)",
+ },
+ {
+ name: "Empty link or used as anchor",
+ input: ``,
+ // With no text or href, it shouldn't print anything (not even empty brackets)
+ expected: "",
+ },
+ {
+ name: "Link without href but with text (Textual anchor)",
+ input: `Back to top`,
+ // Should extract only plain text, without generating a broken Markdown link like [Back to top](#) or [Back to top]()
+ expected: "Back to top",
+ },
+ {
+ name: "Badly spaced bold and italics (Edge Case)",
+ input: ` Text `,
+ // In Markdown `** Text **` is often not formatted correctly. The ideal is `**Text**`
+ expected: "**Text**",
+ },
+ {
+ name: "Complex Test - Real Article",
+ input: `
+ This is an introductory text with a link.
+func main() {\n fmt.Println(\"hello\")\n}",
+ expected: "```\nfunc main() {\n fmt.Println(\"hello\")\n}\n```",
+ },
+ {
+ name: "Inline code",
+ input: `Use the command go test ./... to run the tests.
`, + expected: "> An important quote.", + }, + { + name: "Multiline blockquote", + input: `An important quote.
`, + expected: "> First line of the quote.\n>\n> Second line of the quote.", + }, + { + name: "Strikethrough text (del/s)", + input: `This text isFirst line of the quote.
Second line of the quote.
Above the line
Below the line
`, + expected: "Above the line\n\n---\n\nBelow the line", + }, + { + name: "Bold nested in link", + input: `Linked bold text`, + expected: "[**Linked bold text**](https://example.com)", + }, + { + name: "data-src Image (lazy loading)", + input: `Deeply nested text
Important: read the critical instructions carefully.
`, + expected: "**Important:** read the ***critical instructions*** *carefully*.", + }, + { + name: "Article with nav and aside sections (noise to filter)", + input: ` + +This is the body of the article.
+
`,
+ // The image-link without text must not generate broken markup
+ expected: "[](https://example.com)",
+ },
+ {
+ name: "Empty content or only whitespace",
+ input: `
`,
+ expected: "",
+ },
+ {
+ name: "Image with javascript: src blocked",
+ input: `