From d5c2bc538a60dbaaccc5a644757575caa644677c Mon Sep 17 00:00:00 2001 From: afjcjsbx Date: Sun, 15 Mar 2026 22:12:03 +0100 Subject: [PATCH] feat(tool): markdown format in output web_fetch tool --- README.fr.md | 3 + README.ja.md | 3 + README.md | 3 + README.pt-br.md | 3 + README.zh.md | 3 + config/config.example.json | 5 +- docs/tools_configuration.md | 9 + pkg/agent/loop.go | 6 +- pkg/config/config.go | 1 + pkg/config/defaults.go | 1 + pkg/tools/web.go | 71 +++++-- pkg/tools/web_test.go | 42 ++-- pkg/utils/markdown.go | 413 ++++++++++++++++++++++++++++++++++++ pkg/utils/markdown_test.go | 245 +++++++++++++++++++++ 14 files changed, 769 insertions(+), 39 deletions(-) create mode 100644 pkg/utils/markdown.go create mode 100644 pkg/utils/markdown_test.go diff --git a/README.fr.md b/README.fr.md index 49a02fb77..ac6bdcbd6 100644 --- a/README.fr.md +++ b/README.fr.md @@ -251,6 +251,9 @@ picoclaw onboard }, "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "VOTRE_CLE_API_BRAVE", diff --git a/README.ja.md b/README.ja.md index c0d27de4f..61b35a91b 100644 --- a/README.ja.md +++ b/README.ja.md @@ -216,6 +216,9 @@ picoclaw onboard }, "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "search": { "api_key": "YOUR_BRAVE_API_KEY", "max_results": 5 diff --git a/README.md b/README.md index 159ac706f..39c8d14b0 100644 --- a/README.md +++ b/README.md @@ -270,6 +270,9 @@ picoclaw onboard ], "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", diff --git a/README.pt-br.md b/README.pt-br.md index 56946139b..0b0620b16 100644 --- a/README.pt-br.md +++ b/README.pt-br.md @@ -245,6 +245,9 @@ picoclaw onboard }, "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", diff --git a/README.zh.md b/README.zh.md index 9877ef9f4..4d15060a5 100644 --- a/README.zh.md +++ b/README.zh.md @@ -255,6 +255,9 @@ picoclaw onboard ], "tools": { "web": { + "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", diff --git a/config/config.example.json b/config/config.example.json index 1c11cd42a..f08989c4d 100644 --- a/config/config.example.json +++ b/config/config.example.json @@ -313,6 +313,8 @@ "allow_write_paths": null, "web": { "enabled": true, + "fetch_limit_bytes": 10485760, + "format": "plaintext", "brave": { "enabled": false, "api_key": "YOUR_BRAVE_API_KEY", @@ -350,8 +352,7 @@ "base_url": "https://open.bigmodel.cn/api/paas/v4/web_search", "search_engine": "search_std", "max_results": 5 - }, - "fetch_limit_bytes": 10485760 + } }, "cron": { "enabled": true, diff --git a/docs/tools_configuration.md b/docs/tools_configuration.md index 8c8eb31f0..ae3252e7c 100644 --- a/docs/tools_configuration.md +++ b/docs/tools_configuration.md @@ -30,6 +30,15 @@ PicoClaw's tools configuration is located in the `tools` field of `config.json`. Web tools are used for web search and fetching. +### Web Fetcher +General settings for fetching and processing webpage content. + +| Config | Type | Default | Description | +|---------------------|--------|---------------|-----------------------------------------------------------------------------------------------| +| `enabled` | bool | true | Enable the webpage fetching capability. | +| `fetch_limit_bytes` | int | 10485760 | Maximum size of the webpage payload to fetch, in bytes (default is 10MB). | +| `format` | string | "plaintext" | Output format of the fetched content. Options: `plaintext` or `markdown` (recommended). | + ### Brave | Config | Type | Default | Description | diff --git a/pkg/agent/loop.go b/pkg/agent/loop.go index f20a56b9c..5700a67b4 100644 --- a/pkg/agent/loop.go +++ b/pkg/agent/loop.go @@ -157,7 +157,11 @@ func registerSharedTools( } } if cfg.Tools.IsToolEnabled("web_fetch") { - fetchTool, err := tools.NewWebFetchToolWithProxy(50000, cfg.Tools.Web.Proxy, cfg.Tools.Web.FetchLimitBytes) + fetchTool, err := tools.NewWebFetchToolWithProxy( + 50000, + cfg.Tools.Web.Proxy, + cfg.Tools.Web.Format, + cfg.Tools.Web.FetchLimitBytes) if err != nil { logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()}) } else { diff --git a/pkg/config/config.go b/pkg/config/config.go index 190341224..9f6253cdc 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -694,6 +694,7 @@ type WebToolsConfig struct { // For authenticated proxies, prefer HTTP_PROXY/HTTPS_PROXY env vars instead of embedding credentials in config. Proxy string `json:"proxy,omitempty" env:"PICOCLAW_TOOLS_WEB_PROXY"` FetchLimitBytes int64 `json:"fetch_limit_bytes,omitempty" env:"PICOCLAW_TOOLS_WEB_FETCH_LIMIT_BYTES"` + Format string `json:"format,omitempty" env:"PICOCLAW_TOOLS_WEB_FORMAT"` } type CronToolsConfig struct { diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go index dc534d852..d0e528e12 100644 --- a/pkg/config/defaults.go +++ b/pkg/config/defaults.go @@ -412,6 +412,7 @@ func DefaultConfig() *Config { }, Proxy: "", FetchLimitBytes: 10 * 1024 * 1024, // 10MB by default + Format: "plaintext", Brave: BraveConfig{ Enabled: false, APIKey: "", diff --git a/pkg/tools/web.go b/pkg/tools/web.go index e5036d3a8..64df27780 100644 --- a/pkg/tools/web.go +++ b/pkg/tools/web.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "io" + "mime" "net" "net/http" "net/url" @@ -28,6 +29,7 @@ const ( defaultMaxChars = 50000 maxRedirects = 5 + format = "plaintext" ) // Pre-compiled regexes for HTML text extraction @@ -776,19 +778,20 @@ type WebFetchTool struct { maxChars int proxy string client *http.Client + format string fetchLimitBytes int64 } -func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) { +func NewWebFetchTool(maxChars int, format string, fetchLimitBytes int64) (*WebFetchTool, error) { // createHTTPClient cannot fail with an empty proxy string. - return NewWebFetchToolWithProxy(maxChars, "", fetchLimitBytes) + return NewWebFetchToolWithProxy(maxChars, "", format, fetchLimitBytes) } // allowPrivateWebFetchHosts controls whether loopback/private hosts are allowed. // This is false in normal runtime to reduce SSRF exposure, and tests can override it temporarily. var allowPrivateWebFetchHosts atomic.Bool -func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) { +func NewWebFetchToolWithProxy(maxChars int, proxy string, format string, fetchLimitBytes int64) (*WebFetchTool, error) { if maxChars <= 0 { maxChars = defaultMaxChars } @@ -819,6 +822,7 @@ func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) maxChars: maxChars, proxy: proxy, client: client, + format: format, fetchLimitBytes: fetchLimitBytes, }, nil } @@ -906,26 +910,50 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe return ErrorResult(fmt.Sprintf("failed to read response: %v", err)) } + bodyStr := string(body) contentType := resp.Header.Get("Content-Type") + mediaType, _, _ := mime.ParseMediaType(contentType) + var text, extractor string - if strings.Contains(contentType, "application/json") { + switch { + case mediaType == "application/json": var jsonData any - if err := json.Unmarshal(body, &jsonData); err == nil { - formatted, _ := json.MarshalIndent(jsonData, "", " ") - text = string(formatted) - extractor = "json" - } else { - text = string(body) + if err := json.Unmarshal(body, &jsonData); err != nil { + text = bodyStr extractor = "raw" + break } - } else if strings.Contains(contentType, "text/html") || len(body) > 0 && - (strings.HasPrefix(string(body), "]*>\)\]\(<[^>]*>\)`) + reEmptyHeader = regexp.MustCompile(`(?m)^#{1,6}\s*$`) + reLeadingLineSpace = regexp.MustCompile(`(?m)^([ \t])([^ \t\n])`) +) + +var skipTags = map[string]bool{ + "script": true, "style": true, "head": true, + "noscript": true, "template": true, + "nav": true, "footer": true, "aside": true, "header": true, "form": true, "dialog": true, +} + +func isSafeHref(href string) bool { + lower := strings.ToLower(strings.TrimSpace(href)) + if strings.HasPrefix(lower, "javascript:") || strings.HasPrefix(lower, "vbscript:") || + strings.HasPrefix(lower, "data:") { + return false + } + u, err := url.Parse(strings.TrimSpace(href)) + if err != nil { + return false + } + scheme := strings.ToLower(u.Scheme) + return scheme == "" || scheme == "http" || scheme == "https" || scheme == "mailto" +} + +func isSafeImageSrc(src string) bool { + lower := strings.ToLower(strings.TrimSpace(src)) + if strings.HasPrefix(lower, "data:image/") { + return true + } + return isSafeHref(src) +} + +func escapeMdAlt(s string) string { + s = strings.ReplaceAll(s, `\`, `\\`) + s = strings.ReplaceAll(s, `[`, `\[`) + s = strings.ReplaceAll(s, `]`, `\]`) + return s +} + +func getAttr(n *html.Node, key string) string { + for _, a := range n.Attr { + if a.Key == key { + return a.Val + } + } + return "" +} + +func normalizeAttr(val string) string { + val = strings.ReplaceAll(val, "\n", "") + val = strings.ReplaceAll(val, "\r", "") + val = strings.ReplaceAll(val, "\t", "") + return strings.TrimSpace(val) +} + +func isUnlikelyNode(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + classId := strings.ToLower(getAttr(n, "class") + " " + getAttr(n, "id")) + if classId == " " { + return false + } + if strings.Contains(classId, "article") || strings.Contains(classId, "main") || + strings.Contains(classId, "content") { + return false + } + unlikelyKeywords := []string{ + "menu", + "nav", + "footer", + "sidebar", + "cookie", + "banner", + "sponsor", + "advert", + "popup", + "modal", + "newsletter", + "share", + "social", + } + for _, keyword := range unlikelyKeywords { + if strings.Contains(classId, keyword) { + return true + } + } + return false +} + +type converter struct { + stack []*bytes.Buffer + linkHrefs []string + linkStates []bool + emphStack []string // Tracks "**", "*", "~~" for buffered emphasis + olCounters []int + inPre bool + listDepth int +} + +func newConverter() *converter { + return &converter{ + stack: []*bytes.Buffer{{}}, + } +} + +func (c *converter) write(s string) { + c.stack[len(c.stack)-1].WriteString(s) +} + +func (c *converter) pushBuf() { + c.stack = append(c.stack, &bytes.Buffer{}) +} + +func (c *converter) popBuf() string { + top := c.stack[len(c.stack)-1] + c.stack = c.stack[:len(c.stack)-1] + return top.String() +} + +func (c *converter) walk(n *html.Node) { + if n.Type == html.ElementNode { + if skipTags[n.Data] { + return + } + if isUnlikelyNode(n) { + return + } + } + + if n.Type == html.TextNode { + text := n.Data + if !c.inPre { + text = strings.ReplaceAll(text, "\n", " ") + text = reSpaces.ReplaceAllString(text, " ") + } + if text != "" { + c.write(text) + } + return + } + + if n.Type != html.ElementNode { + for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { + c.walk(ch) + } + return + } + + // Opening Tags + switch n.Data { + + // Buffer emphasis content so we can TrimSpace the inner text, + // avoiding the regex-across-boundaries bug. + case "b", "strong": + c.emphStack = append(c.emphStack, "**") + c.pushBuf() + case "i", "em": + c.emphStack = append(c.emphStack, "*") + c.pushBuf() + case "del", "s": + c.emphStack = append(c.emphStack, "~~") + c.pushBuf() + + case "a": + href := normalizeAttr(getAttr(n, "href")) + if href != "" && !isSafeHref(href) { + href = "#" + } + hasHref := href != "" + c.linkStates = append(c.linkStates, hasHref) + if hasHref { + c.linkHrefs = append(c.linkHrefs, href) + c.pushBuf() + } + + case "h1": + c.write("\n\n# ") + case "h2": + c.write("\n\n## ") + case "h3": + c.write("\n\n### ") + case "h4": + c.write("\n\n#### ") + case "h5": + c.write("\n\n##### ") + case "h6": + c.write("\n\n###### ") + + case "p": + c.write("\n\n") + case "br": + c.write("\n") + case "hr": + c.write("\n\n---\n\n") + + case "ol": + c.olCounters = append(c.olCounters, 1) + // Only write leading newline for top-level list. + if c.listDepth == 0 { + c.write("\n") + } + c.listDepth++ + case "ul": + if c.listDepth == 0 { + c.write("\n") + } + c.listDepth++ + case "li": + c.write("\n") + if c.listDepth > 1 { + c.write(strings.Repeat(" ", c.listDepth-1)) + } + if n.Parent != nil && n.Parent.Data == "ol" && len(c.olCounters) > 0 { + idx := c.olCounters[len(c.olCounters)-1] + c.write(strconv.Itoa(idx) + ". ") + c.olCounters[len(c.olCounters)-1]++ + } else { + c.write("- ") + } + + case "pre": + c.inPre = true + c.write("\n\n```\n") + case "code": + if !c.inPre { + c.write("`") + } + + case "blockquote": + c.pushBuf() + for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { + c.walk(ch) + } + inner := strings.TrimSpace(c.popBuf()) + lines := strings.Split(inner, "\n") + var quoted []string + for _, l := range lines { + if strings.TrimSpace(l) == "" { + quoted = append(quoted, ">") + } else { + quoted = append(quoted, "> "+l) + } + } + var deduped []string + for i, line := range quoted { + if line == ">" && i > 0 && deduped[len(deduped)-1] == ">" { + continue + } + deduped = append(deduped, line) + } + c.write("\n\n" + strings.Join(deduped, "\n") + "\n\n") + return + + case "img": + src := normalizeAttr(getAttr(n, "src")) + if src == "" { + src = normalizeAttr(getAttr(n, "data-src")) + } + if src == "" { + return + } + alt := escapeMdAlt(normalizeAttr(getAttr(n, "alt"))) + if isSafeImageSrc(src) { + c.write("![" + alt + "](" + src + ")") + } + return + } + + // Traverse Children + for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { + c.walk(ch) + } + + // Closing Tags + switch n.Data { + + // Pop buffer, trim, wrap with the correct marker. + case "b", "strong", "i", "em", "del", "s": + if len(c.emphStack) == 0 { + break + } + marker := c.emphStack[len(c.emphStack)-1] + c.emphStack = c.emphStack[:len(c.emphStack)-1] + inner := strings.TrimSpace(c.popBuf()) + if inner != "" { + c.write(marker + inner + marker) + } + + case "a": + if len(c.linkStates) == 0 { + break + } + hasHref := c.linkStates[len(c.linkStates)-1] + c.linkStates = c.linkStates[:len(c.linkStates)-1] + if !hasHref { + break + } + href := c.linkHrefs[len(c.linkHrefs)-1] + c.linkHrefs = c.linkHrefs[:len(c.linkHrefs)-1] + inner := strings.TrimSpace(c.popBuf()) + if strings.Contains(inner, "\n") { + lines := strings.Split(inner, "\n") + linked := false + for i, l := range lines { + cleanLine := strings.TrimSpace(l) + if cleanLine != "" && !strings.HasPrefix(cleanLine, "![") && !linked { + lines[i] = "[" + cleanLine + "](" + href + ")" + linked = true + } + } + c.write(strings.Join(lines, "\n")) + } else { + c.write("[" + inner + "](" + href + ")") + } + + case "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "div", + "section", + "article", + "header", + "footer", + "aside", + "nav", + "figure": + c.write("\n") + + case "ol": + c.listDepth-- + if len(c.olCounters) > 0 { + c.olCounters = c.olCounters[:len(c.olCounters)-1] + } + if c.listDepth == 0 { + c.write("\n") + } + case "ul": + c.listDepth-- + if c.listDepth == 0 { + c.write("\n") + } + + case "pre": + c.inPre = false + c.write("\n```\n\n") + case "code": + if !c.inPre { + c.write("`") + } + } +} + +func HtmlToMarkdown(htmlStr string) (string, error) { + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + return "", err + } + + c := newConverter() + c.walk(doc) + + res := c.stack[0].String() + + // Post-processing + res = reImageOnlyLink.ReplaceAllString(res, "") + res = reEmptyListItem.ReplaceAllString(res, "") + res = reEmptyHeader.ReplaceAllString(res, "") + + lines := strings.Split(res, "\n") + var cleanLines []string + for _, line := range lines { + line = strings.TrimRight(line, " \t") + cleanTest := strings.TrimSpace(line) + if cleanTest == "[]()" || cleanTest == "[](#)" || cleanTest == "-" { + cleanLines = append(cleanLines, "") + continue + } + cleanLines = append(cleanLines, line) + } + res = strings.Join(cleanLines, "\n") + + res = strings.TrimSpace(res) + res = reNewlines.ReplaceAllString(res, "\n\n") + + // Strip a single leading space from lines that are NOT list indentation. + // "(?m)^([ \t])([^ \t\n])" matches exactly one space/tab at line start followed + // by a non-whitespace char, so " - nested" (4 spaces) is left untouched. + res = reLeadingLineSpace.ReplaceAllString(res, "$2") + + return res, nil +} diff --git a/pkg/utils/markdown_test.go b/pkg/utils/markdown_test.go new file mode 100644 index 000000000..72277fb91 --- /dev/null +++ b/pkg/utils/markdown_test.go @@ -0,0 +1,245 @@ +package utils + +import ( + "testing" + + "github.com/sipeed/picoclaw/pkg/logger" +) + +func TestHtmlToMarkdown(t *testing.T) { + // Define our test cases + tests := []struct { + name string + input string + expected string + }{ + { + name: "Removes scripts and styles", + input: `

Clean text

`, + expected: "Clean text", + }, + { + name: "Extracts links correctly", + input: `Visit my website for info.`, + expected: "Visit my [website](https://example.com) for info.", + }, + { + name: "Converts headers (H1, H2, H3)", + input: `

Main Title

Subtitle

Section

`, + expected: "# Main Title\n\n## Subtitle\n\n### Section", + }, + { + name: "Handles bold and italics", + input: `Text bold and strong, then italic and em.`, + expected: "Text **bold** and **strong**, then *italic* and *em*.", + }, + { + name: "Converts lists", + input: ``, + expected: "- First element\n- Second element", + }, + { + name: "Handles paragraphs and line breaks (
)", + input: `

First paragraph

Second paragraph with
a line break.

`, + expected: "First paragraph\n\nSecond paragraph with\na line break.", + }, + { + name: "Decodes HTML entities", + input: `Math: 5 > 3 & 2 < 4. A "quote".`, + expected: "Math: 5 > 3 & 2 < 4. A \"quote\".", + }, + { + name: "Cleans up residual HTML tags", + input: `
Text inside div and span
`, + expected: "Text inside div and span", + }, + { + name: "Removes multiple spaces and excessive empty lines", + input: `This text has too many spaces.



And too many newlines.`, + expected: "This text has too many spaces.\n\nAnd too many newlines.", + }, + { + name: "Nested lists with indentation", + input: "", + // Expect the sub-element to have 4 spaces of indentation + expected: "- One\n - Two", + }, + { + name: "Image support", + input: `alternative text`, + // Correct Markdown syntax for images + expected: "![alternative text](image.jpg)", + }, + { + name: "Image support without alt-text", + input: ``, + // If alt is missing, square brackets remain empty + expected: "![](image.jpg)", + }, + { + name: "XSS Bypass on Links (Obfuscated HTML entities)", + // The Go HTML parser resolves entities, so this becomes "javascript:alert(1)" + input: `Click here`, + // Our isSafeHref (if updated with net/url) should neutralize it to "#" + expected: "[Click here](#)", + }, + { + name: "Empty link or used as anchor", + input: ``, + // With no text or href, it shouldn't print anything (not even empty brackets) + expected: "", + }, + { + name: "Link without href but with text (Textual anchor)", + input: `Back to top`, + // Should extract only plain text, without generating a broken Markdown link like [Back to top](#) or [Back to top]() + expected: "Back to top", + }, + { + name: "Badly spaced bold and italics (Edge Case)", + input: ` Text `, + // In Markdown `** Text **` is often not formatted correctly. The ideal is `**Text**` + expected: "**Text**", + }, + { + name: "Complex Test - Real Article", + input: ` +

Article Title

+

This is an introductory text with a link.

+

Subtitle

+ + + `, + // Note: The indentation of the real HTML test will generate spaces that + // regex will clean up. + expected: "# Article Title\n\nThis is an **introductory text** with a [link](http://link.com).\n\n## Subtitle\n\n- Point one\n- Point two", + }, + { + name: "Ordered list (OL)", + input: `
  1. First
  2. Second
  3. Third
`, + expected: "1. First\n2. Second\n3. Third", + }, + { + name: "Ordered list nested in unordered list", + input: ``, + expected: "- Fruits\n 1. Apples\n 2. Pears\n- Vegetables", + }, + { + name: "Code block (pre/code)", + input: "
func main() {\n    fmt.Println(\"hello\")\n}
", + expected: "```\nfunc main() {\n fmt.Println(\"hello\")\n}\n```", + }, + { + name: "Inline code", + input: `

Use the command go test ./... to run the tests.

`, + expected: "Use the command `go test ./...` to run the tests.", + }, + { + name: "Simple blockquote", + input: `

An important quote.

`, + expected: "> An important quote.", + }, + { + name: "Multiline blockquote", + input: `

First line of the quote.

Second line of the quote.

`, + expected: "> First line of the quote.\n>\n> Second line of the quote.", + }, + { + name: "Strikethrough text (del/s)", + input: `This text is deleted and this is crossed out.`, + expected: "This text is ~~deleted~~ and this is ~~crossed out~~.", + }, + { + name: "Horizontal separator (HR)", + input: `

Above the line


Below the line

`, + expected: "Above the line\n\n---\n\nBelow the line", + }, + { + name: "Bold nested in link", + input: `Linked bold text`, + expected: "[**Linked bold text**](https://example.com)", + }, + { + name: "data-src Image (lazy loading)", + input: `Lazy image`, + expected: "![Lazy image](lazy.jpg)", + }, + { + name: "Image with javascript: src blocked", + input: `XSS`, + // src is not safe, so the image is not emitted + expected: "", + }, + { + name: "Link with data: href blocked", + input: `Click`, + expected: "[Click](#)", + }, + { + name: "Deeply nested divs", + input: `

Deeply nested text

`, + expected: "Deeply nested text", + }, + { + name: "Non-consecutive headers (H1, H3, H5)", + input: `

Title

Subsection

Sub-subsection
`, + expected: "# Title\n\n### Subsection\n\n##### Sub-subsection", + }, + { + name: "Paragraph with mixed multiple emphasis", + input: `

Important: read the critical instructions carefully.

`, + expected: "**Important:** read the ***critical instructions*** *carefully*.", + }, + { + name: "Article with nav and aside sections (noise to filter)", + input: ` + +
+

Article title

+

This is the body of the article.

+
+ + `, + expected: "## Article title\n\nThis is the body of the article.", + }, + { + name: "Text with mixed special HTML entities", + input: `Copyright © 2024 — All rights reserved ®`, + expected: "Copyright © 2024 — All rights reserved ®", + }, + { + name: "Mailto link", + input: `Write to us at info@example.com`, + expected: "Write to us at [info@example.com](mailto:info@example.com)", + }, + { + name: "Image inside a link (clickable figure)", + input: `Photo`, + // The image-link without text must not generate broken markup + expected: "[![Photo](photo.jpg)](https://example.com)", + }, + { + name: "Empty content or only whitespace", + input: `

`, + expected: "", + }, + } + + // Iterate over all test cases + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := HtmlToMarkdown(tt.input) + if err != nil { + logger.ErrorCF("tool", "Failed to parse html to markdown: %s", map[string]any{"error": err.Error()}) + } + + if got != tt.expected { + t.Errorf("\nTest case failed: %s\nInput: %q\nGot: %q\nExpected: %q", + tt.name, tt.input, got, tt.expected) + } + }) + } +}