Merge pull request #1622 from afjcjsbx/feat/markdown-output-format-web-fetch

feat(tool): markdown format in web_fetch tool output
2026-06-12 18:08:54 +00:00 · 2026-03-18 09:15:13 +08:00
parent 513537d230 9835e821d7
commit cefa140bd2
14 changed files with 794 additions and 43 deletions
@@ -253,6 +253,9 @@ picoclaw onboard
  },
  "tools": {
    "web": {
+      "enabled": true,
+      "fetch_limit_bytes": 10485760,
+      "format": "plaintext",
      "brave": {
        "enabled": false,
        "api_key": "VOTRE_CLE_API_BRAVE",
@@ -218,6 +218,9 @@ picoclaw onboard
  },
  "tools": {
    "web": {
+      "enabled": true,
+      "fetch_limit_bytes": 10485760,
+      "format": "plaintext",
      "search": {
        "api_key": "YOUR_BRAVE_API_KEY",
        "max_results": 5
@@ -272,6 +272,9 @@ picoclaw onboard
  ],
  "tools": {
    "web": {
+      "enabled": true,
+      "fetch_limit_bytes": 10485760,
+      "format": "plaintext",
      "brave": {
        "enabled": false,
        "api_key": "YOUR_BRAVE_API_KEY",
@@ -247,6 +247,9 @@ picoclaw onboard
  },
  "tools": {
    "web": {
+      "enabled": true,
+      "fetch_limit_bytes": 10485760,
+      "format": "plaintext",
      "brave": {
        "enabled": false,
        "api_key": "YOUR_BRAVE_API_KEY",
@@ -257,6 +257,9 @@ picoclaw onboard
  ],
  "tools": {
    "web": {
+      "enabled": true,
+      "fetch_limit_bytes": 10485760,
+      "format": "plaintext",
      "brave": {
        "enabled": false,
        "api_key": "YOUR_BRAVE_API_KEY",
@@ -313,6 +313,8 @@
    "allow_write_paths": null,
    "web": {
      "enabled": true,
+      "fetch_limit_bytes": 10485760,
+      "format": "plaintext",
      "brave": {
        "enabled": false,
        "api_key": "YOUR_BRAVE_API_KEY",
@@ -30,6 +30,15 @@ PicoClaw's tools configuration is located in the `tools` field of `config.json`.

 Web tools are used for web search and fetching.

+### Web Fetcher
+General settings for fetching and processing webpage content.
+
+| Config              | Type   | Default       | Description                                                                                   |
+|---------------------|--------|---------------|-----------------------------------------------------------------------------------------------|
+| `enabled`           | bool   | true          | Enable the webpage fetching capability.                                                       |
+| `fetch_limit_bytes` | int    | 10485760      | Maximum size of the webpage payload to fetch, in bytes (default is 10MB).                     |
+| `format`            | string | "plaintext"   | Output format of the fetched content. Options: `plaintext` or `markdown` (recommended).       |
+
 ### Brave

 | Config        | Type   | Default | Description               |
@@ -161,12 +161,12 @@ func registerSharedTools(
 			}
 		}
 		if cfg.Tools.IsToolEnabled("web_fetch") {
-			fetchTool, err := tools.NewWebFetchToolWithConfig(
+			fetchTool, err := tools.NewWebFetchToolWithProxy(
 				50000,
 				cfg.Tools.Web.Proxy,
+				cfg.Tools.Web.Format,
 				cfg.Tools.Web.FetchLimitBytes,
-				cfg.Tools.Web.PrivateHostWhitelist,
-			)
+				cfg.Tools.Web.PrivateHostWhitelist)
 			if err != nil {
 				logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 			} else {
@@ -697,6 +697,7 @@ type WebToolsConfig struct {
 	// For authenticated proxies, prefer HTTP_PROXY/HTTPS_PROXY env vars instead of embedding credentials in config.
 	Proxy                string              `json:"proxy,omitempty"                  env:"PICOCLAW_TOOLS_WEB_PROXY"`
 	FetchLimitBytes      int64               `json:"fetch_limit_bytes,omitempty"      env:"PICOCLAW_TOOLS_WEB_FETCH_LIMIT_BYTES"`
+	Format               string              `json:"format,omitempty"                 env:"PICOCLAW_TOOLS_WEB_FORMAT"`
 	PrivateHostWhitelist FlexibleStringSlice `json:"private_host_whitelist,omitempty" env:"PICOCLAW_TOOLS_WEB_PRIVATE_HOST_WHITELIST"`
 }

@@ -413,6 +413,7 @@ func DefaultConfig() *Config {
 				},
 				Proxy:           "",
 				FetchLimitBytes: 10 * 1024 * 1024, // 10MB by default
+				Format:          "plaintext",
 				Brave: BraveConfig{
 					Enabled:    false,
 					APIKey:     "",
@@ -7,6 +7,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"mime"
 	"net"
 	"net/http"
 	"net/url"
@@ -15,6 +16,7 @@ import (
 	"sync/atomic"
 	"time"

+	"github.com/sipeed/picoclaw/pkg/logger"
 	"github.com/sipeed/picoclaw/pkg/utils"
 )

@@ -776,6 +778,7 @@ type WebFetchTool struct {
 	maxChars        int
 	proxy           string
 	client          *http.Client
+	format          string
 	fetchLimitBytes int64
 	whitelist       *privateHostWhitelist
 }
@@ -785,22 +788,29 @@ type privateHostWhitelist struct {
 	cidrs []*net.IPNet
 }

-func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) {
+func NewWebFetchTool(maxChars int, format string, fetchLimitBytes int64) (*WebFetchTool, error) {
 	// createHTTPClient cannot fail with an empty proxy string.
-	return NewWebFetchToolWithConfig(maxChars, "", fetchLimitBytes, nil)
+	return NewWebFetchToolWithConfig(maxChars, "", format, fetchLimitBytes, nil)
 }

 // allowPrivateWebFetchHosts controls whether loopback/private hosts are allowed.
 // This is false in normal runtime to reduce SSRF exposure, and tests can override it temporarily.
 var allowPrivateWebFetchHosts atomic.Bool

-func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) {
-	return NewWebFetchToolWithConfig(maxChars, proxy, fetchLimitBytes, nil)
+func NewWebFetchToolWithProxy(
+	maxChars int,
+	proxy string,
+	format string,
+	fetchLimitBytes int64,
+	privateHostWhitelist []string,
+) (*WebFetchTool, error) {
+	return NewWebFetchToolWithConfig(maxChars, proxy, format, fetchLimitBytes, privateHostWhitelist)
 }

 func NewWebFetchToolWithConfig(
 	maxChars int,
 	proxy string,
+	format string,
 	fetchLimitBytes int64,
 	privateHostWhitelist []string,
 ) (*WebFetchTool, error) {
@@ -838,6 +848,7 @@ func NewWebFetchToolWithConfig(
 		maxChars:        maxChars,
 		proxy:           proxy,
 		client:          client,
+		format:          format,
 		fetchLimitBytes: fetchLimitBytes,
 		whitelist:       whitelist,
 	}, nil
@@ -926,26 +937,68 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
 		return ErrorResult(fmt.Sprintf("failed to read response: %v", err))
 	}

+	bodyStr := string(body)
 	contentType := resp.Header.Get("Content-Type")

+	mediaType, params, err := mime.ParseMediaType(contentType)
+	if err != nil {
+		// The most common error here is "mime: no media type" if the header is empty.
+		logger.WarnCF("tool", "Failed to parse Content-Type", map[string]any{
+			"raw_header": contentType,
+			"error":      err.Error(),
+		})
+
+		// security fallback
+		mediaType = "application/octet-stream"
+	}
+
+	charset, hasCharset := params["charset"]
+	if hasCharset {
+		// If the charset is not utf-8, we might have to convert the bodyStr
+		// before passing it to the HTML/Markdown parser
+		if strings.ToLower(charset) != "utf-8" {
+			logger.WarnCF("tool", "Note: the content is not in UTF-8", map[string]any{"charset": charset})
+		}
+	}
+
 	var text, extractor string

-	if strings.Contains(contentType, "application/json") {
+	switch {
+	case mediaType == "application/json":
 		var jsonData any
-		if err := json.Unmarshal(body, &jsonData); err == nil {
-			formatted, _ := json.MarshalIndent(jsonData, "", "  ")
-			text = string(formatted)
-			extractor = "json"
-		} else {
-			text = string(body)
+		if err := json.Unmarshal(body, &jsonData); err != nil {
+			text = bodyStr
 			extractor = "raw"
+			break
 		}
-	} else if strings.Contains(contentType, "text/html") || len(body) > 0 &&
-		(strings.HasPrefix(string(body), "<!DOCTYPE") || strings.HasPrefix(strings.ToLower(string(body)), "<html")) {
-		text = t.extractText(string(body))
-		extractor = "text"
-	} else {
-		text = string(body)
+
+		formatted, err := json.MarshalIndent(jsonData, "", "  ")
+		if err != nil {
+			text = bodyStr
+			extractor = "raw"
+			break
+		}
+
+		text = string(formatted)
+		extractor = "json"
+
+	case mediaType == "text/html" || looksLikeHTML(bodyStr):
+		switch strings.ToLower(t.format) {
+		case "markdown":
+			var err error
+			text, err = utils.HtmlToMarkdown(bodyStr)
+			if err != nil {
+				return ErrorResult(fmt.Sprintf("failed to HTML to markdown: %v", err))
+			}
+			extractor = "markdown"
+
+		default:
+			text = t.extractText(bodyStr)
+			extractor = "text"
+		}
+
+	default:
+		text = bodyStr
 		extractor = "raw"
 	}

@@ -977,6 +1030,17 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
 	}
 }

+func looksLikeHTML(body string) bool {
+	if body == "" {
+		return false
+	}
+
+	lower := strings.ToLower(body)
+
+	return strings.HasPrefix(body, "<!doctype") ||
+		strings.HasPrefix(lower, "<html")
+}
+
 func (t *WebFetchTool) extractText(htmlContent string) string {
 	result := reScript.ReplaceAllLiteralString(htmlContent, "")
 	result = reStyle.ReplaceAllLiteralString(result, "")
@@ -15,7 +15,10 @@ import (
 	"github.com/sipeed/picoclaw/pkg/logger"
 )

-const testFetchLimit = int64(10 * 1024 * 1024)
+const (
+	testFetchLimit = int64(10 * 1024 * 1024)
+	format         = "plaintext"
+)

 // TestWebTool_WebFetch_Success verifies successful URL fetching
 func TestWebTool_WebFetch_Success(t *testing.T) {
@@ -28,7 +31,7 @@ func TestWebTool_WebFetch_Success(t *testing.T) {
 	}))
 	defer server.Close()

-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -70,7 +73,7 @@ func TestWebTool_WebFetch_JSON(t *testing.T) {
 	}))
 	defer server.Close()

-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -95,7 +98,7 @@ func TestWebTool_WebFetch_JSON(t *testing.T) {

 // TestWebTool_WebFetch_InvalidURL verifies error handling for invalid URL
 func TestWebTool_WebFetch_InvalidURL(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -120,7 +123,7 @@ func TestWebTool_WebFetch_InvalidURL(t *testing.T) {

 // TestWebTool_WebFetch_UnsupportedScheme verifies error handling for non-http URLs
 func TestWebTool_WebFetch_UnsupportedScheme(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -145,7 +148,7 @@ func TestWebTool_WebFetch_UnsupportedScheme(t *testing.T) {

 // TestWebTool_WebFetch_MissingURL verifies error handling for missing URL
 func TestWebTool_WebFetch_MissingURL(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -179,7 +182,7 @@ func TestWebTool_WebFetch_Truncation(t *testing.T) {
 	}))
 	defer server.Close()

-	tool, err := NewWebFetchTool(1000, testFetchLimit) // Limit to 1000 chars
+	tool, err := NewWebFetchTool(1000, format, testFetchLimit) // Limit to 1000 chars
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -229,7 +232,7 @@ func TestWebFetchTool_PayloadTooLarge(t *testing.T) {
 	defer ts.Close()

 	// Initialize the tool
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -312,7 +315,7 @@ func TestWebTool_WebFetch_HTMLExtraction(t *testing.T) {
 	}))
 	defer server.Close()

-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -448,7 +451,7 @@ func singleHostCIDR(t *testing.T, host string) string {
 }

 func TestWebTool_WebFetch_PrivateHostBlocked(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -474,7 +477,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedByExactWhitelist(t *testing.T) {
 	defer server.Close()

 	host, _ := serverHostAndPort(t, server.URL)
-	tool, err := NewWebFetchToolWithConfig(50000, "", testFetchLimit, []string{host})
+	tool, err := NewWebFetchToolWithConfig(50000, "", format, testFetchLimit, []string{host})
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -499,7 +502,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedByCIDRWhitelist(t *testing.T) {
 	defer server.Close()

 	host, _ := serverHostAndPort(t, server.URL)
-	tool, err := NewWebFetchToolWithConfig(50000, "", testFetchLimit, []string{singleHostCIDR(t, host)})
+	tool, err := NewWebFetchToolWithConfig(50000, "", format, testFetchLimit, []string{singleHostCIDR(t, host)})
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -525,7 +528,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedForTests(t *testing.T) {
 	}))
 	defer server.Close()

-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -540,7 +543,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedForTests(t *testing.T) {

 // TestWebFetch_BlocksIPv4MappedIPv6Loopback verifies ::ffff:127.0.0.1 is blocked
 func TestWebFetch_BlocksIPv4MappedIPv6Loopback(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -555,7 +558,7 @@ func TestWebFetch_BlocksIPv4MappedIPv6Loopback(t *testing.T) {

 // TestWebFetch_BlocksMetadataIP verifies 169.254.169.254 is blocked
 func TestWebFetch_BlocksMetadataIP(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -570,7 +573,7 @@ func TestWebFetch_BlocksMetadataIP(t *testing.T) {

 // TestWebFetch_BlocksIPv6UniqueLocal verifies fc00::/7 addresses are blocked
 func TestWebFetch_BlocksIPv6UniqueLocal(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -585,7 +588,7 @@ func TestWebFetch_BlocksIPv6UniqueLocal(t *testing.T) {

 // TestWebFetch_Blocks6to4WithPrivateEmbed verifies 6to4 with private embedded IPv4 is blocked
 func TestWebFetch_Blocks6to4WithPrivateEmbed(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -601,7 +604,7 @@ func TestWebFetch_Blocks6to4WithPrivateEmbed(t *testing.T) {

 // TestWebFetch_Allows6to4WithPublicEmbed verifies 6to4 with public embedded IPv4 is NOT blocked
 func TestWebFetch_Allows6to4WithPublicEmbed(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -631,7 +634,7 @@ func TestWebFetch_RedirectToPrivateBlocked(t *testing.T) {
 	allowPrivateWebFetchHosts.Store(false)
 	defer allowPrivateWebFetchHosts.Store(true)

-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		t.Fatalf("Failed to create web fetch tool: %v", err)
 	}
@@ -752,7 +755,7 @@ func TestIsPrivateOrRestrictedIP_Table(t *testing.T) {

 // TestWebTool_WebFetch_MissingDomain verifies error handling for URL without domain
 func TestWebTool_WebFetch_MissingDomain(t *testing.T) {
-	tool, err := NewWebFetchTool(50000, testFetchLimit)
+	tool, err := NewWebFetchTool(50000, format, testFetchLimit)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -776,7 +779,7 @@ func TestWebTool_WebFetch_MissingDomain(t *testing.T) {
 }

 func TestNewWebFetchToolWithProxy(t *testing.T) {
-	tool, err := NewWebFetchToolWithProxy(1024, "http://127.0.0.1:7890", testFetchLimit)
+	tool, err := NewWebFetchToolWithProxy(1024, "http://127.0.0.1:7890", format, testFetchLimit, nil)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	} else if tool.maxChars != 1024 {
@@ -787,7 +790,7 @@ func TestNewWebFetchToolWithProxy(t *testing.T) {
 		t.Fatalf("proxy = %q, want %q", tool.proxy, "http://127.0.0.1:7890")
 	}

-	tool, err = NewWebFetchToolWithProxy(0, "http://127.0.0.1:7890", testFetchLimit)
+	tool, err = NewWebFetchToolWithProxy(0, "http://127.0.0.1:7890", format, testFetchLimit, nil)
 	if err != nil {
 		logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
 	}
@@ -798,7 +801,7 @@ func TestNewWebFetchToolWithProxy(t *testing.T) {
 }

 func TestNewWebFetchToolWithConfig_InvalidPrivateHostWhitelist(t *testing.T) {
-	_, err := NewWebFetchToolWithConfig(1024, "", testFetchLimit, []string{"not-an-ip-or-cidr"})
+	_, err := NewWebFetchToolWithConfig(1024, "", format, testFetchLimit, []string{"not-an-ip-or-cidr"})
 	if err == nil {
 		t.Fatal("expected invalid whitelist entry to fail")
 	}
@@ -0,0 +1,411 @@
+package utils
+
+import (
+	"bytes"
+	"net/url"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+var (
+	reSpaces           = regexp.MustCompile(`[ \t]+`)
+	reNewlines         = regexp.MustCompile(`\n{3,}`)
+	reEmptyListItem    = regexp.MustCompile(`(?m)^[-*]\s*$`)
+	reImageOnlyLink    = regexp.MustCompile(`\[!\[\]\(<[^>]*>\)\]\(<[^>]*>\)`)
+	reEmptyHeader      = regexp.MustCompile(`(?m)^#{1,6}\s*$`)
+	reLeadingLineSpace = regexp.MustCompile(`(?m)^([ \t])([^ \t\n])`)
+)
+
+var skipTags = map[string]bool{
+	"script": true, "style": true, "head": true,
+	"noscript": true, "template": true,
+	"nav": true, "footer": true, "aside": true, "header": true, "form": true, "dialog": true,
+}
+
+func isSafeHref(href string) bool {
+	lower := strings.ToLower(strings.TrimSpace(href))
+	if strings.HasPrefix(lower, "javascript:") || strings.HasPrefix(lower, "vbscript:") ||
+		strings.HasPrefix(lower, "data:") {
+		return false
+	}
+	u, err := url.Parse(strings.TrimSpace(href))
+	if err != nil {
+		return false
+	}
+	scheme := strings.ToLower(u.Scheme)
+	return scheme == "" || scheme == "http" || scheme == "https" || scheme == "mailto"
+}
+
+func isSafeImageSrc(src string) bool {
+	lower := strings.ToLower(strings.TrimSpace(src))
+	if strings.HasPrefix(lower, "data:image/") {
+		return true
+	}
+	return isSafeHref(src)
+}
+
+func escapeMdAlt(s string) string {
+	s = strings.ReplaceAll(s, `\`, `\\`)
+	s = strings.ReplaceAll(s, `[`, `\[`)
+	s = strings.ReplaceAll(s, `]`, `\]`)
+	return s
+}
+
+func getAttr(n *html.Node, key string) string {
+	for _, a := range n.Attr {
+		if a.Key == key {
+			return a.Val
+		}
+	}
+	return ""
+}
+
+func normalizeAttr(val string) string {
+	val = strings.ReplaceAll(val, "\n", "")
+	val = strings.ReplaceAll(val, "\r", "")
+	val = strings.ReplaceAll(val, "\t", "")
+	return strings.TrimSpace(val)
+}
+
+func isUnlikelyNode(n *html.Node) bool {
+	if n.Type != html.ElementNode {
+		return false
+	}
+	classId := strings.ToLower(getAttr(n, "class") + " " + getAttr(n, "id"))
+	if classId == " " {
+		return false
+	}
+	if strings.Contains(classId, "article") || strings.Contains(classId, "main") ||
+		strings.Contains(classId, "content") {
+		return false
+	}
+	unlikelyKeywords := []string{
+		"menu",
+		"nav",
+		"footer",
+		"sidebar",
+		"cookie",
+		"banner",
+		"sponsor",
+		"advert",
+		"popup",
+		"modal",
+		"newsletter",
+		"share",
+		"social",
+	}
+	for _, keyword := range unlikelyKeywords {
+		if strings.Contains(classId, keyword) {
+			return true
+		}
+	}
+	return false
+}
+
+type converter struct {
+	stack      []*bytes.Buffer
+	linkHrefs  []string
+	linkStates []bool
+	emphStack  []string // Tracks "**", "*", "~~" for buffered emphasis
+	olCounters []int
+	inPre      bool
+	listDepth  int
+}
+
+func newConverter() *converter {
+	return &converter{
+		stack: []*bytes.Buffer{{}},
+	}
+}
+
+func (c *converter) write(s string) {
+	c.stack[len(c.stack)-1].WriteString(s)
+}
+
+func (c *converter) pushBuf() {
+	c.stack = append(c.stack, &bytes.Buffer{})
+}
+
+func (c *converter) popBuf() string {
+	top := c.stack[len(c.stack)-1]
+	c.stack = c.stack[:len(c.stack)-1]
+	return top.String()
+}
+
+func (c *converter) walk(n *html.Node) {
+	if n.Type == html.ElementNode {
+		if skipTags[n.Data] {
+			return
+		}
+		if isUnlikelyNode(n) {
+			return
+		}
+	}
+
+	if n.Type == html.TextNode {
+		text := n.Data
+		if !c.inPre {
+			text = strings.ReplaceAll(text, "\n", " ")
+			text = reSpaces.ReplaceAllString(text, " ")
+		}
+		if text != "" {
+			c.write(text)
+		}
+		return
+	}
+
+	if n.Type != html.ElementNode {
+		for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
+			c.walk(ch)
+		}
+		return
+	}
+
+	// Opening Tags
+	switch n.Data {
+	// Buffer emphasis content so we can TrimSpace the inner text,
+	// avoiding the regex-across-boundaries bug.
+	case "b", "strong":
+		c.emphStack = append(c.emphStack, "**")
+		c.pushBuf()
+	case "i", "em":
+		c.emphStack = append(c.emphStack, "*")
+		c.pushBuf()
+	case "del", "s":
+		c.emphStack = append(c.emphStack, "~~")
+		c.pushBuf()
+
+	case "a":
+		href := normalizeAttr(getAttr(n, "href"))
+		if href != "" && !isSafeHref(href) {
+			href = "#"
+		}
+		hasHref := href != ""
+		c.linkStates = append(c.linkStates, hasHref)
+		if hasHref {
+			c.linkHrefs = append(c.linkHrefs, href)
+			c.pushBuf()
+		}
+
+	case "h1":
+		c.write("\n\n# ")
+	case "h2":
+		c.write("\n\n## ")
+	case "h3":
+		c.write("\n\n### ")
+	case "h4":
+		c.write("\n\n#### ")
+	case "h5":
+		c.write("\n\n##### ")
+	case "h6":
+		c.write("\n\n###### ")
+
+	case "p":
+		c.write("\n\n")
+	case "br":
+		c.write("\n")
+	case "hr":
+		c.write("\n\n---\n\n")
+
+	case "ol":
+		c.olCounters = append(c.olCounters, 1)
+		// Only write leading newline for top-level list.
+		if c.listDepth == 0 {
+			c.write("\n")
+		}
+		c.listDepth++
+	case "ul":
+		if c.listDepth == 0 {
+			c.write("\n")
+		}
+		c.listDepth++
+	case "li":
+		c.write("\n")
+		if c.listDepth > 1 {
+			c.write(strings.Repeat("    ", c.listDepth-1))
+		}
+		if n.Parent != nil && n.Parent.Data == "ol" && len(c.olCounters) > 0 {
+			idx := c.olCounters[len(c.olCounters)-1]
+			c.write(strconv.Itoa(idx) + ". ")
+			c.olCounters[len(c.olCounters)-1]++
+		} else {
+			c.write("- ")
+		}
+
+	case "pre":
+		c.inPre = true
+		c.write("\n\n```\n")
+	case "code":
+		if !c.inPre {
+			c.write("`")
+		}
+
+	case "blockquote":
+		c.pushBuf()
+		for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
+			c.walk(ch)
+		}
+		inner := strings.TrimSpace(c.popBuf())
+		lines := strings.Split(inner, "\n")
+		var quoted []string
+		for _, l := range lines {
+			if strings.TrimSpace(l) == "" {
+				quoted = append(quoted, ">")
+			} else {
+				quoted = append(quoted, "> "+l)
+			}
+		}
+		var deduped []string
+		for i, line := range quoted {
+			if line == ">" && i > 0 && deduped[len(deduped)-1] == ">" {
+				continue
+			}
+			deduped = append(deduped, line)
+		}
+		c.write("\n\n" + strings.Join(deduped, "\n") + "\n\n")
+		return
+
+	case "img":
+		src := normalizeAttr(getAttr(n, "src"))
+		if src == "" {
+			src = normalizeAttr(getAttr(n, "data-src"))
+		}
+		if src == "" {
+			return
+		}
+		alt := escapeMdAlt(normalizeAttr(getAttr(n, "alt")))
+		if isSafeImageSrc(src) {
+			c.write("![" + alt + "](" + src + ")")
+		}
+		return
+	}
+
+	// Traverse Children
+	for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
+		c.walk(ch)
+	}
+
+	// Closing Tags
+	switch n.Data {
+	// Pop buffer, trim, wrap with the correct marker.
+	case "b", "strong", "i", "em", "del", "s":
+		if len(c.emphStack) == 0 {
+			break
+		}
+		marker := c.emphStack[len(c.emphStack)-1]
+		c.emphStack = c.emphStack[:len(c.emphStack)-1]
+		inner := strings.TrimSpace(c.popBuf())
+		if inner != "" {
+			c.write(marker + inner + marker)
+		}
+
+	case "a":
+		if len(c.linkStates) == 0 {
+			break
+		}
+		hasHref := c.linkStates[len(c.linkStates)-1]
+		c.linkStates = c.linkStates[:len(c.linkStates)-1]
+		if !hasHref {
+			break
+		}
+		href := c.linkHrefs[len(c.linkHrefs)-1]
+		c.linkHrefs = c.linkHrefs[:len(c.linkHrefs)-1]
+		inner := strings.TrimSpace(c.popBuf())
+		if strings.Contains(inner, "\n") {
+			lines := strings.Split(inner, "\n")
+			linked := false
+			for i, l := range lines {
+				cleanLine := strings.TrimSpace(l)
+				if cleanLine != "" && !strings.HasPrefix(cleanLine, "![") && !linked {
+					lines[i] = "[" + cleanLine + "](" + href + ")"
+					linked = true
+				}
+			}
+			c.write(strings.Join(lines, "\n"))
+		} else {
+			c.write("[" + inner + "](" + href + ")")
+		}
+
+	case "h1",
+		"h2",
+		"h3",
+		"h4",
+		"h5",
+		"h6",
+		"p",
+		"div",
+		"section",
+		"article",
+		"header",
+		"footer",
+		"aside",
+		"nav",
+		"figure":
+		c.write("\n")
+
+	case "ol":
+		c.listDepth--
+		if len(c.olCounters) > 0 {
+			c.olCounters = c.olCounters[:len(c.olCounters)-1]
+		}
+		if c.listDepth == 0 {
+			c.write("\n")
+		}
+	case "ul":
+		c.listDepth--
+		if c.listDepth == 0 {
+			c.write("\n")
+		}
+
+	case "pre":
+		c.inPre = false
+		c.write("\n```\n\n")
+	case "code":
+		if !c.inPre {
+			c.write("`")
+		}
+	}
+}
+
+func HtmlToMarkdown(htmlStr string) (string, error) {
+	doc, err := html.Parse(strings.NewReader(htmlStr))
+	if err != nil {
+		return "", err
+	}
+
+	c := newConverter()
+	c.walk(doc)
+
+	res := c.stack[0].String()
+
+	// Post-processing
+	res = reImageOnlyLink.ReplaceAllString(res, "")
+	res = reEmptyListItem.ReplaceAllString(res, "")
+	res = reEmptyHeader.ReplaceAllString(res, "")
+
+	lines := strings.Split(res, "\n")
+	var cleanLines []string
+	for _, line := range lines {
+		line = strings.TrimRight(line, " \t")
+		cleanTest := strings.TrimSpace(line)
+		if cleanTest == "[](</>)" || cleanTest == "[](#)" || cleanTest == "-" {
+			cleanLines = append(cleanLines, "")
+			continue
+		}
+		cleanLines = append(cleanLines, line)
+	}
+	res = strings.Join(cleanLines, "\n")
+
+	res = strings.TrimSpace(res)
+	res = reNewlines.ReplaceAllString(res, "\n\n")
+
+	// Strip a single leading space from lines that are NOT list indentation.
+	// "(?m)^([ \t])([^ \t\n])" matches exactly one space/tab at line start followed
+	// by a non-whitespace char, so "    - nested" (4 spaces) is left untouched.
+	res = reLeadingLineSpace.ReplaceAllString(res, "$2")
+
+	return res, nil
+}
@@ -0,0 +1,245 @@
+package utils
+
+import (
+	"testing"
+
+	"github.com/sipeed/picoclaw/pkg/logger"
+)
+
+func TestHtmlToMarkdown(t *testing.T) {
+	// Define our test cases
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "Removes scripts and styles",
+			input:    `<script>alert("hello");</script><style>body { color: red; }</style><p>Clean text</p>`,
+			expected: "Clean text",
+		},
+		{
+			name:     "Extracts links correctly",
+			input:    `Visit my <a href="https://example.com">website</a> for info.`,
+			expected: "Visit my [website](https://example.com) for info.",
+		},
+		{
+			name:     "Converts headers (H1, H2, H3)",
+			input:    `<h1>Main Title</h1><h2>Subtitle</h2><h3>Section</h3>`,
+			expected: "# Main Title\n\n## Subtitle\n\n### Section",
+		},
+		{
+			name:     "Handles bold and italics",
+			input:    `Text <b>bold</b> and <strong>strong</strong>, then <i>italic</i> and <em>em</em>.`,
+			expected: "Text **bold** and **strong**, then *italic* and *em*.",
+		},
+		{
+			name:     "Converts lists",
+			input:    `<ul><li>First element</li><li>Second element</li></ul>`,
+			expected: "- First element\n- Second element",
+		},
+		{
+			name:     "Handles paragraphs and line breaks (<br>)",
+			input:    `<p>First paragraph</p><p>Second paragraph with<br>a line break.</p>`,
+			expected: "First paragraph\n\nSecond paragraph with\na line break.",
+		},
+		{
+			name:     "Decodes HTML entities",
+			input:    `Math: 5 &gt; 3 &amp; 2 &lt; 4. A &quot;quote&quot;.`,
+			expected: "Math: 5 > 3 & 2 < 4. A \"quote\".",
+		},
+		{
+			name:     "Cleans up residual HTML tags",
+			input:    `<div><span>Text inside div and span</span></div>`,
+			expected: "Text inside div and span",
+		},
+		{
+			name:     "Removes multiple spaces and excessive empty lines",
+			input:    `This   text    has too many spaces. <br><br><br><br> And too many newlines.`,
+			expected: "This text has too many spaces.\n\nAnd too many newlines.",
+		},
+		{
+			name:  "Nested lists with indentation",
+			input: "<ul><li>One<ul><li>Two</li></ul></li></ul>",
+			// Expect the sub-element to have 4 spaces of indentation
+			expected: "- One\n    - Two",
+		},
+		{
+			name:  "Image support",
+			input: `<img src="image.jpg" alt="alternative text">`,
+			// Correct Markdown syntax for images
+			expected: "![alternative text](image.jpg)",
+		},
+		{
+			name:  "Image support without alt-text",
+			input: `<img src="image.jpg">`,
+			// If alt is missing, square brackets remain empty
+			expected: "![](image.jpg)",
+		},
+		{
+			name: "XSS Bypass on Links (Obfuscated HTML entities)",
+			// The Go HTML parser resolves entities, so this becomes "javascript:alert(1)"
+			input: `<a href="jav&#x09;ascript:alert(1)">Click here</a>`,
+			// Our isSafeHref (if updated with net/url) should neutralize it to "#"
+			expected: "[Click here](#)",
+		},
+		{
+			name:  "Empty link or used as anchor",
+			input: `<a name="top"></a>`,
+			// With no text or href, it shouldn't print anything (not even empty brackets)
+			expected: "",
+		},
+		{
+			name:  "Link without href but with text (Textual anchor)",
+			input: `<a id="top">Back to top</a>`,
+			// Should extract only plain text, without generating a broken Markdown link like [Back to top](#) or [Back to top]()
+			expected: "Back to top",
+		},
+		{
+			name:  "Badly spaced bold and italics (Edge Case)",
+			input: `<b> Text </b>`,
+			// In Markdown `** Text **` is often not formatted correctly. The ideal is `**Text**`
+			expected: "**Text**",
+		},
+		{
+			name: "Complex Test - Real Article",
+			input: `
+             <h1>Article Title</h1>
+             <p>This is an <strong>introductory text</strong> with a <a href="http://link.com">link</a>.</p>
+             <h2>Subtitle</h2>
+             <ul>
+                <li>Point one</li>
+                <li>Point two</li>
+             </ul>
+             <script>console.log("do not show me")</script>
+          `,
+			// Note: The indentation of the real HTML test will generate spaces that
+			// regex will clean up.
+			expected: "# Article Title\n\nThis is an **introductory text** with a [link](http://link.com).\n\n## Subtitle\n\n- Point one\n- Point two",
+		},
+		{
+			name:     "Ordered list (OL)",
+			input:    `<ol><li>First</li><li>Second</li><li>Third</li></ol>`,
+			expected: "1. First\n2. Second\n3. Third",
+		},
+		{
+			name:     "Ordered list nested in unordered list",
+			input:    `<ul><li>Fruits<ol><li>Apples</li><li>Pears</li></ol></li><li>Vegetables</li></ul>`,
+			expected: "- Fruits\n    1. Apples\n    2. Pears\n- Vegetables",
+		},
+		{
+			name:     "Code block (pre/code)",
+			input:    "<pre><code>func main() {\n    fmt.Println(\"hello\")\n}</code></pre>",
+			expected: "```\nfunc main() {\n    fmt.Println(\"hello\")\n}\n```",
+		},
+		{
+			name:     "Inline code",
+			input:    `<p>Use the command <code>go test ./...</code> to run the tests.</p>`,
+			expected: "Use the command `go test ./...` to run the tests.",
+		},
+		{
+			name:     "Simple blockquote",
+			input:    `<blockquote><p>An important quote.</p></blockquote>`,
+			expected: "> An important quote.",
+		},
+		{
+			name:     "Multiline blockquote",
+			input:    `<blockquote><p>First line of the quote.</p><p>Second line of the quote.</p></blockquote>`,
+			expected: "> First line of the quote.\n>\n> Second line of the quote.",
+		},
+		{
+			name:     "Strikethrough text (del/s)",
+			input:    `This text is <del>deleted</del> and this is <s>crossed out</s>.`,
+			expected: "This text is ~~deleted~~ and this is ~~crossed out~~.",
+		},
+		{
+			name:     "Horizontal separator (HR)",
+			input:    `<p>Above the line</p><hr><p>Below the line</p>`,
+			expected: "Above the line\n\n---\n\nBelow the line",
+		},
+		{
+			name:     "Bold nested in link",
+			input:    `<a href="https://example.com"><strong>Linked bold text</strong></a>`,
+			expected: "[**Linked bold text**](https://example.com)",
+		},
+		{
+			name:     "data-src Image (lazy loading)",
+			input:    `<img data-src="lazy.jpg" alt="Lazy image">`,
+			expected: "![Lazy image](lazy.jpg)",
+		},
+		{
+			name:  "Image with javascript: src blocked",
+			input: `<img src="javascript:alert(1)" alt="XSS">`,
+			// src is not safe, so the image is not emitted
+			expected: "",
+		},
+		{
+			name:     "Link with data: href blocked",
+			input:    `<a href="data:text/html,<script>alert(1)</script>">Click</a>`,
+			expected: "[Click](#)",
+		},
+		{
+			name:     "Deeply nested divs",
+			input:    `<div><div><div><div><p>Deeply nested text</p></div></div></div></div>`,
+			expected: "Deeply nested text",
+		},
+		{
+			name:     "Non-consecutive headers (H1, H3, H5)",
+			input:    `<h1>Title</h1><h3>Subsection</h3><h5>Sub-subsection</h5>`,
+			expected: "# Title\n\n### Subsection\n\n##### Sub-subsection",
+		},
+		{
+			name:     "Paragraph with mixed multiple emphasis",
+			input:    `<p><strong>Important:</strong> read the <strong><em>critical instructions</em></strong> <em>carefully</em>.</p>`,
+			expected: "**Important:** read the ***critical instructions*** *carefully*.",
+		},
+		{
+			name: "Article with nav and aside sections (noise to filter)",
+			input: `
+        <nav><a href="/home">Home</a><a href="/about-us">About us</a></nav>
+        <article>
+            <h2>Article title</h2>
+            <p>This is the body of the article.</p>
+        </article>
+        <aside><p>Advertisement</p></aside>
+       `,
+			expected: "## Article title\n\nThis is the body of the article.",
+		},
+		{
+			name:     "Text with mixed special HTML entities",
+			input:    `Copyright &copy; 2024 &mdash; All rights reserved &reg;`,
+			expected: "Copyright © 2024 — All rights reserved ®",
+		},
+		{
+			name:     "Mailto link",
+			input:    `Write to us at <a href="mailto:info@example.com">info@example.com</a>`,
+			expected: "Write to us at [info@example.com](mailto:info@example.com)",
+		},
+		{
+			name:  "Image inside a link (clickable figure)",
+			input: `<a href="https://example.com"><img src="photo.jpg" alt="Photo"></a>`,
+			// The image-link without text must not generate broken markup
+			expected: "[![Photo](photo.jpg)](https://example.com)",
+		},
+		{
+			name:     "Empty content or only whitespace",
+			input:    `   <p>  </p>  <div>   </div>  `,
+			expected: "",
+		},
+	}
+
+	// Iterate over all test cases
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := HtmlToMarkdown(tt.input)
+			if err != nil {
+				logger.ErrorCF("tool", "Failed to parse html to markdown: %s", map[string]any{"error": err.Error()})
+			}
+
+			if got != tt.expected {
+				t.Errorf("\nTest case failed: %s\nInput:    %q\nGot:      %q\nExpected: %q",
+					tt.name, tt.input, got, tt.expected)
+			}
+		})
+	}
+}