Merge pull request #1622 from afjcjsbx/feat/markdown-output-format-web-fetch

feat(tool): markdown format in web_fetch tool output
This commit is contained in:
Meng Zhuo
2026-03-18 09:15:13 +08:00
committed by GitHub
14 changed files with 794 additions and 43 deletions
+3
View File
@@ -253,6 +253,9 @@ picoclaw onboard
},
"tools": {
"web": {
"enabled": true,
"fetch_limit_bytes": 10485760,
"format": "plaintext",
"brave": {
"enabled": false,
"api_key": "VOTRE_CLE_API_BRAVE",
+3
View File
@@ -218,6 +218,9 @@ picoclaw onboard
},
"tools": {
"web": {
"enabled": true,
"fetch_limit_bytes": 10485760,
"format": "plaintext",
"search": {
"api_key": "YOUR_BRAVE_API_KEY",
"max_results": 5
+3
View File
@@ -272,6 +272,9 @@ picoclaw onboard
],
"tools": {
"web": {
"enabled": true,
"fetch_limit_bytes": 10485760,
"format": "plaintext",
"brave": {
"enabled": false,
"api_key": "YOUR_BRAVE_API_KEY",
+3
View File
@@ -247,6 +247,9 @@ picoclaw onboard
},
"tools": {
"web": {
"enabled": true,
"fetch_limit_bytes": 10485760,
"format": "plaintext",
"brave": {
"enabled": false,
"api_key": "YOUR_BRAVE_API_KEY",
+3
View File
@@ -257,6 +257,9 @@ picoclaw onboard
],
"tools": {
"web": {
"enabled": true,
"fetch_limit_bytes": 10485760,
"format": "plaintext",
"brave": {
"enabled": false,
"api_key": "YOUR_BRAVE_API_KEY",
+2
View File
@@ -313,6 +313,8 @@
"allow_write_paths": null,
"web": {
"enabled": true,
"fetch_limit_bytes": 10485760,
"format": "plaintext",
"brave": {
"enabled": false,
"api_key": "YOUR_BRAVE_API_KEY",
+9
View File
@@ -30,6 +30,15 @@ PicoClaw's tools configuration is located in the `tools` field of `config.json`.
Web tools are used for web search and fetching.
### Web Fetcher
General settings for fetching and processing webpage content.
| Config | Type | Default | Description |
|---------------------|--------|---------------|-----------------------------------------------------------------------------------------------|
| `enabled` | bool | true | Enable the webpage fetching capability. |
| `fetch_limit_bytes` | int | 10485760 | Maximum size of the webpage payload to fetch, in bytes (default is 10MB). |
| `format` | string | "plaintext" | Output format of the fetched content. Options: `plaintext` or `markdown` (recommended). |
### Brave
| Config | Type | Default | Description |
+3 -3
View File
@@ -161,12 +161,12 @@ func registerSharedTools(
}
}
if cfg.Tools.IsToolEnabled("web_fetch") {
fetchTool, err := tools.NewWebFetchToolWithConfig(
fetchTool, err := tools.NewWebFetchToolWithProxy(
50000,
cfg.Tools.Web.Proxy,
cfg.Tools.Web.Format,
cfg.Tools.Web.FetchLimitBytes,
cfg.Tools.Web.PrivateHostWhitelist,
)
cfg.Tools.Web.PrivateHostWhitelist)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
} else {
+1
View File
@@ -697,6 +697,7 @@ type WebToolsConfig struct {
// For authenticated proxies, prefer HTTP_PROXY/HTTPS_PROXY env vars instead of embedding credentials in config.
Proxy string `json:"proxy,omitempty" env:"PICOCLAW_TOOLS_WEB_PROXY"`
FetchLimitBytes int64 `json:"fetch_limit_bytes,omitempty" env:"PICOCLAW_TOOLS_WEB_FETCH_LIMIT_BYTES"`
Format string `json:"format,omitempty" env:"PICOCLAW_TOOLS_WEB_FORMAT"`
PrivateHostWhitelist FlexibleStringSlice `json:"private_host_whitelist,omitempty" env:"PICOCLAW_TOOLS_WEB_PRIVATE_HOST_WHITELIST"`
}
+1
View File
@@ -413,6 +413,7 @@ func DefaultConfig() *Config {
},
Proxy: "",
FetchLimitBytes: 10 * 1024 * 1024, // 10MB by default
Format: "plaintext",
Brave: BraveConfig{
Enabled: false,
APIKey: "",
+81 -17
View File
@@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"io"
"mime"
"net"
"net/http"
"net/url"
@@ -15,6 +16,7 @@ import (
"sync/atomic"
"time"
"github.com/sipeed/picoclaw/pkg/logger"
"github.com/sipeed/picoclaw/pkg/utils"
)
@@ -776,6 +778,7 @@ type WebFetchTool struct {
maxChars int
proxy string
client *http.Client
format string
fetchLimitBytes int64
whitelist *privateHostWhitelist
}
@@ -785,22 +788,29 @@ type privateHostWhitelist struct {
cidrs []*net.IPNet
}
func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) {
func NewWebFetchTool(maxChars int, format string, fetchLimitBytes int64) (*WebFetchTool, error) {
// createHTTPClient cannot fail with an empty proxy string.
return NewWebFetchToolWithConfig(maxChars, "", fetchLimitBytes, nil)
return NewWebFetchToolWithConfig(maxChars, "", format, fetchLimitBytes, nil)
}
// allowPrivateWebFetchHosts controls whether loopback/private hosts are allowed.
// This is false in normal runtime to reduce SSRF exposure, and tests can override it temporarily.
var allowPrivateWebFetchHosts atomic.Bool
func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) {
return NewWebFetchToolWithConfig(maxChars, proxy, fetchLimitBytes, nil)
func NewWebFetchToolWithProxy(
maxChars int,
proxy string,
format string,
fetchLimitBytes int64,
privateHostWhitelist []string,
) (*WebFetchTool, error) {
return NewWebFetchToolWithConfig(maxChars, proxy, format, fetchLimitBytes, privateHostWhitelist)
}
func NewWebFetchToolWithConfig(
maxChars int,
proxy string,
format string,
fetchLimitBytes int64,
privateHostWhitelist []string,
) (*WebFetchTool, error) {
@@ -838,6 +848,7 @@ func NewWebFetchToolWithConfig(
maxChars: maxChars,
proxy: proxy,
client: client,
format: format,
fetchLimitBytes: fetchLimitBytes,
whitelist: whitelist,
}, nil
@@ -926,26 +937,68 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
return ErrorResult(fmt.Sprintf("failed to read response: %v", err))
}
bodyStr := string(body)
contentType := resp.Header.Get("Content-Type")
mediaType, params, err := mime.ParseMediaType(contentType)
if err != nil {
// The most common error here is "mime: no media type" if the header is empty.
logger.WarnCF("tool", "Failed to parse Content-Type", map[string]any{
"raw_header": contentType,
"error": err.Error(),
})
// security fallback
mediaType = "application/octet-stream"
}
charset, hasCharset := params["charset"]
if hasCharset {
// If the charset is not utf-8, we might have to convert the bodyStr
// before passing it to the HTML/Markdown parser
if strings.ToLower(charset) != "utf-8" {
logger.WarnCF("tool", "Note: the content is not in UTF-8", map[string]any{"charset": charset})
}
}
var text, extractor string
if strings.Contains(contentType, "application/json") {
switch {
case mediaType == "application/json":
var jsonData any
if err := json.Unmarshal(body, &jsonData); err == nil {
formatted, _ := json.MarshalIndent(jsonData, "", " ")
text = string(formatted)
extractor = "json"
} else {
text = string(body)
if err := json.Unmarshal(body, &jsonData); err != nil {
text = bodyStr
extractor = "raw"
break
}
} else if strings.Contains(contentType, "text/html") || len(body) > 0 &&
(strings.HasPrefix(string(body), "<!DOCTYPE") || strings.HasPrefix(strings.ToLower(string(body)), "<html")) {
text = t.extractText(string(body))
extractor = "text"
} else {
text = string(body)
formatted, err := json.MarshalIndent(jsonData, "", " ")
if err != nil {
text = bodyStr
extractor = "raw"
break
}
text = string(formatted)
extractor = "json"
case mediaType == "text/html" || looksLikeHTML(bodyStr):
switch strings.ToLower(t.format) {
case "markdown":
var err error
text, err = utils.HtmlToMarkdown(bodyStr)
if err != nil {
return ErrorResult(fmt.Sprintf("failed to HTML to markdown: %v", err))
}
extractor = "markdown"
default:
text = t.extractText(bodyStr)
extractor = "text"
}
default:
text = bodyStr
extractor = "raw"
}
@@ -977,6 +1030,17 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
}
}
func looksLikeHTML(body string) bool {
if body == "" {
return false
}
lower := strings.ToLower(body)
return strings.HasPrefix(body, "<!doctype") ||
strings.HasPrefix(lower, "<html")
}
func (t *WebFetchTool) extractText(htmlContent string) string {
result := reScript.ReplaceAllLiteralString(htmlContent, "")
result = reStyle.ReplaceAllLiteralString(result, "")
+26 -23
View File
@@ -15,7 +15,10 @@ import (
"github.com/sipeed/picoclaw/pkg/logger"
)
const testFetchLimit = int64(10 * 1024 * 1024)
const (
testFetchLimit = int64(10 * 1024 * 1024)
format = "plaintext"
)
// TestWebTool_WebFetch_Success verifies successful URL fetching
func TestWebTool_WebFetch_Success(t *testing.T) {
@@ -28,7 +31,7 @@ func TestWebTool_WebFetch_Success(t *testing.T) {
}))
defer server.Close()
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -70,7 +73,7 @@ func TestWebTool_WebFetch_JSON(t *testing.T) {
}))
defer server.Close()
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -95,7 +98,7 @@ func TestWebTool_WebFetch_JSON(t *testing.T) {
// TestWebTool_WebFetch_InvalidURL verifies error handling for invalid URL
func TestWebTool_WebFetch_InvalidURL(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -120,7 +123,7 @@ func TestWebTool_WebFetch_InvalidURL(t *testing.T) {
// TestWebTool_WebFetch_UnsupportedScheme verifies error handling for non-http URLs
func TestWebTool_WebFetch_UnsupportedScheme(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -145,7 +148,7 @@ func TestWebTool_WebFetch_UnsupportedScheme(t *testing.T) {
// TestWebTool_WebFetch_MissingURL verifies error handling for missing URL
func TestWebTool_WebFetch_MissingURL(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -179,7 +182,7 @@ func TestWebTool_WebFetch_Truncation(t *testing.T) {
}))
defer server.Close()
tool, err := NewWebFetchTool(1000, testFetchLimit) // Limit to 1000 chars
tool, err := NewWebFetchTool(1000, format, testFetchLimit) // Limit to 1000 chars
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -229,7 +232,7 @@ func TestWebFetchTool_PayloadTooLarge(t *testing.T) {
defer ts.Close()
// Initialize the tool
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -312,7 +315,7 @@ func TestWebTool_WebFetch_HTMLExtraction(t *testing.T) {
}))
defer server.Close()
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -448,7 +451,7 @@ func singleHostCIDR(t *testing.T, host string) string {
}
func TestWebTool_WebFetch_PrivateHostBlocked(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -474,7 +477,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedByExactWhitelist(t *testing.T) {
defer server.Close()
host, _ := serverHostAndPort(t, server.URL)
tool, err := NewWebFetchToolWithConfig(50000, "", testFetchLimit, []string{host})
tool, err := NewWebFetchToolWithConfig(50000, "", format, testFetchLimit, []string{host})
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -499,7 +502,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedByCIDRWhitelist(t *testing.T) {
defer server.Close()
host, _ := serverHostAndPort(t, server.URL)
tool, err := NewWebFetchToolWithConfig(50000, "", testFetchLimit, []string{singleHostCIDR(t, host)})
tool, err := NewWebFetchToolWithConfig(50000, "", format, testFetchLimit, []string{singleHostCIDR(t, host)})
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -525,7 +528,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedForTests(t *testing.T) {
}))
defer server.Close()
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -540,7 +543,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedForTests(t *testing.T) {
// TestWebFetch_BlocksIPv4MappedIPv6Loopback verifies ::ffff:127.0.0.1 is blocked
func TestWebFetch_BlocksIPv4MappedIPv6Loopback(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -555,7 +558,7 @@ func TestWebFetch_BlocksIPv4MappedIPv6Loopback(t *testing.T) {
// TestWebFetch_BlocksMetadataIP verifies 169.254.169.254 is blocked
func TestWebFetch_BlocksMetadataIP(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -570,7 +573,7 @@ func TestWebFetch_BlocksMetadataIP(t *testing.T) {
// TestWebFetch_BlocksIPv6UniqueLocal verifies fc00::/7 addresses are blocked
func TestWebFetch_BlocksIPv6UniqueLocal(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -585,7 +588,7 @@ func TestWebFetch_BlocksIPv6UniqueLocal(t *testing.T) {
// TestWebFetch_Blocks6to4WithPrivateEmbed verifies 6to4 with private embedded IPv4 is blocked
func TestWebFetch_Blocks6to4WithPrivateEmbed(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -601,7 +604,7 @@ func TestWebFetch_Blocks6to4WithPrivateEmbed(t *testing.T) {
// TestWebFetch_Allows6to4WithPublicEmbed verifies 6to4 with public embedded IPv4 is NOT blocked
func TestWebFetch_Allows6to4WithPublicEmbed(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -631,7 +634,7 @@ func TestWebFetch_RedirectToPrivateBlocked(t *testing.T) {
allowPrivateWebFetchHosts.Store(false)
defer allowPrivateWebFetchHosts.Store(true)
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
t.Fatalf("Failed to create web fetch tool: %v", err)
}
@@ -752,7 +755,7 @@ func TestIsPrivateOrRestrictedIP_Table(t *testing.T) {
// TestWebTool_WebFetch_MissingDomain verifies error handling for URL without domain
func TestWebTool_WebFetch_MissingDomain(t *testing.T) {
tool, err := NewWebFetchTool(50000, testFetchLimit)
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -776,7 +779,7 @@ func TestWebTool_WebFetch_MissingDomain(t *testing.T) {
}
func TestNewWebFetchToolWithProxy(t *testing.T) {
tool, err := NewWebFetchToolWithProxy(1024, "http://127.0.0.1:7890", testFetchLimit)
tool, err := NewWebFetchToolWithProxy(1024, "http://127.0.0.1:7890", format, testFetchLimit, nil)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
} else if tool.maxChars != 1024 {
@@ -787,7 +790,7 @@ func TestNewWebFetchToolWithProxy(t *testing.T) {
t.Fatalf("proxy = %q, want %q", tool.proxy, "http://127.0.0.1:7890")
}
tool, err = NewWebFetchToolWithProxy(0, "http://127.0.0.1:7890", testFetchLimit)
tool, err = NewWebFetchToolWithProxy(0, "http://127.0.0.1:7890", format, testFetchLimit, nil)
if err != nil {
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
}
@@ -798,7 +801,7 @@ func TestNewWebFetchToolWithProxy(t *testing.T) {
}
func TestNewWebFetchToolWithConfig_InvalidPrivateHostWhitelist(t *testing.T) {
_, err := NewWebFetchToolWithConfig(1024, "", testFetchLimit, []string{"not-an-ip-or-cidr"})
_, err := NewWebFetchToolWithConfig(1024, "", format, testFetchLimit, []string{"not-an-ip-or-cidr"})
if err == nil {
t.Fatal("expected invalid whitelist entry to fail")
}
+411
View File
@@ -0,0 +1,411 @@
package utils
import (
"bytes"
"net/url"
"regexp"
"strconv"
"strings"
"golang.org/x/net/html"
)
var (
reSpaces = regexp.MustCompile(`[ \t]+`)
reNewlines = regexp.MustCompile(`\n{3,}`)
reEmptyListItem = regexp.MustCompile(`(?m)^[-*]\s*$`)
reImageOnlyLink = regexp.MustCompile(`\[!\[\]\(<[^>]*>\)\]\(<[^>]*>\)`)
reEmptyHeader = regexp.MustCompile(`(?m)^#{1,6}\s*$`)
reLeadingLineSpace = regexp.MustCompile(`(?m)^([ \t])([^ \t\n])`)
)
var skipTags = map[string]bool{
"script": true, "style": true, "head": true,
"noscript": true, "template": true,
"nav": true, "footer": true, "aside": true, "header": true, "form": true, "dialog": true,
}
func isSafeHref(href string) bool {
lower := strings.ToLower(strings.TrimSpace(href))
if strings.HasPrefix(lower, "javascript:") || strings.HasPrefix(lower, "vbscript:") ||
strings.HasPrefix(lower, "data:") {
return false
}
u, err := url.Parse(strings.TrimSpace(href))
if err != nil {
return false
}
scheme := strings.ToLower(u.Scheme)
return scheme == "" || scheme == "http" || scheme == "https" || scheme == "mailto"
}
func isSafeImageSrc(src string) bool {
lower := strings.ToLower(strings.TrimSpace(src))
if strings.HasPrefix(lower, "data:image/") {
return true
}
return isSafeHref(src)
}
func escapeMdAlt(s string) string {
s = strings.ReplaceAll(s, `\`, `\\`)
s = strings.ReplaceAll(s, `[`, `\[`)
s = strings.ReplaceAll(s, `]`, `\]`)
return s
}
func getAttr(n *html.Node, key string) string {
for _, a := range n.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func normalizeAttr(val string) string {
val = strings.ReplaceAll(val, "\n", "")
val = strings.ReplaceAll(val, "\r", "")
val = strings.ReplaceAll(val, "\t", "")
return strings.TrimSpace(val)
}
func isUnlikelyNode(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
classId := strings.ToLower(getAttr(n, "class") + " " + getAttr(n, "id"))
if classId == " " {
return false
}
if strings.Contains(classId, "article") || strings.Contains(classId, "main") ||
strings.Contains(classId, "content") {
return false
}
unlikelyKeywords := []string{
"menu",
"nav",
"footer",
"sidebar",
"cookie",
"banner",
"sponsor",
"advert",
"popup",
"modal",
"newsletter",
"share",
"social",
}
for _, keyword := range unlikelyKeywords {
if strings.Contains(classId, keyword) {
return true
}
}
return false
}
type converter struct {
stack []*bytes.Buffer
linkHrefs []string
linkStates []bool
emphStack []string // Tracks "**", "*", "~~" for buffered emphasis
olCounters []int
inPre bool
listDepth int
}
func newConverter() *converter {
return &converter{
stack: []*bytes.Buffer{{}},
}
}
func (c *converter) write(s string) {
c.stack[len(c.stack)-1].WriteString(s)
}
func (c *converter) pushBuf() {
c.stack = append(c.stack, &bytes.Buffer{})
}
func (c *converter) popBuf() string {
top := c.stack[len(c.stack)-1]
c.stack = c.stack[:len(c.stack)-1]
return top.String()
}
func (c *converter) walk(n *html.Node) {
if n.Type == html.ElementNode {
if skipTags[n.Data] {
return
}
if isUnlikelyNode(n) {
return
}
}
if n.Type == html.TextNode {
text := n.Data
if !c.inPre {
text = strings.ReplaceAll(text, "\n", " ")
text = reSpaces.ReplaceAllString(text, " ")
}
if text != "" {
c.write(text)
}
return
}
if n.Type != html.ElementNode {
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
c.walk(ch)
}
return
}
// Opening Tags
switch n.Data {
// Buffer emphasis content so we can TrimSpace the inner text,
// avoiding the regex-across-boundaries bug.
case "b", "strong":
c.emphStack = append(c.emphStack, "**")
c.pushBuf()
case "i", "em":
c.emphStack = append(c.emphStack, "*")
c.pushBuf()
case "del", "s":
c.emphStack = append(c.emphStack, "~~")
c.pushBuf()
case "a":
href := normalizeAttr(getAttr(n, "href"))
if href != "" && !isSafeHref(href) {
href = "#"
}
hasHref := href != ""
c.linkStates = append(c.linkStates, hasHref)
if hasHref {
c.linkHrefs = append(c.linkHrefs, href)
c.pushBuf()
}
case "h1":
c.write("\n\n# ")
case "h2":
c.write("\n\n## ")
case "h3":
c.write("\n\n### ")
case "h4":
c.write("\n\n#### ")
case "h5":
c.write("\n\n##### ")
case "h6":
c.write("\n\n###### ")
case "p":
c.write("\n\n")
case "br":
c.write("\n")
case "hr":
c.write("\n\n---\n\n")
case "ol":
c.olCounters = append(c.olCounters, 1)
// Only write leading newline for top-level list.
if c.listDepth == 0 {
c.write("\n")
}
c.listDepth++
case "ul":
if c.listDepth == 0 {
c.write("\n")
}
c.listDepth++
case "li":
c.write("\n")
if c.listDepth > 1 {
c.write(strings.Repeat(" ", c.listDepth-1))
}
if n.Parent != nil && n.Parent.Data == "ol" && len(c.olCounters) > 0 {
idx := c.olCounters[len(c.olCounters)-1]
c.write(strconv.Itoa(idx) + ". ")
c.olCounters[len(c.olCounters)-1]++
} else {
c.write("- ")
}
case "pre":
c.inPre = true
c.write("\n\n```\n")
case "code":
if !c.inPre {
c.write("`")
}
case "blockquote":
c.pushBuf()
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
c.walk(ch)
}
inner := strings.TrimSpace(c.popBuf())
lines := strings.Split(inner, "\n")
var quoted []string
for _, l := range lines {
if strings.TrimSpace(l) == "" {
quoted = append(quoted, ">")
} else {
quoted = append(quoted, "> "+l)
}
}
var deduped []string
for i, line := range quoted {
if line == ">" && i > 0 && deduped[len(deduped)-1] == ">" {
continue
}
deduped = append(deduped, line)
}
c.write("\n\n" + strings.Join(deduped, "\n") + "\n\n")
return
case "img":
src := normalizeAttr(getAttr(n, "src"))
if src == "" {
src = normalizeAttr(getAttr(n, "data-src"))
}
if src == "" {
return
}
alt := escapeMdAlt(normalizeAttr(getAttr(n, "alt")))
if isSafeImageSrc(src) {
c.write("![" + alt + "](" + src + ")")
}
return
}
// Traverse Children
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
c.walk(ch)
}
// Closing Tags
switch n.Data {
// Pop buffer, trim, wrap with the correct marker.
case "b", "strong", "i", "em", "del", "s":
if len(c.emphStack) == 0 {
break
}
marker := c.emphStack[len(c.emphStack)-1]
c.emphStack = c.emphStack[:len(c.emphStack)-1]
inner := strings.TrimSpace(c.popBuf())
if inner != "" {
c.write(marker + inner + marker)
}
case "a":
if len(c.linkStates) == 0 {
break
}
hasHref := c.linkStates[len(c.linkStates)-1]
c.linkStates = c.linkStates[:len(c.linkStates)-1]
if !hasHref {
break
}
href := c.linkHrefs[len(c.linkHrefs)-1]
c.linkHrefs = c.linkHrefs[:len(c.linkHrefs)-1]
inner := strings.TrimSpace(c.popBuf())
if strings.Contains(inner, "\n") {
lines := strings.Split(inner, "\n")
linked := false
for i, l := range lines {
cleanLine := strings.TrimSpace(l)
if cleanLine != "" && !strings.HasPrefix(cleanLine, "![") && !linked {
lines[i] = "[" + cleanLine + "](" + href + ")"
linked = true
}
}
c.write(strings.Join(lines, "\n"))
} else {
c.write("[" + inner + "](" + href + ")")
}
case "h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"p",
"div",
"section",
"article",
"header",
"footer",
"aside",
"nav",
"figure":
c.write("\n")
case "ol":
c.listDepth--
if len(c.olCounters) > 0 {
c.olCounters = c.olCounters[:len(c.olCounters)-1]
}
if c.listDepth == 0 {
c.write("\n")
}
case "ul":
c.listDepth--
if c.listDepth == 0 {
c.write("\n")
}
case "pre":
c.inPre = false
c.write("\n```\n\n")
case "code":
if !c.inPre {
c.write("`")
}
}
}
func HtmlToMarkdown(htmlStr string) (string, error) {
doc, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
return "", err
}
c := newConverter()
c.walk(doc)
res := c.stack[0].String()
// Post-processing
res = reImageOnlyLink.ReplaceAllString(res, "")
res = reEmptyListItem.ReplaceAllString(res, "")
res = reEmptyHeader.ReplaceAllString(res, "")
lines := strings.Split(res, "\n")
var cleanLines []string
for _, line := range lines {
line = strings.TrimRight(line, " \t")
cleanTest := strings.TrimSpace(line)
if cleanTest == "[](</>)" || cleanTest == "[](#)" || cleanTest == "-" {
cleanLines = append(cleanLines, "")
continue
}
cleanLines = append(cleanLines, line)
}
res = strings.Join(cleanLines, "\n")
res = strings.TrimSpace(res)
res = reNewlines.ReplaceAllString(res, "\n\n")
// Strip a single leading space from lines that are NOT list indentation.
// "(?m)^([ \t])([^ \t\n])" matches exactly one space/tab at line start followed
// by a non-whitespace char, so " - nested" (4 spaces) is left untouched.
res = reLeadingLineSpace.ReplaceAllString(res, "$2")
return res, nil
}
+245
View File
@@ -0,0 +1,245 @@
package utils
import (
"testing"
"github.com/sipeed/picoclaw/pkg/logger"
)
func TestHtmlToMarkdown(t *testing.T) {
// Define our test cases
tests := []struct {
name string
input string
expected string
}{
{
name: "Removes scripts and styles",
input: `<script>alert("hello");</script><style>body { color: red; }</style><p>Clean text</p>`,
expected: "Clean text",
},
{
name: "Extracts links correctly",
input: `Visit my <a href="https://example.com">website</a> for info.`,
expected: "Visit my [website](https://example.com) for info.",
},
{
name: "Converts headers (H1, H2, H3)",
input: `<h1>Main Title</h1><h2>Subtitle</h2><h3>Section</h3>`,
expected: "# Main Title\n\n## Subtitle\n\n### Section",
},
{
name: "Handles bold and italics",
input: `Text <b>bold</b> and <strong>strong</strong>, then <i>italic</i> and <em>em</em>.`,
expected: "Text **bold** and **strong**, then *italic* and *em*.",
},
{
name: "Converts lists",
input: `<ul><li>First element</li><li>Second element</li></ul>`,
expected: "- First element\n- Second element",
},
{
name: "Handles paragraphs and line breaks (<br>)",
input: `<p>First paragraph</p><p>Second paragraph with<br>a line break.</p>`,
expected: "First paragraph\n\nSecond paragraph with\na line break.",
},
{
name: "Decodes HTML entities",
input: `Math: 5 &gt; 3 &amp; 2 &lt; 4. A &quot;quote&quot;.`,
expected: "Math: 5 > 3 & 2 < 4. A \"quote\".",
},
{
name: "Cleans up residual HTML tags",
input: `<div><span>Text inside div and span</span></div>`,
expected: "Text inside div and span",
},
{
name: "Removes multiple spaces and excessive empty lines",
input: `This text has too many spaces. <br><br><br><br> And too many newlines.`,
expected: "This text has too many spaces.\n\nAnd too many newlines.",
},
{
name: "Nested lists with indentation",
input: "<ul><li>One<ul><li>Two</li></ul></li></ul>",
// Expect the sub-element to have 4 spaces of indentation
expected: "- One\n - Two",
},
{
name: "Image support",
input: `<img src="image.jpg" alt="alternative text">`,
// Correct Markdown syntax for images
expected: "![alternative text](image.jpg)",
},
{
name: "Image support without alt-text",
input: `<img src="image.jpg">`,
// If alt is missing, square brackets remain empty
expected: "![](image.jpg)",
},
{
name: "XSS Bypass on Links (Obfuscated HTML entities)",
// The Go HTML parser resolves entities, so this becomes "javascript:alert(1)"
input: `<a href="jav&#x09;ascript:alert(1)">Click here</a>`,
// Our isSafeHref (if updated with net/url) should neutralize it to "#"
expected: "[Click here](#)",
},
{
name: "Empty link or used as anchor",
input: `<a name="top"></a>`,
// With no text or href, it shouldn't print anything (not even empty brackets)
expected: "",
},
{
name: "Link without href but with text (Textual anchor)",
input: `<a id="top">Back to top</a>`,
// Should extract only plain text, without generating a broken Markdown link like [Back to top](#) or [Back to top]()
expected: "Back to top",
},
{
name: "Badly spaced bold and italics (Edge Case)",
input: `<b> Text </b>`,
// In Markdown `** Text **` is often not formatted correctly. The ideal is `**Text**`
expected: "**Text**",
},
{
name: "Complex Test - Real Article",
input: `
<h1>Article Title</h1>
<p>This is an <strong>introductory text</strong> with a <a href="http://link.com">link</a>.</p>
<h2>Subtitle</h2>
<ul>
<li>Point one</li>
<li>Point two</li>
</ul>
<script>console.log("do not show me")</script>
`,
// Note: The indentation of the real HTML test will generate spaces that
// regex will clean up.
expected: "# Article Title\n\nThis is an **introductory text** with a [link](http://link.com).\n\n## Subtitle\n\n- Point one\n- Point two",
},
{
name: "Ordered list (OL)",
input: `<ol><li>First</li><li>Second</li><li>Third</li></ol>`,
expected: "1. First\n2. Second\n3. Third",
},
{
name: "Ordered list nested in unordered list",
input: `<ul><li>Fruits<ol><li>Apples</li><li>Pears</li></ol></li><li>Vegetables</li></ul>`,
expected: "- Fruits\n 1. Apples\n 2. Pears\n- Vegetables",
},
{
name: "Code block (pre/code)",
input: "<pre><code>func main() {\n fmt.Println(\"hello\")\n}</code></pre>",
expected: "```\nfunc main() {\n fmt.Println(\"hello\")\n}\n```",
},
{
name: "Inline code",
input: `<p>Use the command <code>go test ./...</code> to run the tests.</p>`,
expected: "Use the command `go test ./...` to run the tests.",
},
{
name: "Simple blockquote",
input: `<blockquote><p>An important quote.</p></blockquote>`,
expected: "> An important quote.",
},
{
name: "Multiline blockquote",
input: `<blockquote><p>First line of the quote.</p><p>Second line of the quote.</p></blockquote>`,
expected: "> First line of the quote.\n>\n> Second line of the quote.",
},
{
name: "Strikethrough text (del/s)",
input: `This text is <del>deleted</del> and this is <s>crossed out</s>.`,
expected: "This text is ~~deleted~~ and this is ~~crossed out~~.",
},
{
name: "Horizontal separator (HR)",
input: `<p>Above the line</p><hr><p>Below the line</p>`,
expected: "Above the line\n\n---\n\nBelow the line",
},
{
name: "Bold nested in link",
input: `<a href="https://example.com"><strong>Linked bold text</strong></a>`,
expected: "[**Linked bold text**](https://example.com)",
},
{
name: "data-src Image (lazy loading)",
input: `<img data-src="lazy.jpg" alt="Lazy image">`,
expected: "![Lazy image](lazy.jpg)",
},
{
name: "Image with javascript: src blocked",
input: `<img src="javascript:alert(1)" alt="XSS">`,
// src is not safe, so the image is not emitted
expected: "",
},
{
name: "Link with data: href blocked",
input: `<a href="data:text/html,<script>alert(1)</script>">Click</a>`,
expected: "[Click](#)",
},
{
name: "Deeply nested divs",
input: `<div><div><div><div><p>Deeply nested text</p></div></div></div></div>`,
expected: "Deeply nested text",
},
{
name: "Non-consecutive headers (H1, H3, H5)",
input: `<h1>Title</h1><h3>Subsection</h3><h5>Sub-subsection</h5>`,
expected: "# Title\n\n### Subsection\n\n##### Sub-subsection",
},
{
name: "Paragraph with mixed multiple emphasis",
input: `<p><strong>Important:</strong> read the <strong><em>critical instructions</em></strong> <em>carefully</em>.</p>`,
expected: "**Important:** read the ***critical instructions*** *carefully*.",
},
{
name: "Article with nav and aside sections (noise to filter)",
input: `
<nav><a href="/home">Home</a><a href="/about-us">About us</a></nav>
<article>
<h2>Article title</h2>
<p>This is the body of the article.</p>
</article>
<aside><p>Advertisement</p></aside>
`,
expected: "## Article title\n\nThis is the body of the article.",
},
{
name: "Text with mixed special HTML entities",
input: `Copyright &copy; 2024 &mdash; All rights reserved &reg;`,
expected: "Copyright © 2024 — All rights reserved ®",
},
{
name: "Mailto link",
input: `Write to us at <a href="mailto:info@example.com">info@example.com</a>`,
expected: "Write to us at [info@example.com](mailto:info@example.com)",
},
{
name: "Image inside a link (clickable figure)",
input: `<a href="https://example.com"><img src="photo.jpg" alt="Photo"></a>`,
// The image-link without text must not generate broken markup
expected: "[![Photo](photo.jpg)](https://example.com)",
},
{
name: "Empty content or only whitespace",
input: ` <p> </p> <div> </div> `,
expected: "",
},
}
// Iterate over all test cases
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := HtmlToMarkdown(tt.input)
if err != nil {
logger.ErrorCF("tool", "Failed to parse html to markdown: %s", map[string]any{"error": err.Error()})
}
if got != tt.expected {
t.Errorf("\nTest case failed: %s\nInput: %q\nGot: %q\nExpected: %q",
tt.name, tt.input, got, tt.expected)
}
})
}
}