mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
feat(tool): markdown format in output web_fetch tool
This commit is contained in:
@@ -251,6 +251,9 @@ picoclaw onboard
|
||||
},
|
||||
"tools": {
|
||||
"web": {
|
||||
"enabled": true,
|
||||
"fetch_limit_bytes": 10485760,
|
||||
"format": "plaintext",
|
||||
"brave": {
|
||||
"enabled": false,
|
||||
"api_key": "VOTRE_CLE_API_BRAVE",
|
||||
|
||||
@@ -216,6 +216,9 @@ picoclaw onboard
|
||||
},
|
||||
"tools": {
|
||||
"web": {
|
||||
"enabled": true,
|
||||
"fetch_limit_bytes": 10485760,
|
||||
"format": "plaintext",
|
||||
"search": {
|
||||
"api_key": "YOUR_BRAVE_API_KEY",
|
||||
"max_results": 5
|
||||
|
||||
@@ -270,6 +270,9 @@ picoclaw onboard
|
||||
],
|
||||
"tools": {
|
||||
"web": {
|
||||
"enabled": true,
|
||||
"fetch_limit_bytes": 10485760,
|
||||
"format": "plaintext",
|
||||
"brave": {
|
||||
"enabled": false,
|
||||
"api_key": "YOUR_BRAVE_API_KEY",
|
||||
|
||||
@@ -245,6 +245,9 @@ picoclaw onboard
|
||||
},
|
||||
"tools": {
|
||||
"web": {
|
||||
"enabled": true,
|
||||
"fetch_limit_bytes": 10485760,
|
||||
"format": "plaintext",
|
||||
"brave": {
|
||||
"enabled": false,
|
||||
"api_key": "YOUR_BRAVE_API_KEY",
|
||||
|
||||
@@ -255,6 +255,9 @@ picoclaw onboard
|
||||
],
|
||||
"tools": {
|
||||
"web": {
|
||||
"enabled": true,
|
||||
"fetch_limit_bytes": 10485760,
|
||||
"format": "plaintext",
|
||||
"brave": {
|
||||
"enabled": false,
|
||||
"api_key": "YOUR_BRAVE_API_KEY",
|
||||
|
||||
@@ -313,6 +313,8 @@
|
||||
"allow_write_paths": null,
|
||||
"web": {
|
||||
"enabled": true,
|
||||
"fetch_limit_bytes": 10485760,
|
||||
"format": "plaintext",
|
||||
"brave": {
|
||||
"enabled": false,
|
||||
"api_key": "YOUR_BRAVE_API_KEY",
|
||||
@@ -350,8 +352,7 @@
|
||||
"base_url": "https://open.bigmodel.cn/api/paas/v4/web_search",
|
||||
"search_engine": "search_std",
|
||||
"max_results": 5
|
||||
},
|
||||
"fetch_limit_bytes": 10485760
|
||||
}
|
||||
},
|
||||
"cron": {
|
||||
"enabled": true,
|
||||
|
||||
@@ -30,6 +30,15 @@ PicoClaw's tools configuration is located in the `tools` field of `config.json`.
|
||||
|
||||
Web tools are used for web search and fetching.
|
||||
|
||||
### Web Fetcher
|
||||
General settings for fetching and processing webpage content.
|
||||
|
||||
| Config | Type | Default | Description |
|
||||
|---------------------|--------|---------------|-----------------------------------------------------------------------------------------------|
|
||||
| `enabled` | bool | true | Enable the webpage fetching capability. |
|
||||
| `fetch_limit_bytes` | int | 10485760 | Maximum size of the webpage payload to fetch, in bytes (default is 10MB). |
|
||||
| `format` | string | "plaintext" | Output format of the fetched content. Options: `plaintext` or `markdown` (recommended). |
|
||||
|
||||
### Brave
|
||||
|
||||
| Config | Type | Default | Description |
|
||||
|
||||
+5
-1
@@ -157,7 +157,11 @@ func registerSharedTools(
|
||||
}
|
||||
}
|
||||
if cfg.Tools.IsToolEnabled("web_fetch") {
|
||||
fetchTool, err := tools.NewWebFetchToolWithProxy(50000, cfg.Tools.Web.Proxy, cfg.Tools.Web.FetchLimitBytes)
|
||||
fetchTool, err := tools.NewWebFetchToolWithProxy(
|
||||
50000,
|
||||
cfg.Tools.Web.Proxy,
|
||||
cfg.Tools.Web.Format,
|
||||
cfg.Tools.Web.FetchLimitBytes)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
} else {
|
||||
|
||||
@@ -694,6 +694,7 @@ type WebToolsConfig struct {
|
||||
// For authenticated proxies, prefer HTTP_PROXY/HTTPS_PROXY env vars instead of embedding credentials in config.
|
||||
Proxy string `json:"proxy,omitempty" env:"PICOCLAW_TOOLS_WEB_PROXY"`
|
||||
FetchLimitBytes int64 `json:"fetch_limit_bytes,omitempty" env:"PICOCLAW_TOOLS_WEB_FETCH_LIMIT_BYTES"`
|
||||
Format string `json:"format,omitempty" env:"PICOCLAW_TOOLS_WEB_FORMAT"`
|
||||
}
|
||||
|
||||
type CronToolsConfig struct {
|
||||
|
||||
@@ -412,6 +412,7 @@ func DefaultConfig() *Config {
|
||||
},
|
||||
Proxy: "",
|
||||
FetchLimitBytes: 10 * 1024 * 1024, // 10MB by default
|
||||
Format: "plaintext",
|
||||
Brave: BraveConfig{
|
||||
Enabled: false,
|
||||
APIKey: "",
|
||||
|
||||
+55
-16
@@ -7,6 +7,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
@@ -28,6 +29,7 @@ const (
|
||||
|
||||
defaultMaxChars = 50000
|
||||
maxRedirects = 5
|
||||
format = "plaintext"
|
||||
)
|
||||
|
||||
// Pre-compiled regexes for HTML text extraction
|
||||
@@ -776,19 +778,20 @@ type WebFetchTool struct {
|
||||
maxChars int
|
||||
proxy string
|
||||
client *http.Client
|
||||
format string
|
||||
fetchLimitBytes int64
|
||||
}
|
||||
|
||||
func NewWebFetchTool(maxChars int, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
func NewWebFetchTool(maxChars int, format string, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
// createHTTPClient cannot fail with an empty proxy string.
|
||||
return NewWebFetchToolWithProxy(maxChars, "", fetchLimitBytes)
|
||||
return NewWebFetchToolWithProxy(maxChars, "", format, fetchLimitBytes)
|
||||
}
|
||||
|
||||
// allowPrivateWebFetchHosts controls whether loopback/private hosts are allowed.
|
||||
// This is false in normal runtime to reduce SSRF exposure, and tests can override it temporarily.
|
||||
var allowPrivateWebFetchHosts atomic.Bool
|
||||
|
||||
func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
func NewWebFetchToolWithProxy(maxChars int, proxy string, format string, fetchLimitBytes int64) (*WebFetchTool, error) {
|
||||
if maxChars <= 0 {
|
||||
maxChars = defaultMaxChars
|
||||
}
|
||||
@@ -819,6 +822,7 @@ func NewWebFetchToolWithProxy(maxChars int, proxy string, fetchLimitBytes int64)
|
||||
maxChars: maxChars,
|
||||
proxy: proxy,
|
||||
client: client,
|
||||
format: format,
|
||||
fetchLimitBytes: fetchLimitBytes,
|
||||
}, nil
|
||||
}
|
||||
@@ -906,26 +910,50 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
|
||||
return ErrorResult(fmt.Sprintf("failed to read response: %v", err))
|
||||
}
|
||||
|
||||
bodyStr := string(body)
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
|
||||
mediaType, _, _ := mime.ParseMediaType(contentType)
|
||||
|
||||
var text, extractor string
|
||||
|
||||
if strings.Contains(contentType, "application/json") {
|
||||
switch {
|
||||
case mediaType == "application/json":
|
||||
var jsonData any
|
||||
if err := json.Unmarshal(body, &jsonData); err == nil {
|
||||
formatted, _ := json.MarshalIndent(jsonData, "", " ")
|
||||
text = string(formatted)
|
||||
extractor = "json"
|
||||
} else {
|
||||
text = string(body)
|
||||
if err := json.Unmarshal(body, &jsonData); err != nil {
|
||||
text = bodyStr
|
||||
extractor = "raw"
|
||||
break
|
||||
}
|
||||
} else if strings.Contains(contentType, "text/html") || len(body) > 0 &&
|
||||
(strings.HasPrefix(string(body), "<!DOCTYPE") || strings.HasPrefix(strings.ToLower(string(body)), "<html")) {
|
||||
text = t.extractText(string(body))
|
||||
extractor = "text"
|
||||
} else {
|
||||
text = string(body)
|
||||
|
||||
formatted, err := json.MarshalIndent(jsonData, "", " ")
|
||||
if err != nil {
|
||||
text = bodyStr
|
||||
extractor = "raw"
|
||||
break
|
||||
}
|
||||
|
||||
text = string(formatted)
|
||||
extractor = "json"
|
||||
|
||||
case mediaType == "text/html" || looksLikeHTML(bodyStr):
|
||||
switch strings.ToLower(t.format) {
|
||||
|
||||
case "markdown":
|
||||
var err error
|
||||
text, err = utils.HtmlToMarkdown(bodyStr)
|
||||
if err != nil {
|
||||
return ErrorResult(fmt.Sprintf("failed to HTML to markdown: %v", err))
|
||||
}
|
||||
extractor = "markdown"
|
||||
|
||||
default:
|
||||
text = t.extractText(bodyStr)
|
||||
extractor = "text"
|
||||
}
|
||||
|
||||
default:
|
||||
text = bodyStr
|
||||
extractor = "raw"
|
||||
}
|
||||
|
||||
@@ -957,6 +985,17 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
|
||||
}
|
||||
}
|
||||
|
||||
func looksLikeHTML(body string) bool {
|
||||
if body == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
lower := strings.ToLower(body)
|
||||
|
||||
return strings.HasPrefix(body, "<!doctype") ||
|
||||
strings.HasPrefix(lower, "<html")
|
||||
}
|
||||
|
||||
func (t *WebFetchTool) extractText(htmlContent string) string {
|
||||
result := reScript.ReplaceAllLiteralString(htmlContent, "")
|
||||
result = reStyle.ReplaceAllLiteralString(result, "")
|
||||
|
||||
+22
-20
@@ -14,7 +14,9 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/logger"
|
||||
)
|
||||
|
||||
const testFetchLimit = int64(10 * 1024 * 1024)
|
||||
const (
|
||||
testFetchLimit = int64(10 * 1024 * 1024)
|
||||
)
|
||||
|
||||
// TestWebTool_WebFetch_Success verifies successful URL fetching
|
||||
func TestWebTool_WebFetch_Success(t *testing.T) {
|
||||
@@ -27,7 +29,7 @@ func TestWebTool_WebFetch_Success(t *testing.T) {
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -69,7 +71,7 @@ func TestWebTool_WebFetch_JSON(t *testing.T) {
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -94,7 +96,7 @@ func TestWebTool_WebFetch_JSON(t *testing.T) {
|
||||
|
||||
// TestWebTool_WebFetch_InvalidURL verifies error handling for invalid URL
|
||||
func TestWebTool_WebFetch_InvalidURL(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -119,7 +121,7 @@ func TestWebTool_WebFetch_InvalidURL(t *testing.T) {
|
||||
|
||||
// TestWebTool_WebFetch_UnsupportedScheme verifies error handling for non-http URLs
|
||||
func TestWebTool_WebFetch_UnsupportedScheme(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -144,7 +146,7 @@ func TestWebTool_WebFetch_UnsupportedScheme(t *testing.T) {
|
||||
|
||||
// TestWebTool_WebFetch_MissingURL verifies error handling for missing URL
|
||||
func TestWebTool_WebFetch_MissingURL(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -178,7 +180,7 @@ func TestWebTool_WebFetch_Truncation(t *testing.T) {
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tool, err := NewWebFetchTool(1000, testFetchLimit) // Limit to 1000 chars
|
||||
tool, err := NewWebFetchTool(1000, format, testFetchLimit) // Limit to 1000 chars
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -228,7 +230,7 @@ func TestWebFetchTool_PayloadTooLarge(t *testing.T) {
|
||||
defer ts.Close()
|
||||
|
||||
// Initialize the tool
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -311,7 +313,7 @@ func TestWebTool_WebFetch_HTMLExtraction(t *testing.T) {
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -424,7 +426,7 @@ func withPrivateWebFetchHostsAllowed(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestWebTool_WebFetch_PrivateHostBlocked(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -451,7 +453,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedForTests(t *testing.T) {
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -466,7 +468,7 @@ func TestWebTool_WebFetch_PrivateHostAllowedForTests(t *testing.T) {
|
||||
|
||||
// TestWebFetch_BlocksIPv4MappedIPv6Loopback verifies ::ffff:127.0.0.1 is blocked
|
||||
func TestWebFetch_BlocksIPv4MappedIPv6Loopback(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -481,7 +483,7 @@ func TestWebFetch_BlocksIPv4MappedIPv6Loopback(t *testing.T) {
|
||||
|
||||
// TestWebFetch_BlocksMetadataIP verifies 169.254.169.254 is blocked
|
||||
func TestWebFetch_BlocksMetadataIP(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -496,7 +498,7 @@ func TestWebFetch_BlocksMetadataIP(t *testing.T) {
|
||||
|
||||
// TestWebFetch_BlocksIPv6UniqueLocal verifies fc00::/7 addresses are blocked
|
||||
func TestWebFetch_BlocksIPv6UniqueLocal(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -511,7 +513,7 @@ func TestWebFetch_BlocksIPv6UniqueLocal(t *testing.T) {
|
||||
|
||||
// TestWebFetch_Blocks6to4WithPrivateEmbed verifies 6to4 with private embedded IPv4 is blocked
|
||||
func TestWebFetch_Blocks6to4WithPrivateEmbed(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -527,7 +529,7 @@ func TestWebFetch_Blocks6to4WithPrivateEmbed(t *testing.T) {
|
||||
|
||||
// TestWebFetch_Allows6to4WithPublicEmbed verifies 6to4 with public embedded IPv4 is NOT blocked
|
||||
func TestWebFetch_Allows6to4WithPublicEmbed(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -557,7 +559,7 @@ func TestWebFetch_RedirectToPrivateBlocked(t *testing.T) {
|
||||
allowPrivateWebFetchHosts.Store(false)
|
||||
defer allowPrivateWebFetchHosts.Store(true)
|
||||
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create web fetch tool: %v", err)
|
||||
}
|
||||
@@ -615,7 +617,7 @@ func TestIsPrivateOrRestrictedIP_Table(t *testing.T) {
|
||||
|
||||
// TestWebTool_WebFetch_MissingDomain verifies error handling for URL without domain
|
||||
func TestWebTool_WebFetch_MissingDomain(t *testing.T) {
|
||||
tool, err := NewWebFetchTool(50000, testFetchLimit)
|
||||
tool, err := NewWebFetchTool(50000, format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
@@ -639,7 +641,7 @@ func TestWebTool_WebFetch_MissingDomain(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestNewWebFetchToolWithProxy(t *testing.T) {
|
||||
tool, err := NewWebFetchToolWithProxy(1024, "http://127.0.0.1:7890", testFetchLimit)
|
||||
tool, err := NewWebFetchToolWithProxy(1024, "http://127.0.0.1:7890", format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
} else if tool.maxChars != 1024 {
|
||||
@@ -650,7 +652,7 @@ func TestNewWebFetchToolWithProxy(t *testing.T) {
|
||||
t.Fatalf("proxy = %q, want %q", tool.proxy, "http://127.0.0.1:7890")
|
||||
}
|
||||
|
||||
tool, err = NewWebFetchToolWithProxy(0, "http://127.0.0.1:7890", testFetchLimit)
|
||||
tool, err = NewWebFetchToolWithProxy(0, "http://127.0.0.1:7890", format, testFetchLimit)
|
||||
if err != nil {
|
||||
logger.ErrorCF("agent", "Failed to create web fetch tool", map[string]any{"error": err.Error()})
|
||||
}
|
||||
|
||||
@@ -0,0 +1,413 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var (
|
||||
reSpaces = regexp.MustCompile(`[ \t]+`)
|
||||
reNewlines = regexp.MustCompile(`\n{3,}`)
|
||||
reEmptyListItem = regexp.MustCompile(`(?m)^[-*]\s*$`)
|
||||
reImageOnlyLink = regexp.MustCompile(`\[!\[\]\(<[^>]*>\)\]\(<[^>]*>\)`)
|
||||
reEmptyHeader = regexp.MustCompile(`(?m)^#{1,6}\s*$`)
|
||||
reLeadingLineSpace = regexp.MustCompile(`(?m)^([ \t])([^ \t\n])`)
|
||||
)
|
||||
|
||||
var skipTags = map[string]bool{
|
||||
"script": true, "style": true, "head": true,
|
||||
"noscript": true, "template": true,
|
||||
"nav": true, "footer": true, "aside": true, "header": true, "form": true, "dialog": true,
|
||||
}
|
||||
|
||||
func isSafeHref(href string) bool {
|
||||
lower := strings.ToLower(strings.TrimSpace(href))
|
||||
if strings.HasPrefix(lower, "javascript:") || strings.HasPrefix(lower, "vbscript:") ||
|
||||
strings.HasPrefix(lower, "data:") {
|
||||
return false
|
||||
}
|
||||
u, err := url.Parse(strings.TrimSpace(href))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
scheme := strings.ToLower(u.Scheme)
|
||||
return scheme == "" || scheme == "http" || scheme == "https" || scheme == "mailto"
|
||||
}
|
||||
|
||||
func isSafeImageSrc(src string) bool {
|
||||
lower := strings.ToLower(strings.TrimSpace(src))
|
||||
if strings.HasPrefix(lower, "data:image/") {
|
||||
return true
|
||||
}
|
||||
return isSafeHref(src)
|
||||
}
|
||||
|
||||
func escapeMdAlt(s string) string {
|
||||
s = strings.ReplaceAll(s, `\`, `\\`)
|
||||
s = strings.ReplaceAll(s, `[`, `\[`)
|
||||
s = strings.ReplaceAll(s, `]`, `\]`)
|
||||
return s
|
||||
}
|
||||
|
||||
func getAttr(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func normalizeAttr(val string) string {
|
||||
val = strings.ReplaceAll(val, "\n", "")
|
||||
val = strings.ReplaceAll(val, "\r", "")
|
||||
val = strings.ReplaceAll(val, "\t", "")
|
||||
return strings.TrimSpace(val)
|
||||
}
|
||||
|
||||
func isUnlikelyNode(n *html.Node) bool {
|
||||
if n.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
classId := strings.ToLower(getAttr(n, "class") + " " + getAttr(n, "id"))
|
||||
if classId == " " {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(classId, "article") || strings.Contains(classId, "main") ||
|
||||
strings.Contains(classId, "content") {
|
||||
return false
|
||||
}
|
||||
unlikelyKeywords := []string{
|
||||
"menu",
|
||||
"nav",
|
||||
"footer",
|
||||
"sidebar",
|
||||
"cookie",
|
||||
"banner",
|
||||
"sponsor",
|
||||
"advert",
|
||||
"popup",
|
||||
"modal",
|
||||
"newsletter",
|
||||
"share",
|
||||
"social",
|
||||
}
|
||||
for _, keyword := range unlikelyKeywords {
|
||||
if strings.Contains(classId, keyword) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type converter struct {
|
||||
stack []*bytes.Buffer
|
||||
linkHrefs []string
|
||||
linkStates []bool
|
||||
emphStack []string // Tracks "**", "*", "~~" for buffered emphasis
|
||||
olCounters []int
|
||||
inPre bool
|
||||
listDepth int
|
||||
}
|
||||
|
||||
func newConverter() *converter {
|
||||
return &converter{
|
||||
stack: []*bytes.Buffer{{}},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *converter) write(s string) {
|
||||
c.stack[len(c.stack)-1].WriteString(s)
|
||||
}
|
||||
|
||||
func (c *converter) pushBuf() {
|
||||
c.stack = append(c.stack, &bytes.Buffer{})
|
||||
}
|
||||
|
||||
func (c *converter) popBuf() string {
|
||||
top := c.stack[len(c.stack)-1]
|
||||
c.stack = c.stack[:len(c.stack)-1]
|
||||
return top.String()
|
||||
}
|
||||
|
||||
func (c *converter) walk(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
if skipTags[n.Data] {
|
||||
return
|
||||
}
|
||||
if isUnlikelyNode(n) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if n.Type == html.TextNode {
|
||||
text := n.Data
|
||||
if !c.inPre {
|
||||
text = strings.ReplaceAll(text, "\n", " ")
|
||||
text = reSpaces.ReplaceAllString(text, " ")
|
||||
}
|
||||
if text != "" {
|
||||
c.write(text)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if n.Type != html.ElementNode {
|
||||
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
|
||||
c.walk(ch)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Opening Tags
|
||||
switch n.Data {
|
||||
|
||||
// Buffer emphasis content so we can TrimSpace the inner text,
|
||||
// avoiding the regex-across-boundaries bug.
|
||||
case "b", "strong":
|
||||
c.emphStack = append(c.emphStack, "**")
|
||||
c.pushBuf()
|
||||
case "i", "em":
|
||||
c.emphStack = append(c.emphStack, "*")
|
||||
c.pushBuf()
|
||||
case "del", "s":
|
||||
c.emphStack = append(c.emphStack, "~~")
|
||||
c.pushBuf()
|
||||
|
||||
case "a":
|
||||
href := normalizeAttr(getAttr(n, "href"))
|
||||
if href != "" && !isSafeHref(href) {
|
||||
href = "#"
|
||||
}
|
||||
hasHref := href != ""
|
||||
c.linkStates = append(c.linkStates, hasHref)
|
||||
if hasHref {
|
||||
c.linkHrefs = append(c.linkHrefs, href)
|
||||
c.pushBuf()
|
||||
}
|
||||
|
||||
case "h1":
|
||||
c.write("\n\n# ")
|
||||
case "h2":
|
||||
c.write("\n\n## ")
|
||||
case "h3":
|
||||
c.write("\n\n### ")
|
||||
case "h4":
|
||||
c.write("\n\n#### ")
|
||||
case "h5":
|
||||
c.write("\n\n##### ")
|
||||
case "h6":
|
||||
c.write("\n\n###### ")
|
||||
|
||||
case "p":
|
||||
c.write("\n\n")
|
||||
case "br":
|
||||
c.write("\n")
|
||||
case "hr":
|
||||
c.write("\n\n---\n\n")
|
||||
|
||||
case "ol":
|
||||
c.olCounters = append(c.olCounters, 1)
|
||||
// Only write leading newline for top-level list.
|
||||
if c.listDepth == 0 {
|
||||
c.write("\n")
|
||||
}
|
||||
c.listDepth++
|
||||
case "ul":
|
||||
if c.listDepth == 0 {
|
||||
c.write("\n")
|
||||
}
|
||||
c.listDepth++
|
||||
case "li":
|
||||
c.write("\n")
|
||||
if c.listDepth > 1 {
|
||||
c.write(strings.Repeat(" ", c.listDepth-1))
|
||||
}
|
||||
if n.Parent != nil && n.Parent.Data == "ol" && len(c.olCounters) > 0 {
|
||||
idx := c.olCounters[len(c.olCounters)-1]
|
||||
c.write(strconv.Itoa(idx) + ". ")
|
||||
c.olCounters[len(c.olCounters)-1]++
|
||||
} else {
|
||||
c.write("- ")
|
||||
}
|
||||
|
||||
case "pre":
|
||||
c.inPre = true
|
||||
c.write("\n\n```\n")
|
||||
case "code":
|
||||
if !c.inPre {
|
||||
c.write("`")
|
||||
}
|
||||
|
||||
case "blockquote":
|
||||
c.pushBuf()
|
||||
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
|
||||
c.walk(ch)
|
||||
}
|
||||
inner := strings.TrimSpace(c.popBuf())
|
||||
lines := strings.Split(inner, "\n")
|
||||
var quoted []string
|
||||
for _, l := range lines {
|
||||
if strings.TrimSpace(l) == "" {
|
||||
quoted = append(quoted, ">")
|
||||
} else {
|
||||
quoted = append(quoted, "> "+l)
|
||||
}
|
||||
}
|
||||
var deduped []string
|
||||
for i, line := range quoted {
|
||||
if line == ">" && i > 0 && deduped[len(deduped)-1] == ">" {
|
||||
continue
|
||||
}
|
||||
deduped = append(deduped, line)
|
||||
}
|
||||
c.write("\n\n" + strings.Join(deduped, "\n") + "\n\n")
|
||||
return
|
||||
|
||||
case "img":
|
||||
src := normalizeAttr(getAttr(n, "src"))
|
||||
if src == "" {
|
||||
src = normalizeAttr(getAttr(n, "data-src"))
|
||||
}
|
||||
if src == "" {
|
||||
return
|
||||
}
|
||||
alt := escapeMdAlt(normalizeAttr(getAttr(n, "alt")))
|
||||
if isSafeImageSrc(src) {
|
||||
c.write("")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Traverse Children
|
||||
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
|
||||
c.walk(ch)
|
||||
}
|
||||
|
||||
// Closing Tags
|
||||
switch n.Data {
|
||||
|
||||
// Pop buffer, trim, wrap with the correct marker.
|
||||
case "b", "strong", "i", "em", "del", "s":
|
||||
if len(c.emphStack) == 0 {
|
||||
break
|
||||
}
|
||||
marker := c.emphStack[len(c.emphStack)-1]
|
||||
c.emphStack = c.emphStack[:len(c.emphStack)-1]
|
||||
inner := strings.TrimSpace(c.popBuf())
|
||||
if inner != "" {
|
||||
c.write(marker + inner + marker)
|
||||
}
|
||||
|
||||
case "a":
|
||||
if len(c.linkStates) == 0 {
|
||||
break
|
||||
}
|
||||
hasHref := c.linkStates[len(c.linkStates)-1]
|
||||
c.linkStates = c.linkStates[:len(c.linkStates)-1]
|
||||
if !hasHref {
|
||||
break
|
||||
}
|
||||
href := c.linkHrefs[len(c.linkHrefs)-1]
|
||||
c.linkHrefs = c.linkHrefs[:len(c.linkHrefs)-1]
|
||||
inner := strings.TrimSpace(c.popBuf())
|
||||
if strings.Contains(inner, "\n") {
|
||||
lines := strings.Split(inner, "\n")
|
||||
linked := false
|
||||
for i, l := range lines {
|
||||
cleanLine := strings.TrimSpace(l)
|
||||
if cleanLine != "" && !strings.HasPrefix(cleanLine, "![") && !linked {
|
||||
lines[i] = "[" + cleanLine + "](" + href + ")"
|
||||
linked = true
|
||||
}
|
||||
}
|
||||
c.write(strings.Join(lines, "\n"))
|
||||
} else {
|
||||
c.write("[" + inner + "](" + href + ")")
|
||||
}
|
||||
|
||||
case "h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"p",
|
||||
"div",
|
||||
"section",
|
||||
"article",
|
||||
"header",
|
||||
"footer",
|
||||
"aside",
|
||||
"nav",
|
||||
"figure":
|
||||
c.write("\n")
|
||||
|
||||
case "ol":
|
||||
c.listDepth--
|
||||
if len(c.olCounters) > 0 {
|
||||
c.olCounters = c.olCounters[:len(c.olCounters)-1]
|
||||
}
|
||||
if c.listDepth == 0 {
|
||||
c.write("\n")
|
||||
}
|
||||
case "ul":
|
||||
c.listDepth--
|
||||
if c.listDepth == 0 {
|
||||
c.write("\n")
|
||||
}
|
||||
|
||||
case "pre":
|
||||
c.inPre = false
|
||||
c.write("\n```\n\n")
|
||||
case "code":
|
||||
if !c.inPre {
|
||||
c.write("`")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func HtmlToMarkdown(htmlStr string) (string, error) {
|
||||
doc, err := html.Parse(strings.NewReader(htmlStr))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
c := newConverter()
|
||||
c.walk(doc)
|
||||
|
||||
res := c.stack[0].String()
|
||||
|
||||
// Post-processing
|
||||
res = reImageOnlyLink.ReplaceAllString(res, "")
|
||||
res = reEmptyListItem.ReplaceAllString(res, "")
|
||||
res = reEmptyHeader.ReplaceAllString(res, "")
|
||||
|
||||
lines := strings.Split(res, "\n")
|
||||
var cleanLines []string
|
||||
for _, line := range lines {
|
||||
line = strings.TrimRight(line, " \t")
|
||||
cleanTest := strings.TrimSpace(line)
|
||||
if cleanTest == "[](</>)" || cleanTest == "[](#)" || cleanTest == "-" {
|
||||
cleanLines = append(cleanLines, "")
|
||||
continue
|
||||
}
|
||||
cleanLines = append(cleanLines, line)
|
||||
}
|
||||
res = strings.Join(cleanLines, "\n")
|
||||
|
||||
res = strings.TrimSpace(res)
|
||||
res = reNewlines.ReplaceAllString(res, "\n\n")
|
||||
|
||||
// Strip a single leading space from lines that are NOT list indentation.
|
||||
// "(?m)^([ \t])([^ \t\n])" matches exactly one space/tab at line start followed
|
||||
// by a non-whitespace char, so " - nested" (4 spaces) is left untouched.
|
||||
res = reLeadingLineSpace.ReplaceAllString(res, "$2")
|
||||
|
||||
return res, nil
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/sipeed/picoclaw/pkg/logger"
|
||||
)
|
||||
|
||||
func TestHtmlToMarkdown(t *testing.T) {
|
||||
// Define our test cases
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "Removes scripts and styles",
|
||||
input: `<script>alert("hello");</script><style>body { color: red; }</style><p>Clean text</p>`,
|
||||
expected: "Clean text",
|
||||
},
|
||||
{
|
||||
name: "Extracts links correctly",
|
||||
input: `Visit my <a href="https://example.com">website</a> for info.`,
|
||||
expected: "Visit my [website](https://example.com) for info.",
|
||||
},
|
||||
{
|
||||
name: "Converts headers (H1, H2, H3)",
|
||||
input: `<h1>Main Title</h1><h2>Subtitle</h2><h3>Section</h3>`,
|
||||
expected: "# Main Title\n\n## Subtitle\n\n### Section",
|
||||
},
|
||||
{
|
||||
name: "Handles bold and italics",
|
||||
input: `Text <b>bold</b> and <strong>strong</strong>, then <i>italic</i> and <em>em</em>.`,
|
||||
expected: "Text **bold** and **strong**, then *italic* and *em*.",
|
||||
},
|
||||
{
|
||||
name: "Converts lists",
|
||||
input: `<ul><li>First element</li><li>Second element</li></ul>`,
|
||||
expected: "- First element\n- Second element",
|
||||
},
|
||||
{
|
||||
name: "Handles paragraphs and line breaks (<br>)",
|
||||
input: `<p>First paragraph</p><p>Second paragraph with<br>a line break.</p>`,
|
||||
expected: "First paragraph\n\nSecond paragraph with\na line break.",
|
||||
},
|
||||
{
|
||||
name: "Decodes HTML entities",
|
||||
input: `Math: 5 > 3 & 2 < 4. A "quote".`,
|
||||
expected: "Math: 5 > 3 & 2 < 4. A \"quote\".",
|
||||
},
|
||||
{
|
||||
name: "Cleans up residual HTML tags",
|
||||
input: `<div><span>Text inside div and span</span></div>`,
|
||||
expected: "Text inside div and span",
|
||||
},
|
||||
{
|
||||
name: "Removes multiple spaces and excessive empty lines",
|
||||
input: `This text has too many spaces. <br><br><br><br> And too many newlines.`,
|
||||
expected: "This text has too many spaces.\n\nAnd too many newlines.",
|
||||
},
|
||||
{
|
||||
name: "Nested lists with indentation",
|
||||
input: "<ul><li>One<ul><li>Two</li></ul></li></ul>",
|
||||
// Expect the sub-element to have 4 spaces of indentation
|
||||
expected: "- One\n - Two",
|
||||
},
|
||||
{
|
||||
name: "Image support",
|
||||
input: `<img src="image.jpg" alt="alternative text">`,
|
||||
// Correct Markdown syntax for images
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Image support without alt-text",
|
||||
input: `<img src="image.jpg">`,
|
||||
// If alt is missing, square brackets remain empty
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "XSS Bypass on Links (Obfuscated HTML entities)",
|
||||
// The Go HTML parser resolves entities, so this becomes "javascript:alert(1)"
|
||||
input: `<a href="jav	ascript:alert(1)">Click here</a>`,
|
||||
// Our isSafeHref (if updated with net/url) should neutralize it to "#"
|
||||
expected: "[Click here](#)",
|
||||
},
|
||||
{
|
||||
name: "Empty link or used as anchor",
|
||||
input: `<a name="top"></a>`,
|
||||
// With no text or href, it shouldn't print anything (not even empty brackets)
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Link without href but with text (Textual anchor)",
|
||||
input: `<a id="top">Back to top</a>`,
|
||||
// Should extract only plain text, without generating a broken Markdown link like [Back to top](#) or [Back to top]()
|
||||
expected: "Back to top",
|
||||
},
|
||||
{
|
||||
name: "Badly spaced bold and italics (Edge Case)",
|
||||
input: `<b> Text </b>`,
|
||||
// In Markdown `** Text **` is often not formatted correctly. The ideal is `**Text**`
|
||||
expected: "**Text**",
|
||||
},
|
||||
{
|
||||
name: "Complex Test - Real Article",
|
||||
input: `
|
||||
<h1>Article Title</h1>
|
||||
<p>This is an <strong>introductory text</strong> with a <a href="http://link.com">link</a>.</p>
|
||||
<h2>Subtitle</h2>
|
||||
<ul>
|
||||
<li>Point one</li>
|
||||
<li>Point two</li>
|
||||
</ul>
|
||||
<script>console.log("do not show me")</script>
|
||||
`,
|
||||
// Note: The indentation of the real HTML test will generate spaces that
|
||||
// regex will clean up.
|
||||
expected: "# Article Title\n\nThis is an **introductory text** with a [link](http://link.com).\n\n## Subtitle\n\n- Point one\n- Point two",
|
||||
},
|
||||
{
|
||||
name: "Ordered list (OL)",
|
||||
input: `<ol><li>First</li><li>Second</li><li>Third</li></ol>`,
|
||||
expected: "1. First\n2. Second\n3. Third",
|
||||
},
|
||||
{
|
||||
name: "Ordered list nested in unordered list",
|
||||
input: `<ul><li>Fruits<ol><li>Apples</li><li>Pears</li></ol></li><li>Vegetables</li></ul>`,
|
||||
expected: "- Fruits\n 1. Apples\n 2. Pears\n- Vegetables",
|
||||
},
|
||||
{
|
||||
name: "Code block (pre/code)",
|
||||
input: "<pre><code>func main() {\n fmt.Println(\"hello\")\n}</code></pre>",
|
||||
expected: "```\nfunc main() {\n fmt.Println(\"hello\")\n}\n```",
|
||||
},
|
||||
{
|
||||
name: "Inline code",
|
||||
input: `<p>Use the command <code>go test ./...</code> to run the tests.</p>`,
|
||||
expected: "Use the command `go test ./...` to run the tests.",
|
||||
},
|
||||
{
|
||||
name: "Simple blockquote",
|
||||
input: `<blockquote><p>An important quote.</p></blockquote>`,
|
||||
expected: "> An important quote.",
|
||||
},
|
||||
{
|
||||
name: "Multiline blockquote",
|
||||
input: `<blockquote><p>First line of the quote.</p><p>Second line of the quote.</p></blockquote>`,
|
||||
expected: "> First line of the quote.\n>\n> Second line of the quote.",
|
||||
},
|
||||
{
|
||||
name: "Strikethrough text (del/s)",
|
||||
input: `This text is <del>deleted</del> and this is <s>crossed out</s>.`,
|
||||
expected: "This text is ~~deleted~~ and this is ~~crossed out~~.",
|
||||
},
|
||||
{
|
||||
name: "Horizontal separator (HR)",
|
||||
input: `<p>Above the line</p><hr><p>Below the line</p>`,
|
||||
expected: "Above the line\n\n---\n\nBelow the line",
|
||||
},
|
||||
{
|
||||
name: "Bold nested in link",
|
||||
input: `<a href="https://example.com"><strong>Linked bold text</strong></a>`,
|
||||
expected: "[**Linked bold text**](https://example.com)",
|
||||
},
|
||||
{
|
||||
name: "data-src Image (lazy loading)",
|
||||
input: `<img data-src="lazy.jpg" alt="Lazy image">`,
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Image with javascript: src blocked",
|
||||
input: `<img src="javascript:alert(1)" alt="XSS">`,
|
||||
// src is not safe, so the image is not emitted
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "Link with data: href blocked",
|
||||
input: `<a href="data:text/html,<script>alert(1)</script>">Click</a>`,
|
||||
expected: "[Click](#)",
|
||||
},
|
||||
{
|
||||
name: "Deeply nested divs",
|
||||
input: `<div><div><div><div><p>Deeply nested text</p></div></div></div></div>`,
|
||||
expected: "Deeply nested text",
|
||||
},
|
||||
{
|
||||
name: "Non-consecutive headers (H1, H3, H5)",
|
||||
input: `<h1>Title</h1><h3>Subsection</h3><h5>Sub-subsection</h5>`,
|
||||
expected: "# Title\n\n### Subsection\n\n##### Sub-subsection",
|
||||
},
|
||||
{
|
||||
name: "Paragraph with mixed multiple emphasis",
|
||||
input: `<p><strong>Important:</strong> read the <strong><em>critical instructions</em></strong> <em>carefully</em>.</p>`,
|
||||
expected: "**Important:** read the ***critical instructions*** *carefully*.",
|
||||
},
|
||||
{
|
||||
name: "Article with nav and aside sections (noise to filter)",
|
||||
input: `
|
||||
<nav><a href="/home">Home</a><a href="/about-us">About us</a></nav>
|
||||
<article>
|
||||
<h2>Article title</h2>
|
||||
<p>This is the body of the article.</p>
|
||||
</article>
|
||||
<aside><p>Advertisement</p></aside>
|
||||
`,
|
||||
expected: "## Article title\n\nThis is the body of the article.",
|
||||
},
|
||||
{
|
||||
name: "Text with mixed special HTML entities",
|
||||
input: `Copyright © 2024 — All rights reserved ®`,
|
||||
expected: "Copyright © 2024 — All rights reserved ®",
|
||||
},
|
||||
{
|
||||
name: "Mailto link",
|
||||
input: `Write to us at <a href="mailto:info@example.com">info@example.com</a>`,
|
||||
expected: "Write to us at [info@example.com](mailto:info@example.com)",
|
||||
},
|
||||
{
|
||||
name: "Image inside a link (clickable figure)",
|
||||
input: `<a href="https://example.com"><img src="photo.jpg" alt="Photo"></a>`,
|
||||
// The image-link without text must not generate broken markup
|
||||
expected: "[](https://example.com)",
|
||||
},
|
||||
{
|
||||
name: "Empty content or only whitespace",
|
||||
input: ` <p> </p> <div> </div> `,
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
|
||||
// Iterate over all test cases
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := HtmlToMarkdown(tt.input)
|
||||
if err != nil {
|
||||
logger.ErrorCF("tool", "Failed to parse html to markdown: %s", map[string]any{"error": err.Error()})
|
||||
}
|
||||
|
||||
if got != tt.expected {
|
||||
t.Errorf("\nTest case failed: %s\nInput: %q\nGot: %q\nExpected: %q",
|
||||
tt.name, tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user