mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
414 lines
8.7 KiB
Go
414 lines
8.7 KiB
Go
package utils
|
|
|
|
import (
|
|
"bytes"
|
|
"net/url"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var (
|
|
reSpaces = regexp.MustCompile(`[ \t]+`)
|
|
reNewlines = regexp.MustCompile(`\n{3,}`)
|
|
reEmptyListItem = regexp.MustCompile(`(?m)^[-*]\s*$`)
|
|
reImageOnlyLink = regexp.MustCompile(`\[!\[\]\(<[^>]*>\)\]\(<[^>]*>\)`)
|
|
reEmptyHeader = regexp.MustCompile(`(?m)^#{1,6}\s*$`)
|
|
reLeadingLineSpace = regexp.MustCompile(`(?m)^([ \t])([^ \t\n])`)
|
|
)
|
|
|
|
var skipTags = map[string]bool{
|
|
"script": true, "style": true, "head": true,
|
|
"noscript": true, "template": true,
|
|
"nav": true, "footer": true, "aside": true, "header": true, "form": true, "dialog": true,
|
|
}
|
|
|
|
func isSafeHref(href string) bool {
|
|
lower := strings.ToLower(strings.TrimSpace(href))
|
|
if strings.HasPrefix(lower, "javascript:") || strings.HasPrefix(lower, "vbscript:") ||
|
|
strings.HasPrefix(lower, "data:") {
|
|
return false
|
|
}
|
|
u, err := url.Parse(strings.TrimSpace(href))
|
|
if err != nil {
|
|
return false
|
|
}
|
|
scheme := strings.ToLower(u.Scheme)
|
|
return scheme == "" || scheme == "http" || scheme == "https" || scheme == "mailto"
|
|
}
|
|
|
|
func isSafeImageSrc(src string) bool {
|
|
lower := strings.ToLower(strings.TrimSpace(src))
|
|
if strings.HasPrefix(lower, "data:image/") {
|
|
return true
|
|
}
|
|
return isSafeHref(src)
|
|
}
|
|
|
|
func escapeMdAlt(s string) string {
|
|
s = strings.ReplaceAll(s, `\`, `\\`)
|
|
s = strings.ReplaceAll(s, `[`, `\[`)
|
|
s = strings.ReplaceAll(s, `]`, `\]`)
|
|
return s
|
|
}
|
|
|
|
func getAttr(n *html.Node, key string) string {
|
|
for _, a := range n.Attr {
|
|
if a.Key == key {
|
|
return a.Val
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func normalizeAttr(val string) string {
|
|
val = strings.ReplaceAll(val, "\n", "")
|
|
val = strings.ReplaceAll(val, "\r", "")
|
|
val = strings.ReplaceAll(val, "\t", "")
|
|
return strings.TrimSpace(val)
|
|
}
|
|
|
|
func isUnlikelyNode(n *html.Node) bool {
|
|
if n.Type != html.ElementNode {
|
|
return false
|
|
}
|
|
classId := strings.ToLower(getAttr(n, "class") + " " + getAttr(n, "id"))
|
|
if classId == " " {
|
|
return false
|
|
}
|
|
if strings.Contains(classId, "article") || strings.Contains(classId, "main") ||
|
|
strings.Contains(classId, "content") {
|
|
return false
|
|
}
|
|
unlikelyKeywords := []string{
|
|
"menu",
|
|
"nav",
|
|
"footer",
|
|
"sidebar",
|
|
"cookie",
|
|
"banner",
|
|
"sponsor",
|
|
"advert",
|
|
"popup",
|
|
"modal",
|
|
"newsletter",
|
|
"share",
|
|
"social",
|
|
}
|
|
for _, keyword := range unlikelyKeywords {
|
|
if strings.Contains(classId, keyword) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
type converter struct {
|
|
stack []*bytes.Buffer
|
|
linkHrefs []string
|
|
linkStates []bool
|
|
emphStack []string // Tracks "**", "*", "~~" for buffered emphasis
|
|
olCounters []int
|
|
inPre bool
|
|
listDepth int
|
|
}
|
|
|
|
func newConverter() *converter {
|
|
return &converter{
|
|
stack: []*bytes.Buffer{{}},
|
|
}
|
|
}
|
|
|
|
func (c *converter) write(s string) {
|
|
c.stack[len(c.stack)-1].WriteString(s)
|
|
}
|
|
|
|
func (c *converter) pushBuf() {
|
|
c.stack = append(c.stack, &bytes.Buffer{})
|
|
}
|
|
|
|
func (c *converter) popBuf() string {
|
|
top := c.stack[len(c.stack)-1]
|
|
c.stack = c.stack[:len(c.stack)-1]
|
|
return top.String()
|
|
}
|
|
|
|
func (c *converter) walk(n *html.Node) {
|
|
if n.Type == html.ElementNode {
|
|
if skipTags[n.Data] {
|
|
return
|
|
}
|
|
if isUnlikelyNode(n) {
|
|
return
|
|
}
|
|
}
|
|
|
|
if n.Type == html.TextNode {
|
|
text := n.Data
|
|
if !c.inPre {
|
|
text = strings.ReplaceAll(text, "\n", " ")
|
|
text = reSpaces.ReplaceAllString(text, " ")
|
|
}
|
|
if text != "" {
|
|
c.write(text)
|
|
}
|
|
return
|
|
}
|
|
|
|
if n.Type != html.ElementNode {
|
|
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
|
|
c.walk(ch)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Opening Tags
|
|
switch n.Data {
|
|
|
|
// Buffer emphasis content so we can TrimSpace the inner text,
|
|
// avoiding the regex-across-boundaries bug.
|
|
case "b", "strong":
|
|
c.emphStack = append(c.emphStack, "**")
|
|
c.pushBuf()
|
|
case "i", "em":
|
|
c.emphStack = append(c.emphStack, "*")
|
|
c.pushBuf()
|
|
case "del", "s":
|
|
c.emphStack = append(c.emphStack, "~~")
|
|
c.pushBuf()
|
|
|
|
case "a":
|
|
href := normalizeAttr(getAttr(n, "href"))
|
|
if href != "" && !isSafeHref(href) {
|
|
href = "#"
|
|
}
|
|
hasHref := href != ""
|
|
c.linkStates = append(c.linkStates, hasHref)
|
|
if hasHref {
|
|
c.linkHrefs = append(c.linkHrefs, href)
|
|
c.pushBuf()
|
|
}
|
|
|
|
case "h1":
|
|
c.write("\n\n# ")
|
|
case "h2":
|
|
c.write("\n\n## ")
|
|
case "h3":
|
|
c.write("\n\n### ")
|
|
case "h4":
|
|
c.write("\n\n#### ")
|
|
case "h5":
|
|
c.write("\n\n##### ")
|
|
case "h6":
|
|
c.write("\n\n###### ")
|
|
|
|
case "p":
|
|
c.write("\n\n")
|
|
case "br":
|
|
c.write("\n")
|
|
case "hr":
|
|
c.write("\n\n---\n\n")
|
|
|
|
case "ol":
|
|
c.olCounters = append(c.olCounters, 1)
|
|
// Only write leading newline for top-level list.
|
|
if c.listDepth == 0 {
|
|
c.write("\n")
|
|
}
|
|
c.listDepth++
|
|
case "ul":
|
|
if c.listDepth == 0 {
|
|
c.write("\n")
|
|
}
|
|
c.listDepth++
|
|
case "li":
|
|
c.write("\n")
|
|
if c.listDepth > 1 {
|
|
c.write(strings.Repeat(" ", c.listDepth-1))
|
|
}
|
|
if n.Parent != nil && n.Parent.Data == "ol" && len(c.olCounters) > 0 {
|
|
idx := c.olCounters[len(c.olCounters)-1]
|
|
c.write(strconv.Itoa(idx) + ". ")
|
|
c.olCounters[len(c.olCounters)-1]++
|
|
} else {
|
|
c.write("- ")
|
|
}
|
|
|
|
case "pre":
|
|
c.inPre = true
|
|
c.write("\n\n```\n")
|
|
case "code":
|
|
if !c.inPre {
|
|
c.write("`")
|
|
}
|
|
|
|
case "blockquote":
|
|
c.pushBuf()
|
|
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
|
|
c.walk(ch)
|
|
}
|
|
inner := strings.TrimSpace(c.popBuf())
|
|
lines := strings.Split(inner, "\n")
|
|
var quoted []string
|
|
for _, l := range lines {
|
|
if strings.TrimSpace(l) == "" {
|
|
quoted = append(quoted, ">")
|
|
} else {
|
|
quoted = append(quoted, "> "+l)
|
|
}
|
|
}
|
|
var deduped []string
|
|
for i, line := range quoted {
|
|
if line == ">" && i > 0 && deduped[len(deduped)-1] == ">" {
|
|
continue
|
|
}
|
|
deduped = append(deduped, line)
|
|
}
|
|
c.write("\n\n" + strings.Join(deduped, "\n") + "\n\n")
|
|
return
|
|
|
|
case "img":
|
|
src := normalizeAttr(getAttr(n, "src"))
|
|
if src == "" {
|
|
src = normalizeAttr(getAttr(n, "data-src"))
|
|
}
|
|
if src == "" {
|
|
return
|
|
}
|
|
alt := escapeMdAlt(normalizeAttr(getAttr(n, "alt")))
|
|
if isSafeImageSrc(src) {
|
|
c.write("")
|
|
}
|
|
return
|
|
}
|
|
|
|
// Traverse Children
|
|
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
|
|
c.walk(ch)
|
|
}
|
|
|
|
// Closing Tags
|
|
switch n.Data {
|
|
|
|
// Pop buffer, trim, wrap with the correct marker.
|
|
case "b", "strong", "i", "em", "del", "s":
|
|
if len(c.emphStack) == 0 {
|
|
break
|
|
}
|
|
marker := c.emphStack[len(c.emphStack)-1]
|
|
c.emphStack = c.emphStack[:len(c.emphStack)-1]
|
|
inner := strings.TrimSpace(c.popBuf())
|
|
if inner != "" {
|
|
c.write(marker + inner + marker)
|
|
}
|
|
|
|
case "a":
|
|
if len(c.linkStates) == 0 {
|
|
break
|
|
}
|
|
hasHref := c.linkStates[len(c.linkStates)-1]
|
|
c.linkStates = c.linkStates[:len(c.linkStates)-1]
|
|
if !hasHref {
|
|
break
|
|
}
|
|
href := c.linkHrefs[len(c.linkHrefs)-1]
|
|
c.linkHrefs = c.linkHrefs[:len(c.linkHrefs)-1]
|
|
inner := strings.TrimSpace(c.popBuf())
|
|
if strings.Contains(inner, "\n") {
|
|
lines := strings.Split(inner, "\n")
|
|
linked := false
|
|
for i, l := range lines {
|
|
cleanLine := strings.TrimSpace(l)
|
|
if cleanLine != "" && !strings.HasPrefix(cleanLine, "![") && !linked {
|
|
lines[i] = "[" + cleanLine + "](" + href + ")"
|
|
linked = true
|
|
}
|
|
}
|
|
c.write(strings.Join(lines, "\n"))
|
|
} else {
|
|
c.write("[" + inner + "](" + href + ")")
|
|
}
|
|
|
|
case "h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"p",
|
|
"div",
|
|
"section",
|
|
"article",
|
|
"header",
|
|
"footer",
|
|
"aside",
|
|
"nav",
|
|
"figure":
|
|
c.write("\n")
|
|
|
|
case "ol":
|
|
c.listDepth--
|
|
if len(c.olCounters) > 0 {
|
|
c.olCounters = c.olCounters[:len(c.olCounters)-1]
|
|
}
|
|
if c.listDepth == 0 {
|
|
c.write("\n")
|
|
}
|
|
case "ul":
|
|
c.listDepth--
|
|
if c.listDepth == 0 {
|
|
c.write("\n")
|
|
}
|
|
|
|
case "pre":
|
|
c.inPre = false
|
|
c.write("\n```\n\n")
|
|
case "code":
|
|
if !c.inPre {
|
|
c.write("`")
|
|
}
|
|
}
|
|
}
|
|
|
|
func HtmlToMarkdown(htmlStr string) (string, error) {
|
|
doc, err := html.Parse(strings.NewReader(htmlStr))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
c := newConverter()
|
|
c.walk(doc)
|
|
|
|
res := c.stack[0].String()
|
|
|
|
// Post-processing
|
|
res = reImageOnlyLink.ReplaceAllString(res, "")
|
|
res = reEmptyListItem.ReplaceAllString(res, "")
|
|
res = reEmptyHeader.ReplaceAllString(res, "")
|
|
|
|
lines := strings.Split(res, "\n")
|
|
var cleanLines []string
|
|
for _, line := range lines {
|
|
line = strings.TrimRight(line, " \t")
|
|
cleanTest := strings.TrimSpace(line)
|
|
if cleanTest == "[](</>)" || cleanTest == "[](#)" || cleanTest == "-" {
|
|
cleanLines = append(cleanLines, "")
|
|
continue
|
|
}
|
|
cleanLines = append(cleanLines, line)
|
|
}
|
|
res = strings.Join(cleanLines, "\n")
|
|
|
|
res = strings.TrimSpace(res)
|
|
res = reNewlines.ReplaceAllString(res, "\n\n")
|
|
|
|
// Strip a single leading space from lines that are NOT list indentation.
|
|
// "(?m)^([ \t])([^ \t\n])" matches exactly one space/tab at line start followed
|
|
// by a non-whitespace char, so " - nested" (4 spaces) is left untouched.
|
|
res = reLeadingLineSpace.ReplaceAllString(res, "$2")
|
|
|
|
return res, nil
|
|
}
|