mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
12f4029610
* feat: telegram use parse mode ModeMarkdownV2 instead of ModeHTML * handle expandable block quotation starts, add test for all md2 formats * fix: linter issue * feat: added flag use_markdown_v2, corrected config, updated documentation * move parseChatID to parser_markdown_to_html * fix: tests and linter issues * fix: case with ~ * test: fixed Test_markdownToTelegramMarkdownV2 * fix: regex block-quote line > * fix: linter issues * fix: send chunk param mismatched, in edit msg use HTML parse mode too * fix: remove from .gitignore redundant comment
198 lines
5.9 KiB
Go
198 lines
5.9 KiB
Go
package telegram
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// mdV2SpecialChars are all characters that must be escaped in Telegram MarkdownV2
|
|
var mdV2SpecialChars = map[rune]bool{
|
|
'*': true,
|
|
'_': true,
|
|
'[': true,
|
|
']': true,
|
|
'(': true,
|
|
')': true,
|
|
'~': true,
|
|
'`': true,
|
|
'>': true,
|
|
'<': true,
|
|
'#': true,
|
|
'+': true,
|
|
'-': true,
|
|
'=': true,
|
|
'|': true,
|
|
'{': true,
|
|
'}': true,
|
|
'.': true,
|
|
'!': true,
|
|
'\\': true,
|
|
}
|
|
|
|
// entityPattern describes one Telegram MarkdownV2 inline entity type.
|
|
type entityPattern struct {
|
|
re *regexp.Regexp
|
|
open string
|
|
close string
|
|
}
|
|
|
|
// allEntityPatterns lists every recognized entity in priority order
|
|
// (longer / more-specific delimiters first so they win over shorter ones).
|
|
// Each entry's regex is anchored to find the first occurrence in a string.
|
|
var allEntityPatterns = []entityPattern{
|
|
// fenced code block — content is completely verbatim
|
|
{re: regexp.MustCompile("(?s)```(?:[\\w]*\\n)?[\\s\\S]*?```"), open: "```", close: "```"},
|
|
// inline code — content is completely verbatim
|
|
{re: regexp.MustCompile("`(?:[^`\\\n]|\\\\.)*`"), open: "`", close: "`"},
|
|
// expandable block-quote opener **>…
|
|
{re: regexp.MustCompile(`(?m)\*\*>(?:[^\n]*)`), open: "**>", close: ""},
|
|
// block-quote line >…
|
|
{re: regexp.MustCompile(`(?m)^>(?:[^\n]*)`), open: ">", close: ""},
|
|
// custom emoji / timestamp  — must come before plain link
|
|
{re: regexp.MustCompile(`!\[[^\]]*\]\([^)]*\)`), open: "!", close: ""},
|
|
// inline URL / user mention […](…)
|
|
{re: regexp.MustCompile(`\[[^\]]*\]\([^)]*\)`), open: "[", close: ""},
|
|
// spoiler ||…|| — before single | so it wins
|
|
{re: regexp.MustCompile(`\|\|(?:[^|\\\n]|\\.)*\|\|`), open: "||", close: "||"},
|
|
// underline __…__ — before single _ so it wins
|
|
{re: regexp.MustCompile(`__(?:[^_\\\n]|\\.)*__`), open: "__", close: "__"},
|
|
// bold *…*
|
|
{re: regexp.MustCompile(`\*(?:[^*\\\n]|\\.)*\*`), open: "*", close: "*"},
|
|
// italic _…_
|
|
{re: regexp.MustCompile(`_(?:[^_\\\n]|\\.)*_`), open: "_", close: "_"},
|
|
// strikethrough ~…~
|
|
{re: regexp.MustCompile(`~(?:[^~\\\n]|\\.)*~`), open: "~", close: "~"},
|
|
}
|
|
|
|
// verbatimEntities are entity types whose inner content must never be
|
|
// touched (code blocks, URLs, quotes, custom emoji).
|
|
// Their content is passed through completely unchanged.
|
|
var verbatimEntities = map[string]bool{
|
|
"```": true,
|
|
"`": true,
|
|
"**>": true,
|
|
">": true,
|
|
"!": true,
|
|
"[": true,
|
|
}
|
|
|
|
// markdownToTelegramMarkdownV2 converts a Markdown string into a string safe
|
|
// for sending with Telegram's MarkdownV2 parse mode.
|
|
//
|
|
// Rules:
|
|
// - Markdown headings (# … ######) are converted to *bold*.
|
|
// - **bold** Markdown syntax is converted to *bold*.
|
|
// - Recognized Telegram MarkdownV2 entity spans are preserved; their inner
|
|
// content is processed recursively so that nested valid entities are kept
|
|
// intact while stray special characters are escaped.
|
|
// - All plain-text segments have their MarkdownV2 special characters escaped.
|
|
//
|
|
// Reference: https://core.telegram.org/bots/api#formatting-options
|
|
func markdownToTelegramMarkdownV2(text string) string {
|
|
// 1. Convert Markdown headings → *escaped heading text*
|
|
text = reHeading.ReplaceAllStringFunc(text, func(match string) string {
|
|
sub := reHeading.FindStringSubmatch(match)
|
|
if len(sub) < 2 {
|
|
return match
|
|
}
|
|
// The heading content is fresh plain text — escape everything
|
|
// including * so the resulting *…* bold span stays valid.
|
|
return "*" + escapeMarkdownV2(sub[1]) + "*"
|
|
})
|
|
|
|
// 2. Convert **bold** → *bold*
|
|
text = reBoldStar.ReplaceAllString(text, "*$1*")
|
|
|
|
// 3. Recursively escape the full string.
|
|
return processText(text)
|
|
}
|
|
|
|
// processText walks `text`, finds the leftmost / longest matching entity,
|
|
// escapes the gap before it, processes the entity (recursing into its inner
|
|
// content when appropriate), then continues with the remainder.
|
|
func processText(text string) string {
|
|
if text == "" {
|
|
return ""
|
|
}
|
|
|
|
// Find the leftmost match among all entity patterns.
|
|
bestStart := -1
|
|
bestEnd := -1
|
|
var bestPat *entityPattern
|
|
|
|
for i := range allEntityPatterns {
|
|
p := &allEntityPatterns[i]
|
|
loc := p.re.FindStringIndex(text)
|
|
if loc == nil {
|
|
continue
|
|
}
|
|
if bestStart == -1 || loc[0] < bestStart ||
|
|
(loc[0] == bestStart && (loc[1]-loc[0]) > (bestEnd-bestStart)) {
|
|
bestStart = loc[0]
|
|
bestEnd = loc[1]
|
|
bestPat = p
|
|
}
|
|
}
|
|
|
|
if bestPat == nil {
|
|
// No entity found — escape everything.
|
|
return escapeMarkdownV2(text)
|
|
}
|
|
|
|
var b strings.Builder
|
|
|
|
// Plain text before the entity.
|
|
if bestStart > 0 {
|
|
b.WriteString(escapeMarkdownV2(text[:bestStart]))
|
|
}
|
|
|
|
// The matched entity span.
|
|
matched := text[bestStart:bestEnd]
|
|
|
|
if verbatimEntities[bestPat.open] {
|
|
// Code blocks, URLs, quotes: pass through completely untouched.
|
|
b.WriteString(matched)
|
|
} else {
|
|
// Inline formatting (bold, italic, underline, strikethrough, spoiler):
|
|
// keep the delimiters and recursively process the inner content so that
|
|
// nested entities survive but stray specials get escaped.
|
|
openLen := len(bestPat.open)
|
|
closeLen := len(bestPat.close)
|
|
inner := matched[openLen : len(matched)-closeLen]
|
|
|
|
b.WriteString(bestPat.open)
|
|
b.WriteString(processText(inner))
|
|
b.WriteString(bestPat.close)
|
|
}
|
|
|
|
// Continue with the remainder of the string.
|
|
b.WriteString(processText(text[bestEnd:]))
|
|
|
|
return b.String()
|
|
}
|
|
|
|
// escapeMarkdownV2 escapes every MarkdownV2 special character in a plain-text
|
|
// segment (i.e. a segment that is not part of any recognized entity).
|
|
// Already-escaped sequences (backslash + char) are forwarded verbatim to avoid
|
|
// double-escaping.
|
|
func escapeMarkdownV2(s string) string {
|
|
var b strings.Builder
|
|
b.Grow(len(s) + 8)
|
|
runes := []rune(s)
|
|
for i := 0; i < len(runes); i++ {
|
|
ch := runes[i]
|
|
// Forward an existing escape sequence verbatim.
|
|
if ch == '\\' && i+1 < len(runes) {
|
|
b.WriteRune(ch)
|
|
b.WriteRune(runes[i+1])
|
|
i++
|
|
continue
|
|
}
|
|
if mdV2SpecialChars[ch] {
|
|
b.WriteByte('\\')
|
|
}
|
|
b.WriteRune(ch)
|
|
}
|
|
return b.String()
|
|
}
|