diff --git a/pkg/channels/telegram.go b/pkg/channels/telegram.go index 524494849..6592d9bc0 100644 --- a/pkg/channels/telegram.go +++ b/pkg/channels/telegram.go @@ -23,6 +23,19 @@ import ( "github.com/sipeed/picoclaw/pkg/voice" ) +var ( + reHeading = regexp.MustCompile(`^#{1,6}\s+(.+)$`) + reBlockquote = regexp.MustCompile(`^>\s*(.*)$`) + reLink = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`) + reBoldStar = regexp.MustCompile(`\*\*(.+?)\*\*`) + reBoldUnder = regexp.MustCompile(`__(.+?)__`) + reItalic = regexp.MustCompile(`_([^_]+)_`) + reStrike = regexp.MustCompile(`~~(.+?)~~`) + reListItem = regexp.MustCompile(`^[-*]\s+`) + reCodeBlock = regexp.MustCompile("```[\\w]*\\n?([\\s\\S]*?)```") + reInlineCode = regexp.MustCompile("`([^`]+)`") +) + type TelegramChannel struct { *BaseChannel bot *telego.Bot @@ -431,19 +444,18 @@ func markdownToTelegramHTML(text string) string { inlineCodes := extractInlineCodes(text) text = inlineCodes.text - text = regexp.MustCompile(`^#{1,6}\s+(.+)$`).ReplaceAllString(text, "$1") + text = reHeading.ReplaceAllString(text, "$1") - text = regexp.MustCompile(`^>\s*(.*)$`).ReplaceAllString(text, "$1") + text = reBlockquote.ReplaceAllString(text, "$1") text = escapeHTML(text) - text = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`).ReplaceAllString(text, `$1`) + text = reLink.ReplaceAllString(text, `$1`) - text = regexp.MustCompile(`\*\*(.+?)\*\*`).ReplaceAllString(text, "$1") + text = reBoldStar.ReplaceAllString(text, "$1") - text = regexp.MustCompile(`__(.+?)__`).ReplaceAllString(text, "$1") + text = reBoldUnder.ReplaceAllString(text, "$1") - reItalic := regexp.MustCompile(`_([^_]+)_`) text = reItalic.ReplaceAllStringFunc(text, func(s string) string { match := reItalic.FindStringSubmatch(s) if len(match) < 2 { @@ -452,9 +464,9 @@ func markdownToTelegramHTML(text string) string { return "" + match[1] + "" }) - text = regexp.MustCompile(`~~(.+?)~~`).ReplaceAllString(text, "$1") + text = reStrike.ReplaceAllString(text, "$1") - text = regexp.MustCompile(`^[-*]\s+`).ReplaceAllString(text, "• ") + text = reListItem.ReplaceAllString(text, "• ") for i, code := range inlineCodes.codes { escaped := escapeHTML(code) @@ -479,8 +491,7 @@ type codeBlockMatch struct { } func extractCodeBlocks(text string) codeBlockMatch { - re := regexp.MustCompile("```[\\w]*\\n?([\\s\\S]*?)```") - matches := re.FindAllStringSubmatch(text, -1) + matches := reCodeBlock.FindAllStringSubmatch(text, -1) codes := make([]string, 0, len(matches)) for _, match := range matches { @@ -488,7 +499,7 @@ func extractCodeBlocks(text string) codeBlockMatch { } i := 0 - text = re.ReplaceAllStringFunc(text, func(m string) string { + text = reCodeBlock.ReplaceAllStringFunc(text, func(m string) string { placeholder := fmt.Sprintf("\x00CB%d\x00", i) i++ return placeholder @@ -503,8 +514,7 @@ type inlineCodeMatch struct { } func extractInlineCodes(text string) inlineCodeMatch { - re := regexp.MustCompile("`([^`]+)`") - matches := re.FindAllStringSubmatch(text, -1) + matches := reInlineCode.FindAllStringSubmatch(text, -1) codes := make([]string, 0, len(matches)) for _, match := range matches { @@ -512,7 +522,7 @@ func extractInlineCodes(text string) inlineCodeMatch { } i := 0 - text = re.ReplaceAllStringFunc(text, func(m string) string { + text = reInlineCode.ReplaceAllStringFunc(text, func(m string) string { placeholder := fmt.Sprintf("\x00IC%d\x00", i) i++ return placeholder diff --git a/pkg/skills/loader.go b/pkg/skills/loader.go index 5749d8983..67d3e70e0 100644 --- a/pkg/skills/loader.go +++ b/pkg/skills/loader.go @@ -13,7 +13,11 @@ import ( "github.com/sipeed/picoclaw/pkg/logger" ) -var namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`) +var ( + namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`) + reFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`) + reStripFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`) +) const ( MaxNameLength = 64 @@ -257,10 +261,7 @@ func (sl *SkillsLoader) parseSimpleYAML(content string) map[string]string { func (sl *SkillsLoader) extractFrontmatter(content string) string { // Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks - // (?s) enables DOTALL so . matches newlines; - // ^--- at start, then ... --- at start of line, honoring all three line ending types - re := regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`) - match := re.FindStringSubmatch(content) + match := reFrontmatter.FindStringSubmatch(content) if len(match) > 1 { return match[1] } @@ -268,12 +269,7 @@ func (sl *SkillsLoader) extractFrontmatter(content string) string { } func (sl *SkillsLoader) stripFrontmatter(content string) string { - // Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks - // (?s) enables DOTALL so . matches newlines; - // ^--- at start, then ... --- at start of line, honoring all three line ending types - // Match zero or more trailing line endings after closing --- (handles both with and without blank lines) - re := regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`) - return re.ReplaceAllString(content, "") + return reStripFrontmatter.ReplaceAllString(content, "") } func escapeXML(s string) string { diff --git a/pkg/tools/web.go b/pkg/tools/web.go index 44df28215..8ba2a723a 100644 --- a/pkg/tools/web.go +++ b/pkg/tools/web.go @@ -17,6 +17,19 @@ const ( userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) +// Pre-compiled regexes for HTML text extraction +var ( + reScript = regexp.MustCompile(``) + reStyle = regexp.MustCompile(``) + reTags = regexp.MustCompile(`<[^>]+>`) + reWhitespace = regexp.MustCompile(`[^\S\n]+`) + reBlankLines = regexp.MustCompile(`\n{3,}`) + + // DuckDuckGo result extraction + reDDGLink = regexp.MustCompile(`]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)`) + reDDGSnippet = regexp.MustCompile(`([\s\S]*?)`) +) + // createHTTPClient creates an HTTP client with optional proxy support func createHTTPClient(proxyURL string, timeout time.Duration) (*http.Client, error) { client := &http.Client{ @@ -251,8 +264,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query // Try finding the result links directly first, as they are the most critical // Pattern: Title // The previous regex was a bit strict. Let's make it more flexible for attributes order/content - reLink := regexp.MustCompile(`]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)`) - matches := reLink.FindAllStringSubmatch(html, count+5) + matches := reDDGLink.FindAllStringSubmatch(html, count+5) if len(matches) == 0 { return fmt.Sprintf("No results found or extraction failed. Query: %s", query), nil @@ -269,8 +281,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query // A better regex approach: iterate through text and find matches in order // But for now, let's grab all snippets too - reSnippet := regexp.MustCompile(`([\s\S]*?)`) - snippetMatches := reSnippet.FindAllStringSubmatch(html, count+5) + snippetMatches := reDDGSnippet.FindAllStringSubmatch(html, count+5) maxItems := min(len(matches), count) @@ -305,8 +316,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query } func stripTags(content string) string { - re := regexp.MustCompile(`<[^>]+>`) - return re.ReplaceAllString(content, "") + return reTags.ReplaceAllString(content, "") } type PerplexitySearchProvider struct { @@ -654,19 +664,14 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe } func (t *WebFetchTool) extractText(htmlContent string) string { - re := regexp.MustCompile(``) - result := re.ReplaceAllLiteralString(htmlContent, "") - re = regexp.MustCompile(``) - result = re.ReplaceAllLiteralString(result, "") - re = regexp.MustCompile(`<[^>]+>`) - result = re.ReplaceAllLiteralString(result, "") + result := reScript.ReplaceAllLiteralString(htmlContent, "") + result = reStyle.ReplaceAllLiteralString(result, "") + result = reTags.ReplaceAllLiteralString(result, "") result = strings.TrimSpace(result) - re = regexp.MustCompile(`[^\S\n]+`) - result = re.ReplaceAllString(result, " ") - re = regexp.MustCompile(`\n{3,}`) - result = re.ReplaceAllString(result, "\n\n") + result = reWhitespace.ReplaceAllString(result, " ") + result = reBlankLines.ReplaceAllString(result, "\n\n") lines := strings.Split(result, "\n") var cleanLines []string