Perf/precompile regex (#687)

* perf: pre-compile regexes at package level

Move regexp.MustCompile calls from inside methods to package-level
variables in web.go (7 regexes) and loader.go (2 regexes).
This avoids repeated compilation on every invocation.

Amp-Thread-ID: https://ampcode.com/threads/T-019c79c3-ea1c-7471-b09d-be90ba0e1ca0
Co-authored-by: Amp <amp@ampcode.com>

* perf: pre-compile regexes at package level

* retain the helpful comment

---------

Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
mattn
2026-02-26 18:44:03 +09:00
committed by GitHub
parent cb3191c8c1
commit 8a1fb03974
3 changed files with 52 additions and 41 deletions
+24 -14
View File
@@ -23,6 +23,19 @@ import (
"github.com/sipeed/picoclaw/pkg/voice"
)
var (
reHeading = regexp.MustCompile(`^#{1,6}\s+(.+)$`)
reBlockquote = regexp.MustCompile(`^>\s*(.*)$`)
reLink = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`)
reBoldStar = regexp.MustCompile(`\*\*(.+?)\*\*`)
reBoldUnder = regexp.MustCompile(`__(.+?)__`)
reItalic = regexp.MustCompile(`_([^_]+)_`)
reStrike = regexp.MustCompile(`~~(.+?)~~`)
reListItem = regexp.MustCompile(`^[-*]\s+`)
reCodeBlock = regexp.MustCompile("```[\\w]*\\n?([\\s\\S]*?)```")
reInlineCode = regexp.MustCompile("`([^`]+)`")
)
type TelegramChannel struct {
*BaseChannel
bot *telego.Bot
@@ -431,19 +444,18 @@ func markdownToTelegramHTML(text string) string {
inlineCodes := extractInlineCodes(text)
text = inlineCodes.text
text = regexp.MustCompile(`^#{1,6}\s+(.+)$`).ReplaceAllString(text, "$1")
text = reHeading.ReplaceAllString(text, "$1")
text = regexp.MustCompile(`^>\s*(.*)$`).ReplaceAllString(text, "$1")
text = reBlockquote.ReplaceAllString(text, "$1")
text = escapeHTML(text)
text = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`).ReplaceAllString(text, `<a href="$2">$1</a>`)
text = reLink.ReplaceAllString(text, `<a href="$2">$1</a>`)
text = regexp.MustCompile(`\*\*(.+?)\*\*`).ReplaceAllString(text, "<b>$1</b>")
text = reBoldStar.ReplaceAllString(text, "<b>$1</b>")
text = regexp.MustCompile(`__(.+?)__`).ReplaceAllString(text, "<b>$1</b>")
text = reBoldUnder.ReplaceAllString(text, "<b>$1</b>")
reItalic := regexp.MustCompile(`_([^_]+)_`)
text = reItalic.ReplaceAllStringFunc(text, func(s string) string {
match := reItalic.FindStringSubmatch(s)
if len(match) < 2 {
@@ -452,9 +464,9 @@ func markdownToTelegramHTML(text string) string {
return "<i>" + match[1] + "</i>"
})
text = regexp.MustCompile(`~~(.+?)~~`).ReplaceAllString(text, "<s>$1</s>")
text = reStrike.ReplaceAllString(text, "<s>$1</s>")
text = regexp.MustCompile(`^[-*]\s+`).ReplaceAllString(text, "• ")
text = reListItem.ReplaceAllString(text, "• ")
for i, code := range inlineCodes.codes {
escaped := escapeHTML(code)
@@ -479,8 +491,7 @@ type codeBlockMatch struct {
}
func extractCodeBlocks(text string) codeBlockMatch {
re := regexp.MustCompile("```[\\w]*\\n?([\\s\\S]*?)```")
matches := re.FindAllStringSubmatch(text, -1)
matches := reCodeBlock.FindAllStringSubmatch(text, -1)
codes := make([]string, 0, len(matches))
for _, match := range matches {
@@ -488,7 +499,7 @@ func extractCodeBlocks(text string) codeBlockMatch {
}
i := 0
text = re.ReplaceAllStringFunc(text, func(m string) string {
text = reCodeBlock.ReplaceAllStringFunc(text, func(m string) string {
placeholder := fmt.Sprintf("\x00CB%d\x00", i)
i++
return placeholder
@@ -503,8 +514,7 @@ type inlineCodeMatch struct {
}
func extractInlineCodes(text string) inlineCodeMatch {
re := regexp.MustCompile("`([^`]+)`")
matches := re.FindAllStringSubmatch(text, -1)
matches := reInlineCode.FindAllStringSubmatch(text, -1)
codes := make([]string, 0, len(matches))
for _, match := range matches {
@@ -512,7 +522,7 @@ func extractInlineCodes(text string) inlineCodeMatch {
}
i := 0
text = re.ReplaceAllStringFunc(text, func(m string) string {
text = reInlineCode.ReplaceAllStringFunc(text, func(m string) string {
placeholder := fmt.Sprintf("\x00IC%d\x00", i)
i++
return placeholder
+7 -11
View File
@@ -13,7 +13,11 @@ import (
"github.com/sipeed/picoclaw/pkg/logger"
)
var namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`)
var (
namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`)
reFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`)
reStripFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`)
)
const (
MaxNameLength = 64
@@ -257,10 +261,7 @@ func (sl *SkillsLoader) parseSimpleYAML(content string) map[string]string {
func (sl *SkillsLoader) extractFrontmatter(content string) string {
// Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks
// (?s) enables DOTALL so . matches newlines;
// ^--- at start, then ... --- at start of line, honoring all three line ending types
re := regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`)
match := re.FindStringSubmatch(content)
match := reFrontmatter.FindStringSubmatch(content)
if len(match) > 1 {
return match[1]
}
@@ -268,12 +269,7 @@ func (sl *SkillsLoader) extractFrontmatter(content string) string {
}
func (sl *SkillsLoader) stripFrontmatter(content string) string {
// Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks
// (?s) enables DOTALL so . matches newlines;
// ^--- at start, then ... --- at start of line, honoring all three line ending types
// Match zero or more trailing line endings after closing --- (handles both with and without blank lines)
re := regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`)
return re.ReplaceAllString(content, "")
return reStripFrontmatter.ReplaceAllString(content, "")
}
func escapeXML(s string) string {
+21 -16
View File
@@ -17,6 +17,19 @@ const (
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
// Pre-compiled regexes for HTML text extraction
var (
reScript = regexp.MustCompile(`<script[\s\S]*?</script>`)
reStyle = regexp.MustCompile(`<style[\s\S]*?</style>`)
reTags = regexp.MustCompile(`<[^>]+>`)
reWhitespace = regexp.MustCompile(`[^\S\n]+`)
reBlankLines = regexp.MustCompile(`\n{3,}`)
// DuckDuckGo result extraction
reDDGLink = regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
reDDGSnippet = regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
)
// createHTTPClient creates an HTTP client with optional proxy support
func createHTTPClient(proxyURL string, timeout time.Duration) (*http.Client, error) {
client := &http.Client{
@@ -251,8 +264,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
// Try finding the result links directly first, as they are the most critical
// Pattern: <a class="result__a" href="...">Title</a>
// The previous regex was a bit strict. Let's make it more flexible for attributes order/content
reLink := regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
matches := reLink.FindAllStringSubmatch(html, count+5)
matches := reDDGLink.FindAllStringSubmatch(html, count+5)
if len(matches) == 0 {
return fmt.Sprintf("No results found or extraction failed. Query: %s", query), nil
@@ -269,8 +281,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
// A better regex approach: iterate through text and find matches in order
// But for now, let's grab all snippets too
reSnippet := regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
snippetMatches := reSnippet.FindAllStringSubmatch(html, count+5)
snippetMatches := reDDGSnippet.FindAllStringSubmatch(html, count+5)
maxItems := min(len(matches), count)
@@ -305,8 +316,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
}
func stripTags(content string) string {
re := regexp.MustCompile(`<[^>]+>`)
return re.ReplaceAllString(content, "")
return reTags.ReplaceAllString(content, "")
}
type PerplexitySearchProvider struct {
@@ -654,19 +664,14 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
}
func (t *WebFetchTool) extractText(htmlContent string) string {
re := regexp.MustCompile(`<script[\s\S]*?</script>`)
result := re.ReplaceAllLiteralString(htmlContent, "")
re = regexp.MustCompile(`<style[\s\S]*?</style>`)
result = re.ReplaceAllLiteralString(result, "")
re = regexp.MustCompile(`<[^>]+>`)
result = re.ReplaceAllLiteralString(result, "")
result := reScript.ReplaceAllLiteralString(htmlContent, "")
result = reStyle.ReplaceAllLiteralString(result, "")
result = reTags.ReplaceAllLiteralString(result, "")
result = strings.TrimSpace(result)
re = regexp.MustCompile(`[^\S\n]+`)
result = re.ReplaceAllString(result, " ")
re = regexp.MustCompile(`\n{3,}`)
result = re.ReplaceAllString(result, "\n\n")
result = reWhitespace.ReplaceAllString(result, " ")
result = reBlankLines.ReplaceAllString(result, "\n\n")
lines := strings.Split(result, "\n")
var cleanLines []string