mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
Perf/precompile regex (#687)
* perf: pre-compile regexes at package level Move regexp.MustCompile calls from inside methods to package-level variables in web.go (7 regexes) and loader.go (2 regexes). This avoids repeated compilation on every invocation. Amp-Thread-ID: https://ampcode.com/threads/T-019c79c3-ea1c-7471-b09d-be90ba0e1ca0 Co-authored-by: Amp <amp@ampcode.com> * perf: pre-compile regexes at package level * retain the helpful comment --------- Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
+24
-14
@@ -23,6 +23,19 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/voice"
|
||||
)
|
||||
|
||||
var (
|
||||
reHeading = regexp.MustCompile(`^#{1,6}\s+(.+)$`)
|
||||
reBlockquote = regexp.MustCompile(`^>\s*(.*)$`)
|
||||
reLink = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`)
|
||||
reBoldStar = regexp.MustCompile(`\*\*(.+?)\*\*`)
|
||||
reBoldUnder = regexp.MustCompile(`__(.+?)__`)
|
||||
reItalic = regexp.MustCompile(`_([^_]+)_`)
|
||||
reStrike = regexp.MustCompile(`~~(.+?)~~`)
|
||||
reListItem = regexp.MustCompile(`^[-*]\s+`)
|
||||
reCodeBlock = regexp.MustCompile("```[\\w]*\\n?([\\s\\S]*?)```")
|
||||
reInlineCode = regexp.MustCompile("`([^`]+)`")
|
||||
)
|
||||
|
||||
type TelegramChannel struct {
|
||||
*BaseChannel
|
||||
bot *telego.Bot
|
||||
@@ -431,19 +444,18 @@ func markdownToTelegramHTML(text string) string {
|
||||
inlineCodes := extractInlineCodes(text)
|
||||
text = inlineCodes.text
|
||||
|
||||
text = regexp.MustCompile(`^#{1,6}\s+(.+)$`).ReplaceAllString(text, "$1")
|
||||
text = reHeading.ReplaceAllString(text, "$1")
|
||||
|
||||
text = regexp.MustCompile(`^>\s*(.*)$`).ReplaceAllString(text, "$1")
|
||||
text = reBlockquote.ReplaceAllString(text, "$1")
|
||||
|
||||
text = escapeHTML(text)
|
||||
|
||||
text = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`).ReplaceAllString(text, `<a href="$2">$1</a>`)
|
||||
text = reLink.ReplaceAllString(text, `<a href="$2">$1</a>`)
|
||||
|
||||
text = regexp.MustCompile(`\*\*(.+?)\*\*`).ReplaceAllString(text, "<b>$1</b>")
|
||||
text = reBoldStar.ReplaceAllString(text, "<b>$1</b>")
|
||||
|
||||
text = regexp.MustCompile(`__(.+?)__`).ReplaceAllString(text, "<b>$1</b>")
|
||||
text = reBoldUnder.ReplaceAllString(text, "<b>$1</b>")
|
||||
|
||||
reItalic := regexp.MustCompile(`_([^_]+)_`)
|
||||
text = reItalic.ReplaceAllStringFunc(text, func(s string) string {
|
||||
match := reItalic.FindStringSubmatch(s)
|
||||
if len(match) < 2 {
|
||||
@@ -452,9 +464,9 @@ func markdownToTelegramHTML(text string) string {
|
||||
return "<i>" + match[1] + "</i>"
|
||||
})
|
||||
|
||||
text = regexp.MustCompile(`~~(.+?)~~`).ReplaceAllString(text, "<s>$1</s>")
|
||||
text = reStrike.ReplaceAllString(text, "<s>$1</s>")
|
||||
|
||||
text = regexp.MustCompile(`^[-*]\s+`).ReplaceAllString(text, "• ")
|
||||
text = reListItem.ReplaceAllString(text, "• ")
|
||||
|
||||
for i, code := range inlineCodes.codes {
|
||||
escaped := escapeHTML(code)
|
||||
@@ -479,8 +491,7 @@ type codeBlockMatch struct {
|
||||
}
|
||||
|
||||
func extractCodeBlocks(text string) codeBlockMatch {
|
||||
re := regexp.MustCompile("```[\\w]*\\n?([\\s\\S]*?)```")
|
||||
matches := re.FindAllStringSubmatch(text, -1)
|
||||
matches := reCodeBlock.FindAllStringSubmatch(text, -1)
|
||||
|
||||
codes := make([]string, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
@@ -488,7 +499,7 @@ func extractCodeBlocks(text string) codeBlockMatch {
|
||||
}
|
||||
|
||||
i := 0
|
||||
text = re.ReplaceAllStringFunc(text, func(m string) string {
|
||||
text = reCodeBlock.ReplaceAllStringFunc(text, func(m string) string {
|
||||
placeholder := fmt.Sprintf("\x00CB%d\x00", i)
|
||||
i++
|
||||
return placeholder
|
||||
@@ -503,8 +514,7 @@ type inlineCodeMatch struct {
|
||||
}
|
||||
|
||||
func extractInlineCodes(text string) inlineCodeMatch {
|
||||
re := regexp.MustCompile("`([^`]+)`")
|
||||
matches := re.FindAllStringSubmatch(text, -1)
|
||||
matches := reInlineCode.FindAllStringSubmatch(text, -1)
|
||||
|
||||
codes := make([]string, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
@@ -512,7 +522,7 @@ func extractInlineCodes(text string) inlineCodeMatch {
|
||||
}
|
||||
|
||||
i := 0
|
||||
text = re.ReplaceAllStringFunc(text, func(m string) string {
|
||||
text = reInlineCode.ReplaceAllStringFunc(text, func(m string) string {
|
||||
placeholder := fmt.Sprintf("\x00IC%d\x00", i)
|
||||
i++
|
||||
return placeholder
|
||||
|
||||
+7
-11
@@ -13,7 +13,11 @@ import (
|
||||
"github.com/sipeed/picoclaw/pkg/logger"
|
||||
)
|
||||
|
||||
var namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`)
|
||||
var (
|
||||
namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`)
|
||||
reFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`)
|
||||
reStripFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`)
|
||||
)
|
||||
|
||||
const (
|
||||
MaxNameLength = 64
|
||||
@@ -257,10 +261,7 @@ func (sl *SkillsLoader) parseSimpleYAML(content string) map[string]string {
|
||||
|
||||
func (sl *SkillsLoader) extractFrontmatter(content string) string {
|
||||
// Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks
|
||||
// (?s) enables DOTALL so . matches newlines;
|
||||
// ^--- at start, then ... --- at start of line, honoring all three line ending types
|
||||
re := regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`)
|
||||
match := re.FindStringSubmatch(content)
|
||||
match := reFrontmatter.FindStringSubmatch(content)
|
||||
if len(match) > 1 {
|
||||
return match[1]
|
||||
}
|
||||
@@ -268,12 +269,7 @@ func (sl *SkillsLoader) extractFrontmatter(content string) string {
|
||||
}
|
||||
|
||||
func (sl *SkillsLoader) stripFrontmatter(content string) string {
|
||||
// Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks
|
||||
// (?s) enables DOTALL so . matches newlines;
|
||||
// ^--- at start, then ... --- at start of line, honoring all three line ending types
|
||||
// Match zero or more trailing line endings after closing --- (handles both with and without blank lines)
|
||||
re := regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`)
|
||||
return re.ReplaceAllString(content, "")
|
||||
return reStripFrontmatter.ReplaceAllString(content, "")
|
||||
}
|
||||
|
||||
func escapeXML(s string) string {
|
||||
|
||||
+21
-16
@@ -17,6 +17,19 @@ const (
|
||||
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
// Pre-compiled regexes for HTML text extraction
|
||||
var (
|
||||
reScript = regexp.MustCompile(`<script[\s\S]*?</script>`)
|
||||
reStyle = regexp.MustCompile(`<style[\s\S]*?</style>`)
|
||||
reTags = regexp.MustCompile(`<[^>]+>`)
|
||||
reWhitespace = regexp.MustCompile(`[^\S\n]+`)
|
||||
reBlankLines = regexp.MustCompile(`\n{3,}`)
|
||||
|
||||
// DuckDuckGo result extraction
|
||||
reDDGLink = regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
|
||||
reDDGSnippet = regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
|
||||
)
|
||||
|
||||
// createHTTPClient creates an HTTP client with optional proxy support
|
||||
func createHTTPClient(proxyURL string, timeout time.Duration) (*http.Client, error) {
|
||||
client := &http.Client{
|
||||
@@ -251,8 +264,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
|
||||
// Try finding the result links directly first, as they are the most critical
|
||||
// Pattern: <a class="result__a" href="...">Title</a>
|
||||
// The previous regex was a bit strict. Let's make it more flexible for attributes order/content
|
||||
reLink := regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
|
||||
matches := reLink.FindAllStringSubmatch(html, count+5)
|
||||
matches := reDDGLink.FindAllStringSubmatch(html, count+5)
|
||||
|
||||
if len(matches) == 0 {
|
||||
return fmt.Sprintf("No results found or extraction failed. Query: %s", query), nil
|
||||
@@ -269,8 +281,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
|
||||
|
||||
// A better regex approach: iterate through text and find matches in order
|
||||
// But for now, let's grab all snippets too
|
||||
reSnippet := regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
|
||||
snippetMatches := reSnippet.FindAllStringSubmatch(html, count+5)
|
||||
snippetMatches := reDDGSnippet.FindAllStringSubmatch(html, count+5)
|
||||
|
||||
maxItems := min(len(matches), count)
|
||||
|
||||
@@ -305,8 +316,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
|
||||
}
|
||||
|
||||
func stripTags(content string) string {
|
||||
re := regexp.MustCompile(`<[^>]+>`)
|
||||
return re.ReplaceAllString(content, "")
|
||||
return reTags.ReplaceAllString(content, "")
|
||||
}
|
||||
|
||||
type PerplexitySearchProvider struct {
|
||||
@@ -654,19 +664,14 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
|
||||
}
|
||||
|
||||
func (t *WebFetchTool) extractText(htmlContent string) string {
|
||||
re := regexp.MustCompile(`<script[\s\S]*?</script>`)
|
||||
result := re.ReplaceAllLiteralString(htmlContent, "")
|
||||
re = regexp.MustCompile(`<style[\s\S]*?</style>`)
|
||||
result = re.ReplaceAllLiteralString(result, "")
|
||||
re = regexp.MustCompile(`<[^>]+>`)
|
||||
result = re.ReplaceAllLiteralString(result, "")
|
||||
result := reScript.ReplaceAllLiteralString(htmlContent, "")
|
||||
result = reStyle.ReplaceAllLiteralString(result, "")
|
||||
result = reTags.ReplaceAllLiteralString(result, "")
|
||||
|
||||
result = strings.TrimSpace(result)
|
||||
|
||||
re = regexp.MustCompile(`[^\S\n]+`)
|
||||
result = re.ReplaceAllString(result, " ")
|
||||
re = regexp.MustCompile(`\n{3,}`)
|
||||
result = re.ReplaceAllString(result, "\n\n")
|
||||
result = reWhitespace.ReplaceAllString(result, " ")
|
||||
result = reBlankLines.ReplaceAllString(result, "\n\n")
|
||||
|
||||
lines := strings.Split(result, "\n")
|
||||
var cleanLines []string
|
||||
|
||||
Reference in New Issue
Block a user