Perf/precompile regex (#687)

* perf: pre-compile regexes at package level

Move regexp.MustCompile calls from inside methods to package-level
variables in web.go (7 regexes) and loader.go (2 regexes).
This avoids repeated compilation on every invocation.

Amp-Thread-ID: https://ampcode.com/threads/T-019c79c3-ea1c-7471-b09d-be90ba0e1ca0
Co-authored-by: Amp <amp@ampcode.com>

* perf: pre-compile regexes at package level

* retain the helpful comment

---------

Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
mattn
2026-02-26 18:44:03 +09:00
committed by GitHub
parent cb3191c8c1
commit 8a1fb03974
3 changed files with 52 additions and 41 deletions
+21 -16
View File
@@ -17,6 +17,19 @@ const (
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
// Pre-compiled regexes for HTML text extraction
var (
reScript = regexp.MustCompile(`<script[\s\S]*?</script>`)
reStyle = regexp.MustCompile(`<style[\s\S]*?</style>`)
reTags = regexp.MustCompile(`<[^>]+>`)
reWhitespace = regexp.MustCompile(`[^\S\n]+`)
reBlankLines = regexp.MustCompile(`\n{3,}`)
// DuckDuckGo result extraction
reDDGLink = regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
reDDGSnippet = regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
)
// createHTTPClient creates an HTTP client with optional proxy support
func createHTTPClient(proxyURL string, timeout time.Duration) (*http.Client, error) {
client := &http.Client{
@@ -251,8 +264,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
// Try finding the result links directly first, as they are the most critical
// Pattern: <a class="result__a" href="...">Title</a>
// The previous regex was a bit strict. Let's make it more flexible for attributes order/content
reLink := regexp.MustCompile(`<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)</a>`)
matches := reLink.FindAllStringSubmatch(html, count+5)
matches := reDDGLink.FindAllStringSubmatch(html, count+5)
if len(matches) == 0 {
return fmt.Sprintf("No results found or extraction failed. Query: %s", query), nil
@@ -269,8 +281,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
// A better regex approach: iterate through text and find matches in order
// But for now, let's grab all snippets too
reSnippet := regexp.MustCompile(`<a class="result__snippet[^"]*".*?>([\s\S]*?)</a>`)
snippetMatches := reSnippet.FindAllStringSubmatch(html, count+5)
snippetMatches := reDDGSnippet.FindAllStringSubmatch(html, count+5)
maxItems := min(len(matches), count)
@@ -305,8 +316,7 @@ func (p *DuckDuckGoSearchProvider) extractResults(html string, count int, query
}
func stripTags(content string) string {
re := regexp.MustCompile(`<[^>]+>`)
return re.ReplaceAllString(content, "")
return reTags.ReplaceAllString(content, "")
}
type PerplexitySearchProvider struct {
@@ -654,19 +664,14 @@ func (t *WebFetchTool) Execute(ctx context.Context, args map[string]any) *ToolRe
}
func (t *WebFetchTool) extractText(htmlContent string) string {
re := regexp.MustCompile(`<script[\s\S]*?</script>`)
result := re.ReplaceAllLiteralString(htmlContent, "")
re = regexp.MustCompile(`<style[\s\S]*?</style>`)
result = re.ReplaceAllLiteralString(result, "")
re = regexp.MustCompile(`<[^>]+>`)
result = re.ReplaceAllLiteralString(result, "")
result := reScript.ReplaceAllLiteralString(htmlContent, "")
result = reStyle.ReplaceAllLiteralString(result, "")
result = reTags.ReplaceAllLiteralString(result, "")
result = strings.TrimSpace(result)
re = regexp.MustCompile(`[^\S\n]+`)
result = re.ReplaceAllString(result, " ")
re = regexp.MustCompile(`\n{3,}`)
result = re.ReplaceAllString(result, "\n\n")
result = reWhitespace.ReplaceAllString(result, " ")
result = reBlankLines.ReplaceAllString(result, "\n\n")
lines := strings.Split(result, "\n")
var cleanLines []string