picoclaw/pkg/utils/bm25.go

// Package utils provides shared, reusable algorithms.
// This file implements a generic BM25 search engine.
//
// Usage:
//
//	type MyDoc struct { ID string; Body string }
//
//	corpus := []MyDoc{...}
//	engine := bm25.New(corpus, func(d MyDoc) string {
//	    return d.ID + " " + d.Body
//	})
//	results := engine.Search("my query", 5)
package utils

import (
	"math"
	"sort"
	"strings"
)

// ── Tuning defaults ───────────────────────────────────────────────────────────

const (
	// DefaultBM25K1 is the term-frequency saturation factor (typical range 1.2–2.0).
	// Higher values give more weight to repeated terms.
	DefaultBM25K1 = 1.2

	// DefaultBM25B is the document-length normalization factor (0 = none, 1 = full).
	DefaultBM25B = 0.75
)

// BM25Engine is a query-time BM25 search engine over a generic corpus.
// T is the document type; the caller supplies a TextFunc that extracts the
// searchable text from each document.
//
// The engine is stateless between queries: no caching, no invalidation logic.
// All indexing work is performed inside Search() on every call, making it
// safe to use on corpora that change frequently.
type BM25Engine[T any] struct {
	corpus   []T
	textFunc func(T) string
	k1       float64
	b        float64
}

// BM25Option is a functional option to configure a BM25Engine.
type BM25Option func(*bm25Config)

type bm25Config struct {
	k1 float64
	b  float64
}

// WithK1 overrides the term-frequency saturation constant (default 1.2).
func WithK1(k1 float64) BM25Option {
	return func(c *bm25Config) { c.k1 = k1 }
}

// WithB overrides the document-length normalization factor (default 0.75).
func WithB(b float64) BM25Option {
	return func(c *bm25Config) { c.b = b }
}

// NewBM25Engine creates a BM25Engine for the given corpus.
//
//   - corpus   : slice of documents of any type T.
//   - textFunc : function that returns the searchable text for a document.
//   - opts     : optional tuning (WithK1, WithB).
//
// The corpus slice is referenced, not copied. Callers must not mutate it
// concurrently with Search().
func NewBM25Engine[T any](corpus []T, textFunc func(T) string, opts ...BM25Option) *BM25Engine[T] {
	cfg := bm25Config{k1: DefaultBM25K1, b: DefaultBM25B}
	for _, o := range opts {
		o(&cfg)
	}
	return &BM25Engine[T]{
		corpus:   corpus,
		textFunc: textFunc,
		k1:       cfg.k1,
		b:        cfg.b,
	}
}

// BM25Result is a single ranked result from a Search call.
type BM25Result[T any] struct {
	Document T
	Score    float32
}

// Search ranks the corpus against query and returns the top-k results.
// Returns an empty slice (not nil) when there are no matches.
//
// Complexity: O(N×L) for indexing + O(|Q|×avgPostingLen) for scoring,
// where N = corpus size, L = average document length, Q = query terms.
// Top-k extraction uses a fixed-size min-heap: O(candidates × log k).
func (e *BM25Engine[T]) Search(query string, topK int) []BM25Result[T] {
	if topK <= 0 {
		return []BM25Result[T]{}
	}

	queryTerms := bm25Tokenize(query)
	if len(queryTerms) == 0 {
		return []BM25Result[T]{}
	}

	N := len(e.corpus)
	if N == 0 {
		return []BM25Result[T]{}
	}

	// Step 1: build per-document tf + raw doc lengths
	type docEntry struct {
		tf     map[string]uint32
		rawLen int
	}

	entries := make([]docEntry, N)
	df := make(map[string]int, 64)
	totalLen := 0

	for i, doc := range e.corpus {
		tokens := bm25Tokenize(e.textFunc(doc))
		totalLen += len(tokens)

		tf := make(map[string]uint32, len(tokens))
		for _, t := range tokens {
			tf[t]++
		}
		// df: each term counts once per document (iterate the map, keys are unique)
		for t := range tf {
			df[t]++
		}

		entries[i] = docEntry{tf: tf, rawLen: len(tokens)}
	}

	avgDocLen := float64(totalLen) / float64(N)

	// Step 2: pre-compute IDF and per-doc length normalization
	// IDF (Robertson smoothing): log( (N - df(t) + 0.5) / (df(t) + 0.5) + 1 )
	idf := make(map[string]float32, len(df))
	for term, freq := range df {
		idf[term] = float32(math.Log(
			(float64(N)-float64(freq)+0.5)/(float64(freq)+0.5) + 1,
		))
	}

	// docLenNorm[i] = k1 * (1 - b + b * |doc_i| / avgDocLen)
	// Stored as float32 — sufficient precision for ranking.
	docLenNorm := make([]float32, N)
	for i, entry := range entries {
		docLenNorm[i] = float32(e.k1 * (1 - e.b + e.b*float64(entry.rawLen)/avgDocLen))
	}

	// Step 3: build inverted index (posting lists)
	// Iterate the tf map directly — map keys are already unique, no seen-set needed.
	posting := make(map[string][]int32, len(df))
	for i, entry := range entries {
		for term := range entry.tf {
			posting[term] = append(posting[term], int32(i))
		}
	}

	// Step 4: score via posting lists
	// Deduplicate query terms to avoid double-weighting the same term.
	unique := bm25Dedupe(queryTerms)

	scores := make(map[int32]float32)
	for _, term := range unique {
		termIDF, ok := idf[term]
		if !ok {
			continue // term not in vocabulary → zero contribution
		}
		for _, docID := range posting[term] {
			freq := float32(entries[docID].tf[term])
			// TF_norm = freq * (k1+1) / (freq + docLenNorm)
			tfNorm := freq * float32(e.k1+1) / (freq + docLenNorm[docID])
			scores[docID] += termIDF * tfNorm
		}
	}

	if len(scores) == 0 {
		return []BM25Result[T]{}
	}

	// Step 5: top-K via fixed-size min-heap
	heap := make([]bm25ScoredDoc, 0, topK)

	for docID, sc := range scores {
		switch {
		case len(heap) < topK:
			heap = append(heap, bm25ScoredDoc{docID: docID, score: sc})
			if len(heap) == topK {
				bm25MinHeapify(heap)
			}
		case sc > heap[0].score:
			heap[0] = bm25ScoredDoc{docID: docID, score: sc}
			bm25SiftDown(heap, 0)
		}
	}

	sort.Slice(heap, func(i, j int) bool { return heap[i].score > heap[j].score })

	out := make([]BM25Result[T], len(heap))
	for i, h := range heap {
		out[i] = BM25Result[T]{
			Document: e.corpus[h.docID],
			Score:    h.score,
		}
	}
	return out
}

// bm25Tokenize splits s into lowercase tokens, stripping edge punctuation.
func bm25Tokenize(s string) []string {
	raw := strings.Fields(strings.ToLower(s))
	out := raw[:0] // reuse backing array to avoid extra allocation
	for _, t := range raw {
		t = strings.Trim(t, ".,;:!?\"'()/\\-_")
		if t != "" {
			out = append(out, t)
		}
	}
	return out
}

// bm25Dedupe returns a new slice with duplicate tokens removed,
// preserving first-occurrence order.
func bm25Dedupe(tokens []string) []string {
	seen := make(map[string]struct{}, len(tokens))
	out := make([]string, 0, len(tokens))
	for _, t := range tokens {
		if _, ok := seen[t]; !ok {
			seen[t] = struct{}{}
			out = append(out, t)
		}
	}
	return out
}

type bm25ScoredDoc struct {
	docID int32
	score float32
}

// bm25MinHeapify builds a min-heap in-place using Floyd's algorithm: O(k).
func bm25MinHeapify(h []bm25ScoredDoc) {
	for i := len(h)/2 - 1; i >= 0; i-- {
		bm25SiftDown(h, i)
	}
}

// bm25SiftDown restores the min-heap property starting at node i: O(log k).
func bm25SiftDown(h []bm25ScoredDoc, i int) {
	n := len(h)
	for {
		smallest := i
		l, r := 2*i+1, 2*i+2
		if l < n && h[l].score < h[smallest].score {
			smallest = l
		}
		if r < n && h[r].score < h[smallest].score {
			smallest = r
		}
		if smallest == i {
			break
		}
		h[i], h[smallest] = h[smallest], h[i]
		i = smallest
	}
}