picoclaw/pkg/utils/bm25.go

// Package utils provides shared, reusable algorithms.
// This file implements a generic BM25 search engine.
//
// Usage:
//
//	type MyDoc struct { ID string; Body string }
//
//	corpus := []MyDoc{...}
//	engine := bm25.New(corpus, func(d MyDoc) string {
//	    return d.ID + " " + d.Body
//	})
//	results := engine.Search("my query", 5)
package utils

import (
	"math"
	"sort"
	"strings"
)

// ── Tuning defaults ───────────────────────────────────────────────────────────

const (
	// DefaultBM25K1 is the term-frequency saturation factor (typical range 1.2–2.0).
	// Higher values give more weight to repeated terms.
	DefaultBM25K1 = 1.2

	// DefaultBM25B is the document-length normalization factor (0 = none, 1 = full).
	DefaultBM25B = 0.75
)

// BM25Engine is a BM25 search engine over a generic corpus.
// T is the document type; the caller supplies a TextFunc that extracts the
// searchable text from each document.
//
// The engine precomputes its index once at construction time and reuses it for
// subsequent searches. If the corpus content changes, construct a new engine.
type BM25Engine[T any] struct {
	corpus   []T
	textFunc func(T) string
	k1       float64
	b        float64
	index    *bm25Index
}

// BM25Option is a functional option to configure a BM25Engine.
type BM25Option func(*bm25Config)

type bm25Config struct {
	k1 float64
	b  float64
}

type bm25Index struct {
	entries    []bm25DocEntry
	idf        map[string]float32
	docLenNorm []float32
	posting    map[string][]int32
}

type bm25DocEntry struct {
	tf map[string]uint32
}

// WithK1 overrides the term-frequency saturation constant (default 1.2).
func WithK1(k1 float64) BM25Option {
	return func(c *bm25Config) { c.k1 = k1 }
}

// WithB overrides the document-length normalization factor (default 0.75).
func WithB(b float64) BM25Option {
	return func(c *bm25Config) { c.b = b }
}

// NewBM25Engine creates a BM25Engine for the given corpus.
//
//   - corpus   : slice of documents of any type T.
//   - textFunc : function that returns the searchable text for a document.
//   - opts     : optional tuning (WithK1, WithB).
//
// The corpus slice is referenced, not copied. Callers must not mutate it
// concurrently with Search().
func NewBM25Engine[T any](corpus []T, textFunc func(T) string, opts ...BM25Option) *BM25Engine[T] {
	cfg := bm25Config{k1: DefaultBM25K1, b: DefaultBM25B}
	for _, o := range opts {
		o(&cfg)
	}
	engine := &BM25Engine[T]{
		corpus:   corpus,
		textFunc: textFunc,
		k1:       cfg.k1,
		b:        cfg.b,
	}
	engine.index = buildBM25Index(corpus, textFunc, cfg.k1, cfg.b)
	return engine
}

// BM25Result is a single ranked result from a Search call.
type BM25Result[T any] struct {
	Document T
	Score    float32
}

// Search ranks the corpus against query and returns the top-k results.
// Returns an empty slice (not nil) when there are no matches.
//
// Complexity: O(|Q|×avgPostingLen + candidates × log k) per search after the
// one-time indexing work performed by NewBM25Engine.
func (e *BM25Engine[T]) Search(query string, topK int) []BM25Result[T] {
	if topK <= 0 {
		return []BM25Result[T]{}
	}

	queryTerms := bm25Tokenize(query)
	if len(queryTerms) == 0 {
		return []BM25Result[T]{}
	}

	if len(e.corpus) == 0 || e.index == nil {
		return []BM25Result[T]{}
	}

	// Step 4: score via posting lists
	// Deduplicate query terms to avoid double-weighting the same term.
	unique := bm25Dedupe(queryTerms)

	scores := make(map[int32]float32)
	for _, term := range unique {
		termIDF, ok := e.index.idf[term]
		if !ok {
			continue // term not in vocabulary → zero contribution
		}
		for _, docID := range e.index.posting[term] {
			freq := float32(e.index.entries[docID].tf[term])
			// TF_norm = freq * (k1+1) / (freq + docLenNorm)
			tfNorm := freq * float32(e.k1+1) / (freq + e.index.docLenNorm[docID])
			scores[docID] += termIDF * tfNorm
		}
	}

	if len(scores) == 0 {
		return []BM25Result[T]{}
	}

	// Step 5: top-K via fixed-size min-heap
	heap := make([]bm25ScoredDoc, 0, topK)

	for docID, sc := range scores {
		switch {
		case len(heap) < topK:
			heap = append(heap, bm25ScoredDoc{docID: docID, score: sc})
			if len(heap) == topK {
				bm25MinHeapify(heap)
			}
		case sc > heap[0].score:
			heap[0] = bm25ScoredDoc{docID: docID, score: sc}
			bm25SiftDown(heap, 0)
		}
	}

	sort.Slice(heap, func(i, j int) bool { return heap[i].score > heap[j].score })

	out := make([]BM25Result[T], len(heap))
	for i, h := range heap {
		out[i] = BM25Result[T]{
			Document: e.corpus[h.docID],
			Score:    h.score,
		}
	}
	return out
}

func buildBM25Index[T any](corpus []T, textFunc func(T) string, k1, b float64) *bm25Index {
	N := len(corpus)
	if N == 0 {
		return nil
	}

	entries := make([]bm25DocEntry, N)
	rawLens := make([]int, N)
	df := make(map[string]int, 64)
	totalLen := 0

	for i, doc := range corpus {
		tokens := bm25Tokenize(textFunc(doc))
		totalLen += len(tokens)
		rawLens[i] = len(tokens)

		tf := make(map[string]uint32, len(tokens))
		for _, t := range tokens {
			tf[t]++
		}
		for term := range tf {
			df[term]++
		}

		entries[i] = bm25DocEntry{tf: tf}
	}

	avgDocLen := float64(totalLen) / float64(N)
	if avgDocLen == 0 {
		avgDocLen = 1
	}

	idf := make(map[string]float32, len(df))
	for term, freq := range df {
		idf[term] = float32(math.Log(
			(float64(N)-float64(freq)+0.5)/(float64(freq)+0.5) + 1,
		))
	}

	docLenNorm := make([]float32, N)
	for i, rawLen := range rawLens {
		docLenNorm[i] = float32(k1 * (1 - b + b*float64(rawLen)/avgDocLen))
	}

	posting := make(map[string][]int32, len(df))
	for i, entry := range entries {
		for term := range entry.tf {
			posting[term] = append(posting[term], int32(i))
		}
	}

	return &bm25Index{
		entries:    entries,
		idf:        idf,
		docLenNorm: docLenNorm,
		posting:    posting,
	}
}

// bm25Tokenize splits s into lowercase tokens, stripping edge punctuation.
func bm25Tokenize(s string) []string {
	raw := strings.Fields(strings.ToLower(s))
	out := raw[:0] // reuse backing array to avoid extra allocation
	for _, t := range raw {
		t = strings.Trim(t, ".,;:!?\"'()/\\-_")
		if t != "" {
			out = append(out, t)
		}
	}
	return out
}

// bm25Dedupe returns a new slice with duplicate tokens removed,
// preserving first-occurrence order.
func bm25Dedupe(tokens []string) []string {
	seen := make(map[string]struct{}, len(tokens))
	out := make([]string, 0, len(tokens))
	for _, t := range tokens {
		if _, ok := seen[t]; !ok {
			seen[t] = struct{}{}
			out = append(out, t)
		}
	}
	return out
}

type bm25ScoredDoc struct {
	docID int32
	score float32
}

// bm25MinHeapify builds a min-heap in-place using Floyd's algorithm: O(k).
func bm25MinHeapify(h []bm25ScoredDoc) {
	for i := len(h)/2 - 1; i >= 0; i-- {
		bm25SiftDown(h, i)
	}
}

// bm25SiftDown restores the min-heap property starting at node i: O(log k).
func bm25SiftDown(h []bm25ScoredDoc, i int) {
	n := len(h)
	for {
		smallest := i
		l, r := 2*i+1, 2*i+2
		if l < n && h[l].score < h[smallest].score {
			smallest = l
		}
		if r < n && h[r].score < h[smallest].score {
			smallest = r
		}
		if smallest == i {
			break
		}
		h[i], h[smallest] = h[smallest], h[i]
		i = smallest
	}
}