feat(routing): add language-agnostic model complexity scorer

Add three new files to pkg/routing/: features.go — ExtractFeatures(msg, history) → Features Computes five structural dimensions with zero keyword matching: - TokenEstimate: rune_count/3 (CJK-safe token proxy) - CodeBlockCount: ``` pairs in the message - RecentToolCalls: tool call count in the last 6 history entries - ConversationDepth: total messages in session - HasAttachments: data URIs or media file extensions classifier.go — Classifier interface + RuleClassifier RuleClassifier uses a weighted sum that is capped at 1.0: code block → +0.40 (triggers heavy model alone at 0.35 threshold) token > 200 → +0.35 (triggers heavy model alone) tool calls > 3 → +0.25 token 50-200 → +0.15 conversation depth > 10 → +0.10 attachment → 1.00 (hard gate, always heavy) router.go — Router wraps config + Classifier Router.SelectModel(msg, history, primaryModel) returns either the configured light_model or the primary model depending on whether the complexity score clears the threshold. Threshold defaults to 0.35 when zero/negative to prevent misconfiguration. router_test.go — 34 tests covering all branches and edge cases
2026-06-12 18:08:54 +00:00 · 2026-03-02 22:42:20 +08:00
parent c5a21b269f
commit 1943c3e660
4 changed files with 661 additions and 0 deletions
@@ -0,0 +1,80 @@
+package routing
+
+// Classifier evaluates a feature set and returns a complexity score in [0, 1].
+// A higher score indicates a more complex task that benefits from a heavy model.
+// The score is compared against the configured threshold: score >= threshold selects
+// the primary (heavy) model; score < threshold selects the light model.
+//
+// Classifier is an interface so that future implementations (ML-based, embedding-based,
+// or any other approach) can be swapped in without changing routing infrastructure.
+type Classifier interface {
+	Score(f Features) float64
+}
+
+// RuleClassifier is the v1 implementation.
+// It uses a weighted sum of structural signals with no external dependencies,
+// no API calls, and sub-microsecond latency. The raw sum is capped at 1.0 so
+// that the returned score always falls within the [0, 1] contract.
+//
+// Individual weights (multiple signals can fire simultaneously):
+//
+//	token > 200 (≈600 chars): 0.35  — very long prompts are almost always complex
+//	token 50-200:             0.15  — medium length; may or may not be complex
+//	code block present:       0.40  — coding tasks need the heavy model
+//	tool calls > 3 (recent):  0.25  — dense tool usage signals an agentic workflow
+//	tool calls 1-3 (recent):  0.10  — some tool activity
+//	conversation depth > 10:  0.10  — long sessions carry implicit complexity
+//	attachments present:      1.00  — hard gate; multi-modal always needs heavy model
+//
+// Default threshold is 0.35, so:
+//   - Pure greetings / trivial Q&A:                 0.00 → light  ✓
+//   - Medium prose message (50–200 tokens):          0.15 → light  ✓
+//   - Message with code block:                       0.40 → heavy  ✓
+//   - Long message (>200 tokens):                    0.35 → heavy  ✓
+//   - Active tool session + medium message:          0.25 → light  (acceptable)
+//   - Any message with an image/audio attachment:    1.00 → heavy  ✓
+type RuleClassifier struct{}
+
+// Score computes the complexity score for the given feature set.
+// The returned value is in [0, 1]. Attachments short-circuit to 1.0.
+func (c *RuleClassifier) Score(f Features) float64 {
+	// Hard gate: multi-modal inputs always require the heavy model.
+	if f.HasAttachments {
+		return 1.0
+	}
+
+	var score float64
+
+	// Token estimate — primary verbosity signal
+	switch {
+	case f.TokenEstimate > 200:
+		score += 0.35
+	case f.TokenEstimate > 50:
+		score += 0.15
+	}
+
+	// Fenced code blocks — strongest indicator of a coding/technical task
+	if f.CodeBlockCount > 0 {
+		score += 0.40
+	}
+
+	// Recent tool call density — indicates an ongoing agentic workflow
+	switch {
+	case f.RecentToolCalls > 3:
+		score += 0.25
+	case f.RecentToolCalls > 0:
+		score += 0.10
+	}
+
+	// Conversation depth — accumulated context implies compound task
+	if f.ConversationDepth > 10 {
+		score += 0.10
+	}
+
+	// Cap at 1.0 to honour the [0, 1] contract even when multiple signals fire
+	// simultaneously (e.g., long message + code block + tool chain = 1.10 raw).
+	if score > 1.0 {
+		score = 1.0
+	}
+	return score
+}