feat(routing): add language-agnostic model complexity scorer

Add three new files to pkg/routing/:

features.go — ExtractFeatures(msg, history) → Features
  Computes five structural dimensions with zero keyword matching:
  - TokenEstimate: rune_count/3 (CJK-safe token proxy)
  - CodeBlockCount: ``` pairs in the message
  - RecentToolCalls: tool call count in the last 6 history entries
  - ConversationDepth: total messages in session
  - HasAttachments: data URIs or media file extensions

classifier.go — Classifier interface + RuleClassifier
  RuleClassifier uses a weighted sum that is capped at 1.0:
    code block      → +0.40  (triggers heavy model alone at 0.35 threshold)
    token > 200     → +0.35  (triggers heavy model alone)
    tool calls > 3  → +0.25
    token 50-200    → +0.15
    conversation depth > 10 → +0.10
    attachment      → 1.00 (hard gate, always heavy)

router.go — Router wraps config + Classifier
  Router.SelectModel(msg, history, primaryModel) returns either the
  configured light_model or the primary model depending on whether
  the complexity score clears the threshold. Threshold defaults to
  0.35 when zero/negative to prevent misconfiguration.

router_test.go — 34 tests covering all branches and edge cases
This commit is contained in:
xiaoen
2026-03-02 22:42:20 +08:00
parent c5a21b269f
commit 1943c3e660
4 changed files with 661 additions and 0 deletions
+80
View File
@@ -0,0 +1,80 @@
package routing
// Classifier evaluates a feature set and returns a complexity score in [0, 1].
// A higher score indicates a more complex task that benefits from a heavy model.
// The score is compared against the configured threshold: score >= threshold selects
// the primary (heavy) model; score < threshold selects the light model.
//
// Classifier is an interface so that future implementations (ML-based, embedding-based,
// or any other approach) can be swapped in without changing routing infrastructure.
type Classifier interface {
Score(f Features) float64
}
// RuleClassifier is the v1 implementation.
// It uses a weighted sum of structural signals with no external dependencies,
// no API calls, and sub-microsecond latency. The raw sum is capped at 1.0 so
// that the returned score always falls within the [0, 1] contract.
//
// Individual weights (multiple signals can fire simultaneously):
//
// token > 200 (≈600 chars): 0.35 — very long prompts are almost always complex
// token 50-200: 0.15 — medium length; may or may not be complex
// code block present: 0.40 — coding tasks need the heavy model
// tool calls > 3 (recent): 0.25 — dense tool usage signals an agentic workflow
// tool calls 1-3 (recent): 0.10 — some tool activity
// conversation depth > 10: 0.10 — long sessions carry implicit complexity
// attachments present: 1.00 — hard gate; multi-modal always needs heavy model
//
// Default threshold is 0.35, so:
// - Pure greetings / trivial Q&A: 0.00 → light ✓
// - Medium prose message (50200 tokens): 0.15 → light ✓
// - Message with code block: 0.40 → heavy ✓
// - Long message (>200 tokens): 0.35 → heavy ✓
// - Active tool session + medium message: 0.25 → light (acceptable)
// - Any message with an image/audio attachment: 1.00 → heavy ✓
type RuleClassifier struct{}
// Score computes the complexity score for the given feature set.
// The returned value is in [0, 1]. Attachments short-circuit to 1.0.
func (c *RuleClassifier) Score(f Features) float64 {
// Hard gate: multi-modal inputs always require the heavy model.
if f.HasAttachments {
return 1.0
}
var score float64
// Token estimate — primary verbosity signal
switch {
case f.TokenEstimate > 200:
score += 0.35
case f.TokenEstimate > 50:
score += 0.15
}
// Fenced code blocks — strongest indicator of a coding/technical task
if f.CodeBlockCount > 0 {
score += 0.40
}
// Recent tool call density — indicates an ongoing agentic workflow
switch {
case f.RecentToolCalls > 3:
score += 0.25
case f.RecentToolCalls > 0:
score += 0.10
}
// Conversation depth — accumulated context implies compound task
if f.ConversationDepth > 10 {
score += 0.10
}
// Cap at 1.0 to honour the [0, 1] contract even when multiple signals fire
// simultaneously (e.g., long message + code block + tool chain = 1.10 raw).
if score > 1.0 {
score = 1.0
}
return score
}