fix(agent): correct media token arithmetic and tool call double-counting

Two estimation bugs fixed:

1. Media tokens were added to the chars accumulator before the chars*2/5
   conversion, resulting in 256*2/5=102 tokens per item instead of 256.
   Fix: add media tokens directly to the final token count, bypassing
   the character-based heuristic.

2. estimateMessageTokens counted both tc.Name and tc.Function.Name for
   tool calls, but providers only send one (OpenAI-compat uses
   function.name, Anthropic uses tc.Name). Fix: count tc.Function.Name
   when Function is present, fall back to tc.Name only otherwise.

Also fix i18n hint text: "auto-detect" was misleading — the backend
uses a 4x max_tokens heuristic, not actual model detection.
This commit is contained in:
xiaoen
2026-03-13 16:02:04 +08:00
parent 639739cb85
commit 8034ee7be1
4 changed files with 25 additions and 11 deletions
+16 -9
View File
@@ -95,10 +95,14 @@ func estimateMessageTokens(msg providers.Message) int {
}
for _, tc := range msg.ToolCalls {
// Count tool call metadata: ID, type, function name
chars += len(tc.ID) + len(tc.Type) + len(tc.Name)
chars += len(tc.ID) + len(tc.Type)
if tc.Function != nil {
// Count function name + arguments (the wire format for most providers).
// tc.Name mirrors tc.Function.Name — count only once to avoid double-counting.
chars += len(tc.Function.Name) + len(tc.Function.Arguments)
} else {
// Fallback: some provider formats use top-level Name without Function.
chars += len(tc.Name)
}
}
@@ -106,17 +110,20 @@ func estimateMessageTokens(msg providers.Message) int {
chars += len(msg.ToolCallID)
}
// Media items (images, files) are serialized by provider adapters into
// multipart or image_url payloads. Use a fixed per-item estimate since
// actual token cost depends on resolution and provider tokenization.
const mediaTokensPerItem = 256
chars += len(msg.Media) * mediaTokensPerItem
// Per-message overhead for role label, JSON structure, separators.
const messageOverhead = 12
chars += messageOverhead
return chars * 2 / 5
tokens := chars * 2 / 5
// Media items (images, files) are serialized by provider adapters into
// multipart or image_url payloads. Add a fixed per-item token estimate
// directly (not through the chars heuristic) since actual cost depends
// on resolution and provider-specific image tokenization.
const mediaTokensPerItem = 256
tokens += len(msg.Media) * mediaTokensPerItem
return tokens
}
// estimateToolDefsTokens estimates the total token cost of tool definitions
+7
View File
@@ -503,6 +503,13 @@ func TestEstimateMessageTokens_MediaItems(t *testing.T) {
t.Errorf("message with Media (%d tokens) should exceed plain message (%d tokens)",
mediaTokens, plainTokens)
}
// Each media item should add exactly 256 tokens (not run through chars*2/5).
expectedDelta := 256 * 2
actualDelta := mediaTokens - plainTokens
if actualDelta != expectedDelta {
t.Errorf("2 media items should add %d tokens, got delta %d", expectedDelta, actualDelta)
}
}
// --- estimateToolDefsTokens tests ---
+1 -1
View File
@@ -397,7 +397,7 @@
"max_tokens": "Max Tokens",
"max_tokens_hint": "Upper token limit per model response.",
"context_window": "Context Window",
"context_window_hint": "Model input context capacity in tokens. Leave empty to auto-detect (default: 4x max tokens).",
"context_window_hint": "Model input context capacity in tokens. Leave empty to use the default (4x max tokens).",
"max_tool_iterations": "Max Tool Iterations",
"max_tool_iterations_hint": "Maximum tool-call loops in a single task.",
"summarize_threshold": "Summarize Message Threshold",
+1 -1
View File
@@ -397,7 +397,7 @@
"max_tokens": "最大 Token 数",
"max_tokens_hint": "单次模型响应允许的最大 Token 数。",
"context_window": "上下文窗口",
"context_window_hint": "模型输入上下文容量(Token 数)。留空则自动推算(默认为最大 Token 数的 4 倍)。",
"context_window_hint": "模型输入上下文容量(Token 数)。留空使用默认值(最大 Token 数的 4 倍)。",
"max_tool_iterations": "最大工具迭代次数",
"max_tool_iterations_hint": "单个任务中允许的工具调用循环上限。",
"summarize_threshold": "触发摘要的消息阈值",