diff --git a/pkg/agent/context_budget.go b/pkg/agent/context_budget.go index 05e27e18a..0b7f443e6 100644 --- a/pkg/agent/context_budget.go +++ b/pkg/agent/context_budget.go @@ -95,10 +95,14 @@ func estimateMessageTokens(msg providers.Message) int { } for _, tc := range msg.ToolCalls { - // Count tool call metadata: ID, type, function name - chars += len(tc.ID) + len(tc.Type) + len(tc.Name) + chars += len(tc.ID) + len(tc.Type) if tc.Function != nil { + // Count function name + arguments (the wire format for most providers). + // tc.Name mirrors tc.Function.Name — count only once to avoid double-counting. chars += len(tc.Function.Name) + len(tc.Function.Arguments) + } else { + // Fallback: some provider formats use top-level Name without Function. + chars += len(tc.Name) } } @@ -106,17 +110,20 @@ func estimateMessageTokens(msg providers.Message) int { chars += len(msg.ToolCallID) } - // Media items (images, files) are serialized by provider adapters into - // multipart or image_url payloads. Use a fixed per-item estimate since - // actual token cost depends on resolution and provider tokenization. - const mediaTokensPerItem = 256 - chars += len(msg.Media) * mediaTokensPerItem - // Per-message overhead for role label, JSON structure, separators. const messageOverhead = 12 chars += messageOverhead - return chars * 2 / 5 + tokens := chars * 2 / 5 + + // Media items (images, files) are serialized by provider adapters into + // multipart or image_url payloads. Add a fixed per-item token estimate + // directly (not through the chars heuristic) since actual cost depends + // on resolution and provider-specific image tokenization. + const mediaTokensPerItem = 256 + tokens += len(msg.Media) * mediaTokensPerItem + + return tokens } // estimateToolDefsTokens estimates the total token cost of tool definitions diff --git a/pkg/agent/context_budget_test.go b/pkg/agent/context_budget_test.go index 15198d03b..175e04885 100644 --- a/pkg/agent/context_budget_test.go +++ b/pkg/agent/context_budget_test.go @@ -503,6 +503,13 @@ func TestEstimateMessageTokens_MediaItems(t *testing.T) { t.Errorf("message with Media (%d tokens) should exceed plain message (%d tokens)", mediaTokens, plainTokens) } + + // Each media item should add exactly 256 tokens (not run through chars*2/5). + expectedDelta := 256 * 2 + actualDelta := mediaTokens - plainTokens + if actualDelta != expectedDelta { + t.Errorf("2 media items should add %d tokens, got delta %d", expectedDelta, actualDelta) + } } // --- estimateToolDefsTokens tests --- diff --git a/web/frontend/src/i18n/locales/en.json b/web/frontend/src/i18n/locales/en.json index 116ee4441..09852e0c7 100644 --- a/web/frontend/src/i18n/locales/en.json +++ b/web/frontend/src/i18n/locales/en.json @@ -397,7 +397,7 @@ "max_tokens": "Max Tokens", "max_tokens_hint": "Upper token limit per model response.", "context_window": "Context Window", - "context_window_hint": "Model input context capacity in tokens. Leave empty to auto-detect (default: 4x max tokens).", + "context_window_hint": "Model input context capacity in tokens. Leave empty to use the default (4x max tokens).", "max_tool_iterations": "Max Tool Iterations", "max_tool_iterations_hint": "Maximum tool-call loops in a single task.", "summarize_threshold": "Summarize Message Threshold", diff --git a/web/frontend/src/i18n/locales/zh.json b/web/frontend/src/i18n/locales/zh.json index e68c46085..c92ea0032 100644 --- a/web/frontend/src/i18n/locales/zh.json +++ b/web/frontend/src/i18n/locales/zh.json @@ -397,7 +397,7 @@ "max_tokens": "最大 Token 数", "max_tokens_hint": "单次模型响应允许的最大 Token 数。", "context_window": "上下文窗口", - "context_window_hint": "模型输入上下文容量(Token 数)。留空则自动推算(默认为最大 Token 数的 4 倍)。", + "context_window_hint": "模型输入上下文容量(Token 数)。留空使用默认值(最大 Token 数的 4 倍)。", "max_tool_iterations": "最大工具迭代次数", "max_tool_iterations_hint": "单个任务中允许的工具调用循环上限。", "summarize_threshold": "触发摘要的消息阈值",