fix(agent): include ReasoningContent and Media in token estimation

estimateMessageTokens now counts ReasoningContent (extended thinking / chain-of-thought) which can be substantial and is persisted in session history. Media items get a fixed per-item overhead (256 tokens) since actual cost depends on provider-specific image tokenization.
2026-06-12 18:08:54 +00:00 · 2026-03-13 15:14:00 +08:00
parent 9c65d78b07
commit d5fdd5ebd2
2 changed files with 48 additions and 2 deletions
@@ -63,11 +63,17 @@ func findSafeBoundary(history []providers.Message, targetIndex int) int {
 }

 // estimateMessageTokens estimates the token count for a single message,
-// including Content, ToolCalls arguments, and ToolCallID metadata.
-// Uses a heuristic of 2.5 characters per token.
+// including Content, ReasoningContent, ToolCalls arguments, ToolCallID
+// metadata, and Media items. Uses a heuristic of 2.5 characters per token.
 func estimateMessageTokens(msg providers.Message) int {
 	chars := utf8.RuneCountInString(msg.Content)

+	// ReasoningContent (extended thinking / chain-of-thought) can be
+	// substantial and is stored in session history via AddFullMessage.
+	if msg.ReasoningContent != "" {
+		chars += utf8.RuneCountInString(msg.ReasoningContent)
+	}
+
 	for _, tc := range msg.ToolCalls {
 		// Count tool call metadata: ID, type, function name
 		chars += len(tc.ID) + len(tc.Type) + len(tc.Name)
@@ -80,6 +86,12 @@ func estimateMessageTokens(msg providers.Message) int {
 		chars += len(msg.ToolCallID)
 	}

+	// Media items (images, files) are serialized by provider adapters into
+	// multipart or image_url payloads. Use a fixed per-item estimate since
+	// actual token cost depends on resolution and provider tokenization.
+	const mediaTokensPerItem = 256
+	chars += len(msg.Media) * mediaTokensPerItem
+
 	// Per-message overhead for role label, JSON structure, separators.
 	const messageOverhead = 12
 	chars += messageOverhead
@@ -389,6 +389,40 @@ func TestEstimateMessageTokens_LargeArguments(t *testing.T) {
 	}
 }

+func TestEstimateMessageTokens_ReasoningContent(t *testing.T) {
+	plain := msgAssistant("result")
+	withReasoning := providers.Message{
+		Role:             "assistant",
+		Content:          "result",
+		ReasoningContent: strings.Repeat("thinking step ", 200),
+	}
+
+	plainTokens := estimateMessageTokens(plain)
+	reasoningTokens := estimateMessageTokens(withReasoning)
+
+	if reasoningTokens <= plainTokens {
+		t.Errorf("message with ReasoningContent (%d tokens) should exceed plain message (%d tokens)",
+			reasoningTokens, plainTokens)
+	}
+}
+
+func TestEstimateMessageTokens_MediaItems(t *testing.T) {
+	plain := msgUser("describe this")
+	withMedia := providers.Message{
+		Role:    "user",
+		Content: "describe this",
+		Media:   []string{"media://img1.png", "media://img2.png"},
+	}
+
+	plainTokens := estimateMessageTokens(plain)
+	mediaTokens := estimateMessageTokens(withMedia)
+
+	if mediaTokens <= plainTokens {
+		t.Errorf("message with Media (%d tokens) should exceed plain message (%d tokens)",
+			mediaTokens, plainTokens)
+	}
+}
+
 // --- estimateToolDefsTokens tests ---

 func TestEstimateToolDefsTokens(t *testing.T) {