feat: add extended thinking support for Anthropic models (#1076)

* feat: add extended thinking support for Anthropic models Support configurable thinking levels (off/low/medium/high/xhigh/adaptive) via `agents.defaults.thinking_level` config field. - "adaptive": uses Anthropic's adaptive thinking API (Claude 4.6+) - "low/medium/high/xhigh": uses budget_tokens (all thinking-capable models) - "off": disables thinking (default) API constraints handled: - Temperature cleared when thinking is enabled - budget_tokens clamped to max_tokens-1 - Thinking response blocks parsed into Reasoning field Relates to #645, #966 * fix: address PR review feedback for thinking support - Add ThinkingCapable interface for provider capability detection - Warn when thinking_level is set but provider doesn't support it - Warn when temperature is cleared due to thinking enabled - Adjust budget values per Anthropic best practices (medium=16K, xhigh=64K) - Add budget clamp warning and 80% threshold warning - Add parseResponse thinking block tests - Add thinking_level field to config.example.json * refactor: move ThinkingLevel from AgentDefaults to ModelConfig Thinking is a model-level capability, not a global agent property. Per-model config avoids silent ignoring on non-Anthropic providers and eliminates spurious warning logs in multi-provider setups. Addresses PR #1076 review feedback from @yinwm.
2026-06-12 18:08:54 +00:00 · 2026-03-05 09:51:18 +08:00
parent 325af2163b
commit 204038ec60
9 changed files with 401 additions and 17 deletions
@@ -22,7 +22,8 @@
      "model_name": "claude-sonnet-4.6",
      "model": "anthropic/claude-sonnet-4.6",
      "api_key": "sk-ant-your-key",
-      "api_base": "https://api.anthropic.com/v1"
+      "api_base": "https://api.anthropic.com/v1",
+      "thinking_level": "high"
    },
    {
      "model_name": "gemini",
@@ -26,6 +26,7 @@ type AgentInstance struct {
 	MaxIterations             int
 	MaxTokens                 int
 	Temperature               float64
+	ThinkingLevel             ThinkingLevel
 	ContextWindow             int
 	SummarizeMessageThreshold int
 	SummarizeTokenPercent     int
@@ -103,6 +104,12 @@ func NewAgentInstance(
 		temperature = *defaults.Temperature
 	}

+	var thinkingLevelStr string
+	if mc, err := cfg.GetModelConfig(model); err == nil {
+		thinkingLevelStr = mc.ThinkingLevel
+	}
+	thinkingLevel := parseThinkingLevel(thinkingLevelStr)
+
 	summarizeMessageThreshold := defaults.SummarizeMessageThreshold
 	if summarizeMessageThreshold == 0 {
 		summarizeMessageThreshold = 20
@@ -169,6 +176,7 @@ func NewAgentInstance(
 		MaxIterations:             maxIter,
 		MaxTokens:                 maxTokens,
 		Temperature:               temperature,
+		ThinkingLevel:             thinkingLevel,
 		ContextWindow:             maxTokens,
 		SummarizeMessageThreshold: summarizeMessageThreshold,
 		SummarizeTokenPercent:     summarizeTokenPercent,
@@ -834,23 +834,29 @@ func (al *AgentLoop) runLLMIteration(
 		var response *providers.LLMResponse
 		var err error

+		llmOpts := map[string]any{
+			"max_tokens":       agent.MaxTokens,
+			"temperature":      agent.Temperature,
+			"prompt_cache_key": agent.ID,
+		}
+		// parseThinkingLevel guarantees ThinkingOff for empty/unknown values,
+		// so checking != ThinkingOff is sufficient.
+		if agent.ThinkingLevel != ThinkingOff {
+			if tc, ok := agent.Provider.(providers.ThinkingCapable); ok && tc.SupportsThinking() {
+				llmOpts["thinking_level"] = string(agent.ThinkingLevel)
+			} else {
+				logger.WarnCF("agent", "thinking_level is set but current provider does not support it, ignoring",
+					map[string]any{"agent_id": agent.ID, "thinking_level": string(agent.ThinkingLevel)})
+			}
+		}
+
 		callLLM := func() (*providers.LLMResponse, error) {
 			if len(agent.Candidates) > 1 && al.fallback != nil {
 				fbResult, fbErr := al.fallback.Execute(
 					ctx,
 					agent.Candidates,
 					func(ctx context.Context, provider, model string) (*providers.LLMResponse, error) {
-						return agent.Provider.Chat(
-							ctx,
-							messages,
-							providerToolDefs,
-							model,
-							map[string]any{
-								"max_tokens":       agent.MaxTokens,
-								"temperature":      agent.Temperature,
-								"prompt_cache_key": agent.ID,
-							},
-						)
+						return agent.Provider.Chat(ctx, messages, providerToolDefs, model, llmOpts)
 					},
 				)
 				if fbErr != nil {
@@ -866,11 +872,7 @@ func (al *AgentLoop) runLLMIteration(
 				}
 				return fbResult.Response, nil
 			}
-			return agent.Provider.Chat(ctx, messages, providerToolDefs, agent.Model, map[string]any{
-				"max_tokens":       agent.MaxTokens,
-				"temperature":      agent.Temperature,
-				"prompt_cache_key": agent.ID,
-			})
+			return agent.Provider.Chat(ctx, messages, providerToolDefs, agent.Model, llmOpts)
 		}

 		// Retry loop for context/token errors
@@ -0,0 +1,39 @@
+package agent
+
+import "strings"
+
+// ThinkingLevel controls how the provider sends thinking parameters.
+//
+//   - "adaptive": sends {thinking: {type: "adaptive"}} + output_config.effort (Claude 4.6+)
+//   - "low"/"medium"/"high"/"xhigh": sends {thinking: {type: "enabled", budget_tokens: N}} (all models)
+//   - "off": disables thinking
+type ThinkingLevel string
+
+const (
+	ThinkingOff      ThinkingLevel = "off"
+	ThinkingLow      ThinkingLevel = "low"
+	ThinkingMedium   ThinkingLevel = "medium"
+	ThinkingHigh     ThinkingLevel = "high"
+	ThinkingXHigh    ThinkingLevel = "xhigh"
+	ThinkingAdaptive ThinkingLevel = "adaptive"
+)
+
+// parseThinkingLevel normalizes a config string to a ThinkingLevel.
+// Case-insensitive and whitespace-tolerant for user-facing config values.
+// Returns ThinkingOff for unknown or empty values.
+func parseThinkingLevel(level string) ThinkingLevel {
+	switch strings.ToLower(strings.TrimSpace(level)) {
+	case "adaptive":
+		return ThinkingAdaptive
+	case "low":
+		return ThinkingLow
+	case "medium":
+		return ThinkingMedium
+	case "high":
+		return ThinkingHigh
+	case "xhigh":
+		return ThinkingXHigh
+	default:
+		return ThinkingOff
+	}
+}
@@ -0,0 +1,35 @@
+package agent
+
+import "testing"
+
+func TestParseThinkingLevel(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+		want  ThinkingLevel
+	}{
+		{"off", "off", ThinkingOff},
+		{"empty", "", ThinkingOff},
+		{"low", "low", ThinkingLow},
+		{"medium", "medium", ThinkingMedium},
+		{"high", "high", ThinkingHigh},
+		{"xhigh", "xhigh", ThinkingXHigh},
+		{"adaptive", "adaptive", ThinkingAdaptive},
+		{"unknown", "unknown", ThinkingOff},
+		// Case-insensitive and whitespace-tolerant
+		{"upper_Medium", "Medium", ThinkingMedium},
+		{"upper_HIGH", "HIGH", ThinkingHigh},
+		{"mixed_Adaptive", "Adaptive", ThinkingAdaptive},
+		{"leading_space", " high", ThinkingHigh},
+		{"trailing_space", "low ", ThinkingLow},
+		{"both_spaces", " medium ", ThinkingMedium},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := parseThinkingLevel(tt.input); got != tt.want {
+				t.Errorf("parseThinkingLevel(%q) = %q, want %q", tt.input, got, tt.want)
+			}
+		})
+	}
+}
@@ -507,6 +507,7 @@ type ModelConfig struct {
 	RPM            int    `json:"rpm,omitempty"`              // Requests per minute limit
 	MaxTokensField string `json:"max_tokens_field,omitempty"` // Field name for max tokens (e.g., "max_completion_tokens")
 	RequestTimeout int    `json:"request_timeout,omitempty"`
+	ThinkingLevel  string `json:"thinking_level,omitempty"` // Extended thinking: off|low|medium|high|xhigh|adaptive
 }

 // Validate checks if the ModelConfig has all required fields.
@@ -31,6 +31,9 @@ type Provider struct {
 	baseURL     string
 }

+// SupportsThinking implements providers.ThinkingCapable.
+func (p *Provider) SupportsThinking() bool { return true }
+
 func NewProvider(token string) *Provider {
 	return NewProviderWithBaseURL(token, "")
 }
@@ -182,9 +185,80 @@ func buildParams(
 		params.Tools = translateTools(tools)
 	}

+	// Extended Thinking / Adaptive Thinking
+	// The thinking_level value directly determines the API parameter format:
+	//   "adaptive" → {thinking: {type: "adaptive"}} + output_config.effort
+	//   "low/medium/high/xhigh" → {thinking: {type: "enabled", budget_tokens: N}}
+	if level, ok := options["thinking_level"].(string); ok && level != "" && level != "off" {
+		applyThinkingConfig(&params, level)
+	}
+
 	return params, nil
 }

+// applyThinkingConfig sets thinking parameters based on the level value.
+// "adaptive" uses the adaptive thinking API (Claude 4.6+).
+// All other levels use budget_tokens which is universally supported.
+//
+// Anthropic API constraint: temperature must not be set when thinking is enabled.
+// budget_tokens must be strictly less than max_tokens.
+func applyThinkingConfig(params *anthropic.MessageNewParams, level string) {
+	// Anthropic API rejects requests with temperature set alongside thinking.
+	// Reset to zero value (omitted from JSON serialization).
+	if params.Temperature.Valid() {
+		log.Printf("anthropic: temperature cleared because thinking is enabled (level=%s)", level)
+	}
+	params.Temperature = anthropic.MessageNewParams{}.Temperature
+
+	if level == "adaptive" {
+		adaptive := anthropic.NewThinkingConfigAdaptiveParam()
+		params.Thinking = anthropic.ThinkingConfigParamUnion{OfAdaptive: &adaptive}
+		params.OutputConfig = anthropic.OutputConfigParam{
+			Effort: anthropic.OutputConfigEffortHigh,
+		}
+		return
+	}
+
+	budget := int64(levelToBudget(level))
+	if budget <= 0 {
+		return
+	}
+
+	// budget_tokens must be < max_tokens; clamp to respect user's max_tokens setting.
+	if budget >= params.MaxTokens {
+		log.Printf("anthropic: budget_tokens (%d) clamped to %d (max_tokens-1)", budget, params.MaxTokens-1)
+		budget = params.MaxTokens - 1
+	} else if budget > params.MaxTokens*80/100 {
+		log.Printf("anthropic: thinking budget (%d) exceeds 80%% of max_tokens (%d), output may be truncated",
+			budget, params.MaxTokens)
+	}
+	params.Thinking = anthropic.ThinkingConfigParamOfEnabled(budget)
+}
+
+// levelToBudget maps a thinking level to budget_tokens.
+// Values are based on Anthropic's recommendations and community best practices:
+//
+//	low    =  4,096  — simple reasoning, quick debugging (Claude Code "think")
+//	medium = 16,384  — Anthropic recommended sweet spot for most tasks
+//	high   = 32,000  — complex architecture, deep analysis (diminishing returns above this)
+//	xhigh  = 64,000  — extreme reasoning, research problems, benchmarks
+//
+// Note: For Claude 4.6+, prefer adaptive thinking over manual budget_tokens.
+func levelToBudget(level string) int {
+	switch level {
+	case "low":
+		return 4096
+	case "medium":
+		return 16384
+	case "high":
+		return 32000
+	case "xhigh":
+		return 64000
+	default:
+		return 0
+	}
+}
+
 func translateTools(tools []ToolDefinition) []anthropic.ToolUnionParam {
 	result := make([]anthropic.ToolUnionParam, 0, len(tools))
 	for _, t := range tools {
@@ -213,10 +287,14 @@ func translateTools(tools []ToolDefinition) []anthropic.ToolUnionParam {

 func parseResponse(resp *anthropic.Message) *LLMResponse {
 	var content strings.Builder
+	var reasoning strings.Builder
 	var toolCalls []ToolCall

 	for _, block := range resp.Content {
 		switch block.Type {
+		case "thinking":
+			tb := block.AsThinking()
+			reasoning.WriteString(tb.Thinking)
 		case "text":
 			tb := block.AsText()
 			content.WriteString(tb.Text)
@@ -247,6 +325,7 @@ func parseResponse(resp *anthropic.Message) *LLMResponse {

 	return &LLMResponse{
 		Content:      content.String(),
+		Reasoning:    reasoning.String(),
 		ToolCalls:    toolCalls,
 		FinishReason: finishReason,
 		Usage: &UsageInfo{
@@ -0,0 +1,212 @@
+package anthropicprovider
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/anthropics/anthropic-sdk-go"
+)
+
+func TestApplyThinkingConfig_Adaptive(t *testing.T) {
+	params := anthropic.MessageNewParams{
+		MaxTokens:   16000,
+		Temperature: anthropic.Float(0.7),
+	}
+	applyThinkingConfig(&params, "adaptive")
+
+	if params.Thinking.OfAdaptive == nil {
+		t.Fatal("expected adaptive thinking")
+	}
+	if params.Thinking.OfEnabled != nil {
+		t.Error("should not set enabled thinking in adaptive mode")
+	}
+	if params.OutputConfig.Effort != anthropic.OutputConfigEffortHigh {
+		t.Errorf("effort = %q, want %q", params.OutputConfig.Effort, anthropic.OutputConfigEffortHigh)
+	}
+	if params.Temperature.Valid() {
+		t.Error("temperature should be cleared when thinking is enabled")
+	}
+}
+
+func TestApplyThinkingConfig_BudgetLevels(t *testing.T) {
+	tests := []struct {
+		level      string
+		wantBudget int64
+	}{
+		{"low", 4096},
+		{"medium", 16384},
+		{"high", 32000},
+		{"xhigh", 64000},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.level, func(t *testing.T) {
+			params := anthropic.MessageNewParams{
+				MaxTokens:   200000,
+				Temperature: anthropic.Float(0.5),
+			}
+			applyThinkingConfig(&params, tt.level)
+
+			if params.Thinking.OfEnabled == nil {
+				t.Fatal("expected enabled thinking")
+			}
+			if params.Thinking.OfAdaptive != nil {
+				t.Error("should not set adaptive thinking")
+			}
+			if params.Thinking.OfEnabled.BudgetTokens != tt.wantBudget {
+				t.Errorf("budget_tokens = %d, want %d", params.Thinking.OfEnabled.BudgetTokens, tt.wantBudget)
+			}
+			if params.OutputConfig.Effort != "" {
+				t.Errorf("effort = %q, want empty", params.OutputConfig.Effort)
+			}
+			if params.Temperature.Valid() {
+				t.Error("temperature should be cleared when thinking is enabled")
+			}
+		})
+	}
+}
+
+func TestApplyThinkingConfig_BudgetClamp(t *testing.T) {
+	// budget_tokens must be < max_tokens; clamp budget down to respect user's max_tokens.
+	params := anthropic.MessageNewParams{MaxTokens: 4096}
+	applyThinkingConfig(&params, "high") // budget=32000 > maxTokens=4096
+
+	if params.Thinking.OfEnabled == nil {
+		t.Fatal("expected enabled thinking")
+	}
+	if params.Thinking.OfEnabled.BudgetTokens != 4095 {
+		t.Errorf("budget_tokens = %d, want 4095 (maxTokens-1)", params.Thinking.OfEnabled.BudgetTokens)
+	}
+	if params.MaxTokens != 4096 {
+		t.Errorf("max_tokens should not be modified, got %d", params.MaxTokens)
+	}
+}
+
+func TestApplyThinkingConfig_UnknownLevel(t *testing.T) {
+	params := anthropic.MessageNewParams{MaxTokens: 16000}
+	applyThinkingConfig(&params, "unknown")
+
+	if params.Thinking.OfEnabled != nil {
+		t.Error("should not set enabled thinking for unknown level")
+	}
+	if params.Thinking.OfAdaptive != nil {
+		t.Error("should not set adaptive thinking for unknown level")
+	}
+}
+
+func TestLevelToBudget(t *testing.T) {
+	tests := []struct {
+		name  string
+		level string
+		want  int
+	}{
+		{"low", "low", 4096},
+		{"medium", "medium", 16384},
+		{"high", "high", 32000},
+		{"xhigh", "xhigh", 64000},
+		{"off", "off", 0},
+		{"empty", "", 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := levelToBudget(tt.level); got != tt.want {
+				t.Errorf("levelToBudget(%q) = %d, want %d", tt.level, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestBuildParams_ThinkingClearsTemperature(t *testing.T) {
+	msgs := []Message{{Role: "user", Content: "hello"}}
+	opts := map[string]any{
+		"max_tokens":     200000,
+		"temperature":    0.8,
+		"thinking_level": "medium",
+	}
+
+	params, err := buildParams(msgs, nil, "claude-sonnet-4-6", opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if params.Temperature.Valid() {
+		t.Error("temperature should be cleared when thinking_level is set")
+	}
+	if params.Thinking.OfEnabled == nil {
+		t.Fatal("expected enabled thinking")
+	}
+	if params.Thinking.OfEnabled.BudgetTokens != 16384 {
+		t.Errorf("budget_tokens = %d, want 16384", params.Thinking.OfEnabled.BudgetTokens)
+	}
+}
+
+// unmarshalBlocks constructs []ContentBlockUnion via JSON round-trip so that
+// the internal JSON.raw field is populated (required by AsText/AsThinking).
+func unmarshalBlocks(t *testing.T, jsonStr string) []anthropic.ContentBlockUnion {
+	t.Helper()
+	var blocks []anthropic.ContentBlockUnion
+	if err := json.Unmarshal([]byte(jsonStr), &blocks); err != nil {
+		t.Fatalf("unmarshalBlocks: %v", err)
+	}
+	return blocks
+}
+
+func TestParseResponse_ThinkingBlock(t *testing.T) {
+	resp := &anthropic.Message{
+		Content: unmarshalBlocks(t, `[
+			{"type":"thinking","thinking":"Let me reason step by step...","signature":"sig"},
+			{"type":"text","text":"The answer is 42."}
+		]`),
+		StopReason: anthropic.StopReasonEndTurn,
+	}
+
+	result := parseResponse(resp)
+
+	if result.Reasoning != "Let me reason step by step..." {
+		t.Errorf("Reasoning = %q, want thinking content", result.Reasoning)
+	}
+	if result.Content != "The answer is 42." {
+		t.Errorf("Content = %q, want text content", result.Content)
+	}
+	if result.FinishReason != "stop" {
+		t.Errorf("FinishReason = %q, want stop", result.FinishReason)
+	}
+}
+
+func TestParseResponse_NoThinkingBlock(t *testing.T) {
+	resp := &anthropic.Message{
+		Content: unmarshalBlocks(t, `[
+			{"type":"text","text":"Just a normal response."}
+		]`),
+		StopReason: anthropic.StopReasonEndTurn,
+	}
+
+	result := parseResponse(resp)
+
+	if result.Reasoning != "" {
+		t.Errorf("Reasoning = %q, want empty", result.Reasoning)
+	}
+	if result.Content != "Just a normal response." {
+		t.Errorf("Content = %q, want text content", result.Content)
+	}
+}
+
+func TestBuildParams_NoThinkingKeepsTemperature(t *testing.T) {
+	msgs := []Message{{Role: "user", Content: "hello"}}
+	opts := map[string]any{
+		"temperature": 0.8,
+	}
+
+	params, err := buildParams(msgs, nil, "claude-sonnet-4-6", opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !params.Temperature.Valid() {
+		t.Error("temperature should be preserved when thinking is not set")
+	}
+	if params.Temperature.Value != 0.8 {
+		t.Errorf("temperature = %f, want 0.8", params.Temperature.Value)
+	}
+}
@@ -37,6 +37,13 @@ type StatefulProvider interface {
 	Close()
 }

+// ThinkingCapable is an optional interface for providers that support
+// extended thinking (e.g. Anthropic). Used by the agent loop to warn
+// when thinking_level is configured but the active provider cannot use it.
+type ThinkingCapable interface {
+	SupportsThinking() bool
+}
+
 // FailoverReason classifies why an LLM request failed for fallback decisions.
 type FailoverReason string