feat: add extended thinking support for Anthropic models (#1076)

* feat: add extended thinking support for Anthropic models Support configurable thinking levels (off/low/medium/high/xhigh/adaptive) via `agents.defaults.thinking_level` config field. - "adaptive": uses Anthropic's adaptive thinking API (Claude 4.6+) - "low/medium/high/xhigh": uses budget_tokens (all thinking-capable models) - "off": disables thinking (default) API constraints handled: - Temperature cleared when thinking is enabled - budget_tokens clamped to max_tokens-1 - Thinking response blocks parsed into Reasoning field Relates to #645, #966 * fix: address PR review feedback for thinking support - Add ThinkingCapable interface for provider capability detection - Warn when thinking_level is set but provider doesn't support it - Warn when temperature is cleared due to thinking enabled - Adjust budget values per Anthropic best practices (medium=16K, xhigh=64K) - Add budget clamp warning and 80% threshold warning - Add parseResponse thinking block tests - Add thinking_level field to config.example.json * refactor: move ThinkingLevel from AgentDefaults to ModelConfig Thinking is a model-level capability, not a global agent property. Per-model config avoids silent ignoring on non-Anthropic providers and eliminates spurious warning logs in multi-provider setups. Addresses PR #1076 review feedback from @yinwm.
2026-06-12 18:08:54 +00:00 · 2026-03-05 09:51:18 +08:00
parent 325af2163b
commit 204038ec60
9 changed files with 401 additions and 17 deletions
@@ -31,6 +31,9 @@ type Provider struct {
 	baseURL     string
 }

+// SupportsThinking implements providers.ThinkingCapable.
+func (p *Provider) SupportsThinking() bool { return true }
+
 func NewProvider(token string) *Provider {
 	return NewProviderWithBaseURL(token, "")
 }
@@ -182,9 +185,80 @@ func buildParams(
 		params.Tools = translateTools(tools)
 	}

+	// Extended Thinking / Adaptive Thinking
+	// The thinking_level value directly determines the API parameter format:
+	//   "adaptive" → {thinking: {type: "adaptive"}} + output_config.effort
+	//   "low/medium/high/xhigh" → {thinking: {type: "enabled", budget_tokens: N}}
+	if level, ok := options["thinking_level"].(string); ok && level != "" && level != "off" {
+		applyThinkingConfig(&params, level)
+	}
+
 	return params, nil
 }

+// applyThinkingConfig sets thinking parameters based on the level value.
+// "adaptive" uses the adaptive thinking API (Claude 4.6+).
+// All other levels use budget_tokens which is universally supported.
+//
+// Anthropic API constraint: temperature must not be set when thinking is enabled.
+// budget_tokens must be strictly less than max_tokens.
+func applyThinkingConfig(params *anthropic.MessageNewParams, level string) {
+	// Anthropic API rejects requests with temperature set alongside thinking.
+	// Reset to zero value (omitted from JSON serialization).
+	if params.Temperature.Valid() {
+		log.Printf("anthropic: temperature cleared because thinking is enabled (level=%s)", level)
+	}
+	params.Temperature = anthropic.MessageNewParams{}.Temperature
+
+	if level == "adaptive" {
+		adaptive := anthropic.NewThinkingConfigAdaptiveParam()
+		params.Thinking = anthropic.ThinkingConfigParamUnion{OfAdaptive: &adaptive}
+		params.OutputConfig = anthropic.OutputConfigParam{
+			Effort: anthropic.OutputConfigEffortHigh,
+		}
+		return
+	}
+
+	budget := int64(levelToBudget(level))
+	if budget <= 0 {
+		return
+	}
+
+	// budget_tokens must be < max_tokens; clamp to respect user's max_tokens setting.
+	if budget >= params.MaxTokens {
+		log.Printf("anthropic: budget_tokens (%d) clamped to %d (max_tokens-1)", budget, params.MaxTokens-1)
+		budget = params.MaxTokens - 1
+	} else if budget > params.MaxTokens*80/100 {
+		log.Printf("anthropic: thinking budget (%d) exceeds 80%% of max_tokens (%d), output may be truncated",
+			budget, params.MaxTokens)
+	}
+	params.Thinking = anthropic.ThinkingConfigParamOfEnabled(budget)
+}
+
+// levelToBudget maps a thinking level to budget_tokens.
+// Values are based on Anthropic's recommendations and community best practices:
+//
+//	low    =  4,096  — simple reasoning, quick debugging (Claude Code "think")
+//	medium = 16,384  — Anthropic recommended sweet spot for most tasks
+//	high   = 32,000  — complex architecture, deep analysis (diminishing returns above this)
+//	xhigh  = 64,000  — extreme reasoning, research problems, benchmarks
+//
+// Note: For Claude 4.6+, prefer adaptive thinking over manual budget_tokens.
+func levelToBudget(level string) int {
+	switch level {
+	case "low":
+		return 4096
+	case "medium":
+		return 16384
+	case "high":
+		return 32000
+	case "xhigh":
+		return 64000
+	default:
+		return 0
+	}
+}
+
 func translateTools(tools []ToolDefinition) []anthropic.ToolUnionParam {
 	result := make([]anthropic.ToolUnionParam, 0, len(tools))
 	for _, t := range tools {
@@ -213,10 +287,14 @@ func translateTools(tools []ToolDefinition) []anthropic.ToolUnionParam {

 func parseResponse(resp *anthropic.Message) *LLMResponse {
 	var content strings.Builder
+	var reasoning strings.Builder
 	var toolCalls []ToolCall

 	for _, block := range resp.Content {
 		switch block.Type {
+		case "thinking":
+			tb := block.AsThinking()
+			reasoning.WriteString(tb.Thinking)
 		case "text":
 			tb := block.AsText()
 			content.WriteString(tb.Text)
@@ -247,6 +325,7 @@ func parseResponse(resp *anthropic.Message) *LLMResponse {

 	return &LLMResponse{
 		Content:      content.String(),
+		Reasoning:    reasoning.String(),
 		ToolCalls:    toolCalls,
 		FinishReason: finishReason,
 		Usage: &UsageInfo{
@@ -0,0 +1,212 @@
+package anthropicprovider
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/anthropics/anthropic-sdk-go"
+)
+
+func TestApplyThinkingConfig_Adaptive(t *testing.T) {
+	params := anthropic.MessageNewParams{
+		MaxTokens:   16000,
+		Temperature: anthropic.Float(0.7),
+	}
+	applyThinkingConfig(&params, "adaptive")
+
+	if params.Thinking.OfAdaptive == nil {
+		t.Fatal("expected adaptive thinking")
+	}
+	if params.Thinking.OfEnabled != nil {
+		t.Error("should not set enabled thinking in adaptive mode")
+	}
+	if params.OutputConfig.Effort != anthropic.OutputConfigEffortHigh {
+		t.Errorf("effort = %q, want %q", params.OutputConfig.Effort, anthropic.OutputConfigEffortHigh)
+	}
+	if params.Temperature.Valid() {
+		t.Error("temperature should be cleared when thinking is enabled")
+	}
+}
+
+func TestApplyThinkingConfig_BudgetLevels(t *testing.T) {
+	tests := []struct {
+		level      string
+		wantBudget int64
+	}{
+		{"low", 4096},
+		{"medium", 16384},
+		{"high", 32000},
+		{"xhigh", 64000},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.level, func(t *testing.T) {
+			params := anthropic.MessageNewParams{
+				MaxTokens:   200000,
+				Temperature: anthropic.Float(0.5),
+			}
+			applyThinkingConfig(&params, tt.level)
+
+			if params.Thinking.OfEnabled == nil {
+				t.Fatal("expected enabled thinking")
+			}
+			if params.Thinking.OfAdaptive != nil {
+				t.Error("should not set adaptive thinking")
+			}
+			if params.Thinking.OfEnabled.BudgetTokens != tt.wantBudget {
+				t.Errorf("budget_tokens = %d, want %d", params.Thinking.OfEnabled.BudgetTokens, tt.wantBudget)
+			}
+			if params.OutputConfig.Effort != "" {
+				t.Errorf("effort = %q, want empty", params.OutputConfig.Effort)
+			}
+			if params.Temperature.Valid() {
+				t.Error("temperature should be cleared when thinking is enabled")
+			}
+		})
+	}
+}
+
+func TestApplyThinkingConfig_BudgetClamp(t *testing.T) {
+	// budget_tokens must be < max_tokens; clamp budget down to respect user's max_tokens.
+	params := anthropic.MessageNewParams{MaxTokens: 4096}
+	applyThinkingConfig(&params, "high") // budget=32000 > maxTokens=4096
+
+	if params.Thinking.OfEnabled == nil {
+		t.Fatal("expected enabled thinking")
+	}
+	if params.Thinking.OfEnabled.BudgetTokens != 4095 {
+		t.Errorf("budget_tokens = %d, want 4095 (maxTokens-1)", params.Thinking.OfEnabled.BudgetTokens)
+	}
+	if params.MaxTokens != 4096 {
+		t.Errorf("max_tokens should not be modified, got %d", params.MaxTokens)
+	}
+}
+
+func TestApplyThinkingConfig_UnknownLevel(t *testing.T) {
+	params := anthropic.MessageNewParams{MaxTokens: 16000}
+	applyThinkingConfig(&params, "unknown")
+
+	if params.Thinking.OfEnabled != nil {
+		t.Error("should not set enabled thinking for unknown level")
+	}
+	if params.Thinking.OfAdaptive != nil {
+		t.Error("should not set adaptive thinking for unknown level")
+	}
+}
+
+func TestLevelToBudget(t *testing.T) {
+	tests := []struct {
+		name  string
+		level string
+		want  int
+	}{
+		{"low", "low", 4096},
+		{"medium", "medium", 16384},
+		{"high", "high", 32000},
+		{"xhigh", "xhigh", 64000},
+		{"off", "off", 0},
+		{"empty", "", 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := levelToBudget(tt.level); got != tt.want {
+				t.Errorf("levelToBudget(%q) = %d, want %d", tt.level, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestBuildParams_ThinkingClearsTemperature(t *testing.T) {
+	msgs := []Message{{Role: "user", Content: "hello"}}
+	opts := map[string]any{
+		"max_tokens":     200000,
+		"temperature":    0.8,
+		"thinking_level": "medium",
+	}
+
+	params, err := buildParams(msgs, nil, "claude-sonnet-4-6", opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if params.Temperature.Valid() {
+		t.Error("temperature should be cleared when thinking_level is set")
+	}
+	if params.Thinking.OfEnabled == nil {
+		t.Fatal("expected enabled thinking")
+	}
+	if params.Thinking.OfEnabled.BudgetTokens != 16384 {
+		t.Errorf("budget_tokens = %d, want 16384", params.Thinking.OfEnabled.BudgetTokens)
+	}
+}
+
+// unmarshalBlocks constructs []ContentBlockUnion via JSON round-trip so that
+// the internal JSON.raw field is populated (required by AsText/AsThinking).
+func unmarshalBlocks(t *testing.T, jsonStr string) []anthropic.ContentBlockUnion {
+	t.Helper()
+	var blocks []anthropic.ContentBlockUnion
+	if err := json.Unmarshal([]byte(jsonStr), &blocks); err != nil {
+		t.Fatalf("unmarshalBlocks: %v", err)
+	}
+	return blocks
+}
+
+func TestParseResponse_ThinkingBlock(t *testing.T) {
+	resp := &anthropic.Message{
+		Content: unmarshalBlocks(t, `[
+			{"type":"thinking","thinking":"Let me reason step by step...","signature":"sig"},
+			{"type":"text","text":"The answer is 42."}
+		]`),
+		StopReason: anthropic.StopReasonEndTurn,
+	}
+
+	result := parseResponse(resp)
+
+	if result.Reasoning != "Let me reason step by step..." {
+		t.Errorf("Reasoning = %q, want thinking content", result.Reasoning)
+	}
+	if result.Content != "The answer is 42." {
+		t.Errorf("Content = %q, want text content", result.Content)
+	}
+	if result.FinishReason != "stop" {
+		t.Errorf("FinishReason = %q, want stop", result.FinishReason)
+	}
+}
+
+func TestParseResponse_NoThinkingBlock(t *testing.T) {
+	resp := &anthropic.Message{
+		Content: unmarshalBlocks(t, `[
+			{"type":"text","text":"Just a normal response."}
+		]`),
+		StopReason: anthropic.StopReasonEndTurn,
+	}
+
+	result := parseResponse(resp)
+
+	if result.Reasoning != "" {
+		t.Errorf("Reasoning = %q, want empty", result.Reasoning)
+	}
+	if result.Content != "Just a normal response." {
+		t.Errorf("Content = %q, want text content", result.Content)
+	}
+}
+
+func TestBuildParams_NoThinkingKeepsTemperature(t *testing.T) {
+	msgs := []Message{{Role: "user", Content: "hello"}}
+	opts := map[string]any{
+		"temperature": 0.8,
+	}
+
+	params, err := buildParams(msgs, nil, "claude-sonnet-4-6", opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !params.Temperature.Valid() {
+		t.Error("temperature should be preserved when thinking is not set")
+	}
+	if params.Temperature.Value != 0.8 {
+		t.Errorf("temperature = %f, want 0.8", params.Temperature.Value)
+	}
+}
@@ -37,6 +37,13 @@ type StatefulProvider interface {
 	Close()
 }

+// ThinkingCapable is an optional interface for providers that support
+// extended thinking (e.g. Anthropic). Used by the agent loop to warn
+// when thinking_level is configured but the active provider cannot use it.
+type ThinkingCapable interface {
+	SupportsThinking() bool
+}
+
 // FailoverReason classifies why an LLM request failed for fallback decisions.
 type FailoverReason string