fix(openai): stop max_tokens streaming retry loop on reasoning models (#9716) (#10448)

fix(openai): stop max_tokens streaming retry loop on reasoning models When a thinking model spends its entire max_tokens budget on the reasoning block, the C++ autoparser clears the raw Response and delivers reasoning-only ChatDeltas (no content, no tool calls). ComputeChoices' empty-response retry then fires and regenerates from scratch up to maxRetries times, each re-consuming the whole budget, instead of terminating with finish_reason "length" (issue #9716). Add a reachedTokenBudget helper and suppress both the built-in and caller-driven retries when the completion count has reached the configured max_tokens ceiling. Report finish_reason "length" instead of "stop" in the streaming and non-streaming chat paths when the budget was exhausted. Adds a deterministic regression test that counts backend invocations (previously 6, now 1) plus boundary tests for the helper. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Dennisadira <dennisadira@gmail.com>
2026-06-30 11:26:32 -04:00 · 2026-06-30 10:01:53 +03:00
parent 5d0c43ec6e
commit 28d7397743
4 changed files with 106 additions and 2 deletions
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					finishReason = FinishReasonToolCalls
 				} else if toolsCalled {
 					finishReason = FinishReasonFunctionCall
+				} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
+					// Generation stopped because it hit the max_tokens ceiling
+					// rather than a natural stop — report "length" (issue #9716).
+					finishReason = FinishReasonLength
 				}

 				// Final delta chunk: empty delta with finish_reason set. Per
@@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}
 				}

+				// If generation hit the max_tokens ceiling, report "length"
+				// instead of a natural "stop" (issue #9716). Mirrors the
+				// streaming path; tool/function finish reasons are untouched.
+				if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
+					for i := range result {
+						if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
+							lengthReason := FinishReasonLength
+							result[i].FinishReason = &lengthReason
+						}
+					}
+				}
+
 				// No MCP tools to execute (or no MCP tools configured), return response
 				usage := schema.OpenAIUsage{
 					PromptTokens:     tokenUsage.Prompt,
--- a/core/http/endpoints/openai/constants.go
+++ b/core/http/endpoints/openai/constants.go
@@ -5,4 +5,7 @@ const (
 	FinishReasonStop         = "stop"
 	FinishReasonToolCalls    = "tool_calls"
 	FinishReasonFunctionCall = "function_call"
+	// FinishReasonLength is reported when generation stopped because it
+	// reached the max_tokens budget rather than a natural stop (issue #9716).
+	FinishReasonLength = "length"
 )
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -13,6 +13,14 @@ import (
 	"github.com/mudler/xlog"
 )

+// reachedTokenBudget reports whether generation stopped because it reached the
+// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
+// Used to suppress regeneration retries (which would just hit the same ceiling
+// again) and to report finish_reason "length" instead of "stop" (issue #9716).
+func reachedTokenBudget(completion int, maxTokens *int) bool {
+	return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
+}
+
 func ComputeChoices(
 	req *schema.OpenAIRequest,
 	predInput string,
@@ -113,11 +121,21 @@ func ComputeChoices(
 			}
 			prediction = p

+			// budgetExhausted is true when the model stopped because it reached
+			// the configured max_tokens ceiling. None of the retry paths below
+			// should fire in that case: regenerating would just hit the same
+			// ceiling again and multiply token consumption (issue #9716). A
+			// thinking model that spends its whole budget on the reasoning block
+			// produces an empty content / reasoning-only response, which would
+			// otherwise look like a failed generation worth retrying. This is a
+			// "length" finish, not an empty one.
+			budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
+
 			// Built-in: retry on truly empty response (no tokens at all).
 			// However, when the C++ autoparser is active, it clears the raw
 			// message and delivers content via ChatDeltas instead. Do NOT
 			// retry if ChatDeltas contain tool calls or content.
-			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
+			if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
 				hasChatDeltaData := false
 				for _, d := range prediction.ChatDeltas {
 					if d.Content != "" || len(d.ToolCalls) > 0 {
@@ -159,7 +177,7 @@ func ComputeChoices(
 					}
 				}
 			}
-			if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
+			if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
 				// Caller has already reset its state inside shouldRetry
 				result = result[:0]
 				allChatDeltas = nil
--- a/core/http/endpoints/openai/inference_test.go
+++ b/core/http/endpoints/openai/inference_test.go
@@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() {
 		})
 	})

+	Context("reachedTokenBudget", func() {
+		ptr := func(i int) *int { return &i }
+		It("is false when no limit is configured", func() {
+			Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
+			Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
+			Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
+		})
+		It("is false when generation stopped below the limit", func() {
+			Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
+		})
+		It("is true when generation reached or exceeded the limit", func() {
+			Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
+			Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
+		})
+	})
+
+	Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
+		// Reproduces the streaming retry loop: when a thinking model spends its
+		// entire max_tokens budget on the reasoning block, the C++ autoparser
+		// clears the raw Response and delivers reasoning-only ChatDeltas (no
+		// content, no tool calls). The built-in empty-response retry then fires
+		// and regenerates from scratch up to maxRetries times, each re-consuming
+		// the whole budget — instead of terminating with finish_reason "length".
+		It("should NOT retry when the token budget was exhausted", func() {
+			maxTokens := 100
+			cfg.Maxtokens = &maxTokens
+
+			calls := 0
+			backend.ModelInferenceFunc = func(
+				ctx context.Context, s string, messages schema.Messages,
+				images, videos, audios []string,
+				loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
+				o *config.ApplicationConfig,
+				tokenCallback func(string, backend.TokenUsage) bool,
+				tools, toolChoice string,
+				logprobs, topLogprobs *int,
+				logitBias map[string]float64,
+				metadata map[string]string,
+			) (func() (backend.LLMResponse, error), error) {
+				predFunc := func() (backend.LLMResponse, error) {
+					calls++
+					// Autoparser cleared Response; only reasoning was produced,
+					// and the completion count reached the max_tokens budget.
+					return backend.LLMResponse{
+						Response:   "",
+						ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
+						Usage:      backend.TokenUsage{Prompt: 5, Completion: maxTokens},
+					}, nil
+				}
+				return predFunc, nil
+			}
+
+			_, usage, _, err := ComputeChoices(
+				makeReq(), "test", cfg, nil, appCfg, nil,
+				func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s})
+				},
+				nil,
+			)
+			Expect(err).ToNot(HaveOccurred())
+			// The model hit its token ceiling; regenerating would just hit it
+			// again and multiply token consumption. Exactly one call expected.
+			Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
+			Expect(usage.Completion).To(Equal(maxTokens))
+		})
+	})
+
 	Context("with streaming token callback", func() {
 		It("should call tokenCallback for streaming responses", func() {
 			var streamedTokens []string