fix(openai): stop max_tokens streaming retry loop on reasoning models (#9716) (#10448)

fix(openai): stop max_tokens streaming retry loop on reasoning models

When a thinking model spends its entire max_tokens budget on the reasoning
block, the C++ autoparser clears the raw Response and delivers reasoning-only
ChatDeltas (no content, no tool calls). ComputeChoices' empty-response retry
then fires and regenerates from scratch up to maxRetries times, each
re-consuming the whole budget, instead of terminating with finish_reason
"length" (issue #9716).

Add a reachedTokenBudget helper and suppress both the built-in and
caller-driven retries when the completion count has reached the configured
max_tokens ceiling. Report finish_reason "length" instead of "stop" in the
streaming and non-streaming chat paths when the budget was exhausted.

Adds a deterministic regression test that counts backend invocations
(previously 6, now 1) plus boundary tests for the helper.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Dennisadira <dennisadira@gmail.com>
This commit is contained in:
Adira
2026-06-30 10:01:53 +03:00
committed by GitHub
parent 5d0c43ec6e
commit 28d7397743
4 changed files with 106 additions and 2 deletions

View File

@@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
finishReason = FinishReasonToolCalls
} else if toolsCalled {
finishReason = FinishReasonFunctionCall
} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
// Generation stopped because it hit the max_tokens ceiling
// rather than a natural stop — report "length" (issue #9716).
finishReason = FinishReasonLength
}
// Final delta chunk: empty delta with finish_reason set. Per
@@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
}
}
// If generation hit the max_tokens ceiling, report "length"
// instead of a natural "stop" (issue #9716). Mirrors the
// streaming path; tool/function finish reasons are untouched.
if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
for i := range result {
if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
lengthReason := FinishReasonLength
result[i].FinishReason = &lengthReason
}
}
}
// No MCP tools to execute (or no MCP tools configured), return response
usage := schema.OpenAIUsage{
PromptTokens: tokenUsage.Prompt,

View File

@@ -5,4 +5,7 @@ const (
FinishReasonStop = "stop"
FinishReasonToolCalls = "tool_calls"
FinishReasonFunctionCall = "function_call"
// FinishReasonLength is reported when generation stopped because it
// reached the max_tokens budget rather than a natural stop (issue #9716).
FinishReasonLength = "length"
)

View File

@@ -13,6 +13,14 @@ import (
"github.com/mudler/xlog"
)
// reachedTokenBudget reports whether generation stopped because it reached the
// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
// Used to suppress regeneration retries (which would just hit the same ceiling
// again) and to report finish_reason "length" instead of "stop" (issue #9716).
func reachedTokenBudget(completion int, maxTokens *int) bool {
return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
}
func ComputeChoices(
req *schema.OpenAIRequest,
predInput string,
@@ -113,11 +121,21 @@ func ComputeChoices(
}
prediction = p
// budgetExhausted is true when the model stopped because it reached
// the configured max_tokens ceiling. None of the retry paths below
// should fire in that case: regenerating would just hit the same
// ceiling again and multiply token consumption (issue #9716). A
// thinking model that spends its whole budget on the reasoning block
// produces an empty content / reasoning-only response, which would
// otherwise look like a failed generation worth retrying. This is a
// "length" finish, not an empty one.
budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
// Built-in: retry on truly empty response (no tokens at all).
// However, when the C++ autoparser is active, it clears the raw
// message and delivers content via ChatDeltas instead. Do NOT
// retry if ChatDeltas contain tool calls or content.
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
hasChatDeltaData := false
for _, d := range prediction.ChatDeltas {
if d.Content != "" || len(d.ToolCalls) > 0 {
@@ -159,7 +177,7 @@ func ComputeChoices(
}
}
}
if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
// Caller has already reset its state inside shouldRetry
result = result[:0]
allChatDeltas = nil

View File

@@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() {
})
})
Context("reachedTokenBudget", func() {
ptr := func(i int) *int { return &i }
It("is false when no limit is configured", func() {
Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
})
It("is false when generation stopped below the limit", func() {
Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
})
It("is true when generation reached or exceeded the limit", func() {
Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
})
})
Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
// Reproduces the streaming retry loop: when a thinking model spends its
// entire max_tokens budget on the reasoning block, the C++ autoparser
// clears the raw Response and delivers reasoning-only ChatDeltas (no
// content, no tool calls). The built-in empty-response retry then fires
// and regenerates from scratch up to maxRetries times, each re-consuming
// the whole budget — instead of terminating with finish_reason "length".
It("should NOT retry when the token budget was exhausted", func() {
maxTokens := 100
cfg.Maxtokens = &maxTokens
calls := 0
backend.ModelInferenceFunc = func(
ctx context.Context, s string, messages schema.Messages,
images, videos, audios []string,
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
o *config.ApplicationConfig,
tokenCallback func(string, backend.TokenUsage) bool,
tools, toolChoice string,
logprobs, topLogprobs *int,
logitBias map[string]float64,
metadata map[string]string,
) (func() (backend.LLMResponse, error), error) {
predFunc := func() (backend.LLMResponse, error) {
calls++
// Autoparser cleared Response; only reasoning was produced,
// and the completion count reached the max_tokens budget.
return backend.LLMResponse{
Response: "",
ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
Usage: backend.TokenUsage{Prompt: 5, Completion: maxTokens},
}, nil
}
return predFunc, nil
}
_, usage, _, err := ComputeChoices(
makeReq(), "test", cfg, nil, appCfg, nil,
func(s string, c *[]schema.Choice) {
*c = append(*c, schema.Choice{Text: s})
},
nil,
)
Expect(err).ToNot(HaveOccurred())
// The model hit its token ceiling; regenerating would just hit it
// again and multiply token consumption. Exactly one call expected.
Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
Expect(usage.Completion).To(Equal(maxTokens))
})
})
Context("with streaming token callback", func() {
It("should call tokenCallback for streaming responses", func() {
var streamedTokens []string