mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-30 11:26:32 -04:00
fix(openai): stop max_tokens streaming retry loop on reasoning models When a thinking model spends its entire max_tokens budget on the reasoning block, the C++ autoparser clears the raw Response and delivers reasoning-only ChatDeltas (no content, no tool calls). ComputeChoices' empty-response retry then fires and regenerates from scratch up to maxRetries times, each re-consuming the whole budget, instead of terminating with finish_reason "length" (issue #9716). Add a reachedTokenBudget helper and suppress both the built-in and caller-driven retries when the completion count has reached the configured max_tokens ceiling. Report finish_reason "length" instead of "stop" in the streaming and non-streaming chat paths when the budget was exhausted. Adds a deterministic regression test that counts backend invocations (previously 6, now 1) plus boundary tests for the helper. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Dennisadira <dennisadira@gmail.com>
This commit is contained in:
@@ -618,6 +618,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
finishReason = FinishReasonToolCalls
|
||||
} else if toolsCalled {
|
||||
finishReason = FinishReasonFunctionCall
|
||||
} else if reachedTokenBudget(finalUsage.Completion, config.Maxtokens) {
|
||||
// Generation stopped because it hit the max_tokens ceiling
|
||||
// rather than a natural stop — report "length" (issue #9716).
|
||||
finishReason = FinishReasonLength
|
||||
}
|
||||
|
||||
// Final delta chunk: empty delta with finish_reason set. Per
|
||||
@@ -984,6 +988,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
}
|
||||
}
|
||||
|
||||
// If generation hit the max_tokens ceiling, report "length"
|
||||
// instead of a natural "stop" (issue #9716). Mirrors the
|
||||
// streaming path; tool/function finish reasons are untouched.
|
||||
if reachedTokenBudget(tokenUsage.Completion, config.Maxtokens) {
|
||||
for i := range result {
|
||||
if result[i].FinishReason != nil && *result[i].FinishReason == FinishReasonStop {
|
||||
lengthReason := FinishReasonLength
|
||||
result[i].FinishReason = &lengthReason
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No MCP tools to execute (or no MCP tools configured), return response
|
||||
usage := schema.OpenAIUsage{
|
||||
PromptTokens: tokenUsage.Prompt,
|
||||
|
||||
@@ -5,4 +5,7 @@ const (
|
||||
FinishReasonStop = "stop"
|
||||
FinishReasonToolCalls = "tool_calls"
|
||||
FinishReasonFunctionCall = "function_call"
|
||||
// FinishReasonLength is reported when generation stopped because it
|
||||
// reached the max_tokens budget rather than a natural stop (issue #9716).
|
||||
FinishReasonLength = "length"
|
||||
)
|
||||
|
||||
@@ -13,6 +13,14 @@ import (
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// reachedTokenBudget reports whether generation stopped because it reached the
|
||||
// configured max_tokens ceiling. A maxTokens of nil or <= 0 means "no limit".
|
||||
// Used to suppress regeneration retries (which would just hit the same ceiling
|
||||
// again) and to report finish_reason "length" instead of "stop" (issue #9716).
|
||||
func reachedTokenBudget(completion int, maxTokens *int) bool {
|
||||
return maxTokens != nil && *maxTokens > 0 && completion >= *maxTokens
|
||||
}
|
||||
|
||||
func ComputeChoices(
|
||||
req *schema.OpenAIRequest,
|
||||
predInput string,
|
||||
@@ -113,11 +121,21 @@ func ComputeChoices(
|
||||
}
|
||||
prediction = p
|
||||
|
||||
// budgetExhausted is true when the model stopped because it reached
|
||||
// the configured max_tokens ceiling. None of the retry paths below
|
||||
// should fire in that case: regenerating would just hit the same
|
||||
// ceiling again and multiply token consumption (issue #9716). A
|
||||
// thinking model that spends its whole budget on the reasoning block
|
||||
// produces an empty content / reasoning-only response, which would
|
||||
// otherwise look like a failed generation worth retrying. This is a
|
||||
// "length" finish, not an empty one.
|
||||
budgetExhausted := reachedTokenBudget(prediction.Usage.Completion, config.Maxtokens)
|
||||
|
||||
// Built-in: retry on truly empty response (no tokens at all).
|
||||
// However, when the C++ autoparser is active, it clears the raw
|
||||
// message and delivers content via ChatDeltas instead. Do NOT
|
||||
// retry if ChatDeltas contain tool calls or content.
|
||||
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
|
||||
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries && !budgetExhausted {
|
||||
hasChatDeltaData := false
|
||||
for _, d := range prediction.ChatDeltas {
|
||||
if d.Content != "" || len(d.ToolCalls) > 0 {
|
||||
@@ -159,7 +177,7 @@ func ComputeChoices(
|
||||
}
|
||||
}
|
||||
}
|
||||
if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
|
||||
if shouldRetryFn != nil && !skipCallerRetry && !budgetExhausted && shouldRetryFn(attempt) && attempt < maxRetries {
|
||||
// Caller has already reset its state inside shouldRetry
|
||||
result = result[:0]
|
||||
allChatDeltas = nil
|
||||
|
||||
@@ -393,6 +393,73 @@ var _ = Describe("ComputeChoices", func() {
|
||||
})
|
||||
})
|
||||
|
||||
Context("reachedTokenBudget", func() {
|
||||
ptr := func(i int) *int { return &i }
|
||||
It("is false when no limit is configured", func() {
|
||||
Expect(reachedTokenBudget(1000, nil)).To(BeFalse())
|
||||
Expect(reachedTokenBudget(1000, ptr(0))).To(BeFalse())
|
||||
Expect(reachedTokenBudget(1000, ptr(-1))).To(BeFalse())
|
||||
})
|
||||
It("is false when generation stopped below the limit", func() {
|
||||
Expect(reachedTokenBudget(99, ptr(100))).To(BeFalse())
|
||||
})
|
||||
It("is true when generation reached or exceeded the limit", func() {
|
||||
Expect(reachedTokenBudget(100, ptr(100))).To(BeTrue())
|
||||
Expect(reachedTokenBudget(101, ptr(100))).To(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
Context("max_tokens budget exhausted on reasoning (issue #9716)", func() {
|
||||
// Reproduces the streaming retry loop: when a thinking model spends its
|
||||
// entire max_tokens budget on the reasoning block, the C++ autoparser
|
||||
// clears the raw Response and delivers reasoning-only ChatDeltas (no
|
||||
// content, no tool calls). The built-in empty-response retry then fires
|
||||
// and regenerates from scratch up to maxRetries times, each re-consuming
|
||||
// the whole budget — instead of terminating with finish_reason "length".
|
||||
It("should NOT retry when the token budget was exhausted", func() {
|
||||
maxTokens := 100
|
||||
cfg.Maxtokens = &maxTokens
|
||||
|
||||
calls := 0
|
||||
backend.ModelInferenceFunc = func(
|
||||
ctx context.Context, s string, messages schema.Messages,
|
||||
images, videos, audios []string,
|
||||
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
|
||||
o *config.ApplicationConfig,
|
||||
tokenCallback func(string, backend.TokenUsage) bool,
|
||||
tools, toolChoice string,
|
||||
logprobs, topLogprobs *int,
|
||||
logitBias map[string]float64,
|
||||
metadata map[string]string,
|
||||
) (func() (backend.LLMResponse, error), error) {
|
||||
predFunc := func() (backend.LLMResponse, error) {
|
||||
calls++
|
||||
// Autoparser cleared Response; only reasoning was produced,
|
||||
// and the completion count reached the max_tokens budget.
|
||||
return backend.LLMResponse{
|
||||
Response: "",
|
||||
ChatDeltas: []*pb.ChatDelta{{ReasoningContent: "thinking..."}},
|
||||
Usage: backend.TokenUsage{Prompt: 5, Completion: maxTokens},
|
||||
}, nil
|
||||
}
|
||||
return predFunc, nil
|
||||
}
|
||||
|
||||
_, usage, _, err := ComputeChoices(
|
||||
makeReq(), "test", cfg, nil, appCfg, nil,
|
||||
func(s string, c *[]schema.Choice) {
|
||||
*c = append(*c, schema.Choice{Text: s})
|
||||
},
|
||||
nil,
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
// The model hit its token ceiling; regenerating would just hit it
|
||||
// again and multiply token consumption. Exactly one call expected.
|
||||
Expect(calls).To(Equal(1), "budget-exhausted generation must not be retried")
|
||||
Expect(usage.Completion).To(Equal(maxTokens))
|
||||
})
|
||||
})
|
||||
|
||||
Context("with streaming token callback", func() {
|
||||
It("should call tokenCallback for streaming responses", func() {
|
||||
var streamedTokens []string
|
||||
|
||||
Reference in New Issue
Block a user