mirror of
https://github.com/mudler/LocalAI.git
synced 2026-03-31 21:25:59 -04:00
fix: retry when LLM returns empty messages (#8704)
* debug Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * retry instead of re-computing a response Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
94539f3992
commit
c7c4a20a9e
@@ -125,13 +125,21 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Anthropic prediction failed", "error", err)
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
|
||||
const maxEmptyRetries = 5
|
||||
var prediction backend.LLMResponse
|
||||
var result string
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
prediction, err = predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Anthropic prediction failed", "error", err)
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
|
||||
}
|
||||
result = backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
if result != "" || !shouldUseFn {
|
||||
break
|
||||
}
|
||||
xlog.Warn("Anthropic: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
|
||||
// Check if the result contains tool calls
|
||||
toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
|
||||
|
||||
@@ -270,7 +270,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
}
|
||||
responses <- initialMessage
|
||||
|
||||
result, err := handleQuestion(config, cl, req, ml, startupOptions, functionResults, result, prompt)
|
||||
result, err := handleQuestion(config, functionResults, result, prompt)
|
||||
if err != nil {
|
||||
xlog.Error("error handling question", "error", err)
|
||||
return err
|
||||
@@ -388,6 +388,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
|
||||
strictMode := false
|
||||
|
||||
xlog.Debug("Tool call routing decision",
|
||||
"shouldUseFn", shouldUseFn,
|
||||
"len(input.Functions)", len(input.Functions),
|
||||
"len(input.Tools)", len(input.Tools),
|
||||
"config.ShouldUseFunctions()", config.ShouldUseFunctions(),
|
||||
"config.FunctionToCall()", config.FunctionToCall(),
|
||||
)
|
||||
|
||||
for _, f := range input.Functions {
|
||||
if f.Strict {
|
||||
strictMode = true
|
||||
@@ -648,12 +656,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)
|
||||
|
||||
var emptyRetryNeeded bool
|
||||
|
||||
tokenCallback := func(s string, c *[]schema.Choice) {
|
||||
// Prepend thinking token if needed, then extract reasoning from the response
|
||||
reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
if !shouldUseFn {
|
||||
// no function is called, just reply and use stop as finish reason
|
||||
stopReason := FinishReasonStop
|
||||
message := &schema.Message{Role: "assistant", Content: &s}
|
||||
if reasoning != "" {
|
||||
@@ -671,9 +680,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
switch {
|
||||
case noActionsToRun:
|
||||
result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput)
|
||||
if s == "" && textContentToReturn == "" {
|
||||
xlog.Warn("Backend returned empty content in tool-calling context, will retry")
|
||||
emptyRetryNeeded = true
|
||||
return
|
||||
}
|
||||
result, err := handleQuestion(config, results, s, predInput)
|
||||
if err != nil {
|
||||
xlog.Error("error handling question", "error", err)
|
||||
emptyRetryNeeded = true
|
||||
return
|
||||
}
|
||||
|
||||
@@ -745,19 +760,42 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
// Echo properly supports context cancellation via c.Request().Context()
|
||||
// No workaround needed!
|
||||
|
||||
result, tokenUsage, err := ComputeChoices(
|
||||
input,
|
||||
predInput,
|
||||
config,
|
||||
cl,
|
||||
startupOptions,
|
||||
ml,
|
||||
tokenCallback,
|
||||
nil,
|
||||
)
|
||||
const maxEmptyRetries = 5
|
||||
var result []schema.Choice
|
||||
var tokenUsage backend.TokenUsage
|
||||
var err error
|
||||
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
emptyRetryNeeded = false
|
||||
result, tokenUsage, err = ComputeChoices(
|
||||
input,
|
||||
predInput,
|
||||
config,
|
||||
cl,
|
||||
startupOptions,
|
||||
ml,
|
||||
tokenCallback,
|
||||
nil,
|
||||
)
|
||||
if err != nil || !emptyRetryNeeded {
|
||||
break
|
||||
}
|
||||
xlog.Warn("Retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if emptyRetryNeeded {
|
||||
xlog.Warn("All retries exhausted, backend still returning empty content")
|
||||
stopReason := FinishReasonStop
|
||||
empty := ""
|
||||
result = append(result, schema.Choice{
|
||||
FinishReason: &stopReason,
|
||||
Index: 0,
|
||||
Message: &schema.Message{Role: "assistant", Content: &empty},
|
||||
})
|
||||
}
|
||||
usage := schema.OpenAIUsage{
|
||||
PromptTokens: tokenUsage.Prompt,
|
||||
CompletionTokens: tokenUsage.Completion,
|
||||
@@ -785,7 +823,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
}
|
||||
}
|
||||
|
||||
func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
|
||||
func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
|
||||
|
||||
if len(funcResults) == 0 && result != "" {
|
||||
xlog.Debug("nothing function results but we had a message from the LLM")
|
||||
@@ -818,73 +856,6 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in
|
||||
}
|
||||
|
||||
xlog.Debug("No action received from LLM, without a message, computing a reply")
|
||||
// Otherwise ask the LLM to understand the JSON output and the context, and return a message
|
||||
// Note: This costs (in term of CPU/GPU) another computation
|
||||
config.Grammar = ""
|
||||
images := []string{}
|
||||
for _, m := range input.Messages {
|
||||
images = append(images, m.StringImages...)
|
||||
}
|
||||
videos := []string{}
|
||||
for _, m := range input.Messages {
|
||||
videos = append(videos, m.StringVideos...)
|
||||
}
|
||||
audios := []string{}
|
||||
for _, m := range input.Messages {
|
||||
audios = append(audios, m.StringAudios...)
|
||||
}
|
||||
|
||||
// Serialize tools and tool_choice to JSON strings
|
||||
toolsJSON := ""
|
||||
if len(input.Tools) > 0 {
|
||||
toolsBytes, err := json.Marshal(input.Tools)
|
||||
if err == nil {
|
||||
toolsJSON = string(toolsBytes)
|
||||
}
|
||||
}
|
||||
toolChoiceJSON := ""
|
||||
if input.ToolsChoice != nil {
|
||||
toolChoiceBytes, err := json.Marshal(input.ToolsChoice)
|
||||
if err == nil {
|
||||
toolChoiceJSON = string(toolChoiceBytes)
|
||||
}
|
||||
}
|
||||
|
||||
// Extract logprobs from request
|
||||
// According to OpenAI API: logprobs is boolean, top_logprobs (0-20) controls how many top tokens per position
|
||||
var logprobs *int
|
||||
var topLogprobs *int
|
||||
if input.Logprobs.IsEnabled() {
|
||||
// If logprobs is enabled, use top_logprobs if provided, otherwise default to 1
|
||||
if input.TopLogprobs != nil {
|
||||
topLogprobs = input.TopLogprobs
|
||||
// For backend compatibility, set logprobs to the top_logprobs value
|
||||
logprobs = input.TopLogprobs
|
||||
} else {
|
||||
// Default to 1 if logprobs is true but top_logprobs not specified
|
||||
val := 1
|
||||
logprobs = &val
|
||||
topLogprobs = &val
|
||||
}
|
||||
}
|
||||
|
||||
// Extract logit_bias from request
|
||||
// According to OpenAI API: logit_bias is a map of token IDs (as strings) to bias values (-100 to 100)
|
||||
var logitBias map[string]float64
|
||||
if len(input.LogitBias) > 0 {
|
||||
logitBias = input.LogitBias
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
|
||||
if err != nil {
|
||||
xlog.Error("model inference failed", "error", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("prediction failed", "error", err)
|
||||
return "", err
|
||||
}
|
||||
return backend.Finetune(*config, prompt, prediction.Response), nil
|
||||
return "", fmt.Errorf("no action received from LLM, without a message, computing a reply")
|
||||
}
|
||||
|
||||
@@ -800,13 +800,26 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon
|
||||
default:
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("prediction failed: %w", err)
|
||||
const maxEmptyRetries = 5
|
||||
var prediction backend.LLMResponse
|
||||
var result string
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
prediction, err = predFunc()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("prediction failed: %w", err)
|
||||
}
|
||||
result = backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
if result != "" || !shouldUseFn {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
xlog.Warn("Open Responses background: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
|
||||
// Parse tool calls if using functions (same logic as regular handler)
|
||||
var outputItems []schema.ORItemField
|
||||
var toolCalls []schema.ToolCall
|
||||
@@ -1475,13 +1488,21 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "")
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses prediction failed", "error", err)
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
|
||||
const maxEmptyRetries = 5
|
||||
var prediction backend.LLMResponse
|
||||
var result string
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
prediction, err = predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses prediction failed", "error", err)
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
|
||||
}
|
||||
result = backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
if result != "" || !shouldUseFn {
|
||||
break
|
||||
}
|
||||
xlog.Warn("Open Responses: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)
|
||||
|
||||
// Detect if thinking token is already in prompt or template
|
||||
|
||||
Reference in New Issue
Block a user