From c7c4a20a9e380cbd566a9f8c8b5366a563c6a2d1 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 1 Mar 2026 21:32:38 +0100 Subject: [PATCH] fix: retry when LLM returns empty messages (#8704) * debug Signed-off-by: Ettore Di Giacinto * retry instead of re-computing a response Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- core/http/endpoints/anthropic/messages.go | 20 ++- core/http/endpoints/openai/chat.go | 135 +++++++----------- .../http/endpoints/openresponses/responses.go | 43 ++++-- 3 files changed, 99 insertions(+), 99 deletions(-) diff --git a/core/http/endpoints/anthropic/messages.go b/core/http/endpoints/anthropic/messages.go index 12d500125..f2acc524f 100644 --- a/core/http/endpoints/anthropic/messages.go +++ b/core/http/endpoints/anthropic/messages.go @@ -125,13 +125,21 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err)) } - prediction, err := predFunc() - if err != nil { - xlog.Error("Anthropic prediction failed", "error", err) - return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err)) + const maxEmptyRetries = 5 + var prediction backend.LLMResponse + var result string + for attempt := 0; attempt <= maxEmptyRetries; attempt++ { + prediction, err = predFunc() + if err != nil { + xlog.Error("Anthropic prediction failed", "error", err) + return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err)) + } + result = backend.Finetune(*cfg, predInput, prediction.Response) + if result != "" || !shouldUseFn { + break + } + xlog.Warn("Anthropic: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries) } - - result := backend.Finetune(*cfg, predInput, prediction.Response) // Check if the result contains tool calls toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig) diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 753936124..8f4a44a07 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -270,7 +270,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator } responses <- initialMessage - result, err := handleQuestion(config, cl, req, ml, startupOptions, functionResults, result, prompt) + result, err := handleQuestion(config, functionResults, result, prompt) if err != nil { xlog.Error("error handling question", "error", err) return err @@ -388,6 +388,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions() strictMode := false + xlog.Debug("Tool call routing decision", + "shouldUseFn", shouldUseFn, + "len(input.Functions)", len(input.Functions), + "len(input.Tools)", len(input.Tools), + "config.ShouldUseFunctions()", config.ShouldUseFunctions(), + "config.FunctionToCall()", config.FunctionToCall(), + ) + for _, f := range input.Functions { if f.Strict { strictMode = true @@ -648,12 +656,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template) + var emptyRetryNeeded bool + tokenCallback := func(s string, c *[]schema.Choice) { // Prepend thinking token if needed, then extract reasoning from the response reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig) if !shouldUseFn { - // no function is called, just reply and use stop as finish reason stopReason := FinishReasonStop message := &schema.Message{Role: "assistant", Content: &s} if reasoning != "" { @@ -671,9 +680,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch { case noActionsToRun: - result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput) + if s == "" && textContentToReturn == "" { + xlog.Warn("Backend returned empty content in tool-calling context, will retry") + emptyRetryNeeded = true + return + } + result, err := handleQuestion(config, results, s, predInput) if err != nil { xlog.Error("error handling question", "error", err) + emptyRetryNeeded = true return } @@ -745,19 +760,42 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Echo properly supports context cancellation via c.Request().Context() // No workaround needed! - result, tokenUsage, err := ComputeChoices( - input, - predInput, - config, - cl, - startupOptions, - ml, - tokenCallback, - nil, - ) + const maxEmptyRetries = 5 + var result []schema.Choice + var tokenUsage backend.TokenUsage + var err error + + for attempt := 0; attempt <= maxEmptyRetries; attempt++ { + emptyRetryNeeded = false + result, tokenUsage, err = ComputeChoices( + input, + predInput, + config, + cl, + startupOptions, + ml, + tokenCallback, + nil, + ) + if err != nil || !emptyRetryNeeded { + break + } + xlog.Warn("Retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries) + } if err != nil { return err } + + if emptyRetryNeeded { + xlog.Warn("All retries exhausted, backend still returning empty content") + stopReason := FinishReasonStop + empty := "" + result = append(result, schema.Choice{ + FinishReason: &stopReason, + Index: 0, + Message: &schema.Message{Role: "assistant", Content: &empty}, + }) + } usage := schema.OpenAIUsage{ PromptTokens: tokenUsage.Prompt, CompletionTokens: tokenUsage.Completion, @@ -785,7 +823,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator } } -func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) { +func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) { if len(funcResults) == 0 && result != "" { xlog.Debug("nothing function results but we had a message from the LLM") @@ -818,73 +856,6 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in } xlog.Debug("No action received from LLM, without a message, computing a reply") - // Otherwise ask the LLM to understand the JSON output and the context, and return a message - // Note: This costs (in term of CPU/GPU) another computation - config.Grammar = "" - images := []string{} - for _, m := range input.Messages { - images = append(images, m.StringImages...) - } - videos := []string{} - for _, m := range input.Messages { - videos = append(videos, m.StringVideos...) - } - audios := []string{} - for _, m := range input.Messages { - audios = append(audios, m.StringAudios...) - } - // Serialize tools and tool_choice to JSON strings - toolsJSON := "" - if len(input.Tools) > 0 { - toolsBytes, err := json.Marshal(input.Tools) - if err == nil { - toolsJSON = string(toolsBytes) - } - } - toolChoiceJSON := "" - if input.ToolsChoice != nil { - toolChoiceBytes, err := json.Marshal(input.ToolsChoice) - if err == nil { - toolChoiceJSON = string(toolChoiceBytes) - } - } - - // Extract logprobs from request - // According to OpenAI API: logprobs is boolean, top_logprobs (0-20) controls how many top tokens per position - var logprobs *int - var topLogprobs *int - if input.Logprobs.IsEnabled() { - // If logprobs is enabled, use top_logprobs if provided, otherwise default to 1 - if input.TopLogprobs != nil { - topLogprobs = input.TopLogprobs - // For backend compatibility, set logprobs to the top_logprobs value - logprobs = input.TopLogprobs - } else { - // Default to 1 if logprobs is true but top_logprobs not specified - val := 1 - logprobs = &val - topLogprobs = &val - } - } - - // Extract logit_bias from request - // According to OpenAI API: logit_bias is a map of token IDs (as strings) to bias values (-100 to 100) - var logitBias map[string]float64 - if len(input.LogitBias) > 0 { - logitBias = input.LogitBias - } - - predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias) - if err != nil { - xlog.Error("model inference failed", "error", err) - return "", err - } - - prediction, err := predFunc() - if err != nil { - xlog.Error("prediction failed", "error", err) - return "", err - } - return backend.Finetune(*config, prompt, prediction.Response), nil + return "", fmt.Errorf("no action received from LLM, without a message, computing a reply") } diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 38f4c6538..2a939c730 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -800,13 +800,26 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon default: } - prediction, err := predFunc() - if err != nil { - return nil, fmt.Errorf("prediction failed: %w", err) + const maxEmptyRetries = 5 + var prediction backend.LLMResponse + var result string + for attempt := 0; attempt <= maxEmptyRetries; attempt++ { + prediction, err = predFunc() + if err != nil { + return nil, fmt.Errorf("prediction failed: %w", err) + } + result = backend.Finetune(*cfg, predInput, prediction.Response) + if result != "" || !shouldUseFn { + break + } + select { + case <-ctx.Done(): + return nil, ctx.Err() + default: + } + xlog.Warn("Open Responses background: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries) } - result := backend.Finetune(*cfg, predInput, prediction.Response) - // Parse tool calls if using functions (same logic as regular handler) var outputItems []schema.ORItemField var toolCalls []schema.ToolCall @@ -1475,13 +1488,21 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "") } - prediction, err := predFunc() - if err != nil { - xlog.Error("Open Responses prediction failed", "error", err) - return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "") + const maxEmptyRetries = 5 + var prediction backend.LLMResponse + var result string + for attempt := 0; attempt <= maxEmptyRetries; attempt++ { + prediction, err = predFunc() + if err != nil { + xlog.Error("Open Responses prediction failed", "error", err) + return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "") + } + result = backend.Finetune(*cfg, predInput, prediction.Response) + if result != "" || !shouldUseFn { + break + } + xlog.Warn("Open Responses: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries) } - - result := backend.Finetune(*cfg, predInput, prediction.Response) xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn) // Detect if thinking token is already in prompt or template