fix: retry when LLM returns empty messages (#8704)

* debug Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * retry instead of re-computing a response Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-18 21:45:01 -04:00 · 2026-03-01 21:32:38 +01:00
parent 94539f3992
commit c7c4a20a9e
3 changed files with 99 additions and 99 deletions
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -125,13 +125,21 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
 		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
 	}

-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("Anthropic prediction failed", "error", err)
-		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
+	const maxEmptyRetries = 5
+	var prediction backend.LLMResponse
+	var result string
+	for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+		prediction, err = predFunc()
+		if err != nil {
+			xlog.Error("Anthropic prediction failed", "error", err)
+			return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
+		}
+		result = backend.Finetune(*cfg, predInput, prediction.Response)
+		if result != "" || !shouldUseFn {
+			break
+		}
+		xlog.Warn("Anthropic: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
 	}
-
-	result := backend.Finetune(*cfg, predInput, prediction.Response)
 	
 	// Check if the result contains tool calls
 	toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -270,7 +270,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			}
 			responses <- initialMessage

-			result, err := handleQuestion(config, cl, req, ml, startupOptions, functionResults, result, prompt)
+			result, err := handleQuestion(config, functionResults, result, prompt)
 			if err != nil {
 				xlog.Error("error handling question", "error", err)
 				return err
@@ -388,6 +388,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
 		strictMode := false

+		xlog.Debug("Tool call routing decision",
+			"shouldUseFn", shouldUseFn,
+			"len(input.Functions)", len(input.Functions),
+			"len(input.Tools)", len(input.Tools),
+			"config.ShouldUseFunctions()", config.ShouldUseFunctions(),
+			"config.FunctionToCall()", config.FunctionToCall(),
+		)
+
 		for _, f := range input.Functions {
 			if f.Strict {
 				strictMode = true
@@ -648,12 +656,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 			xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)

+			var emptyRetryNeeded bool
+
 			tokenCallback := func(s string, c *[]schema.Choice) {
 				// Prepend thinking token if needed, then extract reasoning from the response
 				reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)

 				if !shouldUseFn {
-					// no function is called, just reply and use stop as finish reason
 					stopReason := FinishReasonStop
 					message := &schema.Message{Role: "assistant", Content: &s}
 					if reasoning != "" {
@@ -671,9 +680,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 				switch {
 				case noActionsToRun:
-					result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput)
+					if s == "" && textContentToReturn == "" {
+						xlog.Warn("Backend returned empty content in tool-calling context, will retry")
+						emptyRetryNeeded = true
+						return
+					}
+					result, err := handleQuestion(config, results, s, predInput)
 					if err != nil {
 						xlog.Error("error handling question", "error", err)
+						emptyRetryNeeded = true
 						return
 					}

@@ -745,19 +760,42 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			// Echo properly supports context cancellation via c.Request().Context()
 			// No workaround needed!

-			result, tokenUsage, err := ComputeChoices(
-				input,
-				predInput,
-				config,
-				cl,
-				startupOptions,
-				ml,
-				tokenCallback,
-				nil,
-			)
+			const maxEmptyRetries = 5
+			var result []schema.Choice
+			var tokenUsage backend.TokenUsage
+			var err error
+
+			for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+				emptyRetryNeeded = false
+				result, tokenUsage, err = ComputeChoices(
+					input,
+					predInput,
+					config,
+					cl,
+					startupOptions,
+					ml,
+					tokenCallback,
+					nil,
+				)
+				if err != nil || !emptyRetryNeeded {
+					break
+				}
+				xlog.Warn("Retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
+			}
 			if err != nil {
 				return err
 			}
+
+			if emptyRetryNeeded {
+				xlog.Warn("All retries exhausted, backend still returning empty content")
+				stopReason := FinishReasonStop
+				empty := ""
+				result = append(result, schema.Choice{
+					FinishReason: &stopReason,
+					Index:        0,
+					Message:      &schema.Message{Role: "assistant", Content: &empty},
+				})
+			}
 			usage := schema.OpenAIUsage{
 				PromptTokens:     tokenUsage.Prompt,
 				CompletionTokens: tokenUsage.Completion,
@@ -785,7 +823,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 	}
 }

-func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
+func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {

 	if len(funcResults) == 0 && result != "" {
 		xlog.Debug("nothing function results but we had a message from the LLM")
@@ -818,73 +856,6 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in
 	}

 	xlog.Debug("No action received from LLM, without a message, computing a reply")
-	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-	// Note: This costs (in term of CPU/GPU) another computation
-	config.Grammar = ""
-	images := []string{}
-	for _, m := range input.Messages {
-		images = append(images, m.StringImages...)
-	}
-	videos := []string{}
-	for _, m := range input.Messages {
-		videos = append(videos, m.StringVideos...)
-	}
-	audios := []string{}
-	for _, m := range input.Messages {
-		audios = append(audios, m.StringAudios...)
-	}

-	// Serialize tools and tool_choice to JSON strings
-	toolsJSON := ""
-	if len(input.Tools) > 0 {
-		toolsBytes, err := json.Marshal(input.Tools)
-		if err == nil {
-			toolsJSON = string(toolsBytes)
-		}
-	}
-	toolChoiceJSON := ""
-	if input.ToolsChoice != nil {
-		toolChoiceBytes, err := json.Marshal(input.ToolsChoice)
-		if err == nil {
-			toolChoiceJSON = string(toolChoiceBytes)
-		}
-	}
-
-	// Extract logprobs from request
-	// According to OpenAI API: logprobs is boolean, top_logprobs (0-20) controls how many top tokens per position
-	var logprobs *int
-	var topLogprobs *int
-	if input.Logprobs.IsEnabled() {
-		// If logprobs is enabled, use top_logprobs if provided, otherwise default to 1
-		if input.TopLogprobs != nil {
-			topLogprobs = input.TopLogprobs
-			// For backend compatibility, set logprobs to the top_logprobs value
-			logprobs = input.TopLogprobs
-		} else {
-			// Default to 1 if logprobs is true but top_logprobs not specified
-			val := 1
-			logprobs = &val
-			topLogprobs = &val
-		}
-	}
-
-	// Extract logit_bias from request
-	// According to OpenAI API: logit_bias is a map of token IDs (as strings) to bias values (-100 to 100)
-	var logitBias map[string]float64
-	if len(input.LogitBias) > 0 {
-		logitBias = input.LogitBias
-	}
-
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
-	if err != nil {
-		xlog.Error("model inference failed", "error", err)
-		return "", err
-	}
-
-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("prediction failed", "error", err)
-		return "", err
-	}
-	return backend.Finetune(*config, prompt, prediction.Response), nil
+	return "", fmt.Errorf("no action received from LLM, without a message, computing a reply")
 }
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -800,13 +800,26 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon
 	default:
 	}

-	prediction, err := predFunc()
-	if err != nil {
-		return nil, fmt.Errorf("prediction failed: %w", err)
+	const maxEmptyRetries = 5
+	var prediction backend.LLMResponse
+	var result string
+	for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+		prediction, err = predFunc()
+		if err != nil {
+			return nil, fmt.Errorf("prediction failed: %w", err)
+		}
+		result = backend.Finetune(*cfg, predInput, prediction.Response)
+		if result != "" || !shouldUseFn {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			return nil, ctx.Err()
+		default:
+		}
+		xlog.Warn("Open Responses background: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
 	}

-	result := backend.Finetune(*cfg, predInput, prediction.Response)
-
 	// Parse tool calls if using functions (same logic as regular handler)
 	var outputItems []schema.ORItemField
 	var toolCalls []schema.ToolCall
@@ -1475,13 +1488,21 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 		return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "")
 	}

-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("Open Responses prediction failed", "error", err)
-		return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
+	const maxEmptyRetries = 5
+	var prediction backend.LLMResponse
+	var result string
+	for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
+		prediction, err = predFunc()
+		if err != nil {
+			xlog.Error("Open Responses prediction failed", "error", err)
+			return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
+		}
+		result = backend.Finetune(*cfg, predInput, prediction.Response)
+		if result != "" || !shouldUseFn {
+			break
+		}
+		xlog.Warn("Open Responses: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
 	}
-
-	result := backend.Finetune(*cfg, predInput, prediction.Response)
 	xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)

 	// Detect if thinking token is already in prompt or template