Respect explicit reasoning config during GGUF thinking probe (#9463)

Signed-off-by: leinasi2014 <leinasi2014@gmail.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-29 03:24:49 -04:00 · 2026-04-22 03:53:10 +08:00
parent 39573ecd2a
commit d18d434bb2
4 changed files with 170 additions and 17 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -40,6 +40,12 @@ type TokenUsage struct {
 	ChatDeltas             []*proto.ChatDelta // per-chunk deltas from C++ autoparser (only set during streaming)
 }

+func needsThinkingProbe(c *config.ModelConfig) bool {
+	return c.TemplateConfig.UseTokenizerTemplate &&
+		(c.ReasoningConfig.DisableReasoning == nil ||
+			c.ReasoningConfig.DisableReasoningTagPrefill == nil)
+}
+
 // HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
 // Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
 func (t TokenUsage) HasChatDeltaContent() bool {
@@ -100,11 +106,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 	// tokenizer template path is active) and the multimodal media marker (needed
 	// by custom chat templates so markers line up with what mtmd expects).
 	// We probe whenever any of those slots is still empty.
-	needsThinkingProbe := c.TemplateConfig.UseTokenizerTemplate &&
-		c.ReasoningConfig.DisableReasoning == nil &&
-		c.ReasoningConfig.DisableReasoningTagPrefill == nil
+	shouldProbeThinking := needsThinkingProbe(c)
 	needsMarkerProbe := c.MediaMarker == ""
-	if needsThinkingProbe || needsMarkerProbe {
+	if shouldProbeThinking || needsMarkerProbe {
 		modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
 		config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
 		// Update the config in the loader so it persists for future requests
--- a/core/backend/llm_probe_test.go
+++ b/core/backend/llm_probe_test.go
@@ -0,0 +1,29 @@
+package backend
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+
+	"github.com/gpustack/gguf-parser-go/util/ptr"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("thinking probe gating", func() {
+	It("probes tokenizer-template models when any reasoning default is still unset", func() {
+		cfg := &config.ModelConfig{
+			TemplateConfig: config.TemplateConfig{UseTokenizerTemplate: true},
+		}
+		Expect(needsThinkingProbe(cfg)).To(BeTrue())
+
+		cfg.ReasoningConfig.DisableReasoning = ptr.To(true)
+		Expect(needsThinkingProbe(cfg)).To(BeTrue())
+
+		cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
+		Expect(needsThinkingProbe(cfg)).To(BeFalse())
+	})
+
+	It("does not probe when tokenizer templates are disabled", func() {
+		cfg := &config.ModelConfig{}
+		Expect(needsThinkingProbe(cfg)).To(BeFalse())
+	})
+})
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -125,19 +125,7 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
 			return
 		}

-		cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)
-
-		// Use the rendered template to detect if thinking token is at the end
-		// This reuses the existing DetectThinkingStartToken function
-		if metadata.RenderedTemplate != "" {
-			thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig)
-			thinkingForcedOpen := thinkingStartToken != ""
-			cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen)
-			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken)
-		} else {
-			cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
-			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false)
-		}
+		applyDetectedThinkingConfig(cfg, metadata)

 		// Extract tool format markers from autoparser analysis
 		if tf := metadata.GetToolFormat(); tf != nil && tf.FormatType != "" {
@@ -180,3 +168,34 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
 		}
 	}
 }
+
+func applyDetectedThinkingConfig(cfg *ModelConfig, metadata *pb.ModelMetadataResponse) {
+	if cfg == nil || metadata == nil {
+		return
+	}
+
+	// Respect explicit YAML/user config. Backend probing should only fill defaults
+	// when the reasoning mode has not already been set.
+	if cfg.ReasoningConfig.DisableReasoning == nil {
+		cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)
+	}
+
+	// Respect explicit prefill config for the same reason. Only infer the
+	// default prefill behavior when the user did not set it.
+	if cfg.ReasoningConfig.DisableReasoningTagPrefill == nil {
+		// Use the rendered template to detect if thinking token is at the end.
+		// This reuses the existing DetectThinkingStartToken function.
+		if metadata.RenderedTemplate != "" {
+			thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig)
+			thinkingForcedOpen := thinkingStartToken != ""
+			cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen)
+			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken)
+		} else {
+			cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
+			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false)
+		}
+		return
+	}
+
+	xlog.Debug("[gguf] DetectThinkingSupportFromBackend: preserving explicit reasoning config", "supports_thinking", metadata.SupportsThinking, "disable_reasoning", *cfg.ReasoningConfig.DisableReasoning, "disable_reasoning_tag_prefill", *cfg.ReasoningConfig.DisableReasoningTagPrefill)
+}
--- a/core/config/gguf_reasoning_test.go
+++ b/core/config/gguf_reasoning_test.go
@@ -0,0 +1,101 @@
+package config
+
+import (
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/reasoning"
+
+	"github.com/gpustack/gguf-parser-go/util/ptr"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("GGUF backend metadata reasoning defaults", func() {
+	It("fills reasoning defaults when unset", func() {
+		cfg := &ModelConfig{
+			TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+		}
+
+		applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{
+			SupportsThinking: true,
+			RenderedTemplate: "{{ bos_token }}<think>",
+		})
+
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeFalse())
+	})
+
+	It("preserves fully explicit reasoning settings", func() {
+		cfg := &ModelConfig{
+			TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+			ReasoningConfig: reasoning.Config{
+				DisableReasoning:           ptr.To(true),
+				DisableReasoningTagPrefill: ptr.To(true),
+			},
+		}
+
+		applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{
+			SupportsThinking: true,
+			RenderedTemplate: "{{ bos_token }}<think>",
+		})
+
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue())
+	})
+
+	It("preserves explicit disable while still inferring missing prefill", func() {
+		cfg := &ModelConfig{
+			TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+			ReasoningConfig: reasoning.Config{
+				DisableReasoning: ptr.To(true),
+			},
+		}
+
+		applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{
+			SupportsThinking: true,
+			RenderedTemplate: "{{ bos_token }}<think>",
+		})
+
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeFalse())
+	})
+
+	It("preserves explicit prefill while still inferring missing disable flag", func() {
+		cfg := &ModelConfig{
+			TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+			ReasoningConfig: reasoning.Config{
+				DisableReasoningTagPrefill: ptr.To(true),
+			},
+		}
+
+		applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{
+			SupportsThinking: true,
+			RenderedTemplate: "{{ bos_token }}<think>",
+		})
+
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue())
+	})
+
+	It("defaults to disabling reasoning when backend does not support thinking", func() {
+		cfg := &ModelConfig{
+			TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+		}
+
+		applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{
+			SupportsThinking: false,
+		})
+
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue())
+	})
+})