diff --git a/core/backend/llm.go b/core/backend/llm.go index 4c6c1874d..ae550755a 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -40,6 +40,12 @@ type TokenUsage struct { ChatDeltas []*proto.ChatDelta // per-chunk deltas from C++ autoparser (only set during streaming) } +func needsThinkingProbe(c *config.ModelConfig) bool { + return c.TemplateConfig.UseTokenizerTemplate && + (c.ReasoningConfig.DisableReasoning == nil || + c.ReasoningConfig.DisableReasoningTagPrefill == nil) +} + // HasChatDeltaContent returns true if any chat delta carries content or reasoning text. // Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction. func (t TokenUsage) HasChatDeltaContent() bool { @@ -100,11 +106,9 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima // tokenizer template path is active) and the multimodal media marker (needed // by custom chat templates so markers line up with what mtmd expects). // We probe whenever any of those slots is still empty. - needsThinkingProbe := c.TemplateConfig.UseTokenizerTemplate && - c.ReasoningConfig.DisableReasoning == nil && - c.ReasoningConfig.DisableReasoningTagPrefill == nil + shouldProbeThinking := needsThinkingProbe(c) needsMarkerProbe := c.MediaMarker == "" - if needsThinkingProbe || needsMarkerProbe { + if shouldProbeThinking || needsMarkerProbe { modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath) config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts) // Update the config in the loader so it persists for future requests diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go new file mode 100644 index 000000000..73ed9f967 --- /dev/null +++ b/core/backend/llm_probe_test.go @@ -0,0 +1,29 @@ +package backend + +import ( + "github.com/mudler/LocalAI/core/config" + + "github.com/gpustack/gguf-parser-go/util/ptr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("thinking probe gating", func() { + It("probes tokenizer-template models when any reasoning default is still unset", func() { + cfg := &config.ModelConfig{ + TemplateConfig: config.TemplateConfig{UseTokenizerTemplate: true}, + } + Expect(needsThinkingProbe(cfg)).To(BeTrue()) + + cfg.ReasoningConfig.DisableReasoning = ptr.To(true) + Expect(needsThinkingProbe(cfg)).To(BeTrue()) + + cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true) + Expect(needsThinkingProbe(cfg)).To(BeFalse()) + }) + + It("does not probe when tokenizer templates are disabled", func() { + cfg := &config.ModelConfig{} + Expect(needsThinkingProbe(cfg)).To(BeFalse()) + }) +}) diff --git a/core/config/gguf.go b/core/config/gguf.go index 14d95d4ce..2d5d3f7c9 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -125,19 +125,7 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac return } - cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking) - - // Use the rendered template to detect if thinking token is at the end - // This reuses the existing DetectThinkingStartToken function - if metadata.RenderedTemplate != "" { - thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig) - thinkingForcedOpen := thinkingStartToken != "" - cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen) - xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken) - } else { - cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true) - xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false) - } + applyDetectedThinkingConfig(cfg, metadata) // Extract tool format markers from autoparser analysis if tf := metadata.GetToolFormat(); tf != nil && tf.FormatType != "" { @@ -180,3 +168,34 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac } } } + +func applyDetectedThinkingConfig(cfg *ModelConfig, metadata *pb.ModelMetadataResponse) { + if cfg == nil || metadata == nil { + return + } + + // Respect explicit YAML/user config. Backend probing should only fill defaults + // when the reasoning mode has not already been set. + if cfg.ReasoningConfig.DisableReasoning == nil { + cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking) + } + + // Respect explicit prefill config for the same reason. Only infer the + // default prefill behavior when the user did not set it. + if cfg.ReasoningConfig.DisableReasoningTagPrefill == nil { + // Use the rendered template to detect if thinking token is at the end. + // This reuses the existing DetectThinkingStartToken function. + if metadata.RenderedTemplate != "" { + thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig) + thinkingForcedOpen := thinkingStartToken != "" + cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen) + xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken) + } else { + cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true) + xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false) + } + return + } + + xlog.Debug("[gguf] DetectThinkingSupportFromBackend: preserving explicit reasoning config", "supports_thinking", metadata.SupportsThinking, "disable_reasoning", *cfg.ReasoningConfig.DisableReasoning, "disable_reasoning_tag_prefill", *cfg.ReasoningConfig.DisableReasoningTagPrefill) +} diff --git a/core/config/gguf_reasoning_test.go b/core/config/gguf_reasoning_test.go new file mode 100644 index 000000000..a55930521 --- /dev/null +++ b/core/config/gguf_reasoning_test.go @@ -0,0 +1,101 @@ +package config + +import ( + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/reasoning" + + "github.com/gpustack/gguf-parser-go/util/ptr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("GGUF backend metadata reasoning defaults", func() { + It("fills reasoning defaults when unset", func() { + cfg := &ModelConfig{ + TemplateConfig: TemplateConfig{UseTokenizerTemplate: true}, + } + + applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{ + SupportsThinking: true, + RenderedTemplate: "{{ bos_token }}", + }) + + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeFalse()) + }) + + It("preserves fully explicit reasoning settings", func() { + cfg := &ModelConfig{ + TemplateConfig: TemplateConfig{UseTokenizerTemplate: true}, + ReasoningConfig: reasoning.Config{ + DisableReasoning: ptr.To(true), + DisableReasoningTagPrefill: ptr.To(true), + }, + } + + applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{ + SupportsThinking: true, + RenderedTemplate: "{{ bos_token }}", + }) + + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue()) + }) + + It("preserves explicit disable while still inferring missing prefill", func() { + cfg := &ModelConfig{ + TemplateConfig: TemplateConfig{UseTokenizerTemplate: true}, + ReasoningConfig: reasoning.Config{ + DisableReasoning: ptr.To(true), + }, + } + + applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{ + SupportsThinking: true, + RenderedTemplate: "{{ bos_token }}", + }) + + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeFalse()) + }) + + It("preserves explicit prefill while still inferring missing disable flag", func() { + cfg := &ModelConfig{ + TemplateConfig: TemplateConfig{UseTokenizerTemplate: true}, + ReasoningConfig: reasoning.Config{ + DisableReasoningTagPrefill: ptr.To(true), + }, + } + + applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{ + SupportsThinking: true, + RenderedTemplate: "{{ bos_token }}", + }) + + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue()) + }) + + It("defaults to disabling reasoning when backend does not support thinking", func() { + cfg := &ModelConfig{ + TemplateConfig: TemplateConfig{UseTokenizerTemplate: true}, + } + + applyDetectedThinkingConfig(cfg, &pb.ModelMetadataResponse{ + SupportsThinking: false, + }) + + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue()) + }) +})