LocalAI/core/config/gguf.go

package config

import (
	"context"

	"github.com/mudler/LocalAI/pkg/functions"
	"github.com/mudler/LocalAI/pkg/grpc"
	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
	"github.com/mudler/LocalAI/pkg/reasoning"
	"github.com/mudler/LocalAI/pkg/xsysinfo"
	"github.com/mudler/xlog"

	gguf "github.com/gpustack/gguf-parser-go"
	"github.com/gpustack/gguf-parser-go/util/ptr"
)

const (
	defaultContextSize = 1024
	defaultNGPULayers  = 99999999
)

// reservedNonChatModel reports whether the operator reserved this model for an
// internal primitive — the router score classifier or the PII NER
// token_classify tier. Such a model has no chat template and must not be
// given the generative-chat defaults the GGUF importer otherwise applies
// (FLAG_CHAT, jinja templating): surfacing it in chat pickers defeats the
// reservation. Operators who do want a combined model declare both usecases
// explicitly — the combination is valid.
func reservedNonChatModel(cfg *ModelConfig) bool {
	return cfg.KnownUsecases != nil &&
		(*cfg.KnownUsecases&(FLAG_SCORE|FLAG_TOKEN_CLASSIFY)) != 0
}

func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
	if defaultCtx == 0 && cfg.ContextSize == nil {
		ctxSize := f.EstimateLLaMACppRun().ContextSize
		if ctxSize > 0 {
			cSize := int(ctxSize)
			cfg.ContextSize = &cSize
		} else {
			defaultCtx = defaultContextSize
			cfg.ContextSize = &defaultCtx
		}
	}

	// GPU options
	if cfg.Options == nil {
		if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
			cfg.Options = []string{"gpu"}
		}
	}

	if cfg.NGPULayers == nil {
		// we assume we want to offload all layers
		defaultHigh := defaultNGPULayers
		cfg.NGPULayers = &defaultHigh
	}

	xlog.Debug("[gguf] guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers, "modelName", f.Metadata().Name)

	// identify from well known templates first, otherwise use the raw jinja template
	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
	if found {
		// fill jinja template
		cfg.modelTemplate = chatTemplate.ValueString()
	}

	// Auto-enable Multi-Token Prediction (ggml-org/llama.cpp#22673) when the
	// GGUF carries an embedded MTP head. Skipped silently for non-MTP models
	// and when the user already configured a spec_type.
	if n, ok := HasEmbeddedMTPHead(f); ok {
		ApplyMTPDefaults(cfg, n)
	}

	// Thinking support detection is done after model load via DetectThinkingSupportFromBackend

	// template estimations
	if cfg.HasTemplate() {
		// nothing to guess here
		xlog.Debug("[gguf] guessDefaultsFromFile: template already set", "name", cfg.Name, "modelName", f.Metadata().Name)
		return
	}

	xlog.Debug("[gguf] Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)

	// guess the name
	if cfg.Name == "" {
		cfg.Name = f.Metadata().Name
	}

	// A model the operator reserved for an internal primitive (the router
	// score classifier, or the PII NER token_classify tier) is not a chat
	// model: it carries no chat template and must not be painted with the
	// generative-chat defaults — appending FLAG_CHAT here would fold chat
	// into KnownUsecases on the next sync and surface the model in every
	// chat picker. Respect the declaration.
	if !reservedNonChatModel(cfg) {
		// Instruct to use template from llama.cpp
		cfg.TemplateConfig.UseTokenizerTemplate = true
		cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
		cfg.Options = append(cfg.Options, "use_jinja:true")
		cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
	}

	// Apply per-model-family inference parameter defaults (temperature, top_p, etc.)
	ApplyInferenceDefaults(cfg, f.Metadata().Name)
}

// DetectThinkingSupportFromBackend calls the ModelMetadata gRPC method to detect
// if the model supports thinking mode and if the template ends with a thinking start token.
// This should be called after the model is loaded.
// The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
// The backend-reported multimodal marker is also captured into cfg.MediaMarker.
func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
	if backendClient == nil {
		xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
		return
	}

	if modelOptions == nil {
		xlog.Debug("[gguf] DetectThinkingSupportFromBackend: model options is nil, skipping detection")
		return
	}

	// Only llama-cpp exposes ModelMetadata today. Other backends will either error
	// or return an empty response — both are fine, we just bail before calling.
	if cfg.Backend != "llama-cpp" {
		xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend)
		return
	}

	metadata, err := backendClient.ModelMetadata(ctx, modelOptions)
	if err != nil {
		xlog.Warn("[gguf] DetectThinkingSupportFromBackend: failed to get model metadata", "error", err)
		return
	}

	if metadata != nil {
		// The multimodal media marker is backend-controlled (llama.cpp may pick a
		// random per-server string). Empty means "no mtmd context" — Go falls back
		// to templates.DefaultMultiMediaMarker at render time.
		if metadata.MediaMarker != "" {
			cfg.MediaMarker = metadata.MediaMarker
			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker)
		}

		// Thinking / tool-format detection only applies when we rely on the
		// backend-side tokenizer template — otherwise the rendered-template based
		// heuristics below aren't meaningful.
		if !cfg.TemplateConfig.UseTokenizerTemplate {
			return
		}

		applyDetectedThinkingConfig(cfg, metadata)

		// Extract tool format markers from autoparser analysis
		if tf := metadata.GetToolFormat(); tf != nil && tf.FormatType != "" {
			cfg.FunctionsConfig.ToolFormatMarkers = &functions.ToolFormatMarkers{
				FormatType:        tf.FormatType,
				SectionStart:      tf.SectionStart,
				SectionEnd:        tf.SectionEnd,
				PerCallStart:      tf.PerCallStart,
				PerCallEnd:        tf.PerCallEnd,
				FuncNamePrefix:    tf.FuncNamePrefix,
				FuncNameSuffix:    tf.FuncNameSuffix,
				FuncClose:         tf.FuncClose,
				ArgNamePrefix:     tf.ArgNamePrefix,
				ArgNameSuffix:     tf.ArgNameSuffix,
				ArgValuePrefix:    tf.ArgValuePrefix,
				ArgValueSuffix:    tf.ArgValueSuffix,
				ArgSeparator:      tf.ArgSeparator,
				ArgsStart:         tf.ArgsStart,
				ArgsEnd:           tf.ArgsEnd,
				NameField:         tf.NameField,
				ArgsField:         tf.ArgsField,
				IDField:           tf.IdField,
				FunNameIsKey:      tf.FunNameIsKey,
				ToolsArrayWrapped: tf.ToolsArrayWrapped,
				FunctionField:     tf.FunctionField,
				ParameterOrder:    tf.ParameterOrder,
				GenIDField:        tf.GenIdField,
				CallIDPosition:    tf.CallIdPosition,
				CallIDPrefix:      tf.CallIdPrefix,
				CallIDSuffix:      tf.CallIdSuffix,
				ReasoningStart:    tf.ReasoningStart,
				ReasoningEnd:      tf.ReasoningEnd,
				ContentStart:      tf.ContentStart,
				ContentEnd:        tf.ContentEnd,
			}
			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: tool format markers detected",
				"format_type", tf.FormatType,
				"section_start", tf.SectionStart,
				"func_name_prefix", tf.FuncNamePrefix)
		}
	}
}

func applyDetectedThinkingConfig(cfg *ModelConfig, metadata *pb.ModelMetadataResponse) {
	if cfg == nil || metadata == nil {
		return
	}

	// Respect explicit YAML/user config. Backend probing should only fill defaults
	// when the reasoning mode has not already been set.
	if cfg.ReasoningConfig.DisableReasoning == nil {
		cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)
	}

	// Respect explicit prefill config for the same reason. Only infer the
	// default prefill behavior when the user did not set it.
	if cfg.ReasoningConfig.DisableReasoningTagPrefill == nil {
		// Use the rendered template to detect if thinking token is at the end.
		// This reuses the existing DetectThinkingStartToken function.
		if metadata.RenderedTemplate != "" {
			thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig)
			thinkingForcedOpen := thinkingStartToken != ""
			cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen)
			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken)
		} else {
			cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
			xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false)
		}
		return
	}

	xlog.Debug("[gguf] DetectThinkingSupportFromBackend: preserving explicit reasoning config", "supports_thinking", metadata.SupportsThinking, "disable_reasoning", *cfg.ReasoningConfig.DisableReasoning, "disable_reasoning_tag_prefill", *cfg.ReasoningConfig.DisableReasoningTagPrefill)
}