mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-20 22:59:09 -04:00
Squashed feat/pii-ner-tier-engine rebased onto master (was 45 commits; see backup/pii-ner-tier-engine-prerebase). Net change: - privacy-filter.cpp: standalone GGML engine for the openai-privacy-filter PII/NER token classifier, wired as a LocalAI gRPC backend (CPU/CUDA/Vulkan). TokenClassify moves off the patched llama.cpp path onto this backend. - PII filter reworked to be NER-centric (encoder/NER detection tier scanning whole conversations as one document), with a recreated bounded restricted- regex secret-matching pattern detector tier alongside it (per-model pii_detection.builtins / .patterns + core/services/routing/piipattern). - Detection labelled by source (ner vs pattern); backend trace / confidence / debug observability; analyze/redact exposed as a synchronous API. - Instance-wide default detector policy + per-usecase default-on; request filtering extended to completions, embeddings, edits & Ollama. - React UI: NER-centric PII editor, detector-models table, pattern/builtins editor, middleware default-policy UI. - Gallery: privacy-filter-multilingual token-classify model + NER install filter; token_classify known_usecase; batch sized to context for NER models. privacy-filter backend registered in the backend gallery (cpu/vulkan/cuda-13 meta + image entries with a capabilities map) matching its CI matrix jobs, and an /import-model auto-detect importer (PrivacyFilterImporter, narrow privacy-filter GGUF detection) replacing the prior pref-only registration. Reconciled against master's independent evolution: - Dropped master's PIIPatternOverrides feature (global-pattern runtime overrides + /api/pii/patterns API + runtime_settings.json persistence). The per-model NER + pattern-detector design supersedes it; it was built on the global redactor pattern set this branch replaced. - Reverted the llama.cpp Score carry-patch (0006-server-task-type-score): removed the patch and restored master's grpc-server.cpp Score RPC (direct llama_decode, slot-loop bypass) and LLAMA_VERSION pin, plus master's model_config validation forbidding score + chat/completion/embeddings on llama-cpp. token_classify is unaffected (it runs on the privacy-filter backend, not llama-cpp). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com>
228 lines
9.0 KiB
Go
228 lines
9.0 KiB
Go
package config
|
|
|
|
import (
|
|
"context"
|
|
|
|
"github.com/mudler/LocalAI/pkg/functions"
|
|
"github.com/mudler/LocalAI/pkg/grpc"
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
"github.com/mudler/LocalAI/pkg/reasoning"
|
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
|
"github.com/mudler/xlog"
|
|
|
|
gguf "github.com/gpustack/gguf-parser-go"
|
|
"github.com/gpustack/gguf-parser-go/util/ptr"
|
|
)
|
|
|
|
const (
|
|
defaultContextSize = 1024
|
|
defaultNGPULayers = 99999999
|
|
)
|
|
|
|
// reservedNonChatModel reports whether the operator reserved this model for an
|
|
// internal primitive — the router score classifier or the PII NER
|
|
// token_classify tier. Such a model has no chat template and must not be
|
|
// given the generative-chat defaults the GGUF importer otherwise applies
|
|
// (FLAG_CHAT, jinja templating): surfacing it in chat pickers defeats the
|
|
// reservation. Operators who do want a combined model declare both usecases
|
|
// explicitly — the combination is valid.
|
|
func reservedNonChatModel(cfg *ModelConfig) bool {
|
|
return cfg.KnownUsecases != nil &&
|
|
(*cfg.KnownUsecases&(FLAG_SCORE|FLAG_TOKEN_CLASSIFY)) != 0
|
|
}
|
|
|
|
func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
|
if defaultCtx == 0 && cfg.ContextSize == nil {
|
|
ctxSize := f.EstimateLLaMACppRun().ContextSize
|
|
if ctxSize > 0 {
|
|
cSize := int(ctxSize)
|
|
cfg.ContextSize = &cSize
|
|
} else {
|
|
defaultCtx = defaultContextSize
|
|
cfg.ContextSize = &defaultCtx
|
|
}
|
|
}
|
|
|
|
// GPU options
|
|
if cfg.Options == nil {
|
|
if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
|
|
cfg.Options = []string{"gpu"}
|
|
}
|
|
}
|
|
|
|
if cfg.NGPULayers == nil {
|
|
// we assume we want to offload all layers
|
|
defaultHigh := defaultNGPULayers
|
|
cfg.NGPULayers = &defaultHigh
|
|
}
|
|
|
|
xlog.Debug("[gguf] guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers, "modelName", f.Metadata().Name)
|
|
|
|
// identify from well known templates first, otherwise use the raw jinja template
|
|
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
|
|
if found {
|
|
// fill jinja template
|
|
cfg.modelTemplate = chatTemplate.ValueString()
|
|
}
|
|
|
|
// Auto-enable Multi-Token Prediction (ggml-org/llama.cpp#22673) when the
|
|
// GGUF carries an embedded MTP head. Skipped silently for non-MTP models
|
|
// and when the user already configured a spec_type.
|
|
if n, ok := HasEmbeddedMTPHead(f); ok {
|
|
ApplyMTPDefaults(cfg, n)
|
|
}
|
|
|
|
// Thinking support detection is done after model load via DetectThinkingSupportFromBackend
|
|
|
|
// template estimations
|
|
if cfg.HasTemplate() {
|
|
// nothing to guess here
|
|
xlog.Debug("[gguf] guessDefaultsFromFile: template already set", "name", cfg.Name, "modelName", f.Metadata().Name)
|
|
return
|
|
}
|
|
|
|
xlog.Debug("[gguf] Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)
|
|
|
|
// guess the name
|
|
if cfg.Name == "" {
|
|
cfg.Name = f.Metadata().Name
|
|
}
|
|
|
|
// A model the operator reserved for an internal primitive (the router
|
|
// score classifier, or the PII NER token_classify tier) is not a chat
|
|
// model: it carries no chat template and must not be painted with the
|
|
// generative-chat defaults — appending FLAG_CHAT here would fold chat
|
|
// into KnownUsecases on the next sync and surface the model in every
|
|
// chat picker. Respect the declaration.
|
|
if !reservedNonChatModel(cfg) {
|
|
// Instruct to use template from llama.cpp
|
|
cfg.TemplateConfig.UseTokenizerTemplate = true
|
|
cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
|
|
cfg.Options = append(cfg.Options, "use_jinja:true")
|
|
cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
|
|
}
|
|
|
|
// Apply per-model-family inference parameter defaults (temperature, top_p, etc.)
|
|
ApplyInferenceDefaults(cfg, f.Metadata().Name)
|
|
}
|
|
|
|
// DetectThinkingSupportFromBackend calls the ModelMetadata gRPC method to detect
|
|
// if the model supports thinking mode and if the template ends with a thinking start token.
|
|
// This should be called after the model is loaded.
|
|
// The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
|
|
// The backend-reported multimodal marker is also captured into cfg.MediaMarker.
|
|
func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
|
|
if backendClient == nil {
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
|
|
return
|
|
}
|
|
|
|
if modelOptions == nil {
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: model options is nil, skipping detection")
|
|
return
|
|
}
|
|
|
|
// Only llama-cpp exposes ModelMetadata today. Other backends will either error
|
|
// or return an empty response — both are fine, we just bail before calling.
|
|
if cfg.Backend != "llama-cpp" {
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend)
|
|
return
|
|
}
|
|
|
|
metadata, err := backendClient.ModelMetadata(ctx, modelOptions)
|
|
if err != nil {
|
|
xlog.Warn("[gguf] DetectThinkingSupportFromBackend: failed to get model metadata", "error", err)
|
|
return
|
|
}
|
|
|
|
if metadata != nil {
|
|
// The multimodal media marker is backend-controlled (llama.cpp may pick a
|
|
// random per-server string). Empty means "no mtmd context" — Go falls back
|
|
// to templates.DefaultMultiMediaMarker at render time.
|
|
if metadata.MediaMarker != "" {
|
|
cfg.MediaMarker = metadata.MediaMarker
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker)
|
|
}
|
|
|
|
// Thinking / tool-format detection only applies when we rely on the
|
|
// backend-side tokenizer template — otherwise the rendered-template based
|
|
// heuristics below aren't meaningful.
|
|
if !cfg.TemplateConfig.UseTokenizerTemplate {
|
|
return
|
|
}
|
|
|
|
applyDetectedThinkingConfig(cfg, metadata)
|
|
|
|
// Extract tool format markers from autoparser analysis
|
|
if tf := metadata.GetToolFormat(); tf != nil && tf.FormatType != "" {
|
|
cfg.FunctionsConfig.ToolFormatMarkers = &functions.ToolFormatMarkers{
|
|
FormatType: tf.FormatType,
|
|
SectionStart: tf.SectionStart,
|
|
SectionEnd: tf.SectionEnd,
|
|
PerCallStart: tf.PerCallStart,
|
|
PerCallEnd: tf.PerCallEnd,
|
|
FuncNamePrefix: tf.FuncNamePrefix,
|
|
FuncNameSuffix: tf.FuncNameSuffix,
|
|
FuncClose: tf.FuncClose,
|
|
ArgNamePrefix: tf.ArgNamePrefix,
|
|
ArgNameSuffix: tf.ArgNameSuffix,
|
|
ArgValuePrefix: tf.ArgValuePrefix,
|
|
ArgValueSuffix: tf.ArgValueSuffix,
|
|
ArgSeparator: tf.ArgSeparator,
|
|
ArgsStart: tf.ArgsStart,
|
|
ArgsEnd: tf.ArgsEnd,
|
|
NameField: tf.NameField,
|
|
ArgsField: tf.ArgsField,
|
|
IDField: tf.IdField,
|
|
FunNameIsKey: tf.FunNameIsKey,
|
|
ToolsArrayWrapped: tf.ToolsArrayWrapped,
|
|
FunctionField: tf.FunctionField,
|
|
ParameterOrder: tf.ParameterOrder,
|
|
GenIDField: tf.GenIdField,
|
|
CallIDPosition: tf.CallIdPosition,
|
|
CallIDPrefix: tf.CallIdPrefix,
|
|
CallIDSuffix: tf.CallIdSuffix,
|
|
ReasoningStart: tf.ReasoningStart,
|
|
ReasoningEnd: tf.ReasoningEnd,
|
|
ContentStart: tf.ContentStart,
|
|
ContentEnd: tf.ContentEnd,
|
|
}
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: tool format markers detected",
|
|
"format_type", tf.FormatType,
|
|
"section_start", tf.SectionStart,
|
|
"func_name_prefix", tf.FuncNamePrefix)
|
|
}
|
|
}
|
|
}
|
|
|
|
func applyDetectedThinkingConfig(cfg *ModelConfig, metadata *pb.ModelMetadataResponse) {
|
|
if cfg == nil || metadata == nil {
|
|
return
|
|
}
|
|
|
|
// Respect explicit YAML/user config. Backend probing should only fill defaults
|
|
// when the reasoning mode has not already been set.
|
|
if cfg.ReasoningConfig.DisableReasoning == nil {
|
|
cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)
|
|
}
|
|
|
|
// Respect explicit prefill config for the same reason. Only infer the
|
|
// default prefill behavior when the user did not set it.
|
|
if cfg.ReasoningConfig.DisableReasoningTagPrefill == nil {
|
|
// Use the rendered template to detect if thinking token is at the end.
|
|
// This reuses the existing DetectThinkingStartToken function.
|
|
if metadata.RenderedTemplate != "" {
|
|
thinkingStartToken := reasoning.DetectThinkingStartToken(metadata.RenderedTemplate, &cfg.ReasoningConfig)
|
|
thinkingForcedOpen := thinkingStartToken != ""
|
|
cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(!thinkingForcedOpen)
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", thinkingForcedOpen, "thinking_start_token", thinkingStartToken)
|
|
} else {
|
|
cfg.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: thinking support detected", "supports_thinking", metadata.SupportsThinking, "thinking_forced_open", false)
|
|
}
|
|
return
|
|
}
|
|
|
|
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: preserving explicit reasoning config", "supports_thinking", metadata.SupportsThinking, "disable_reasoning", *cfg.ReasoningConfig.DisableReasoning, "disable_reasoning_tag_prefill", *cfg.ReasoningConfig.DisableReasoningTagPrefill)
|
|
}
|