mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-15 20:43:07 -04:00
fix(reasoning): support models with reasoning without starting thinking tag (#8132)
* chore: extract reasoning to its own package Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * make sure we detect thinking tokens from template Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Allow to override via config, add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
e886bb291a
commit
34e054f607
@@ -62,16 +62,23 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
cfg.NGPULayers = &defaultHigh
|
||||
}
|
||||
|
||||
xlog.Debug("guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers)
|
||||
xlog.Debug("[gguf] guessDefaultsFromFile: NGPULayers set", "NGPULayers", cfg.NGPULayers, "modelName", f.Metadata().Name)
|
||||
|
||||
// identify from well known templates first, otherwise use the raw jinja template
|
||||
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
|
||||
if found {
|
||||
// fill jinja template
|
||||
cfg.modelTemplate = chatTemplate.ValueString()
|
||||
}
|
||||
|
||||
// template estimations
|
||||
if cfg.HasTemplate() {
|
||||
// nothing to guess here
|
||||
xlog.Debug("guessDefaultsFromFile: template already set", "name", cfg.Name)
|
||||
xlog.Debug("[gguf] guessDefaultsFromFile: template already set", "name", cfg.Name, "modelName", f.Metadata().Name)
|
||||
return
|
||||
}
|
||||
|
||||
xlog.Debug("Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)
|
||||
xlog.Debug("[gguf] Model file loaded", "file", cfg.ModelFileName(), "eosTokenID", f.Tokenizer().EOSTokenID, "bosTokenID", f.Tokenizer().BOSTokenID, "modelName", f.Metadata().Name, "architecture", f.Architecture().Architecture)
|
||||
|
||||
// guess the name
|
||||
if cfg.Name == "" {
|
||||
@@ -83,4 +90,5 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
|
||||
cfg.Options = append(cfg.Options, "use_jinja:true")
|
||||
cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
|
||||
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
"github.com/mudler/LocalAI/pkg/reasoning"
|
||||
"github.com/mudler/cogito"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
@@ -30,6 +31,7 @@ type TTSConfig struct {
|
||||
// @Description ModelConfig represents a model configuration
|
||||
type ModelConfig struct {
|
||||
modelConfigFile string `yaml:"-" json:"-"`
|
||||
modelTemplate string `yaml:"-" json:"-"`
|
||||
schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
|
||||
Name string `yaml:"name,omitempty" json:"name,omitempty"`
|
||||
|
||||
@@ -51,6 +53,7 @@ type ModelConfig struct {
|
||||
ResponseFormatMap map[string]interface{} `yaml:"-" json:"-"`
|
||||
|
||||
FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
|
||||
ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`
|
||||
|
||||
FeatureFlag FeatureFlag `yaml:"feature_flags,omitempty" json:"feature_flags,omitempty"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
|
||||
// LLM configs (GPT4ALL, Llama.cpp, ...)
|
||||
@@ -521,6 +524,11 @@ func (c *ModelConfig) GetModelConfigFile() string {
|
||||
return c.modelConfigFile
|
||||
}
|
||||
|
||||
// GetModelTemplate returns the model's chat template if available
|
||||
func (c *ModelConfig) GetModelTemplate() string {
|
||||
return c.modelTemplate
|
||||
}
|
||||
|
||||
type ModelConfigUsecase int
|
||||
|
||||
const (
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/http/middleware"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
||||
|
||||
"github.com/mudler/LocalAI/core/templates"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
@@ -38,6 +39,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
}
|
||||
responses <- initialMessage
|
||||
|
||||
// Detect if thinking token is already in prompt or template
|
||||
// When UseTokenizerTemplate is enabled, predInput is empty, so we check the template
|
||||
var template string
|
||||
if config.TemplateConfig.UseTokenizerTemplate {
|
||||
template = config.GetModelTemplate()
|
||||
} else {
|
||||
template = s
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template)
|
||||
|
||||
// Track accumulated content for reasoning extraction
|
||||
accumulatedContent := ""
|
||||
lastEmittedReasoning := ""
|
||||
@@ -45,8 +56,12 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
|
||||
accumulatedContent += s
|
||||
// Extract reasoning from accumulated content
|
||||
currentReasoning, cleanedContent := functions.ExtractReasoning(accumulatedContent)
|
||||
content := accumulatedContent
|
||||
// Prepend thinking token if needed, then extract reasoning
|
||||
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
|
||||
content = reason.PrependThinkingTokenIfNeeded(content, thinkingStartToken)
|
||||
}
|
||||
currentReasoning, cleanedContent := reason.ExtractReasoning(content)
|
||||
|
||||
// Calculate new reasoning delta (what we haven't emitted yet)
|
||||
var reasoningDelta *string
|
||||
@@ -118,6 +133,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
return err
|
||||
}
|
||||
processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) error {
|
||||
// Detect if thinking token is already in prompt or template
|
||||
var template string
|
||||
if config.TemplateConfig.UseTokenizerTemplate {
|
||||
template = config.GetModelTemplate()
|
||||
} else {
|
||||
template = prompt
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template)
|
||||
|
||||
result := ""
|
||||
lastEmittedCount := 0
|
||||
_, tokenUsage, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
|
||||
@@ -229,8 +253,12 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Extract reasoning before processing tool calls
|
||||
reasoning, cleanedResult := functions.ExtractReasoning(result)
|
||||
// Prepend thinking token if needed, then extract reasoning before processing tool calls
|
||||
resultWithToken := result
|
||||
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
|
||||
resultWithToken = reason.PrependThinkingTokenIfNeeded(result, thinkingStartToken)
|
||||
}
|
||||
reasoning, cleanedResult := reason.ExtractReasoning(resultWithToken)
|
||||
result = cleanedResult
|
||||
|
||||
textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
|
||||
@@ -617,10 +645,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
// no streaming mode
|
||||
default:
|
||||
// Detect if thinking token is already in prompt or template
|
||||
var template string
|
||||
if config.TemplateConfig.UseTokenizerTemplate {
|
||||
template = config.GetModelTemplate() // TODO: this should be the parsed jinja template. But for now this is the best we can do.
|
||||
} else {
|
||||
template = predInput
|
||||
}
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template)
|
||||
|
||||
xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)
|
||||
|
||||
tokenCallback := func(s string, c *[]schema.Choice) {
|
||||
// Extract reasoning from the response
|
||||
reasoning, cleanedS := functions.ExtractReasoning(s)
|
||||
// Prepend thinking token if needed, then extract reasoning from the response
|
||||
sWithToken := s
|
||||
if config.ReasoningConfig.DisableReasoningTagPrefill == nil || !*config.ReasoningConfig.DisableReasoningTagPrefill {
|
||||
sWithToken = reason.PrependThinkingTokenIfNeeded(s, thinkingStartToken)
|
||||
}
|
||||
reasoning, cleanedS := reason.ExtractReasoning(sWithToken)
|
||||
s = cleanedS
|
||||
|
||||
if !shouldUseFn {
|
||||
|
||||
Reference in New Issue
Block a user