mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-13 03:09:03 -04:00
fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)
* fix(router): score classifier production-readiness Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(backend): auto-size batch to context for embedding and rerank models Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins. Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse. Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(gallery): raise arch-router scoring output cap via parallel:64 Scoring decodes the whole prompt+candidate in a single llama_decode and reads one logit row per candidate token. The vendored llama.cpp server caps causal output rows at n_parallel, so the default of 1 aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route labels. Set options: [parallel:64] on both arch-router quant entries to lift the cap; kv_unified (the grpc-server default) keeps the full context per sequence, so this does not split the KV cache. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
committed by
GitHub
parent
56cc4f63fc
commit
085fc53bbc
139
core/http/middleware/probe_trim_test.go
Normal file
139
core/http/middleware/probe_trim_test.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("routerConfigFingerprint", func() {
|
||||
rc := config.RouterConfig{Classifier: "score", ClassifierModel: "arch-router"}
|
||||
ctx4096 := 4096
|
||||
ctx8192 := 8192
|
||||
|
||||
// Regression: the score classifier bakes context_size into its token
|
||||
// budget at build time, and the built classifier is cached by this
|
||||
// fingerprint. If context_size weren't hashed, editing it and reloading
|
||||
// would return a classifier carrying the stale budget.
|
||||
It("changes when the classifier model's context_size changes", func() {
|
||||
cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
|
||||
cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8192}}
|
||||
Expect(routerConfigFingerprint(rc, cfgA)).NotTo(Equal(routerConfigFingerprint(rc, cfgB)))
|
||||
})
|
||||
|
||||
It("is stable for identical classifier configs", func() {
|
||||
cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
|
||||
cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
|
||||
Expect(routerConfigFingerprint(rc, cfgA)).To(Equal(routerConfigFingerprint(rc, cfgB)))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("routing probe extraction and trimming", func() {
|
||||
Describe("OpenAIProbeFromRequest", func() {
|
||||
It("keeps a short conversation intact, newline-terminated per message", func() {
|
||||
req := &schema.OpenAIRequest{Messages: []schema.Message{
|
||||
{Role: "user", Content: "first"},
|
||||
{Role: "assistant", Content: "second"},
|
||||
{Role: "user", Content: "third"},
|
||||
}}
|
||||
Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("first\nsecond\nthird\n"))
|
||||
})
|
||||
|
||||
It("flattens text blocks and skips image-only messages", func() {
|
||||
req := &schema.OpenAIRequest{Messages: []schema.Message{
|
||||
{Role: "user", Content: []any{
|
||||
map[string]any{"type": "text", "text": "describe this"},
|
||||
map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
|
||||
}},
|
||||
{Role: "user", Content: []any{
|
||||
map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
|
||||
}},
|
||||
}}
|
||||
// Second message contributes no text, so it neither adds a blank
|
||||
// line nor a stray newline.
|
||||
Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("describe this\n"))
|
||||
})
|
||||
|
||||
It("carries the full conversation untrimmed — trimming is each classifier's job", func() {
|
||||
// The middleware no longer caps the probe by a fixed rune budget;
|
||||
// every turn reaches the Probe and each classifier trims to its own
|
||||
// model's context (see modelTokenTrim / promptTrimmer).
|
||||
block := strings.Repeat("x", 999)
|
||||
msgs := make([]schema.Message, 0, 20)
|
||||
msgs = append(msgs, schema.Message{Role: "user", Content: "OLDEST" + strings.Repeat("o", 994)})
|
||||
for range 18 {
|
||||
msgs = append(msgs, schema.Message{Role: "user", Content: block})
|
||||
}
|
||||
msgs = append(msgs, schema.Message{Role: "user", Content: "NEWEST" + strings.Repeat("n", 994)})
|
||||
|
||||
probe := OpenAIProbeFromRequest(&schema.OpenAIRequest{Messages: msgs})
|
||||
Expect(probe.Prompt).To(ContainSubstring("OLDEST"), "no turn is dropped at probe-build time")
|
||||
Expect(probe.Prompt).To(ContainSubstring("NEWEST"))
|
||||
// Messages preserves the per-turn split the classifier trims from.
|
||||
Expect(probe.Messages).To(HaveLen(20))
|
||||
Expect(probe.Messages[0]).To(ContainSubstring("OLDEST"))
|
||||
Expect(probe.Messages[19]).To(ContainSubstring("NEWEST"))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("AnthropicProbe", func() {
|
||||
It("extracts and trims the same way as the OpenAI path", func() {
|
||||
req := &schema.AnthropicRequest{Messages: []schema.AnthropicMessage{
|
||||
{Role: "user", Content: "alpha"},
|
||||
{Role: "assistant", Content: []any{
|
||||
map[string]any{"type": "text", "text": "beta"},
|
||||
}},
|
||||
}}
|
||||
probe, ok := AnthropicProbe(req)
|
||||
Expect(ok).To(BeTrue())
|
||||
Expect(probe.Prompt).To(Equal("alpha\nbeta\n"))
|
||||
})
|
||||
|
||||
It("returns ok=false for a non-Anthropic payload", func() {
|
||||
_, ok := AnthropicProbe(&schema.OpenAIRequest{})
|
||||
Expect(ok).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("modelTokenTrim", func() {
|
||||
tok := func(string) (int, error) { return 1, nil }
|
||||
depsFor := func(cfg *config.ModelConfig) ClassifierDeps {
|
||||
return ClassifierDeps{
|
||||
ModelLookup: func(string) *config.ModelConfig { return cfg },
|
||||
TokenCounter: func(string) func(string) (int, error) { return tok },
|
||||
}
|
||||
}
|
||||
|
||||
It("still trims to the backend default when context_size is unset", func() {
|
||||
// Regression: with the fixed middleware rune cap gone, an unset
|
||||
// context_size must NOT disable trimming — otherwise a non-trivial
|
||||
// prompt overflows the default 4096 window and every score fails.
|
||||
score := config.FLAG_SCORE
|
||||
cfg := &config.ModelConfig{KnownUsecases: &score} // FLAG_SCORE → batch follows context
|
||||
count, ceiling := modelTokenTrim("classifier", depsFor(cfg))
|
||||
Expect(count).NotTo(BeNil())
|
||||
Expect(ceiling).To(Equal(4096), "unset context_size falls back to the backend default, not 0")
|
||||
})
|
||||
|
||||
It("is bounded by the batch when the batch is smaller than the context", func() {
|
||||
// The probe is one decode (n_tokens <= n_batch). A model with a
|
||||
// large context but a small batch can only process the batch — the
|
||||
// ceiling must follow it, not the context.
|
||||
ctx8k := 8192
|
||||
cfg := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8k}}
|
||||
cfg.Batch = 512
|
||||
_, ceiling := modelTokenTrim("embedder", depsFor(cfg))
|
||||
Expect(ceiling).To(Equal(512), "batch is the binding single-decode limit")
|
||||
})
|
||||
|
||||
It("disables trimming only when no tokenizer is available", func() {
|
||||
count, ceiling := modelTokenTrim("x", ClassifierDeps{ModelLookup: func(string) *config.ModelConfig { return &config.ModelConfig{} }})
|
||||
Expect(count).To(BeNil())
|
||||
Expect(ceiling).To(Equal(0))
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -86,6 +87,12 @@ type ClassifierDeps struct {
|
||||
// templates.Evaluator so any model the operator points at gets
|
||||
// its own chat template applied.
|
||||
Evaluator *templates.Evaluator
|
||||
|
||||
// TokenCounter binds the classifier model's tokenizer for the score
|
||||
// classifier's token-trim path. Optional; nil falls back to the
|
||||
// backend's n_ctx guard. Plain func type so core/application supplies
|
||||
// it as a method value without importing this package.
|
||||
TokenCounter func(modelName string) func(text string) (int, error)
|
||||
}
|
||||
|
||||
// ProbeExtractor pulls the prompt content out of a parsed request so
|
||||
@@ -212,7 +219,6 @@ func recordHTTPDecision(c echo.Context, store router.DecisionStore, result *rout
|
||||
_ = store.Record(context.Background(), result.ToDecisionRecord(newDecisionID(), correlationID, userID, source))
|
||||
}
|
||||
|
||||
|
||||
// GetOrBuildClassifier looks up a built Classifier for the named router
|
||||
// model in the registry and builds it on miss. Exported so the
|
||||
// /api/router/decide decision-oracle endpoint can share the same
|
||||
@@ -262,9 +268,10 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
|
||||
h := fnv.New64a()
|
||||
h.Write(bytes)
|
||||
if classifierCfg != nil {
|
||||
// Narrow projection: only the fields newTemplateRenderer and
|
||||
// firstStopWord actually read. Hashing the whole ModelConfig
|
||||
// would invalidate the cache on irrelevant parameter changes.
|
||||
// Narrow projection: only the fields buildClassifier reads (renderer,
|
||||
// stop tokens, context_size → MaxContextTokens). Hashing the whole
|
||||
// ModelConfig would invalidate the cache on irrelevant changes;
|
||||
// omitting context_size would let a reload leave a stale token budget.
|
||||
h.Write([]byte{0}) // separator so empty fields don't collide
|
||||
h.Write([]byte(classifierCfg.TemplateConfig.Chat))
|
||||
h.Write([]byte{0})
|
||||
@@ -274,6 +281,10 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
|
||||
h.Write([]byte(sw))
|
||||
h.Write([]byte{0})
|
||||
}
|
||||
h.Write([]byte{0})
|
||||
if classifierCfg.ContextSize != nil {
|
||||
h.Write([]byte(strconv.Itoa(*classifierCfg.ContextSize)))
|
||||
}
|
||||
}
|
||||
return h.Sum64()
|
||||
}
|
||||
@@ -319,11 +330,30 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
|
||||
if deps.ModelLookup != nil {
|
||||
if classifierCfg := deps.ModelLookup(rc.ClassifierModel); classifierCfg != nil {
|
||||
if deps.Evaluator != nil {
|
||||
opts.PromptRenderer = newTemplateRenderer(deps.Evaluator, classifierCfg)
|
||||
// The router renders the scoring prompt client-side, so the
|
||||
// classifier model MUST carry a chat template — refusing
|
||||
// here beats silently falling back to a generic ChatML
|
||||
// envelope the model may not have been trained on.
|
||||
renderer := newTemplateRenderer(deps.Evaluator, classifierCfg)
|
||||
if renderer == nil {
|
||||
return nil, fmt.Errorf(
|
||||
"router classifier score: classifier_model %q has no chat template "+
|
||||
"(set template.chat and template.chat_message in its config). The router "+
|
||||
"renders the scoring prompt with the classifier model's own template; "+
|
||||
"without it the prompt format would not match the model",
|
||||
rc.ClassifierModel)
|
||||
}
|
||||
opts.PromptRenderer = renderer
|
||||
}
|
||||
if st := pickAssistantTurnEnd(classifierCfg.StopWords, classifierCfg.TemplateConfig.ChatMessage); st != "" {
|
||||
opts.StopToken = st
|
||||
}
|
||||
// Token-exact conversation trim — score classifier drops the
|
||||
// oldest turns using the model's own tokenizer.
|
||||
if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
|
||||
opts.TokenCounter = count
|
||||
opts.MaxContextTokens = ctxTokens
|
||||
}
|
||||
}
|
||||
}
|
||||
inner = router.NewScoreClassifier(policies, scorer, opts)
|
||||
@@ -335,7 +365,11 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
|
||||
if reranker == nil {
|
||||
return nil, fmt.Errorf("router classifier colbert: classifier_model %q not loadable", rc.ClassifierModel)
|
||||
}
|
||||
inner = router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
|
||||
rerankClassifier := router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
|
||||
if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
|
||||
rerankClassifier = rerankClassifier.WithTokenTrim(count, ctxTokens)
|
||||
}
|
||||
inner = rerankClassifier
|
||||
default:
|
||||
return nil, fmt.Errorf("router: unknown classifier %q (supported: %s)", name, strings.Join([]string{router.ClassifierScore, router.ClassifierColbert}, ", "))
|
||||
}
|
||||
@@ -523,7 +557,41 @@ func wrapWithEmbeddingCache(cfg *config.ModelConfig, inner router.Classifier, de
|
||||
if vstore == nil {
|
||||
return nil, fmt.Errorf("vector store %q not loadable", storeName)
|
||||
}
|
||||
return router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold), nil
|
||||
cache := router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold)
|
||||
// Trim the probe to the embedder model's own context (e.g. nomic-embed at
|
||||
// 8k) rather than a fixed guess — otherwise the cache key is an embedding
|
||||
// of a silently-truncated conversation.
|
||||
if count, ctxTokens := modelTokenTrim(ec.EmbeddingModel, deps); count != nil {
|
||||
cache = cache.WithTokenTrim(count, ctxTokens)
|
||||
}
|
||||
return cache, nil
|
||||
}
|
||||
|
||||
// modelTokenTrim returns a model's own tokenizer and the token ceiling its
|
||||
// probe must fit, or (nil, 0) when no tokenizer is available (only then can we
|
||||
// not trim exactly). The ceiling is min(effective context, effective batch):
|
||||
// score/embed/rerank all decode the whole prompt in one pass, so it must fit
|
||||
// both the context window and a single batch. Using the backend's *effective*
|
||||
// values — not the raw config fields — means trimming still works when
|
||||
// context_size and batch are unset; otherwise a non-trivial prompt overflows
|
||||
// the default window and every classification fails.
|
||||
func modelTokenTrim(modelName string, deps ClassifierDeps) (func(string) (int, error), int) {
|
||||
if deps.TokenCounter == nil || deps.ModelLookup == nil {
|
||||
return nil, 0
|
||||
}
|
||||
cfg := deps.ModelLookup(modelName)
|
||||
if cfg == nil {
|
||||
return nil, 0
|
||||
}
|
||||
count := deps.TokenCounter(modelName)
|
||||
if count == nil {
|
||||
return nil, 0
|
||||
}
|
||||
ceiling := backend.EffectiveContextSize(*cfg)
|
||||
if b := backend.EffectiveBatchSize(*cfg); b < ceiling {
|
||||
ceiling = b
|
||||
}
|
||||
return count, ceiling
|
||||
}
|
||||
|
||||
func newDecisionID() string {
|
||||
@@ -545,6 +613,41 @@ func OpenAIProbe(parsed any) (router.Probe, bool) {
|
||||
return OpenAIProbeFromRequest(req), true
|
||||
}
|
||||
|
||||
// messageText flattens a chat message's Content to plain text: string content
|
||||
// verbatim; []any structured content contributes only its "text" blocks.
|
||||
func messageText(content any) string {
|
||||
switch ct := content.(type) {
|
||||
case string:
|
||||
return ct
|
||||
case []any:
|
||||
var b strings.Builder
|
||||
for _, block := range ct {
|
||||
if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
|
||||
if t, ok := bm["text"].(string); ok {
|
||||
if b.Len() > 0 {
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
b.WriteString(t)
|
||||
}
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// messageProbeParts drops empty (e.g. image-only) messages so they don't
|
||||
// consume budget or emit blank lines.
|
||||
func messageProbeParts(texts []string) []string {
|
||||
parts := make([]string, 0, len(texts))
|
||||
for _, t := range texts {
|
||||
if t != "" {
|
||||
parts = append(parts, t)
|
||||
}
|
||||
}
|
||||
return parts
|
||||
}
|
||||
|
||||
// OpenAIProbeFromRequest is the typed counterpart of OpenAIProbe — same
|
||||
// extraction logic, but takes the request struct directly. Realtime and
|
||||
// other non-HTTP callers use it to feed a probe to router.Resolve
|
||||
@@ -553,24 +656,15 @@ func OpenAIProbeFromRequest(req *schema.OpenAIRequest) router.Probe {
|
||||
if req == nil {
|
||||
return router.Probe{}
|
||||
}
|
||||
var b strings.Builder
|
||||
texts := make([]string, len(req.Messages))
|
||||
for i := range req.Messages {
|
||||
switch ct := req.Messages[i].Content.(type) {
|
||||
case string:
|
||||
b.WriteString(ct)
|
||||
b.WriteByte('\n')
|
||||
case []any:
|
||||
for _, block := range ct {
|
||||
if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
|
||||
if t, ok := bm["text"].(string); ok {
|
||||
b.WriteString(t)
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
texts[i] = messageText(req.Messages[i].Content)
|
||||
}
|
||||
return router.Probe{Prompt: b.String()}
|
||||
parts := messageProbeParts(texts)
|
||||
// Prompt carries the full conversation; each classifier trims it to its own
|
||||
// model's context (see modelTokenTrim). Messages preserves the per-turn
|
||||
// split the trimmer drops oldest-first.
|
||||
return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}
|
||||
}
|
||||
|
||||
// AnthropicProbe is the AnthropicRequest analogue of OpenAIProbe.
|
||||
@@ -579,25 +673,10 @@ func AnthropicProbe(parsed any) (router.Probe, bool) {
|
||||
if !ok || req == nil {
|
||||
return router.Probe{}, false
|
||||
}
|
||||
var b strings.Builder
|
||||
texts := make([]string, len(req.Messages))
|
||||
for i := range req.Messages {
|
||||
switch ct := req.Messages[i].Content.(type) {
|
||||
case string:
|
||||
b.WriteString(ct)
|
||||
b.WriteByte('\n')
|
||||
case []any:
|
||||
for _, block := range ct {
|
||||
if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
|
||||
if t, ok := bm["text"].(string); ok {
|
||||
b.WriteString(t)
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
texts[i] = messageText(req.Messages[i].Content)
|
||||
}
|
||||
return router.Probe{
|
||||
Prompt: b.String(),
|
||||
}, true
|
||||
parts := messageProbeParts(texts)
|
||||
return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}, true
|
||||
}
|
||||
|
||||
|
||||
@@ -246,11 +246,12 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
|
||||
"rendered prompt must end at assistant-open marker. got: %q", s.lastPrompt)
|
||||
})
|
||||
|
||||
It("falls back to chatMLRenderer when the classifier model has no chat_message template", func() {
|
||||
// Partial template config: only outer Chat, no per-role
|
||||
// piece. The renderer must refuse rather than emit a prompt
|
||||
// that drops the system turn, so the score classifier's
|
||||
// built-in ChatML default takes over.
|
||||
It("refuses to build the router when the classifier model has no chat_message template", func() {
|
||||
// Partial template config: only the outer Chat, no per-role piece.
|
||||
// The router renders the scoring prompt client-side from the
|
||||
// classifier model's own template, so a missing template is a hard
|
||||
// error rather than a silent fall back to a generic ChatML envelope
|
||||
// the model may not have been trained on.
|
||||
writePartialClassifierModel(modelDir, "arch-router")
|
||||
routerCfg := newScoreRouterModel(modelDir, "smart-router")
|
||||
|
||||
@@ -266,19 +267,9 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
|
||||
ModelLookup: loaderLookup(loader, appConfig),
|
||||
Evaluator: eval,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
// chatMLRenderer fallback emits its own envelope and still
|
||||
// embeds the routing system prompt. OpenAIProbeFromRequest
|
||||
// appends "\n" after each message body, so the user content
|
||||
// reaches the renderer as "hello world\n" — the substring
|
||||
// match accounts for that.
|
||||
Expect(s.lastPrompt).To(ContainSubstring("<routes>"),
|
||||
"fallback renderer also dropped the system prompt")
|
||||
Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>system\n"))
|
||||
Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>user\nhello world\n<|im_end|>"))
|
||||
Expect(strings.HasSuffix(s.lastPrompt, "<|im_start|>assistant\n")).To(BeTrue(),
|
||||
"chatMLRenderer fallback must end at assistant-open marker. got: %q", s.lastPrompt)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("no chat template"),
|
||||
"missing classifier template must surface as a clear config error. got: %v", err)
|
||||
})
|
||||
|
||||
It("uses the classifier model's first stopword as the candidate suffix", func() {
|
||||
@@ -533,8 +524,8 @@ template:
|
||||
|
||||
// writePartialClassifierModel writes a classifier model that has the
|
||||
// outer Chat template but no ChatMessage — exercises the
|
||||
// newTemplateRenderer "refuse partial templating" branch that hands
|
||||
// off to chatMLRenderer.
|
||||
// newTemplateRenderer "refuse partial templating" branch, which makes
|
||||
// buildClassifier reject the router with a missing-template error.
|
||||
func writePartialClassifierModel(modelDir, name string) {
|
||||
body := `name: ` + name + `
|
||||
backend: llama-cpp
|
||||
|
||||
Reference in New Issue
Block a user