fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)

* fix(router): score classifier production-readiness Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(backend): auto-size batch to context for embedding and rerank models Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins. Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse. Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(gallery): raise arch-router scoring output cap via parallel:64 Scoring decodes the whole prompt+candidate in a single llama_decode and reads one logit row per candidate token. The vendored llama.cpp server caps causal output rows at n_parallel, so the default of 1 aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route labels. Set options: [parallel:64] on both arch-router quant entries to lift the cap; kv_unified (the grpc-server default) keeps the full context per sequence, so this does not split the KV cache. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-06-13 03:09:03 -04:00 · 2026-06-12 15:21:15 +01:00
parent 56cc4f63fc
commit 085fc53bbc
86 changed files with 2305 additions and 387 deletions
--- a/core/http/middleware/probe_trim_test.go
+++ b/core/http/middleware/probe_trim_test.go
@@ -0,0 +1,139 @@
+package middleware
+
+import (
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("routerConfigFingerprint", func() {
+	rc := config.RouterConfig{Classifier: "score", ClassifierModel: "arch-router"}
+	ctx4096 := 4096
+	ctx8192 := 8192
+
+	// Regression: the score classifier bakes context_size into its token
+	// budget at build time, and the built classifier is cached by this
+	// fingerprint. If context_size weren't hashed, editing it and reloading
+	// would return a classifier carrying the stale budget.
+	It("changes when the classifier model's context_size changes", func() {
+		cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
+		cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8192}}
+		Expect(routerConfigFingerprint(rc, cfgA)).NotTo(Equal(routerConfigFingerprint(rc, cfgB)))
+	})
+
+	It("is stable for identical classifier configs", func() {
+		cfgA := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
+		cfgB := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx4096}}
+		Expect(routerConfigFingerprint(rc, cfgA)).To(Equal(routerConfigFingerprint(rc, cfgB)))
+	})
+})
+
+var _ = Describe("routing probe extraction and trimming", func() {
+	Describe("OpenAIProbeFromRequest", func() {
+		It("keeps a short conversation intact, newline-terminated per message", func() {
+			req := &schema.OpenAIRequest{Messages: []schema.Message{
+				{Role: "user", Content: "first"},
+				{Role: "assistant", Content: "second"},
+				{Role: "user", Content: "third"},
+			}}
+			Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("first\nsecond\nthird\n"))
+		})
+
+		It("flattens text blocks and skips image-only messages", func() {
+			req := &schema.OpenAIRequest{Messages: []schema.Message{
+				{Role: "user", Content: []any{
+					map[string]any{"type": "text", "text": "describe this"},
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
+				}},
+				{Role: "user", Content: []any{
+					map[string]any{"type": "image_url", "image_url": map[string]any{"url": "data:..."}},
+				}},
+			}}
+			// Second message contributes no text, so it neither adds a blank
+			// line nor a stray newline.
+			Expect(OpenAIProbeFromRequest(req).Prompt).To(Equal("describe this\n"))
+		})
+
+		It("carries the full conversation untrimmed — trimming is each classifier's job", func() {
+			// The middleware no longer caps the probe by a fixed rune budget;
+			// every turn reaches the Probe and each classifier trims to its own
+			// model's context (see modelTokenTrim / promptTrimmer).
+			block := strings.Repeat("x", 999)
+			msgs := make([]schema.Message, 0, 20)
+			msgs = append(msgs, schema.Message{Role: "user", Content: "OLDEST" + strings.Repeat("o", 994)})
+			for range 18 {
+				msgs = append(msgs, schema.Message{Role: "user", Content: block})
+			}
+			msgs = append(msgs, schema.Message{Role: "user", Content: "NEWEST" + strings.Repeat("n", 994)})
+
+			probe := OpenAIProbeFromRequest(&schema.OpenAIRequest{Messages: msgs})
+			Expect(probe.Prompt).To(ContainSubstring("OLDEST"), "no turn is dropped at probe-build time")
+			Expect(probe.Prompt).To(ContainSubstring("NEWEST"))
+			// Messages preserves the per-turn split the classifier trims from.
+			Expect(probe.Messages).To(HaveLen(20))
+			Expect(probe.Messages[0]).To(ContainSubstring("OLDEST"))
+			Expect(probe.Messages[19]).To(ContainSubstring("NEWEST"))
+		})
+	})
+
+	Describe("AnthropicProbe", func() {
+		It("extracts and trims the same way as the OpenAI path", func() {
+			req := &schema.AnthropicRequest{Messages: []schema.AnthropicMessage{
+				{Role: "user", Content: "alpha"},
+				{Role: "assistant", Content: []any{
+					map[string]any{"type": "text", "text": "beta"},
+				}},
+			}}
+			probe, ok := AnthropicProbe(req)
+			Expect(ok).To(BeTrue())
+			Expect(probe.Prompt).To(Equal("alpha\nbeta\n"))
+		})
+
+		It("returns ok=false for a non-Anthropic payload", func() {
+			_, ok := AnthropicProbe(&schema.OpenAIRequest{})
+			Expect(ok).To(BeFalse())
+		})
+	})
+
+	Describe("modelTokenTrim", func() {
+		tok := func(string) (int, error) { return 1, nil }
+		depsFor := func(cfg *config.ModelConfig) ClassifierDeps {
+			return ClassifierDeps{
+				ModelLookup:  func(string) *config.ModelConfig { return cfg },
+				TokenCounter: func(string) func(string) (int, error) { return tok },
+			}
+		}
+
+		It("still trims to the backend default when context_size is unset", func() {
+			// Regression: with the fixed middleware rune cap gone, an unset
+			// context_size must NOT disable trimming — otherwise a non-trivial
+			// prompt overflows the default 4096 window and every score fails.
+			score := config.FLAG_SCORE
+			cfg := &config.ModelConfig{KnownUsecases: &score} // FLAG_SCORE → batch follows context
+			count, ceiling := modelTokenTrim("classifier", depsFor(cfg))
+			Expect(count).NotTo(BeNil())
+			Expect(ceiling).To(Equal(4096), "unset context_size falls back to the backend default, not 0")
+		})
+
+		It("is bounded by the batch when the batch is smaller than the context", func() {
+			// The probe is one decode (n_tokens <= n_batch). A model with a
+			// large context but a small batch can only process the batch — the
+			// ceiling must follow it, not the context.
+			ctx8k := 8192
+			cfg := &config.ModelConfig{LLMConfig: config.LLMConfig{ContextSize: &ctx8k}}
+			cfg.Batch = 512
+			_, ceiling := modelTokenTrim("embedder", depsFor(cfg))
+			Expect(ceiling).To(Equal(512), "batch is the binding single-decode limit")
+		})
+
+		It("disables trimming only when no tokenizer is available", func() {
+			count, ceiling := modelTokenTrim("x", ClassifierDeps{ModelLookup: func(string) *config.ModelConfig { return &config.ModelConfig{} }})
+			Expect(count).To(BeNil())
+			Expect(ceiling).To(Equal(0))
+		})
+	})
+})
--- a/core/http/middleware/route_model.go
+++ b/core/http/middleware/route_model.go
@@ -6,6 +6,7 @@ import (
 	"encoding/hex"
 	"fmt"
 	"hash/fnv"
+	"strconv"
 	"strings"
 	"time"

@@ -86,6 +87,12 @@ type ClassifierDeps struct {
 	// templates.Evaluator so any model the operator points at gets
 	// its own chat template applied.
 	Evaluator *templates.Evaluator
+
+	// TokenCounter binds the classifier model's tokenizer for the score
+	// classifier's token-trim path. Optional; nil falls back to the
+	// backend's n_ctx guard. Plain func type so core/application supplies
+	// it as a method value without importing this package.
+	TokenCounter func(modelName string) func(text string) (int, error)
 }

 // ProbeExtractor pulls the prompt content out of a parsed request so
@@ -212,7 +219,6 @@ func recordHTTPDecision(c echo.Context, store router.DecisionStore, result *rout
 	_ = store.Record(context.Background(), result.ToDecisionRecord(newDecisionID(), correlationID, userID, source))
 }

-
 // GetOrBuildClassifier looks up a built Classifier for the named router
 // model in the registry and builds it on miss. Exported so the
 // /api/router/decide decision-oracle endpoint can share the same
@@ -262,9 +268,10 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
 	h := fnv.New64a()
 	h.Write(bytes)
 	if classifierCfg != nil {
-		// Narrow projection: only the fields newTemplateRenderer and
-		// firstStopWord actually read. Hashing the whole ModelConfig
-		// would invalidate the cache on irrelevant parameter changes.
+		// Narrow projection: only the fields buildClassifier reads (renderer,
+		// stop tokens, context_size → MaxContextTokens). Hashing the whole
+		// ModelConfig would invalidate the cache on irrelevant changes;
+		// omitting context_size would let a reload leave a stale token budget.
 		h.Write([]byte{0}) // separator so empty fields don't collide
 		h.Write([]byte(classifierCfg.TemplateConfig.Chat))
 		h.Write([]byte{0})
@@ -274,6 +281,10 @@ func routerConfigFingerprint(rc config.RouterConfig, classifierCfg *config.Model
 			h.Write([]byte(sw))
 			h.Write([]byte{0})
 		}
+		h.Write([]byte{0})
+		if classifierCfg.ContextSize != nil {
+			h.Write([]byte(strconv.Itoa(*classifierCfg.ContextSize)))
+		}
 	}
 	return h.Sum64()
 }
@@ -319,11 +330,30 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
 		if deps.ModelLookup != nil {
 			if classifierCfg := deps.ModelLookup(rc.ClassifierModel); classifierCfg != nil {
 				if deps.Evaluator != nil {
-					opts.PromptRenderer = newTemplateRenderer(deps.Evaluator, classifierCfg)
+					// The router renders the scoring prompt client-side, so the
+					// classifier model MUST carry a chat template — refusing
+					// here beats silently falling back to a generic ChatML
+					// envelope the model may not have been trained on.
+					renderer := newTemplateRenderer(deps.Evaluator, classifierCfg)
+					if renderer == nil {
+						return nil, fmt.Errorf(
+							"router classifier score: classifier_model %q has no chat template "+
+								"(set template.chat and template.chat_message in its config). The router "+
+								"renders the scoring prompt with the classifier model's own template; "+
+								"without it the prompt format would not match the model",
+							rc.ClassifierModel)
+					}
+					opts.PromptRenderer = renderer
 				}
 				if st := pickAssistantTurnEnd(classifierCfg.StopWords, classifierCfg.TemplateConfig.ChatMessage); st != "" {
 					opts.StopToken = st
 				}
+				// Token-exact conversation trim — score classifier drops the
+				// oldest turns using the model's own tokenizer.
+				if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
+					opts.TokenCounter = count
+					opts.MaxContextTokens = ctxTokens
+				}
 			}
 		}
 		inner = router.NewScoreClassifier(policies, scorer, opts)
@@ -335,7 +365,11 @@ func buildClassifier(cfg *config.ModelConfig, deps ClassifierDeps) (router.Class
 		if reranker == nil {
 			return nil, fmt.Errorf("router classifier colbert: classifier_model %q not loadable", rc.ClassifierModel)
 		}
-		inner = router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
+		rerankClassifier := router.NewRerankClassifier(policies, reranker, cacheCap, rc.ActivationThreshold)
+		if count, ctxTokens := modelTokenTrim(rc.ClassifierModel, deps); count != nil {
+			rerankClassifier = rerankClassifier.WithTokenTrim(count, ctxTokens)
+		}
+		inner = rerankClassifier
 	default:
 		return nil, fmt.Errorf("router: unknown classifier %q (supported: %s)", name, strings.Join([]string{router.ClassifierScore, router.ClassifierColbert}, ", "))
 	}
@@ -523,7 +557,41 @@ func wrapWithEmbeddingCache(cfg *config.ModelConfig, inner router.Classifier, de
 	if vstore == nil {
 		return nil, fmt.Errorf("vector store %q not loadable", storeName)
 	}
-	return router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold), nil
+	cache := router.NewEmbeddingCacheClassifier(inner, embedder, vstore, ec.SimilarityThreshold, ec.ConfidenceThreshold)
+	// Trim the probe to the embedder model's own context (e.g. nomic-embed at
+	// 8k) rather than a fixed guess — otherwise the cache key is an embedding
+	// of a silently-truncated conversation.
+	if count, ctxTokens := modelTokenTrim(ec.EmbeddingModel, deps); count != nil {
+		cache = cache.WithTokenTrim(count, ctxTokens)
+	}
+	return cache, nil
+}
+
+// modelTokenTrim returns a model's own tokenizer and the token ceiling its
+// probe must fit, or (nil, 0) when no tokenizer is available (only then can we
+// not trim exactly). The ceiling is min(effective context, effective batch):
+// score/embed/rerank all decode the whole prompt in one pass, so it must fit
+// both the context window and a single batch. Using the backend's *effective*
+// values — not the raw config fields — means trimming still works when
+// context_size and batch are unset; otherwise a non-trivial prompt overflows
+// the default window and every classification fails.
+func modelTokenTrim(modelName string, deps ClassifierDeps) (func(string) (int, error), int) {
+	if deps.TokenCounter == nil || deps.ModelLookup == nil {
+		return nil, 0
+	}
+	cfg := deps.ModelLookup(modelName)
+	if cfg == nil {
+		return nil, 0
+	}
+	count := deps.TokenCounter(modelName)
+	if count == nil {
+		return nil, 0
+	}
+	ceiling := backend.EffectiveContextSize(*cfg)
+	if b := backend.EffectiveBatchSize(*cfg); b < ceiling {
+		ceiling = b
+	}
+	return count, ceiling
 }

 func newDecisionID() string {
@@ -545,6 +613,41 @@ func OpenAIProbe(parsed any) (router.Probe, bool) {
 	return OpenAIProbeFromRequest(req), true
 }

+// messageText flattens a chat message's Content to plain text: string content
+// verbatim; []any structured content contributes only its "text" blocks.
+func messageText(content any) string {
+	switch ct := content.(type) {
+	case string:
+		return ct
+	case []any:
+		var b strings.Builder
+		for _, block := range ct {
+			if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
+				if t, ok := bm["text"].(string); ok {
+					if b.Len() > 0 {
+						b.WriteByte('\n')
+					}
+					b.WriteString(t)
+				}
+			}
+		}
+		return b.String()
+	}
+	return ""
+}
+
+// messageProbeParts drops empty (e.g. image-only) messages so they don't
+// consume budget or emit blank lines.
+func messageProbeParts(texts []string) []string {
+	parts := make([]string, 0, len(texts))
+	for _, t := range texts {
+		if t != "" {
+			parts = append(parts, t)
+		}
+	}
+	return parts
+}
+
 // OpenAIProbeFromRequest is the typed counterpart of OpenAIProbe — same
 // extraction logic, but takes the request struct directly. Realtime and
 // other non-HTTP callers use it to feed a probe to router.Resolve
@@ -553,24 +656,15 @@ func OpenAIProbeFromRequest(req *schema.OpenAIRequest) router.Probe {
 	if req == nil {
 		return router.Probe{}
 	}
-	var b strings.Builder
+	texts := make([]string, len(req.Messages))
 	for i := range req.Messages {
-		switch ct := req.Messages[i].Content.(type) {
-		case string:
-			b.WriteString(ct)
-			b.WriteByte('\n')
-		case []any:
-			for _, block := range ct {
-				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
-					if t, ok := bm["text"].(string); ok {
-						b.WriteString(t)
-						b.WriteByte('\n')
-					}
-				}
-			}
-		}
+		texts[i] = messageText(req.Messages[i].Content)
 	}
-	return router.Probe{Prompt: b.String()}
+	parts := messageProbeParts(texts)
+	// Prompt carries the full conversation; each classifier trims it to its own
+	// model's context (see modelTokenTrim). Messages preserves the per-turn
+	// split the trimmer drops oldest-first.
+	return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}
 }

 // AnthropicProbe is the AnthropicRequest analogue of OpenAIProbe.
@@ -579,25 +673,10 @@ func AnthropicProbe(parsed any) (router.Probe, bool) {
 	if !ok || req == nil {
 		return router.Probe{}, false
 	}
-	var b strings.Builder
+	texts := make([]string, len(req.Messages))
 	for i := range req.Messages {
-		switch ct := req.Messages[i].Content.(type) {
-		case string:
-			b.WriteString(ct)
-			b.WriteByte('\n')
-		case []any:
-			for _, block := range ct {
-				if bm, ok := block.(map[string]any); ok && bm["type"] == "text" {
-					if t, ok := bm["text"].(string); ok {
-						b.WriteString(t)
-						b.WriteByte('\n')
-					}
-				}
-			}
-		}
+		texts[i] = messageText(req.Messages[i].Content)
 	}
-	return router.Probe{
-		Prompt: b.String(),
-	}, true
+	parts := messageProbeParts(texts)
+	return router.Probe{Prompt: router.JoinTurns(parts), Messages: parts}, true
 }
-
--- a/core/http/middleware/route_model_test.go
+++ b/core/http/middleware/route_model_test.go
@@ -246,11 +246,12 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
 			"rendered prompt must end at assistant-open marker. got: %q", s.lastPrompt)
 	})

-	It("falls back to chatMLRenderer when the classifier model has no chat_message template", func() {
-		// Partial template config: only outer Chat, no per-role
-		// piece. The renderer must refuse rather than emit a prompt
-		// that drops the system turn, so the score classifier's
-		// built-in ChatML default takes over.
+	It("refuses to build the router when the classifier model has no chat_message template", func() {
+		// Partial template config: only the outer Chat, no per-role piece.
+		// The router renders the scoring prompt client-side from the
+		// classifier model's own template, so a missing template is a hard
+		// error rather than a silent fall back to a generic ChatML envelope
+		// the model may not have been trained on.
 		writePartialClassifierModel(modelDir, "arch-router")
 		routerCfg := newScoreRouterModel(modelDir, "smart-router")

@@ -266,19 +267,9 @@ var _ = Describe("RouteModel rendered classifier prompt", func() {
 				ModelLookup: loaderLookup(loader, appConfig),
 				Evaluator:   eval,
 			})
-		Expect(err).NotTo(HaveOccurred())
-
-		// chatMLRenderer fallback emits its own envelope and still
-		// embeds the routing system prompt. OpenAIProbeFromRequest
-		// appends "\n" after each message body, so the user content
-		// reaches the renderer as "hello world\n" — the substring
-		// match accounts for that.
-		Expect(s.lastPrompt).To(ContainSubstring("<routes>"),
-			"fallback renderer also dropped the system prompt")
-		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>system\n"))
-		Expect(s.lastPrompt).To(ContainSubstring("<|im_start|>user\nhello world\n<|im_end|>"))
-		Expect(strings.HasSuffix(s.lastPrompt, "<|im_start|>assistant\n")).To(BeTrue(),
-			"chatMLRenderer fallback must end at assistant-open marker. got: %q", s.lastPrompt)
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("no chat template"),
+			"missing classifier template must surface as a clear config error. got: %v", err)
 	})

 	It("uses the classifier model's first stopword as the candidate suffix", func() {
@@ -533,8 +524,8 @@ template:

 // writePartialClassifierModel writes a classifier model that has the
 // outer Chat template but no ChatMessage — exercises the
-// newTemplateRenderer "refuse partial templating" branch that hands
-// off to chatMLRenderer.
+// newTemplateRenderer "refuse partial templating" branch, which makes
+// buildClassifier reject the router with a missing-template error.
 func writePartialClassifierModel(modelDir, name string) {
 	body := `name: ` + name + `
 backend: llama-cpp