fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)

* fix(router): score classifier production-readiness Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(backend): auto-size batch to context for embedding and rerank models Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins. Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse. Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(gallery): raise arch-router scoring output cap via parallel:64 Scoring decodes the whole prompt+candidate in a single llama_decode and reads one logit row per candidate token. The vendored llama.cpp server caps causal output rows at n_parallel, so the default of 1 aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route labels. Set options: [parallel:64] on both arch-router quant entries to lift the cap; kv_unified (the grpc-server default) keeps the full context per sequence, so this does not split the KV cache. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-06-13 03:09:03 -04:00 · 2026-06-12 15:21:15 +01:00
parent 56cc4f63fc
commit 085fc53bbc
86 changed files with 2305 additions and 387 deletions
--- a/tests/e2e/e2e_router_test.go
+++ b/tests/e2e/e2e_router_test.go
@@ -0,0 +1,90 @@
+package e2e_test
+
+import (
+	"context"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/openai/openai-go/v3"
+)
+
+// Router e2e: drives /v1/chat/completions through the RouteModel middleware
+// against a configured score classifier (mock-classifier from the suite
+// fixtures) and two candidates. The mock-backend's Score handler ranks
+// candidates by looking for a `ROUTE_HINT=<label>` marker in the prompt and
+// boosting the candidate whose label matches; without a hint, all candidates
+// score equally and the router falls back. The ECHO_SERVED_MODEL trigger
+// makes the chosen candidate echo its loaded model file path so the test can
+// verify routing decisively rather than infer it from content shape.
+var _ = Describe("Router E2E", Label("Router"), func() {
+	chat := func(message string) (*openai.ChatCompletion, error) {
+		return client.Chat.Completions.New(
+			context.TODO(),
+			openai.ChatCompletionNewParams{
+				Model: "smart-router",
+				Messages: []openai.ChatCompletionMessageParamUnion{
+					openai.UserMessage(message),
+				},
+			},
+		)
+	}
+
+	It("routes a casual probe to the casual-chat candidate", func() {
+		resp, err := chat("ROUTE_HINT=casual-chat ECHO_SERVED_MODEL")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Choices).To(HaveLen(1))
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-casual.bin"),
+			"casual hint should have routed to mock-cand-casual; got %q", resp.Choices[0].Message.Content)
+	})
+
+	It("routes a code probe to the code-generation candidate", func() {
+		resp, err := chat("ROUTE_HINT=code-generation ECHO_SERVED_MODEL")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Choices).To(HaveLen(1))
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-code.bin"),
+			"code hint should have routed to mock-cand-code; got %q", resp.Choices[0].Message.Content)
+	})
+
+	It("falls back when no policy label matches the probe", func() {
+		// No ROUTE_HINT marker — the mock Score handler gives every candidate
+		// the same base log-prob, softmax goes uniform, no label clears
+		// activation_threshold=0.40, so the router falls back to
+		// mock-cand-casual.
+		resp, err := chat("ECHO_SERVED_MODEL hello world")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Choices).To(HaveLen(1))
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-casual.bin"),
+			"unhinted probe should have fallen back; got %q", resp.Choices[0].Message.Content)
+	})
+
+	It("routes correctly over a long conversation (exercises fitMessages)", func() {
+		// Build a conversation long enough that the score classifier's
+		// probeTokenBudget kicks in and fitMessages has to trim. mock-backend's
+		// TokenizeString returns ~1 token per 4 prompt characters, and the
+		// classifier ContextSize is 4096, so >40k chars guarantees the trim
+		// path. The ROUTE_HINT marker is placed ONLY in the newest message —
+		// if fitMessages dropped it during trim, no candidate would win and we
+		// would route to the fallback (mock-cand-casual) instead of the code
+		// candidate.
+		filler := strings.Repeat("background context, lorem ipsum dolor sit amet. ", 200) // ~10k chars × 5 turns
+		msgs := make([]openai.ChatCompletionMessageParamUnion, 0, 6)
+		for range 5 {
+			msgs = append(msgs, openai.UserMessage(filler))
+		}
+		msgs = append(msgs, openai.UserMessage("ROUTE_HINT=code-generation ECHO_SERVED_MODEL"))
+
+		resp, err := client.Chat.Completions.New(
+			context.TODO(),
+			openai.ChatCompletionNewParams{Model: "smart-router", Messages: msgs},
+		)
+		Expect(err).ToNot(HaveOccurred(), "router must survive a long conversation without erroring")
+		Expect(resp.Choices).To(HaveLen(1))
+		// The newest turn carries the routing intent ("code"); fitMessages must
+		// keep it intact even after dropping older fillers, so the code
+		// candidate still wins.
+		Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-code.bin"),
+			"long-conversation routing must still resolve to the code candidate; got %q",
+			resp.Choices[0].Message.Content)
+	})
+})
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -236,6 +236,65 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())

+	// Router model setup: a score classifier (mock-backend Score) selects
+	// between two candidate chat models based on keyword matches against the
+	// candidate label fragments. Exercises the full RouteModel middleware path
+	// — probe extraction, ScoreClassifier.fitMessages (with the classifier's
+	// real TokenizeString and ContextSize wired), Score RPC, and fanout to
+	// the chosen candidate. The classifier MUST carry a chat template, since
+	// buildClassifier now rejects routers whose classifier model has none.
+	chatMLTpl := map[string]any{
+		"chat":         "{{.Input -}}\n<|im_start|>assistant\n",
+		"chat_message": "<|im_start|>{{ .RoleName }}\n{{ if .Content }}{{ .Content }}{{ end }}<|im_end|>",
+	}
+	classifierCfg := map[string]any{
+		"name":           "mock-classifier",
+		"backend":        "mock-backend",
+		"known_usecases": []string{"score"},
+		"context_size":   4096,
+		"stopwords":      []string{"<|im_end|>"},
+		"parameters":     map[string]any{"model": "mock-classifier.bin"},
+		"template":       chatMLTpl,
+	}
+	classifierData, err := yaml.Marshal(classifierCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "mock-classifier.yaml"), classifierData, 0644)).To(Succeed())
+
+	for _, name := range []string{"mock-cand-casual", "mock-cand-code"} {
+		candCfg := map[string]any{
+			"name":           name,
+			"backend":        "mock-backend",
+			"known_usecases": []string{"chat"},
+			"parameters":     map[string]any{"model": name + ".bin"},
+		}
+		candData, err := yaml.Marshal(candCfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(os.WriteFile(filepath.Join(modelsPath, name+".yaml"), candData, 0644)).To(Succeed())
+	}
+
+	routerCfg := map[string]any{
+		"name":           "smart-router",
+		"known_usecases": []string{"chat"},
+		"router": map[string]any{
+			"classifier":           "score",
+			"classifier_model":     "mock-classifier",
+			"activation_threshold": 0.40,
+			"fallback":             "mock-cand-casual",
+			"policies": []map[string]any{
+				{"label": "casual-chat", "description": "small talk and general conversation"},
+				{"label": "code-generation", "description": "writing or debugging code"},
+				{"label": "math-reasoning", "description": "arithmetic and word problems"},
+			},
+			"candidates": []map[string]any{
+				{"model": "mock-cand-casual", "labels": []string{"casual-chat"}},
+				{"model": "mock-cand-code", "labels": []string{"code-generation", "math-reasoning"}},
+			},
+		},
+	}
+	routerData, err := yaml.Marshal(routerCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "smart-router.yaml"), routerData, 0644)).To(Succeed())
+
 	// If REALTIME_TEST_MODEL=realtime-test-pipeline, auto-create a pipeline
 	// config from the REALTIME_VAD/STT/LLM/TTS env vars so real-model tests
 	// can run without the user having to write a YAML file manually.
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@@ -109,6 +109,23 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
 		}, nil
 	}

+	// ECHO_SERVED_MODEL returns the loaded model file path so router e2e
+	// tests can verify which candidate actually served the request without
+	// adding a new RPC. The router fans out to a single backend process per
+	// candidate, so lastLoadParams.Model is unique per candidate.
+	if strings.Contains(in.Prompt, "ECHO_SERVED_MODEL") {
+		opts := snapshotLoadParams()
+		modelID := ""
+		if opts != nil {
+			modelID = opts.Model
+		}
+		return &pb.Reply{
+			Message:      []byte("SERVED_MODEL=" + modelID),
+			Tokens:       2,
+			PromptTokens: 1,
+		}, nil
+	}
+
 	// Simulate C++ autoparser: tool call via ChatDeltas, empty message
 	if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
 		toolName := mockToolNameFromRequest(in)
@@ -171,7 +188,7 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
 	// Simulate multiple tool calls in a single response (Go-side JSON parser path).
 	if strings.Contains(in.Prompt, "MULTI_TOOL_CALL") {
 		return &pb.Reply{
-			Message:      []byte(`{"name": "get_weather", "arguments": {"location": "Rome"}}
+			Message: []byte(`{"name": "get_weather", "arguments": {"location": "Rome"}}
 {"name": "get_weather", "arguments": {"location": "Paris"}}`),
 			Tokens:       30,
 			PromptTokens: 10,
@@ -540,15 +557,91 @@ func (m *MockBackend) AudioTranscription(ctx context.Context, in *pb.TranscriptR
 }

 func (m *MockBackend) TokenizeString(ctx context.Context, in *pb.PredictOptions) (*pb.TokenizationResponse, error) {
-	xlog.Debug("TokenizeString called", "prompt", in.Prompt)
-	// Return mock token IDs
-	tokens := []int32{101, 2023, 2003, 1037, 3231, 1012}
+	xlog.Debug("TokenizeString called", "prompt_len", len(in.Prompt))
+	// Approximate BPE: ~4 chars/token, minimum 1. Realistic enough for the
+	// router's fitMessages to exercise the budget/rune-pretrim path with
+	// recognisable counts that scale with input size.
+	n := max((len(in.Prompt)+3)/4, 1)
+	tokens := make([]int32, n)
+	for i := range tokens {
+		tokens[i] = int32(i + 1)
+	}
 	return &pb.TokenizationResponse{
-		Length: int32(len(tokens)),
+		Length: int32(n),
 		Tokens: tokens,
 	}, nil
 }

+// Score implements deterministic marker-driven ranking for router e2e
+// tests. The Score RPC receives the full rendered routing prompt (system
+// prompt + chat envelope + user turn), and the system prompt by construction
+// lists every policy label — so any keyword-against-prompt heuristic would
+// match every candidate. Instead we look for an explicit `ROUTE_HINT=<label>`
+// marker, which only appears when a test deliberately places one in a user
+// message. The candidate whose extracted label equals the hint gets a large
+// log-prob boost; all others stay at the base. With no hint, every candidate
+// scores equally, softmax is uniform, and (with a sensible activation
+// threshold) the router falls back.
+func (m *MockBackend) Score(ctx context.Context, in *pb.ScoreRequest) (*pb.ScoreResponse, error) {
+	xlog.Debug("Score called", "candidates", len(in.Candidates))
+	hint := extractRouteHint(in.Prompt)
+	out := &pb.ScoreResponse{Candidates: make([]*pb.CandidateScore, len(in.Candidates))}
+	for i, c := range in.Candidates {
+		label := extractRouteLabel(c)
+		// Base -5 (softmax ≈ 0.003), hint match +5 → 0 (softmax ≈ 0.99).
+		logProb := -5.0
+		if hint != "" && label == hint {
+			logProb = 0.0
+		}
+		// num_tokens matches TokenizeString's heuristic so per-token mean
+		// log-prob consumers see consistent values.
+		nTok := max((len(c)+3)/4, 1)
+		out.Candidates[i] = &pb.CandidateScore{
+			LogProb:                 logProb,
+			NumTokens:               int32(nTok),
+			LengthNormalizedLogProb: logProb / float64(nTok),
+		}
+	}
+	return out, nil
+}
+
+// extractRouteHint returns the label after the LAST occurrence of
+// `ROUTE_HINT=` in the prompt, terminated by whitespace or end-of-string.
+// Using the last occurrence makes the marker stable across long
+// conversations: the *newest* user message's hint wins, mirroring how the
+// router's fitMessages keeps the newest turn whole.
+func extractRouteHint(prompt string) string {
+	const key = "ROUTE_HINT="
+	i := strings.LastIndex(prompt, key)
+	if i < 0 {
+		return ""
+	}
+	rest := prompt[i+len(key):]
+	end := strings.IndexAny(rest, " \t\r\n<")
+	if end < 0 {
+		return rest
+	}
+	return rest[:end]
+}
+
+// extractRouteLabel returns the label inside `{"route": "<label>"}`. Returns
+// "" on any shape it doesn't recognise — the caller treats that as a no-match.
+func extractRouteLabel(candidate string) string {
+	_, rest, ok := strings.Cut(candidate, `"route"`)
+	if !ok {
+		return ""
+	}
+	_, rest, ok = strings.Cut(rest, `"`)
+	if !ok {
+		return ""
+	}
+	label, _, ok := strings.Cut(rest, `"`)
+	if !ok {
+		return ""
+	}
+	return label
+}
+
 func (m *MockBackend) Status(ctx context.Context, in *pb.HealthMessage) (*pb.StatusResponse, error) {
 	xlog.Debug("Status called")
 	return &pb.StatusResponse{