mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-13 03:09:03 -04:00
fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)
* fix(router): score classifier production-readiness Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(backend): auto-size batch to context for embedding and rerank models Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins. Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse. Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(gallery): raise arch-router scoring output cap via parallel:64 Scoring decodes the whole prompt+candidate in a single llama_decode and reads one logit row per candidate token. The vendored llama.cpp server caps causal output rows at n_parallel, so the default of 1 aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route labels. Set options: [parallel:64] on both arch-router quant entries to lift the cap; kv_unified (the grpc-server default) keeps the full context per sequence, so this does not split the KV cache. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
committed by
GitHub
parent
56cc4f63fc
commit
085fc53bbc
90
tests/e2e/e2e_router_test.go
Normal file
90
tests/e2e/e2e_router_test.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package e2e_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"github.com/openai/openai-go/v3"
|
||||
)
|
||||
|
||||
// Router e2e: drives /v1/chat/completions through the RouteModel middleware
|
||||
// against a configured score classifier (mock-classifier from the suite
|
||||
// fixtures) and two candidates. The mock-backend's Score handler ranks
|
||||
// candidates by looking for a `ROUTE_HINT=<label>` marker in the prompt and
|
||||
// boosting the candidate whose label matches; without a hint, all candidates
|
||||
// score equally and the router falls back. The ECHO_SERVED_MODEL trigger
|
||||
// makes the chosen candidate echo its loaded model file path so the test can
|
||||
// verify routing decisively rather than infer it from content shape.
|
||||
var _ = Describe("Router E2E", Label("Router"), func() {
|
||||
chat := func(message string) (*openai.ChatCompletion, error) {
|
||||
return client.Chat.Completions.New(
|
||||
context.TODO(),
|
||||
openai.ChatCompletionNewParams{
|
||||
Model: "smart-router",
|
||||
Messages: []openai.ChatCompletionMessageParamUnion{
|
||||
openai.UserMessage(message),
|
||||
},
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
It("routes a casual probe to the casual-chat candidate", func() {
|
||||
resp, err := chat("ROUTE_HINT=casual-chat ECHO_SERVED_MODEL")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(resp.Choices).To(HaveLen(1))
|
||||
Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-casual.bin"),
|
||||
"casual hint should have routed to mock-cand-casual; got %q", resp.Choices[0].Message.Content)
|
||||
})
|
||||
|
||||
It("routes a code probe to the code-generation candidate", func() {
|
||||
resp, err := chat("ROUTE_HINT=code-generation ECHO_SERVED_MODEL")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(resp.Choices).To(HaveLen(1))
|
||||
Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-code.bin"),
|
||||
"code hint should have routed to mock-cand-code; got %q", resp.Choices[0].Message.Content)
|
||||
})
|
||||
|
||||
It("falls back when no policy label matches the probe", func() {
|
||||
// No ROUTE_HINT marker — the mock Score handler gives every candidate
|
||||
// the same base log-prob, softmax goes uniform, no label clears
|
||||
// activation_threshold=0.40, so the router falls back to
|
||||
// mock-cand-casual.
|
||||
resp, err := chat("ECHO_SERVED_MODEL hello world")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(resp.Choices).To(HaveLen(1))
|
||||
Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-casual.bin"),
|
||||
"unhinted probe should have fallen back; got %q", resp.Choices[0].Message.Content)
|
||||
})
|
||||
|
||||
It("routes correctly over a long conversation (exercises fitMessages)", func() {
|
||||
// Build a conversation long enough that the score classifier's
|
||||
// probeTokenBudget kicks in and fitMessages has to trim. mock-backend's
|
||||
// TokenizeString returns ~1 token per 4 prompt characters, and the
|
||||
// classifier ContextSize is 4096, so >40k chars guarantees the trim
|
||||
// path. The ROUTE_HINT marker is placed ONLY in the newest message —
|
||||
// if fitMessages dropped it during trim, no candidate would win and we
|
||||
// would route to the fallback (mock-cand-casual) instead of the code
|
||||
// candidate.
|
||||
filler := strings.Repeat("background context, lorem ipsum dolor sit amet. ", 200) // ~10k chars × 5 turns
|
||||
msgs := make([]openai.ChatCompletionMessageParamUnion, 0, 6)
|
||||
for range 5 {
|
||||
msgs = append(msgs, openai.UserMessage(filler))
|
||||
}
|
||||
msgs = append(msgs, openai.UserMessage("ROUTE_HINT=code-generation ECHO_SERVED_MODEL"))
|
||||
|
||||
resp, err := client.Chat.Completions.New(
|
||||
context.TODO(),
|
||||
openai.ChatCompletionNewParams{Model: "smart-router", Messages: msgs},
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred(), "router must survive a long conversation without erroring")
|
||||
Expect(resp.Choices).To(HaveLen(1))
|
||||
// The newest turn carries the routing intent ("code"); fitMessages must
|
||||
// keep it intact even after dropping older fillers, so the code
|
||||
// candidate still wins.
|
||||
Expect(resp.Choices[0].Message.Content).To(ContainSubstring("SERVED_MODEL=mock-cand-code.bin"),
|
||||
"long-conversation routing must still resolve to the code candidate; got %q",
|
||||
resp.Choices[0].Message.Content)
|
||||
})
|
||||
})
|
||||
@@ -236,6 +236,65 @@ var _ = BeforeSuite(func() {
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())
|
||||
|
||||
// Router model setup: a score classifier (mock-backend Score) selects
|
||||
// between two candidate chat models based on keyword matches against the
|
||||
// candidate label fragments. Exercises the full RouteModel middleware path
|
||||
// — probe extraction, ScoreClassifier.fitMessages (with the classifier's
|
||||
// real TokenizeString and ContextSize wired), Score RPC, and fanout to
|
||||
// the chosen candidate. The classifier MUST carry a chat template, since
|
||||
// buildClassifier now rejects routers whose classifier model has none.
|
||||
chatMLTpl := map[string]any{
|
||||
"chat": "{{.Input -}}\n<|im_start|>assistant\n",
|
||||
"chat_message": "<|im_start|>{{ .RoleName }}\n{{ if .Content }}{{ .Content }}{{ end }}<|im_end|>",
|
||||
}
|
||||
classifierCfg := map[string]any{
|
||||
"name": "mock-classifier",
|
||||
"backend": "mock-backend",
|
||||
"known_usecases": []string{"score"},
|
||||
"context_size": 4096,
|
||||
"stopwords": []string{"<|im_end|>"},
|
||||
"parameters": map[string]any{"model": "mock-classifier.bin"},
|
||||
"template": chatMLTpl,
|
||||
}
|
||||
classifierData, err := yaml.Marshal(classifierCfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, "mock-classifier.yaml"), classifierData, 0644)).To(Succeed())
|
||||
|
||||
for _, name := range []string{"mock-cand-casual", "mock-cand-code"} {
|
||||
candCfg := map[string]any{
|
||||
"name": name,
|
||||
"backend": "mock-backend",
|
||||
"known_usecases": []string{"chat"},
|
||||
"parameters": map[string]any{"model": name + ".bin"},
|
||||
}
|
||||
candData, err := yaml.Marshal(candCfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, name+".yaml"), candData, 0644)).To(Succeed())
|
||||
}
|
||||
|
||||
routerCfg := map[string]any{
|
||||
"name": "smart-router",
|
||||
"known_usecases": []string{"chat"},
|
||||
"router": map[string]any{
|
||||
"classifier": "score",
|
||||
"classifier_model": "mock-classifier",
|
||||
"activation_threshold": 0.40,
|
||||
"fallback": "mock-cand-casual",
|
||||
"policies": []map[string]any{
|
||||
{"label": "casual-chat", "description": "small talk and general conversation"},
|
||||
{"label": "code-generation", "description": "writing or debugging code"},
|
||||
{"label": "math-reasoning", "description": "arithmetic and word problems"},
|
||||
},
|
||||
"candidates": []map[string]any{
|
||||
{"model": "mock-cand-casual", "labels": []string{"casual-chat"}},
|
||||
{"model": "mock-cand-code", "labels": []string{"code-generation", "math-reasoning"}},
|
||||
},
|
||||
},
|
||||
}
|
||||
routerData, err := yaml.Marshal(routerCfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(modelsPath, "smart-router.yaml"), routerData, 0644)).To(Succeed())
|
||||
|
||||
// If REALTIME_TEST_MODEL=realtime-test-pipeline, auto-create a pipeline
|
||||
// config from the REALTIME_VAD/STT/LLM/TTS env vars so real-model tests
|
||||
// can run without the user having to write a YAML file manually.
|
||||
|
||||
@@ -109,6 +109,23 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ECHO_SERVED_MODEL returns the loaded model file path so router e2e
|
||||
// tests can verify which candidate actually served the request without
|
||||
// adding a new RPC. The router fans out to a single backend process per
|
||||
// candidate, so lastLoadParams.Model is unique per candidate.
|
||||
if strings.Contains(in.Prompt, "ECHO_SERVED_MODEL") {
|
||||
opts := snapshotLoadParams()
|
||||
modelID := ""
|
||||
if opts != nil {
|
||||
modelID = opts.Model
|
||||
}
|
||||
return &pb.Reply{
|
||||
Message: []byte("SERVED_MODEL=" + modelID),
|
||||
Tokens: 2,
|
||||
PromptTokens: 1,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Simulate C++ autoparser: tool call via ChatDeltas, empty message
|
||||
if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
|
||||
toolName := mockToolNameFromRequest(in)
|
||||
@@ -171,7 +188,7 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
|
||||
// Simulate multiple tool calls in a single response (Go-side JSON parser path).
|
||||
if strings.Contains(in.Prompt, "MULTI_TOOL_CALL") {
|
||||
return &pb.Reply{
|
||||
Message: []byte(`{"name": "get_weather", "arguments": {"location": "Rome"}}
|
||||
Message: []byte(`{"name": "get_weather", "arguments": {"location": "Rome"}}
|
||||
{"name": "get_weather", "arguments": {"location": "Paris"}}`),
|
||||
Tokens: 30,
|
||||
PromptTokens: 10,
|
||||
@@ -540,15 +557,91 @@ func (m *MockBackend) AudioTranscription(ctx context.Context, in *pb.TranscriptR
|
||||
}
|
||||
|
||||
func (m *MockBackend) TokenizeString(ctx context.Context, in *pb.PredictOptions) (*pb.TokenizationResponse, error) {
|
||||
xlog.Debug("TokenizeString called", "prompt", in.Prompt)
|
||||
// Return mock token IDs
|
||||
tokens := []int32{101, 2023, 2003, 1037, 3231, 1012}
|
||||
xlog.Debug("TokenizeString called", "prompt_len", len(in.Prompt))
|
||||
// Approximate BPE: ~4 chars/token, minimum 1. Realistic enough for the
|
||||
// router's fitMessages to exercise the budget/rune-pretrim path with
|
||||
// recognisable counts that scale with input size.
|
||||
n := max((len(in.Prompt)+3)/4, 1)
|
||||
tokens := make([]int32, n)
|
||||
for i := range tokens {
|
||||
tokens[i] = int32(i + 1)
|
||||
}
|
||||
return &pb.TokenizationResponse{
|
||||
Length: int32(len(tokens)),
|
||||
Length: int32(n),
|
||||
Tokens: tokens,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Score implements deterministic marker-driven ranking for router e2e
|
||||
// tests. The Score RPC receives the full rendered routing prompt (system
|
||||
// prompt + chat envelope + user turn), and the system prompt by construction
|
||||
// lists every policy label — so any keyword-against-prompt heuristic would
|
||||
// match every candidate. Instead we look for an explicit `ROUTE_HINT=<label>`
|
||||
// marker, which only appears when a test deliberately places one in a user
|
||||
// message. The candidate whose extracted label equals the hint gets a large
|
||||
// log-prob boost; all others stay at the base. With no hint, every candidate
|
||||
// scores equally, softmax is uniform, and (with a sensible activation
|
||||
// threshold) the router falls back.
|
||||
func (m *MockBackend) Score(ctx context.Context, in *pb.ScoreRequest) (*pb.ScoreResponse, error) {
|
||||
xlog.Debug("Score called", "candidates", len(in.Candidates))
|
||||
hint := extractRouteHint(in.Prompt)
|
||||
out := &pb.ScoreResponse{Candidates: make([]*pb.CandidateScore, len(in.Candidates))}
|
||||
for i, c := range in.Candidates {
|
||||
label := extractRouteLabel(c)
|
||||
// Base -5 (softmax ≈ 0.003), hint match +5 → 0 (softmax ≈ 0.99).
|
||||
logProb := -5.0
|
||||
if hint != "" && label == hint {
|
||||
logProb = 0.0
|
||||
}
|
||||
// num_tokens matches TokenizeString's heuristic so per-token mean
|
||||
// log-prob consumers see consistent values.
|
||||
nTok := max((len(c)+3)/4, 1)
|
||||
out.Candidates[i] = &pb.CandidateScore{
|
||||
LogProb: logProb,
|
||||
NumTokens: int32(nTok),
|
||||
LengthNormalizedLogProb: logProb / float64(nTok),
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// extractRouteHint returns the label after the LAST occurrence of
|
||||
// `ROUTE_HINT=` in the prompt, terminated by whitespace or end-of-string.
|
||||
// Using the last occurrence makes the marker stable across long
|
||||
// conversations: the *newest* user message's hint wins, mirroring how the
|
||||
// router's fitMessages keeps the newest turn whole.
|
||||
func extractRouteHint(prompt string) string {
|
||||
const key = "ROUTE_HINT="
|
||||
i := strings.LastIndex(prompt, key)
|
||||
if i < 0 {
|
||||
return ""
|
||||
}
|
||||
rest := prompt[i+len(key):]
|
||||
end := strings.IndexAny(rest, " \t\r\n<")
|
||||
if end < 0 {
|
||||
return rest
|
||||
}
|
||||
return rest[:end]
|
||||
}
|
||||
|
||||
// extractRouteLabel returns the label inside `{"route": "<label>"}`. Returns
|
||||
// "" on any shape it doesn't recognise — the caller treats that as a no-match.
|
||||
func extractRouteLabel(candidate string) string {
|
||||
_, rest, ok := strings.Cut(candidate, `"route"`)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
_, rest, ok = strings.Cut(rest, `"`)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
label, _, ok := strings.Cut(rest, `"`)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
return label
|
||||
}
|
||||
|
||||
func (m *MockBackend) Status(ctx context.Context, in *pb.HealthMessage) (*pb.StatusResponse, error) {
|
||||
xlog.Debug("Status called")
|
||||
return &pb.StatusResponse{
|
||||
|
||||
Reference in New Issue
Block a user