mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-13 11:21:42 -04:00
fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)
* fix(router): score classifier production-readiness Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(backend): auto-size batch to context for embedding and rerank models Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins. Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse. Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(gallery): raise arch-router scoring output cap via parallel:64 Scoring decodes the whole prompt+candidate in a single llama_decode and reads one logit row per candidate token. The vendored llama.cpp server caps causal output rows at n_parallel, so the default of 1 aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route labels. Set options: [parallel:64] on both arch-router quant entries to lift the cap; kv_unified (the grpc-server default) keeps the full context per sequence, so this does not split the KV cache. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
committed by
GitHub
parent
56cc4f63fc
commit
085fc53bbc
@@ -353,7 +353,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
|
||||
overrides = make(map[string]pii.Action, len(raw))
|
||||
for ovid, action := range raw {
|
||||
switch pii.Action(action) {
|
||||
case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
|
||||
case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
|
||||
overrides[ovid] = pii.Action(action)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ var instructionDefs = []instructionDef{
|
||||
Name: "pii-filtering",
|
||||
Description: "Inspect and tune the regex PII filter applied to chat requests",
|
||||
Tags: []string{"pii"},
|
||||
Intro: "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, route_local). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
|
||||
Intro: "GET /api/pii/patterns lists the active pattern set with each one's action (mask, block, allow). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (admin or local-user only). POST /api/pii/test dry-runs the redactor against an admin-supplied string. POST /api/pii/decide is the programmatic decision oracle for external routers: send `{text}`, receive `{findings, suggested_action, redacted_preview}` without LocalAI mutating, recording, or acting on the call — caller composes the action with its own policy. Default patterns: email, phone, SSN, credit card (Luhn), IPv4, common API key prefixes (sk-, pk-, ghp_, github_pat_). PII is per-model: by default it is OFF for non-proxy backends and ON for backends starting with proxy-* (cloud passthroughs). Opt in with `pii: { enabled: true }` in a model's YAML; use `pii: { patterns: [{id, action}] }` to upgrade or downgrade individual actions for that model. Override global default actions via --pii-config pii.yaml; --disable-pii turns the filter off entirely.",
|
||||
},
|
||||
{
|
||||
Name: "middleware-admin",
|
||||
|
||||
@@ -124,6 +124,8 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
|
||||
filterFn = config.BuildUsecaseFilterFn(config.FLAG_VAD)
|
||||
case config.UsecaseTranscript:
|
||||
filterFn = config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)
|
||||
case "score": // router classifier usecase (FLAG_SCORE); not in UsecaseInfoMap
|
||||
filterFn = config.BuildUsecaseFilterFn(config.FLAG_SCORE)
|
||||
default:
|
||||
filterFn = config.NoFilterFn
|
||||
}
|
||||
|
||||
@@ -15,9 +15,9 @@ import (
|
||||
//
|
||||
// External routers (e.g. the localai-org/platform router) call this
|
||||
// before dispatching to learn whether to mask the prompt in place,
|
||||
// route to a local-only backend, block the request, or pass it
|
||||
// through. LocalAI's in-band PII middleware is the alternative path
|
||||
// for direct-to-LocalAI clients — same Redactor, different framing.
|
||||
// block the request, or pass it through. LocalAI's in-band PII
|
||||
// middleware is the alternative path for direct-to-LocalAI clients —
|
||||
// same Redactor, different framing.
|
||||
//
|
||||
// Takes the *pii.Redactor directly rather than the whole
|
||||
// *application.Application so the handler stays unit-testable with a
|
||||
@@ -62,24 +62,18 @@ func PIIDecideEndpoint(redactor *pii.Redactor) echo.HandlerFunc {
|
||||
}
|
||||
}
|
||||
|
||||
// actionAllow is the wire-only value for "no findings". The other
|
||||
// three map to existing pii.Action* constants; allow has no in-band
|
||||
// counterpart because the in-band middleware simply passes through.
|
||||
const actionAllow = "allow"
|
||||
|
||||
// suggestedAction collapses the Redactor's Result flags onto a single
|
||||
// wire-format action using the in-band ordering (block > route_local
|
||||
// > mask > allow). Spans-without-Blocked-or-LocalOnly means every
|
||||
// match resolved to ActionMask.
|
||||
// wire-format action using the in-band ordering (block > mask >
|
||||
// allow). "allow" covers both "nothing matched" and "matched but every
|
||||
// span resolved to the allow action" — in both cases the caller may
|
||||
// dispatch unchanged, with the Findings list reporting what was seen.
|
||||
func suggestedAction(res pii.Result) string {
|
||||
switch {
|
||||
case res.Blocked:
|
||||
return string(pii.ActionBlock)
|
||||
case res.LocalOnly:
|
||||
return string(pii.ActionRouteLocal)
|
||||
case len(res.Spans) > 0:
|
||||
case res.Masked:
|
||||
return string(pii.ActionMask)
|
||||
default:
|
||||
return actionAllow
|
||||
return string(pii.ActionAllow)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@ import (
|
||||
|
||||
// PIIDecideEndpoint exposes the redactor as a decision oracle. These
|
||||
// specs pin the validation surface and the suggested_action mapping
|
||||
// across all four actions (allow/mask/route_local/block). The redactor
|
||||
// itself is covered in core/services/routing/pii/redactor_test.go.
|
||||
// across the three actions (allow/mask/block). The redactor itself is
|
||||
// covered in core/services/routing/pii/redactor_test.go.
|
||||
|
||||
var _ = Describe("PIIDecideEndpoint", func() {
|
||||
var redactor *pii.Redactor
|
||||
@@ -68,16 +68,17 @@ var _ = Describe("PIIDecideEndpoint", func() {
|
||||
Expect(len(body.Findings)).To(BeNumerically(">=", 1))
|
||||
})
|
||||
|
||||
It("returns route_local when an override sets that action", func() {
|
||||
// Promote the email pattern to route_local for this test —
|
||||
// exercises the route_local branch of suggestedAction without
|
||||
// needing a custom pattern set.
|
||||
Expect(redactor.SetAction("email", pii.ActionRouteLocal)).To(Succeed())
|
||||
It("returns allow when a matched pattern's action is allow", func() {
|
||||
// Downgrade the email pattern to allow for this test —
|
||||
// exercises the allow branch of suggestedAction: a match is
|
||||
// found, but the strongest action is allow so the suggestion
|
||||
// is "allow" and the text is left intact.
|
||||
Expect(redactor.SetAction("email", pii.ActionAllow)).To(Succeed())
|
||||
rec, body := invokePIIDecide(redactor, `{"text":"contact alice@example.com"}`)
|
||||
Expect(rec.Code).To(Equal(http.StatusOK))
|
||||
Expect(body.SuggestedAction).To(Equal("route_local"))
|
||||
// route_local leaves the original text intact — caller decides
|
||||
// whether to forward it to a local-only backend.
|
||||
Expect(body.SuggestedAction).To(Equal("allow"))
|
||||
Expect(body.Findings).To(HaveLen(1), "allow still reports the finding")
|
||||
// allow leaves the original text intact.
|
||||
Expect(body.RedactedPreview).To(ContainSubstring("alice@example.com"))
|
||||
})
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
|
||||
overrides = make(map[string]pii.Action, len(raw))
|
||||
for ovid, action := range raw {
|
||||
switch pii.Action(action) {
|
||||
case pii.ActionMask, pii.ActionBlock, pii.ActionRouteLocal:
|
||||
case pii.ActionMask, pii.ActionBlock, pii.ActionAllow:
|
||||
overrides[ovid] = pii.Action(action)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -451,13 +451,14 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
|
||||
return nil
|
||||
}
|
||||
deps := &middleware.ClassifierDeps{
|
||||
Scorer: a.Scorer,
|
||||
Embedder: a.Embedder,
|
||||
VectorStore: a.VectorStore,
|
||||
Reranker: a.Reranker,
|
||||
ModelLookup: a.ModelConfigLookup(),
|
||||
Registry: a.RouterClassifierRegistry(),
|
||||
Evaluator: a.TemplatesEvaluator(),
|
||||
Scorer: a.Scorer,
|
||||
TokenCounter: a.TokenCounter,
|
||||
Embedder: a.Embedder,
|
||||
VectorStore: a.VectorStore,
|
||||
Reranker: a.Reranker,
|
||||
ModelLookup: a.ModelConfigLookup(),
|
||||
Registry: a.RouterClassifierRegistry(),
|
||||
Evaluator: a.TemplatesEvaluator(),
|
||||
}
|
||||
userID := ""
|
||||
if u := a.FallbackUser(); u != nil {
|
||||
|
||||
Reference in New Issue
Block a user