Files
Richard Palethorpe 085fc53bbc fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)
* fix(router): score classifier production-readiness

Conversation trimming runs through the classifier model's chat template
and trims by exact token count, sized to the model's n_batch which is
now scaled to context so long probes can't crash the backend. Missing
chat_message templates are a hard error at router build time. Router-
facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve
ModelConfig per call so a model installed post-startup doesn't bind a
stub Backend="" config and silently fall into the loader's auto-
iterate path.

New 'vector_store' backend trace recorded inside localVectorStore on
every Search/Insert — including the backend-load-failure path that
previously vanished into an xlog.Warn — with outcome tagging
(hit/miss/empty_store/backend_load_error/find_error/insert_error/ok).
Companion cleanup drops misleading similarity:0 and input_tokens_count:0
from non-hit and text-mode traces.

Gallery local-store-development aliases to 'local-store' so the master
image satisfies pkg/model.LocalStoreBackend lookups from the embedding
cache.

Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key
(the original bug); ModelTokenize nil-guard; non-fatal mitm proxy
startup; PII 'route_local' renamed to 'allow' with docs/UI in sync;
model-editor footer no longer eats the edit area on small screens;
several config-editor template/dropdown/section fixes.

Tests: e2e router specs (casual/code-hint + long-conversation trim),
vector_store trace specs, lazy-factory specs, gallery dev-alias
resolution, Playwright trace badge + scroll regression.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* feat(backend): auto-size batch to context for embedding and rerank models

Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins.

Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse.

Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch.

Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(gallery): raise arch-router scoring output cap via parallel:64

Scoring decodes the whole prompt+candidate in a single llama_decode and
reads one logit row per candidate token. The vendored llama.cpp server
caps causal output rows at n_parallel, so the default of 1 aborts with
GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route
labels. Set options: [parallel:64] on both arch-router quant entries to
lift the cap; kv_unified (the grpc-server default) keeps the full context
per sequence, so this does not split the KV cache.

Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-06-12 16:21:15 +02:00

179 lines
4.8 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package router
import (
"math"
"strings"
"sync"
"sync/atomic"
"unicode/utf8"
"github.com/mudler/xlog"
)
// pretrimRunesPerToken is deliberately high (most text is 35 runes/token,
// tokenisers rarely exceed 6) so the cheap rune pre-trim keeps a superset of
// what fits before any tokenize call.
const pretrimRunesPerToken = 6
// tokenBudgetMargin absorbs BPE-boundary drift and the framing tokens a
// renderer adds, so a prompt measured at exactly the budget still fits n_ctx.
const tokenBudgetMargin = 16
// JoinTurns joins per-turn texts oldest→newest with a trailing newline each.
// The probe builder, the trimmer, and every classifier share this so the text
// a model sees has one canonical shape.
func JoinTurns(turns []string) string {
var b strings.Builder
for _, m := range turns {
b.WriteString(m)
b.WriteByte('\n')
}
return b.String()
}
// promptTrimmer fits an oldest→newest turn list into a token budget for one
// model: optimistic rune pre-trim, tokenize once, then recalibrate with the
// real runes/token and drop whole turns oldest-first until the rendered prompt
// fits. The newest turn is never dropped — if it alone overflows it's sent
// whole and the backend's n_ctx guard is the backstop.
//
// render wraps the joined turns into what the model actually tokenizes: a chat
// template for the scorer, identityRender for an embedder/reranker on raw text.
type promptTrimmer struct {
tokenize func(string) (int, error)
render func(joined string) (string, error)
budget int
}
func identityRender(s string) (string, error) { return s, nil }
func (t promptTrimmer) fit(turns []string) string {
if len(turns) == 0 {
return ""
}
kept := turns[runePretrimStart(turns, t.budget*pretrimRunesPerToken):]
joined := JoinTurns(kept)
rendered, err := t.render(joined)
if err != nil {
return joined
}
total, err := t.tokenize(rendered)
if err != nil || total <= t.budget {
return joined
}
runesPerToken := float64(utf8.RuneCountInString(rendered)) / float64(total)
if runesPerToken <= 0 {
runesPerToken = 1
}
est := total
keep := 0
for keep < len(kept)-1 && est > t.budget {
est -= int(math.Ceil(float64(utf8.RuneCountInString(kept[keep])) / runesPerToken))
keep++
}
for {
tail := JoinTurns(kept[keep:])
rendered, err := t.render(tail)
if err != nil {
return tail
}
n, err := t.tokenize(rendered)
if err != nil || n <= t.budget {
return tail
}
if keep >= len(kept)-1 {
xlog.Warn("router: newest turn alone exceeds model context; sending it whole — backend n_ctx guard is the backstop",
"tokens", n, "budget", t.budget)
return tail
}
keep++
}
}
// runePretrimStart returns the oldest index to keep so the joined tail stays
// within budgetRunes. The newest turn is always kept; older ones are added
// while they fit.
func runePretrimStart(turns []string, budgetRunes int) int {
if budgetRunes <= 0 || len(turns) == 0 {
return 0
}
start := len(turns) - 1
total := utf8.RuneCountInString(turns[start])
for i := len(turns) - 2; i >= 0; i-- {
r := utf8.RuneCountInString(turns[i])
if total+r > budgetRunes {
break
}
total += r
start = i
}
return start
}
// lazyBudget computes a model's probe token budget once, on first use, caching
// the result: maxContext minus the longest per-call extra (scorer candidates,
// reranker documents; none for a plain embed) minus tokenBudgetMargin. A
// tokenizer error leaves it uncomputed so a transient failure (model still
// loading) recovers on a later call; extras that already fill the context are
// cached as disabled.
type lazyBudget struct {
tokenize func(string) (int, error)
maxContext int
extras []string
mu sync.Mutex
value atomic.Int64 // 0=unset, >0=budget, -1=disabled
}
func (l *lazyBudget) get() int {
if l == nil || l.tokenize == nil || l.maxContext <= 0 {
return 0
}
if v := l.value.Load(); v != 0 {
if v < 0 {
return 0
}
return int(v)
}
l.mu.Lock()
defer l.mu.Unlock()
if v := l.value.Load(); v != 0 {
if v < 0 {
return 0
}
return int(v)
}
longest := 0
for _, e := range l.extras {
n, err := l.tokenize(e)
if err != nil {
return 0 // transient: leave unset so a later call retries
}
if n > longest {
longest = n
}
}
b := l.maxContext - longest - tokenBudgetMargin
if b <= 0 {
l.value.Store(-1)
return 0
}
l.value.Store(int64(b))
return b
}
// trimmedProbeText returns the text to feed a model: the most recent turns
// that fit its token budget, or p.Prompt when trimming is disabled (no
// tokenizer/context wired, or a single-input probe with no Messages).
func trimmedProbeText(p Probe, b *lazyBudget, render func(string) (string, error)) string {
if len(p.Messages) > 0 {
if budget := b.get(); budget > 0 {
return promptTrimmer{tokenize: b.tokenize, render: render, budget: budget}.fit(p.Messages)
}
}
return p.Prompt
}