mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-13 03:09:03 -04:00
* fix(router): score classifier production-readiness Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(backend): auto-size batch to context for embedding and rerank models Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins. Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse. Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(gallery): raise arch-router scoring output cap via parallel:64 Scoring decodes the whole prompt+candidate in a single llama_decode and reads one logit row per candidate token. The vendored llama.cpp server caps causal output rows at n_parallel, so the default of 1 aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route labels. Set options: [parallel:64] on both arch-router quant entries to lift the cap; kv_unified (the grpc-server default) keeps the full context per sequence, so this does not split the KV cache. Assisted-by: claude-code:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
144 lines
4.7 KiB
Go
144 lines
4.7 KiB
Go
package backend
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/trace"
|
|
|
|
"github.com/mudler/LocalAI/pkg/grpc"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/store"
|
|
)
|
|
|
|
// VectorStore is the narrowed KNN store used by the router's embedding
|
|
// cache. Search returns the top-1 match (cosine similarity in [-1, 1])
|
|
// and the serialised payload, or ok=false on a clean miss.
|
|
type VectorStore interface {
|
|
Search(ctx context.Context, vec []float32) (similarity float64, payload []byte, ok bool, err error)
|
|
Insert(ctx context.Context, vec []float32, payload []byte) error
|
|
}
|
|
|
|
// NewVectorStore returns a VectorStore backed by the local-store
|
|
// gRPC backend, namespaced by storeName so two routers don't collide.
|
|
func NewVectorStore(loader *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) VectorStore {
|
|
if storeName == "" {
|
|
return nil
|
|
}
|
|
return &localVectorStore{loader: loader, appConfig: appConfig, storeName: storeName}
|
|
}
|
|
|
|
type localVectorStore struct {
|
|
loader *model.ModelLoader
|
|
appConfig *config.ApplicationConfig
|
|
storeName string
|
|
}
|
|
|
|
func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) {
|
|
return StoreBackend(s.loader, s.appConfig, s.storeName, "")
|
|
}
|
|
|
|
func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float64, payload []byte, ok bool, err error) {
|
|
start := time.Now()
|
|
outcome := "hit"
|
|
defer func() {
|
|
s.recordTrace(start, "search", len(vec), sim, outcome, err)
|
|
}()
|
|
be, berr := s.backend(ctx)
|
|
if berr != nil {
|
|
outcome = "backend_load_error"
|
|
return 0, nil, false, fmt.Errorf("vector store load: %w", berr)
|
|
}
|
|
_, values, similarities, ferr := store.Find(ctx, be, vec, 1)
|
|
if ferr != nil {
|
|
outcome = "find_error"
|
|
return 0, nil, false, fmt.Errorf("vector store find: %w", ferr)
|
|
}
|
|
if len(values) == 0 || len(similarities) == 0 {
|
|
outcome = "miss"
|
|
return 0, nil, false, nil
|
|
}
|
|
return float64(similarities[0]), values[0], true, nil
|
|
}
|
|
|
|
func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) {
|
|
start := time.Now()
|
|
outcome := "ok"
|
|
defer func() {
|
|
s.recordTrace(start, "insert", len(vec), 0, outcome, err)
|
|
}()
|
|
be, berr := s.backend(ctx)
|
|
if berr != nil {
|
|
outcome = "backend_load_error"
|
|
return fmt.Errorf("vector store load: %w", berr)
|
|
}
|
|
if serr := store.SetSingle(ctx, be, vec, payload); serr != nil {
|
|
outcome = "insert_error"
|
|
return serr
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// recordTrace surfaces vector-store calls in /api/backend-traces, including
|
|
// the backend-load-failure path that otherwise vanishes into an xlog.Warn.
|
|
// modelName uses the store namespace (e.g. "router-cache-smart-router") so
|
|
// admins can tell which router's cache misbehaved; the backend is always
|
|
// "local-store" and can't disambiguate.
|
|
func (s *localVectorStore) recordTrace(start time.Time, op string, vecDim int, sim float64, outcome string, err error) {
|
|
if s.appConfig == nil || !s.appConfig.EnableTracing {
|
|
return
|
|
}
|
|
trace.InitBackendTracingIfEnabled(s.appConfig.TracingMaxItems, s.appConfig.TracingMaxBodyBytes)
|
|
errStr := ""
|
|
if err != nil {
|
|
errStr = err.Error()
|
|
}
|
|
summary := op + " " + outcome
|
|
if op == "search" && outcome == "hit" {
|
|
summary = fmt.Sprintf("search hit (sim=%.3f)", sim)
|
|
}
|
|
data := map[string]any{
|
|
"op": op,
|
|
"outcome": outcome,
|
|
"vector_dim": vecDim,
|
|
}
|
|
// Only include similarity for a real neighbor — miss/empty_store would
|
|
// otherwise render "similarity: 0" and read as a measured value.
|
|
if op == "search" && outcome == "hit" {
|
|
data["similarity"] = sim
|
|
}
|
|
trace.RecordBackendTrace(trace.BackendTrace{
|
|
Timestamp: start,
|
|
Duration: time.Since(start),
|
|
Type: trace.BackendTraceVectorStore,
|
|
ModelName: s.storeName,
|
|
Backend: model.LocalStoreBackend,
|
|
Summary: summary,
|
|
Error: errStr,
|
|
Data: data,
|
|
})
|
|
}
|
|
|
|
func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) {
|
|
if backend == "" {
|
|
backend = model.LocalStoreBackend
|
|
}
|
|
// ModelLoader caches backend processes by `modelID`, not by the `model`
|
|
// passed via WithModel. Without a distinct modelID, every StoreBackend
|
|
// call collapses to the same `modelID=""` cache slot — face (512-D) and
|
|
// voice (192-D) biometrics would then share the same local-store process
|
|
// and the second enrollment would fail with
|
|
// Try to add key with length N when existing length is M
|
|
// Use the store namespace as modelID so each namespace gets its own
|
|
// process instance and its own in-memory Store{}.
|
|
sc := []model.Option{
|
|
model.WithBackendString(backend),
|
|
model.WithModelID(storeName),
|
|
model.WithModel(storeName),
|
|
}
|
|
|
|
return sl.Load(sc...)
|
|
}
|