diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 6c7b2b5e8..c02dcec44 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -564,7 +564,7 @@ jobs: - name: Run e2e-backends smoke env: BACKEND_IMAGE: quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp - BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias + BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias,tokenize run: | make test-extra-backend # Realtime e2e with sherpa-onnx driving VAD + STT + TTS against a mocked LLM. diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 9ec806cd1..5b032ad4e 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -3486,7 +3486,7 @@ public: if (body.count("prompt") != 0) { const bool add_special = json_value(body, "add_special", false); - llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("prompt"), add_special, true); for (const auto& token : tokens) { diff --git a/backend/index.yaml b/backend/index.yaml index e641c9355..b25d0a406 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -1586,6 +1586,7 @@ - localai/localai-backends:master-metal-darwin-arm64-kitten-tts - !!merge <<: *local-store name: "local-store-development" + alias: "local-store" uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store" mirrors: - localai/localai-backends:master-cpu-local-store @@ -1596,6 +1597,7 @@ - localai/localai-backends:latest-metal-darwin-arm64-local-store - !!merge <<: *local-store name: "metal-local-store-development" + alias: "local-store" uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-local-store" mirrors: - localai/localai-backends:master-metal-darwin-arm64-local-store diff --git a/core/application/mitm.go b/core/application/mitm.go index 293b3d449..a67a68934 100644 --- a/core/application/mitm.go +++ b/core/application/mitm.go @@ -11,6 +11,29 @@ import ( "github.com/mudler/xlog" ) +// startMITMIfConfigured brings up the cloudproxy MITM listener when an +// address is configured, treating any startup failure as non-fatal. +// +// The listener is opt-in middleware whose address is persisted in runtime +// settings (/api/settings → runtime_settings.json) and replayed on every +// boot. A bad value — e.g. a host the process can't bind, like a LAN IP +// inside a container — must NOT abort the whole server: doing so crash-loops +// with no way out, because the Settings UI used to correct the address can't +// load if startup never completes. So on failure we log loudly and carry on; +// the admin fixes the address via /api/settings, which calls RestartMITM. +func startMITMIfConfigured(app *Application, options *config.ApplicationConfig) { + if options.MITMListen == "" { + return + } + if err := startMITMProxy(app, options); err != nil { + xlog.Error("mitm: cloudproxy listener failed to start — continuing without it", + "listen", options.MITMListen, + "error", err, + "hint", "fix the address via Settings (e.g. \":8082\" to bind all interfaces) and the listener will restart", + ) + } +} + func startMITMProxy(app *Application, options *config.ApplicationConfig) error { app.mitmMutex.Lock() defer app.mitmMutex.Unlock() diff --git a/core/application/mitm_test.go b/core/application/mitm_test.go new file mode 100644 index 000000000..b7627fa2d --- /dev/null +++ b/core/application/mitm_test.go @@ -0,0 +1,58 @@ +package application + +import ( + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// minimal Application wired enough for startMITMProxy: an empty model +// config loader (no host claims), CA written under a temp DataPath. +func newMITMTestApp(dataPath string) (*Application, *config.ApplicationConfig) { + state, err := system.GetSystemState() + Expect(err).NotTo(HaveOccurred()) + state.Model.ModelsPath = dataPath + opts := config.NewApplicationConfig( + config.WithSystemState(state), + config.WithDataPath(dataPath), + ) + return newApplication(opts), opts +} + +var _ = Describe("startMITMIfConfigured", func() { + It("does nothing when no listen address is configured", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + opts.MITMListen = "" + + Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic()) + Expect(app.mitmServer.Load()).To(BeNil(), "no listener should be stored when disabled") + }) + + // Regression: a persisted-but-unbindable MITM address (e.g. a LAN host + // inside a container) must not abort startup. startMITMIfConfigured + // swallows the bind error so the rest of LocalAI still comes up and the + // admin can fix the address via the Settings UI. + It("logs and continues when the listen address cannot be bound", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + // 192.0.2.1 is TEST-NET-1 (RFC 5737): guaranteed not assigned to any + // local interface, so bind fails deterministically without DNS. + opts.MITMListen = "192.0.2.1:8082" + + Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic()) + Expect(app.mitmServer.Load()).To(BeNil(), "failed listener must not be stored") + }) + + It("starts and stores the listener on a bindable address", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + opts.MITMListen = "127.0.0.1:0" // OS-assigned free port + + startMITMIfConfigured(app, opts) + + srv := app.mitmServer.Load() + Expect(srv).NotTo(BeNil(), "listener should be stored on success") + DeferCleanup(srv.Stop) + Expect(srv.Addr()).NotTo(BeEmpty()) + }) +}) diff --git a/core/application/router_factories.go b/core/application/router_factories.go index d37cfb9d8..879c43a83 100644 --- a/core/application/router_factories.go +++ b/core/application/router_factories.go @@ -1,63 +1,120 @@ package application import ( + "context" + "fmt" + "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" ) -// adapterConfig resolves a model name to its runtime ModelConfig, or -// nil when the name is unknown. Shared by the router-facing factories -// below and by ModelConfigLookup. +// adapterConfig resolves a model name to its runtime ModelConfig, or nil when +// unknown. LoadModelConfigFileByNameDefaultOptions never returns nil — for an +// unknown name it returns a defaults-filled stub with an empty Name (the YAML +// `name:` field is required by Validate), which is how we tell the two apart. func (a *Application) adapterConfig(modelName string) *config.ModelConfig { cfg, err := a.backendLoader.LoadModelConfigFileByNameDefaultOptions(modelName, a.applicationConfig) - if err != nil || cfg == nil { + if err != nil || cfg == nil || cfg.Name == "" { return nil } return cfg } -// ModelConfigLookup is the lookup function the router middleware's -// classifier validator uses to confirm classifier_model declares -// FLAG_SCORE before binding it. +// ModelConfigLookup is the lookup the router middleware's classifier validator +// uses to confirm classifier_model declares FLAG_SCORE before binding it. func (a *Application) ModelConfigLookup() func(modelName string) *config.ModelConfig { return a.adapterConfig } -// Scorer returns a backend.Scorer bound to the named model, or nil -// when the model is unknown. Used as a method value (app.Scorer) by -// router.ClassifierDeps — no factory-of-factory wrapper needed. +// The router-facing factories below (Scorer, Embedder, Reranker, TokenCounter) +// bind a model NAME at construction and re-resolve the CONFIG on every call. +// Capturing the config at construction would bake in whatever state +// adapterConfig saw first — including a stub returned before the YAML reached +// bcl.configs (e.g. /import-model or gallery install racing startup). The +// classifier registry caches factories by router-config fingerprint, so a +// once-stale capture stays stale until the router config is edited. + func (a *Application) Scorer(modelName string) backend.Scorer { - cfg := a.adapterConfig(modelName) - if cfg == nil { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewScorer(a.modelLoader, *cfg, a.applicationConfig) + return &lazyScorer{app: a, modelName: modelName} +} + +type lazyScorer struct { + app *Application + modelName string +} + +func (l *lazyScorer) Score(ctx context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) { + cfg := l.app.adapterConfig(l.modelName) + if cfg == nil { + return nil, fmt.Errorf("scorer: model %q no longer available", l.modelName) + } + return backend.NewScorer(l.app.modelLoader, *cfg, l.app.applicationConfig).Score(ctx, prompt, candidates) +} + +// TokenCounter returns a func so the middleware's literal field type accepts +// it as a method value without importing core/http/middleware from here. +func (a *Application) TokenCounter(modelName string) func(string) (int, error) { + if a.adapterConfig(modelName) == nil { + return nil + } + return func(text string) (int, error) { + cfg := a.adapterConfig(modelName) + if cfg == nil { + return 0, fmt.Errorf("token counter: model %q no longer available", modelName) + } + resp, err := backend.ModelTokenize(text, a.modelLoader, *cfg, a.applicationConfig) + if err != nil { + return 0, err + } + return len(resp.Tokens), nil + } } -// Reranker returns a backend.Reranker bound to the named model, or -// nil when unknown. The reranker model's `type:` (e.g. "colbert") -// selects the scoring head inside the rerankers backend. func (a *Application) Reranker(modelName string) backend.Reranker { - cfg := a.adapterConfig(modelName) - if cfg == nil { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewReranker(a.modelLoader, *cfg, a.applicationConfig) + return &lazyReranker{app: a, modelName: modelName} +} + +type lazyReranker struct { + app *Application + modelName string +} + +func (l *lazyReranker) Rerank(ctx context.Context, query string, documents []string) ([]backend.RerankResult, error) { + cfg := l.app.adapterConfig(l.modelName) + if cfg == nil { + return nil, fmt.Errorf("reranker: model %q no longer available", l.modelName) + } + return backend.NewReranker(l.app.modelLoader, *cfg, l.app.applicationConfig).Rerank(ctx, query, documents) } -// Embedder returns a backend.Embedder bound to the named model, or -// nil when unknown. Used by the router's L2 embedding cache. func (a *Application) Embedder(modelName string) backend.Embedder { - cfg := a.adapterConfig(modelName) - if cfg == nil { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewEmbedder(a.modelLoader, *cfg, a.applicationConfig) + return &lazyEmbedder{app: a, modelName: modelName} } -// VectorStore returns a backend.VectorStore for the named collection, -// or nil when the name is empty. Each router model gets its own -// backend process via the model loader's cache keyed by storeName. +type lazyEmbedder struct { + app *Application + modelName string +} + +func (l *lazyEmbedder) Embed(ctx context.Context, text string) ([]float32, error) { + cfg := l.app.adapterConfig(l.modelName) + if cfg == nil { + return nil, fmt.Errorf("embedder: model %q no longer available", l.modelName) + } + return backend.NewEmbedder(l.app.modelLoader, *cfg, l.app.applicationConfig).Embed(ctx, text) +} + +// VectorStore takes a store name, not a model name — no adapterConfig, no +// staleness to avoid. func (a *Application) VectorStore(storeName string) backend.VectorStore { return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName) } diff --git a/core/application/router_factories_test.go b/core/application/router_factories_test.go new file mode 100644 index 000000000..5a6988a88 --- /dev/null +++ b/core/application/router_factories_test.go @@ -0,0 +1,155 @@ +package application + +import ( + "context" + "os" + "path/filepath" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// Regression: the router-facing factories used to capture +// *config.ModelConfig at construction. A gallery install that raced +// startup left a stub (Backend="") bound for the lifetime of the +// classifier registry's cache entry, bypassing the user's `backend:` +// config. These specs pin the lazy re-resolve. +var _ = Describe("router_factories lazy config resolution", func() { + var ( + tmpDir string + app *Application + ) + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "router-factories-*") + Expect(err).NotTo(HaveOccurred()) + + appCfg := &config.ApplicationConfig{ + Context: context.Background(), + SystemState: &system.SystemState{Model: system.Model{ModelsPath: tmpDir}}, + } + app = &Application{ + backendLoader: config.NewModelConfigLoader(tmpDir), + modelLoader: model.NewModelLoader(appCfg.SystemState), + applicationConfig: appCfg, + } + }) + + AfterEach(func() { + _ = os.RemoveAll(tmpDir) + }) + + // writeCfg seeds both the on-disk YAML and the in-memory cache — + // removing only the cache would fall through to file-read. + writeCfg := func(name, backend string) { + yaml := "name: " + name + "\nbackend: " + backend + "\nparameters:\n model: " + name + ".bin\n" + Expect(os.WriteFile(filepath.Join(tmpDir, name+".yaml"), []byte(yaml), 0644)).To(Succeed()) + Expect(app.backendLoader.LoadModelConfigsFromPath(tmpDir)).To(Succeed()) + cfg, ok := app.backendLoader.GetModelConfig(name) + Expect(ok).To(BeTrue(), "config must be loaded before the spec runs") + Expect(cfg.Backend).To(Equal(backend)) + } + + // removeCfg purges both the cache and the YAML so LoadModelConfigFileByName + // returns the empty-stub case and adapterConfig returns nil. + removeCfg := func(name string) { + app.backendLoader.RemoveModelConfig(name) + Expect(os.Remove(filepath.Join(tmpDir, name+".yaml"))).To(Succeed()) + } + + Context("Embedder", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Embedder("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Embed call", func() { + writeCfg("emb-test", "llama-cpp") + emb := app.Embedder("emb-test") + Expect(emb).NotTo(BeNil()) + + // The factory must hold the NAME, not a captured config — + // otherwise stale captures survive cache invalidation. + lazy, ok := emb.(*lazyEmbedder) + Expect(ok).To(BeTrue(), "Embedder must return *lazyEmbedder") + Expect(lazy.modelName).To(Equal("emb-test")) + + // Mutate the cached config. A lazy implementation sees the + // update on the next adapterConfig call; a captured-at- + // construction implementation would still see "llama-cpp". + app.backendLoader.UpdateModelConfig("emb-test", func(c *config.ModelConfig) { + c.Backend = "rerankers" + }) + Expect(lazy.app.adapterConfig("emb-test").Backend).To(Equal("rerankers")) + + // Remove the config entirely → Embed must surface the disappearance. + removeCfg("emb-test") + _, err := emb.Embed(context.Background(), "anything") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("Scorer", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Scorer("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Score call", func() { + writeCfg("score-test", "llama-cpp") + sc := app.Scorer("score-test") + Expect(sc).NotTo(BeNil()) + + lazy, ok := sc.(*lazyScorer) + Expect(ok).To(BeTrue(), "Scorer must return *lazyScorer") + Expect(lazy.modelName).To(Equal("score-test")) + + removeCfg("score-test") + _, err := sc.Score(context.Background(), "prompt", []string{"a"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("Reranker", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Reranker("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Rerank call", func() { + writeCfg("rerank-test", "rerankers") + rr := app.Reranker("rerank-test") + Expect(rr).NotTo(BeNil()) + + lazy, ok := rr.(*lazyReranker) + Expect(ok).To(BeTrue(), "Reranker must return *lazyReranker") + Expect(lazy.modelName).To(Equal("rerank-test")) + + removeCfg("rerank-test") + _, err := rr.Rerank(context.Background(), "q", []string{"d"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("TokenCounter", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.TokenCounter("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each call", func() { + writeCfg("tok-test", "llama-cpp") + tc := app.TokenCounter("tok-test") + Expect(tc).NotTo(BeNil()) + + removeCfg("tok-test") + _, err := tc("anything") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) +}) diff --git a/core/application/startup.go b/core/application/startup.go index 8ddf5a1f6..ab7b60d99 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -462,11 +462,7 @@ func New(opts ...config.AppOption) (*Application, error) { // traffic doesn't need a parallel config for MITM traffic. // Runs after loadRuntimeSettingsFromFile so a listener configured // via /api/settings is brought back up across restarts. - if options.MITMListen != "" { - if err := startMITMProxy(application, options); err != nil { - return nil, fmt.Errorf("mitm: startup: %w", err) - } - } + startMITMIfConfigured(application, options) application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging) diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go index 4be2bc346..eff88ef04 100644 --- a/core/backend/embeddings.go +++ b/core/backend/embeddings.go @@ -100,8 +100,13 @@ func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.M trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) traceData := map[string]any{ - "input_text": trace.TruncateString(s, 1000), - "input_tokens_count": len(tokens), + "input_text": trace.TruncateString(s, 1000), + } + // Only present for token-mode callers (pre-tokenized override); + // emitting "0" alongside input_text would read as "consumed zero + // tokens", which is wrong. + if len(tokens) > 0 { + traceData["input_tokens_count"] = len(tokens) } startTime := time.Now() diff --git a/core/backend/options.go b/core/backend/options.go index 0274bdb6e..72e622108 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -87,11 +87,47 @@ func getSeed(c config.ModelConfig) int32 { return seed } -func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { - b := 512 - if c.Batch != 0 { - b = c.Batch +// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a +// model config leaves them unset. Exported so callers that must respect the +// effective decode window — notably the router's prompt trimmer — resolve the +// same numbers grpcModelOpts does instead of guessing. +const ( + DefaultContextSize = 4096 + DefaultBatchSize = 512 +) + +// EffectiveContextSize is the context window the backend will run with: the +// configured value, or DefaultContextSize when unset. +func EffectiveContextSize(c config.ModelConfig) int { + if c.ContextSize != nil { + return *c.ContextSize } + return DefaultContextSize +} + +// EffectiveBatchSize is the single-decode batch the backend will run with. +// Score, embedding and rerank all process the whole input in one pass: score +// decodes prompt+candidate (asserts n_tokens <= n_batch), and embedding/rerank +// pool over the full sequence in one physical batch (n_ubatch). So the batch +// is sized to the context — anything that fits the context fits one pass, +// avoiding both the GGML_ASSERT crash and the "input is too large to process" +// error. Explicit `batch:` always wins. +func EffectiveBatchSize(c config.ModelConfig) int { + if c.Batch != 0 { + return c.Batch + } + singlePass := c.HasUsecases(config.FLAG_SCORE) || + c.HasUsecases(config.FLAG_EMBEDDINGS) || + c.HasUsecases(config.FLAG_RERANK) + if ctx := EffectiveContextSize(c); singlePass && ctx > DefaultBatchSize { + return ctx + } + return DefaultBatchSize +} + +func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { + ctxSize := EffectiveContextSize(c) + b := EffectiveBatchSize(c) flashAttention := "auto" @@ -134,11 +170,6 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { } } - ctxSize := 4096 - if c.ContextSize != nil { - ctxSize = *c.ContextSize - } - mmlock := false if c.MMlock != nil { mmlock = *c.MMlock diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index 5e1848f0f..aa07b43bd 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -97,3 +97,67 @@ var _ = Describe("gRPCPredictOpts reasoning_effort metadata", func() { Expect(opts.Metadata).ToNot(HaveKey("reasoning_effort")) }) }) + +var _ = Describe("grpcModelOpts NBatch", func() { + scoreUsecase := config.FLAG_SCORE + threads := 1 + ctx := 4096 + + It("defaults to 512 for an ordinary model", func() { + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(512)) + }) + + It("sizes the batch to the context window for score models", func() { + // Score models decode the whole prompt+candidate in one + // llama_decode; n_batch must cover it or the backend aborts. + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("keeps an explicit batch over the score default", func() { + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase} + cfg.Batch = 1024 + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(1024)) + }) + + It("sizes the batch to the context window for embedding models", func() { + // Embedding/rerank pool over the whole sequence in one physical batch + // (n_ubatch); without this the input is capped at the 512 default and + // the backend returns "input is too large to process". + embeddings := true + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + cfg.Embeddings = &embeddings + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("sizes the batch to the context window for rerank models", func() { + reranking := true + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + cfg.Reranking = &reranking + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("does not raise the batch when a score model's context is below the default", func() { + small := 256 + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &small}, KnownUsecases: &scoreUsecase} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(512)) + }) + + It("sizes the batch to the effective 4096 default for a score model with no explicit context_size", func() { + // The crash case: the backend defaults n_ctx to 4096, so n_batch must + // follow even when context_size is unset — otherwise n_batch stays 512 + // against a 4096 window and the score decode hits the GGML_ASSERT. + cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &scoreUsecase} + Expect(cfg.ContextSize).To(BeNil()) + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives") + }) +}) diff --git a/core/backend/stores.go b/core/backend/stores.go index 4884765f2..8b73ee17c 100644 --- a/core/backend/stores.go +++ b/core/backend/stores.go @@ -3,9 +3,10 @@ package backend import ( "context" "fmt" - "strings" + "time" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/pkg/grpc" "github.com/mudler/LocalAI/pkg/model" @@ -39,34 +40,85 @@ func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) { return StoreBackend(s.loader, s.appConfig, s.storeName, "") } -func (s *localVectorStore) Search(ctx context.Context, vec []float32) (float64, []byte, bool, error) { - be, err := s.backend(ctx) - if err != nil { - return 0, nil, false, fmt.Errorf("vector store load: %w", err) +func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float64, payload []byte, ok bool, err error) { + start := time.Now() + outcome := "hit" + defer func() { + s.recordTrace(start, "search", len(vec), sim, outcome, err) + }() + be, berr := s.backend(ctx) + if berr != nil { + outcome = "backend_load_error" + return 0, nil, false, fmt.Errorf("vector store load: %w", berr) } - _, values, similarities, err := store.Find(ctx, be, vec, 1) - if err != nil { - // local-store's Find returns "existing length is -1" before - // any keys are inserted. Surface that as a clean miss so the - // cache layer treats it as an empty store and proceeds to - // Insert rather than skipping. - if strings.Contains(err.Error(), "existing length is -1") { - return 0, nil, false, nil - } - return 0, nil, false, fmt.Errorf("vector store find: %w", err) + _, values, similarities, ferr := store.Find(ctx, be, vec, 1) + if ferr != nil { + outcome = "find_error" + return 0, nil, false, fmt.Errorf("vector store find: %w", ferr) } if len(values) == 0 || len(similarities) == 0 { + outcome = "miss" return 0, nil, false, nil } return float64(similarities[0]), values[0], true, nil } -func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) error { - be, err := s.backend(ctx) - if err != nil { - return fmt.Errorf("vector store load: %w", err) +func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) { + start := time.Now() + outcome := "ok" + defer func() { + s.recordTrace(start, "insert", len(vec), 0, outcome, err) + }() + be, berr := s.backend(ctx) + if berr != nil { + outcome = "backend_load_error" + return fmt.Errorf("vector store load: %w", berr) } - return store.SetSingle(ctx, be, vec, payload) + if serr := store.SetSingle(ctx, be, vec, payload); serr != nil { + outcome = "insert_error" + return serr + } + return nil +} + +// recordTrace surfaces vector-store calls in /api/backend-traces, including +// the backend-load-failure path that otherwise vanishes into an xlog.Warn. +// modelName uses the store namespace (e.g. "router-cache-smart-router") so +// admins can tell which router's cache misbehaved; the backend is always +// "local-store" and can't disambiguate. +func (s *localVectorStore) recordTrace(start time.Time, op string, vecDim int, sim float64, outcome string, err error) { + if s.appConfig == nil || !s.appConfig.EnableTracing { + return + } + trace.InitBackendTracingIfEnabled(s.appConfig.TracingMaxItems, s.appConfig.TracingMaxBodyBytes) + errStr := "" + if err != nil { + errStr = err.Error() + } + summary := op + " " + outcome + if op == "search" && outcome == "hit" { + summary = fmt.Sprintf("search hit (sim=%.3f)", sim) + } + data := map[string]any{ + "op": op, + "outcome": outcome, + "vector_dim": vecDim, + } + // Only include similarity for a real neighbor — miss/empty_store would + // otherwise render "similarity: 0" and read as a measured value. + if op == "search" && outcome == "hit" { + data["similarity"] = sim + } + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: start, + Duration: time.Since(start), + Type: trace.BackendTraceVectorStore, + ModelName: s.storeName, + Backend: model.LocalStoreBackend, + Summary: summary, + Error: errStr, + Data: data, + }) } func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) { diff --git a/core/backend/stores_test.go b/core/backend/stores_test.go new file mode 100644 index 000000000..e9d5208a3 --- /dev/null +++ b/core/backend/stores_test.go @@ -0,0 +1,88 @@ +package backend + +import ( + "context" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// findVectorStoreTrace returns the most recent vector_store trace whose +// model_name matches storeName, or nil if none was recorded. Used by +// the specs below to assert the trace landed without relying on +// ring-buffer ordering across other tests in the suite. +func findVectorStoreTrace(storeName string) *trace.BackendTrace { + traces := trace.GetBackendTraces() + for i := range traces { + bt := &traces[i] + if bt.Type == trace.BackendTraceVectorStore && bt.ModelName == storeName { + return bt + } + } + return nil +} + +var _ = Describe("localVectorStore tracing", func() { + // Pin the trace surface admins read from /api/backend-traces. + // The original failure mode that motivated these specs — the + // local-store backend not installed — was silent on every surface + // except a per-call xlog.Warn. With tracing wired in, the row + // appears next to the embedder/score traces for the same request. + BeforeEach(func() { + trace.ClearBackendTraces() + }) + + It("records a vector_store trace with outcome=backend_load_error when the backend can't be loaded", func() { + // nil ModelLoader → s.backend → StoreBackend → panics on load. + // Use a real-but-empty loader so the failure surfaces as an + // error instead, exercising the load-failure trace path the + // admin would hit when local-store isn't installed. + appCfg := &config.ApplicationConfig{ + EnableTracing: true, + TracingMaxItems: 16, + TracingMaxBodyBytes: 1024, + } + s := &localVectorStore{ + loader: model.NewModelLoader(&system.SystemState{}), + appConfig: appCfg, + storeName: "router-cache-test", + } + + // Search must surface the error AND record a trace describing it. + _, _, _, err := s.Search(context.Background(), []float32{0.1, 0.2, 0.3}) + Expect(err).To(HaveOccurred()) + + Eventually(func() *trace.BackendTrace { + return findVectorStoreTrace("router-cache-test") + }).ShouldNot(BeNil()) + + bt := findVectorStoreTrace("router-cache-test") + Expect(bt.Backend).To(Equal(model.LocalStoreBackend)) + Expect(bt.Data["op"]).To(Equal("search")) + Expect(bt.Data["outcome"]).To(Equal("backend_load_error")) + Expect(bt.Data["vector_dim"]).To(Equal(3)) + // Error is the wrapped "vector store load: …" surfaced to the caller. + Expect(bt.Error).To(ContainSubstring("vector store load")) + }) + + It("does not record a trace when tracing is disabled", func() { + // Opt-out path: appConfig.EnableTracing=false must short-circuit + // before InitBackendTracingIfEnabled, so a workload with tracing + // turned off doesn't pay the channel-send cost per cache call. + appCfg := &config.ApplicationConfig{EnableTracing: false} + s := &localVectorStore{ + loader: model.NewModelLoader(&system.SystemState{}), + appConfig: appCfg, + storeName: "router-cache-disabled", + } + _, _, _, _ = s.Search(context.Background(), []float32{1}) + Consistently(func() *trace.BackendTrace { + return findVectorStoreTrace("router-cache-disabled") + }).Should(BeNil()) + }) +}) diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go index 96618d89c..6b926b179 100644 --- a/core/backend/tokenize.go +++ b/core/backend/tokenize.go @@ -7,9 +7,23 @@ import ( "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/pkg/grpc" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/model" ) +// tokenizeTokenCount returns the number of tokens in a backend response, +// treating a nil response as zero. The gRPC client returns (nil, err) on +// failure, and the tracing block below runs before that error is returned — +// so the count must be read nil-safely here. Reading resp.Tokens on a nil +// resp previously panicked the whole HTTP handler when tracing was enabled +// (e.g. a transient tokenize failure during router probe-budget sizing). +func tokenizeTokenCount(resp *pb.TokenizationResponse) int { + if resp == nil { + return 0 + } + return len(resp.Tokens) +} + func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) { var inferenceModel grpc.Backend @@ -40,10 +54,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model errStr = err.Error() } - tokenCount := 0 - if resp.Tokens != nil { - tokenCount = len(resp.Tokens) - } + tokenCount := tokenizeTokenCount(resp) trace.RecordBackendTrace(trace.BackendTrace{ Timestamp: startTime, @@ -64,8 +75,8 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model return schema.TokenizeResponse{}, err } - if resp.Tokens == nil { - resp.Tokens = make([]int32, 0) + if resp == nil || resp.Tokens == nil { + return schema.TokenizeResponse{Tokens: make([]int32, 0)}, nil } return schema.TokenizeResponse{ diff --git a/core/backend/tokenize_test.go b/core/backend/tokenize_test.go new file mode 100644 index 000000000..3b5c8e9fb --- /dev/null +++ b/core/backend/tokenize_test.go @@ -0,0 +1,27 @@ +package backend + +import ( + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("tokenizeTokenCount", func() { + // Regression: the gRPC client returns (nil, err) when a tokenize call + // fails, and ModelTokenize's tracing block reads the token count before + // the error is returned. Dereferencing a nil response there panicked the + // HTTP handler (nil pointer dereference) — e.g. a transient tokenize + // failure while the router sized its probe-token budget. + It("returns zero for a nil response instead of panicking", func() { + Expect(tokenizeTokenCount(nil)).To(Equal(0)) + }) + + It("returns zero when the response carries no tokens", func() { + Expect(tokenizeTokenCount(&pb.TokenizationResponse{})).To(Equal(0)) + }) + + It("counts the tokens present on the response", func() { + Expect(tokenizeTokenCount(&pb.TokenizationResponse{Tokens: []int32{1, 2, 3}})).To(Equal(3)) + }) +}) diff --git a/core/config/application_config.go b/core/config/application_config.go index 1e1bacc2c..c7113e140 100644 --- a/core/config/application_config.go +++ b/core/config/application_config.go @@ -65,7 +65,7 @@ type ApplicationConfig struct { // // patterns: // - id: email - // action: route_local # downgrade default mask -> route_local + // action: allow # downgrade default mask -> allow (log only) // - id: ssn // action: block # upgrade default mask -> block // diff --git a/core/config/meta/build.go b/core/config/meta/build.go index 24cfb86b7..39235b999 100644 --- a/core/config/meta/build.go +++ b/core/config/meta/build.go @@ -93,6 +93,9 @@ func applyOverride(f *FieldMeta, o FieldMetaOverride) { if o.Component != "" { f.Component = o.Component } + if o.Language != "" { + f.Language = o.Language + } if o.Placeholder != "" { f.Placeholder = o.Placeholder } diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go index b15eb53d0..9be49fec0 100644 --- a/core/config/meta/constants.go +++ b/core/config/meta/constants.go @@ -8,6 +8,7 @@ const ( ProviderModelsTTS = "models:tts" ProviderModelsTranscript = "models:transcript" ProviderModelsVAD = "models:vad" + ProviderModelsScore = "models:score" ) // Static option lists embedded directly in field metadata. diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 1b4cd8580..c404effb2 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -226,6 +226,7 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Chat Template", Description: "Go template for chat completion requests", Component: "code-editor", + Language: "gotemplate", Order: 40, }, "template.chat_message": { @@ -233,6 +234,7 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Chat Message Template", Description: "Go template for individual chat messages", Component: "code-editor", + Language: "gotemplate", Order: 41, }, "template.completion": { @@ -240,13 +242,22 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Completion Template", Description: "Go template for completion requests", Component: "code-editor", + Language: "gotemplate", Order: 42, }, + "template.function": { + Section: "templates", + Label: "Functions Template", + Description: "Go template applied when tools/functions are present in the request", + Component: "code-editor", + Language: "gotemplate", + Order: 43, + }, "template.use_tokenizer_template": { Section: "templates", Label: "Use Tokenizer Template", Description: "Use the chat template from the model's tokenizer config", - Order: 43, + Order: 44, }, // Router section template — kept in the templates UI section // (rather than the router section under "other") so operators @@ -257,7 +268,8 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Router Classifier System Prompt", Description: "Go text/template (with sprig functions) for the routing system prompt the score classifier feeds to its classifier_model. Executed with `.Policies` ([]{Label, Description}). Empty falls back to the built-in Arch-Router-shaped prompt (route-listing block + JSON output schema). Override when the classifier model was trained on a different schema or you need the routing instructions in a different language. The candidate format scored against the model is fixed at `{\"route\": \"