chore: ⬆️ Update ggml-org/whisper.cpp to f24588a272ae8e23280d9c220536437164e6ed28 (#10078 )

⬆️ Update ggml-org/whisper.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
chore: ⬆️ Update mudler/rf-detr.cpp to 65c0ffcc9a9bc9dae38252f63d0417c9845a6cf7 (#10075 )
2026-06-07 08:16:53 -04:00 · 2026-05-30 01:09:52 +02:00 · 2026-05-30 00:55:41 +02:00 · 2026-05-30 00:24:34 +02:00 · 2026-05-30 00:11:57 +02:00 · 2026-05-30 00:11:41 +02:00
52 changed files with 1543 additions and 352 deletions
--- a/29
+++ b/29
@@ -1313,6 +1313,13 @@ build-ui-test-server: build-mock-backend react-ui protogen-go
 test-ui-e2e: build-ui-test-server
 	cd core/http/react-ui && npm install && npx playwright install --with-deps chromium && npx playwright test

+## Optional Playwright worker count for the UI e2e targets below. Pass
+## UI_TEST_WORKERS=N (e.g. `make test-ui-coverage UI_TEST_WORKERS=20`) to
+## override Playwright's default (cores/2). Empty by default so Playwright
+## picks its own worker count.
+UI_TEST_WORKERS ?=
+PLAYWRIGHT_WORKERS_FLAG = $(if $(UI_TEST_WORKERS),--workers=$(UI_TEST_WORKERS),)
+
 ## Fast Playwright e2e run used by the pre-commit hook on React UI changes.
 ## Force-rebuilds the (non-instrumented) dist so the suite tests the working
 ## tree — not a stale dist the `react-ui` skip-guard would leave — re-embeds
@@ -1322,22 +1329,24 @@ test-ui-e2e: build-ui-test-server
 test-ui: build-mock-backend protogen-go
 	cd core/http/react-ui && bun install && bun run build
 	$(GOCMD) build -o tests/e2e-ui/ui-test-server ./tests/e2e-ui
-	cd core/http/react-ui && sh $(CURDIR)/scripts/ensure-playwright-browser.sh && bunx playwright test
+	cd core/http/react-ui && sh $(CURDIR)/scripts/ensure-playwright-browser.sh && bunx playwright test $(PLAYWRIGHT_WORKERS_FLAG)

-## React UI code coverage from the Playwright e2e suite. Builds an
-## istanbul-instrumented bundle (COVERAGE=true), re-embeds it into the
-## ui-test-server (the dist is //go:embed'ed at compile time), runs the
-## Playwright specs — which harvest window.__coverage__ via the coverage
-## fixture — and writes an nyc report to core/http/react-ui/coverage/.
-## Removes the instrumented dist afterwards so normal builds aren't served
-## instrumented assets.
+## React UI code coverage from the Playwright e2e suite. Builds a
+## NON-instrumented bundle with source maps (COVERAGE_V8=true), re-embeds it
+## into the ui-test-server (the dist is //go:embed'ed at compile time), runs the
+## Playwright specs which collect native Chromium V8 coverage (PW_V8_COVERAGE=1)
+## — far cheaper than istanbul's build-time counters (~40% faster end-to-end) —
+## convert it to istanbul via v8-to-istanbul in the coverage fixture, and write
+## an nyc report to core/http/react-ui/coverage/. Removes the dist afterwards so
+## normal builds aren't served source-mapped assets. (The legacy istanbul path
+## still exists: `bun run build:coverage` + unset PW_V8_COVERAGE.)
 test-ui-coverage: build-mock-backend protogen-go
 	trap 'rm -rf "$(CURDIR)/core/http/react-ui/dist"' EXIT; \
-	( cd core/http/react-ui && bun install && bun run build:coverage ) && \
+	( cd core/http/react-ui && bun install && bun run build:coverage-v8 ) && \
 	$(GOCMD) build -o tests/e2e-ui/ui-test-server ./tests/e2e-ui && \
 	( cd core/http/react-ui && rm -rf .nyc_output coverage && \
 	    sh $(CURDIR)/scripts/ensure-playwright-browser.sh && \
-	    bunx playwright test && bun run coverage:report )
+	    PW_V8_COVERAGE=1 bunx playwright test $(PLAYWRIGHT_WORKERS_FLAG) && bun run coverage:report )

 ## UI coverage baseline (committed) and the strict gate that compares against
 ## it — the React mirror of test-coverage-baseline / test-coverage-check.
--- a/backend/cpp/ds4/Makefile
+++ b/backend/cpp/ds4/Makefile
@@ -1,10 +1,10 @@
 # ds4 backend Makefile.
 #
-# Upstream pin lives below as DS4_VERSION?=e8e8779b261c10f36ad6270ba732c8f0be5b62e3
+# Upstream pin lives below as DS4_VERSION?=22393e770ea8eb7501d8718d6f66c6374004e03f
 # (.github/bump_deps.sh) can find and update it - matches the
 # llama-cpp / ik-llama-cpp / turboquant convention.

-DS4_VERSION?=e8e8779b261c10f36ad6270ba732c8f0be5b62e3
+DS4_VERSION?=22393e770ea8eb7501d8718d6f66c6374004e03f
 DS4_REPO?=https://github.com/antirez/ds4

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=d2da6da05c73aeb658a3d1751f386c24e6963856
+IK_LLAMA_VERSION?=8960c5ba5ee9db30ba838304373aa4dbec9f7cbd
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=0d18aaa9d1a8af3df9abccd828e22eeaac7f840b
+LLAMA_VERSION?=751ebd17a58a8a513994509214373bb9e6a3d66c
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -573,8 +573,12 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    // checkpoint_min_step: minimum spacing between context checkpoints in
    // tokens (0 disables the minimum). Match upstream's default (256). This
    // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics
-    // also shifted from a fixed cadence to a minimum spacing.
+    // also shifted from a fixed cadence to a minimum spacing. The turboquant
+    // fork branched before the field existed, so skip it on the legacy path
+    // (LOCALAI_LEGACY_LLAMA_CPP_SPEC is injected by patch-grpc-server.sh).
+#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
    params.checkpoint_min_step = 256;
+#endif

     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
@@ -748,11 +752,18 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                params.cache_idle_slots = false;
            }

+#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
        // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) ---
        // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`,
        // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility
        // with existing user configs: upstream renamed the field and shifted its
        // semantics from a fixed cadence to a minimum spacing.
+        //
+        // Gated out for the turboquant fork, which lacks common_params::
+        // checkpoint_min_step. The leading `}` closing the cache_idle_slots
+        // branch is removed with this block; the next `} else if` (n_ubatch)
+        // then closes cache_idle_slots, so braces stay balanced under both
+        // preprocessor branches.
        } else if (!strcmp(optname, "checkpoint_min_step") || !strcmp(optname, "checkpoint_min_spacing") ||
                   !strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) {
            if (optval != NULL) {
@@ -762,6 +773,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                    // If conversion fails, keep default value (256)
                }
            }
+#endif

        // --- physical batch size (upstream -ub / --ubatch-size) ---
        // Note: line ~482 already aliases n_ubatch to n_batch as a default; this
@@ -1165,9 +1177,15 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
            params.tensor_buft_overrides.push_back({nullptr, nullptr});
        }
    }
+    // The draft tensor_buft_overrides are only populated under the modern
+    // (post-#22838) layout, whose population code is itself gated by
+    // LOCALAI_LEGACY_LLAMA_CPP_SPEC above. The turboquant fork lacks
+    // common_params_speculative::draft entirely, so skip the sentinel there too.
+#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
    }
+#endif

    // TODO: Add yarn

--- a/backend/cpp/turboquant/patch-grpc-server.sh
+++ b/backend/cpp/turboquant/patch-grpc-server.sh
@@ -124,8 +124,11 @@ fi
 # 5. Define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top of the file so the
 #    grpc-server option parser skips the new option-handler blocks (ngram_mod,
 #    ngram_map_k, ngram_map_k4v, ngram_cache, draft.cache_type_*, draft.cpuparams*,
-#    draft.tensor_buft_overrides) introduced for the post-#22838 layout. Those
-#    blocks reference struct fields that simply do not exist in the fork.
+#    draft.tensor_buft_overrides) introduced for the post-#22838 layout, the
+#    draft.tensor_buft_overrides sentinel termination, and the
+#    common_params::checkpoint_min_step default/option (added with the
+#    35c9b1f3 bump). Those blocks reference struct fields that simply do not
+#    exist in the fork.
 if grep -q '^#define LOCALAI_LEGACY_LLAMA_CPP_SPEC' "$SRC"; then
    echo "==> $SRC already defines LOCALAI_LEGACY_LLAMA_CPP_SPEC, skipping"
 else
--- a/backend/go/rfdetr-cpp/Makefile
+++ b/backend/go/rfdetr-cpp/Makefile
@@ -11,7 +11,7 @@ JOBS?=$(shell nproc --ignore=1)
 # build; leaving this on `master` always picks up the latest C-API surface
 # (incl. the per-detection accessor functions used by gorfdetrcpp.go).
 RFDETR_REPO?=https://github.com/mudler/rf-detr.cpp.git
-RFDETR_VERSION?=ecf64d77b09013e7e90af6a17b9ce884e7daa86c
+RFDETR_VERSION?=65c0ffcc9a9bc9dae38252f63d0417c9845a6cf7

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=92dc7268fc4ffb0c0cc0bd52dfcefea91326e797
+STABLEDIFFUSION_GGML_VERSION?=0e4ee04488159b81d95a9ffcd983a077fd5dcb77

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=27101c01dcac1676e2b6422256233cd0f1f9ae28
+WHISPER_CPP_VERSION?=f24588a272ae8e23280d9c220536437164e6ed28
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -3,5 +3,5 @@
 # on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.21.0/cu130
-vllm==0.21.0
+--extra-index-url https://wheels.vllm.ai/0.22.0/cu130
+vllm==0.22.0
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -90,6 +90,8 @@ type Application struct {
 	// LocalAI Assistant in-process MCP server. nil when DisableLocalAIAssistant
 	// is set; otherwise initialised in start() after galleryService.
 	localAIAssistant *mcpTools.LocalAIAssistantHolder
+
+	shutdownOnce sync.Once
 }

 func newApplication(appConfig *config.ApplicationConfig) *Application {
@@ -320,6 +322,24 @@ func (a *Application) IsDistributed() bool {
 	return a.distributed != nil
 }

+// Shutdown stops backend gRPC processes and distributed services
+// synchronously on the caller's stack. The context-cancel goroutine wired
+// in New does the same work asynchronously, which races test-binary exit
+// and CLI shutdown — orphaning spawned mock-backend / llama.cpp / etc.
+// children to init. Callers that need a guarantee that cleanup has
+// finished before they proceed (AfterSuite/AfterEach, signal handlers)
+// must call this. Safe to call multiple times.
+func (a *Application) Shutdown() error {
+	var err error
+	a.shutdownOnce.Do(func() {
+		a.distributed.Shutdown()
+		if a.modelLoader != nil {
+			err = a.modelLoader.StopAllGRPC()
+		}
+	})
+	return err
+}
+
 // waitForHealthyWorker blocks until at least one healthy backend worker is registered.
 // This prevents the agent pool from failing during startup when workers haven't connected yet.
 func (a *Application) waitForHealthyWorker() {
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -449,13 +449,15 @@ func New(opts ...config.AppOption) (*Application, error) {

 	application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging)

-	// turn off any process that was started by GRPC if the context is canceled
+	// Safety-net cleanup if the application context is cancelled without
+	// the caller invoking Shutdown directly. This is fire-and-forget — it
+	// races binary exit and is unreliable in tests; the deterministic path
+	// is application.Shutdown(), which Shutdown's sync.Once dedupes with
+	// this goroutine.
 	go func() {
 		<-options.Context.Done()
 		xlog.Debug("Context canceled, shutting down")
-		application.distributed.Shutdown()
-		err := application.ModelLoader().StopAllGRPC()
-		if err != nil {
+		if err := application.Shutdown(); err != nil {
 			xlog.Error("error while stopping all grpc backends", "error", err)
 		}
 	}()
--- a/core/backend/options_internal_test.go
+++ b/core/backend/options_internal_test.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"

 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"

 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -42,3 +43,35 @@ var _ = Describe("grpcModelOpts EngineArgs", func() {
 		Expect(opts.EngineArgs).To(BeEmpty())
 	})
 })
+
+// Guards the DisableReasoning -> enable_thinking metadata conversion that the
+// per-request reasoning_effort feature (issue #10072) relies on: the request
+// merge sets ReasoningConfig.DisableReasoning, and gRPCPredictOpts is where it
+// becomes the gRPC PredictOptions.Metadata the backend reads.
+var _ = Describe("gRPCPredictOpts enable_thinking metadata", func() {
+	// withReasoning builds a fully-defaulted config (gRPCPredictOpts dereferences
+	// many pointer fields) and overrides only the reasoning toggle.
+	withReasoning := func(disable *bool) config.ModelConfig {
+		cfg := config.ModelConfig{}
+		cfg.SetDefaults()
+		cfg.ReasoningConfig = reasoning.Config{DisableReasoning: disable}
+		return cfg
+	}
+	disabled := true
+	enabled := false
+
+	It("emits enable_thinking=false when reasoning is disabled", func() {
+		opts := gRPCPredictOpts(withReasoning(&disabled), "/tmp/models")
+		Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "false"))
+	})
+
+	It("emits enable_thinking=true when reasoning is enabled", func() {
+		opts := gRPCPredictOpts(withReasoning(&enabled), "/tmp/models")
+		Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true"))
+	})
+
+	It("omits enable_thinking when reasoning is unset", func() {
+		opts := gRPCPredictOpts(withReasoning(nil), "/tmp/models")
+		Expect(opts.Metadata).ToNot(HaveKey("enable_thinking"))
+	})
+})
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -577,12 +577,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	}

 	signals.RegisterGracefulTerminationHandler(func() {
-		if err := app.ModelLoader().StopAllGRPC(); err != nil {
-			xlog.Error("error while stopping all grpc backends", "error", err)
-		}
-		// Clean up distributed services (idempotent — safe if already called)
-		if d := app.Distributed(); d != nil {
-			d.Shutdown()
+		if err := app.Shutdown(); err != nil {
+			xlog.Error("error while shutting down application", "error", err)
 		}
 	})

--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -732,6 +732,17 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Proxy.Mode = ProxyModePassthrough
 	}

+	// When templating is delegated to the backend (use_tokenizer_template),
+	// the backend also owns tool-call grammar generation and parsing. Sending
+	// a LocalAI-generated grammar alongside overrides the backend's native
+	// (name-first) tool pipeline and makes it stream the tool-call JSON back as
+	// plain content (issue #10052). The GGUF auto-import path already couples
+	// these two flags; enforce it here so gallery and hand-written configs that
+	// set use_tokenizer_template directly stay consistent.
+	if cfg.TemplateConfig.UseTokenizerTemplate {
+		cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
+	}
+
 	// Apply model-family-specific inference defaults before generic fallbacks.
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -471,4 +471,33 @@ concurrency_groups:
 			Expect(configs[0].GetConcurrencyGroups()).To(Equal([]string{"vram-heavy", "120b"}))
 		})
 	})
+
+	// When templating is delegated to the backend (use_tokenizer_template),
+	// the backend also owns tool-call grammar generation and parsing. A
+	// LocalAI-generated grammar sent alongside would override the backend's
+	// native (name-first) tool pipeline and make it stream the tool-call JSON
+	// back as plain content (issue #10052). SetDefaults must therefore couple
+	// the two: tokenizer template implies grammar generation is disabled.
+	Context("use_tokenizer_template couples with grammar disable (issue #10052)", func() {
+		It("disables Go grammar generation when the tokenizer template is used", func() {
+			cfg := &ModelConfig{
+				TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+			}
+			Expect(cfg.FunctionsConfig.GrammarConfig.NoGrammar).To(BeFalse())
+
+			cfg.SetDefaults()
+
+			Expect(cfg.FunctionsConfig.GrammarConfig.NoGrammar).To(BeTrue(),
+				"use_tokenizer_template must imply grammar.disable so tools go to the backend's native pipeline")
+		})
+
+		It("leaves grammar generation enabled when the tokenizer template is not used", func() {
+			cfg := &ModelConfig{}
+
+			cfg.SetDefaults()
+
+			Expect(cfg.FunctionsConfig.GrammarConfig.NoGrammar).To(BeFalse(),
+				"models that template in Go still rely on the Go-generated grammar")
+		})
+	})
 })
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -308,6 +308,11 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
+	// localAIApp captures the Application so AfterEach can synchronously
+	// stop the spawned gRPC backend processes. application.New cancels
+	// them asynchronously on context cancel, which races with test-binary
+	// exit and leaks mock-backend children to init.
+	var localAIApp *application.Application

 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
@@ -736,14 +741,14 @@ parameters:
 			)
 			Expect(err).ToNot(HaveOccurred())

-			application, err := application.New(
+			localAIApp, err = application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
-			app, err = API(application)
+			localAIApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
+			app, err = API(localAIApp)
 			Expect(err).ToNot(HaveOccurred())
 			go func() {
 				if err := app.Start("127.0.0.1:9090"); err != nil && err != http.ErrServerClosed {
@@ -765,6 +770,11 @@ parameters:
 			}, "2m").ShouldNot(HaveOccurred())
 		})
 		AfterEach(func() {
+			// Synchronous shutdown — context-cancel cleanup is async and races
+			// test-binary exit, orphaning mock-backend children to init.
+			if localAIApp != nil {
+				_ = localAIApp.Shutdown()
+			}
 			cancel()
 			if app != nil {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
@@ -976,15 +986,15 @@ parameters:
 			)
 			Expect(err).ToNot(HaveOccurred())

-			application, err := application.New(
+			localAIApp, err = application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 					config.WithConfigFile(configFile))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
-			app, err = API(application)
+			localAIApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
+			app, err = API(localAIApp)
 			Expect(err).ToNot(HaveOccurred())

 			go func() {
@@ -1005,6 +1015,11 @@ parameters:
 			}, "2m").ShouldNot(HaveOccurred())
 		})
 		AfterEach(func() {
+			// Synchronous shutdown — context-cancel cleanup is async and races
+			// test-binary exit, orphaning mock-backend children to init.
+			if localAIApp != nil {
+				_ = localAIApp.Shutdown()
+			}
 			cancel()
 			if app != nil {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
--- a/core/http/endpoints/openai/chat_stream_workers.go
+++ b/core/http/endpoints/openai/chat_stream_workers.go
@@ -341,6 +341,19 @@ func processStreamWithTools(
 			}
 		}

+		// Issue #9722: when the C++ autoparser is already producing tool
+		// calls (it delivers them via ChatDeltas, which are flushed at
+		// end-of-stream by ToolCallsFromChatDeltas -> buildDeferredToolCallChunks),
+		// skip the Go-side iterative parser below. Running both parsers makes
+		// the same logical tool call surface at multiple `index` values.
+		// The deferred flush is guarded by lastEmittedCount, so the race where
+		// the Go parser already emitted before this flag flipped also stays
+		// single-emission. Backends without an autoparser (e.g. vLLM) keep
+		// hasChatDeltaToolCalls=false and are unaffected.
+		if hasChatDeltaToolCalls {
+			return true
+		}
+
 		// Try incremental XML parsing for streaming support using iterative parser
 		// This allows emitting partial tool calls as they're being generated
 		cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -95,7 +95,7 @@ func ResponsesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval

 		// Add instructions as system message if provided
 		if input.Instructions != "" {
-			messages = append([]schema.Message{{Role: "system", StringContent: input.Instructions}}, messages...)
+			messages = append([]schema.Message{{Role: "system", Content: input.Instructions, StringContent: input.Instructions}}, messages...)
 		}

 		// Handle tools
@@ -299,7 +299,7 @@ func convertORInputToMessages(input any, cfg *config.ModelConfig) ([]schema.Mess
 	switch v := input.(type) {
 	case string:
 		// Simple string = user message
-		return []schema.Message{{Role: "user", StringContent: v}}, nil
+		return []schema.Message{{Role: "user", Content: v, StringContent: v}}, nil
 	case []any:
 		// Array of items
 		for _, itemRaw := range v {
@@ -309,6 +309,16 @@ func convertORInputToMessages(input any, cfg *config.ModelConfig) ([]schema.Mess
 			}

 			itemType, _ := itemMap["type"].(string)
+			// OpenAI SDK helpers (e.g. client.responses.create(input=[{"role":...,"content":...}]))
+			// send message items without a "type" discriminator. Treat a bare {role, content}
+			// object as type:"message" so the chat-completions and responses paths agree.
+			if itemType == "" {
+				if _, hasRole := itemMap["role"].(string); hasRole {
+					if _, hasContent := itemMap["content"]; hasContent {
+						itemType = "message"
+					}
+				}
+			}
 			switch itemType {
 			case "message":
 				msg, err := convertORMessageItem(itemMap, cfg)
--- a/core/http/endpoints/openresponses/responses_convert_test.go
+++ b/core/http/endpoints/openresponses/responses_convert_test.go
@@ -0,0 +1,62 @@
+package openresponses
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Regression for mudler/LocalAI#10039. convertORInputToMessages must populate
+// both Content and StringContent: the templating fallback path reads
+// StringContent, while the UseTokenizerTemplate path serialises Content via
+// Messages.ToProto(). Leaving Content nil produced an empty prompt on any model
+// without a Go-side template.chat_message block (the default for imported GGUFs).
+var _ = Describe("convertORInputToMessages", func() {
+	cfg := &config.ModelConfig{}
+
+	It("populates both Content and StringContent for plain string input", func() {
+		msgs, err := convertORInputToMessages("Hello", cfg)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(msgs).To(HaveLen(1))
+		Expect(msgs[0].Role).To(Equal("user"))
+		Expect(msgs[0].StringContent).To(Equal("Hello"))
+		Expect(msgs[0].Content).To(Equal("Hello"))
+	})
+
+	It("accepts a bare {role, content} item without a type discriminator", func() {
+		// The OpenAI Python SDK helper client.responses.create(input=[{...}])
+		// sends message items with no "type" field. They must not be dropped.
+		input := []any{
+			map[string]any{"role": "user", "content": "Hi there"},
+		}
+		msgs, err := convertORInputToMessages(input, cfg)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(msgs).To(HaveLen(1))
+		Expect(msgs[0].Role).To(Equal("user"))
+		Expect(msgs[0].StringContent).To(Equal("Hi there"))
+		Expect(msgs[0].Content).To(Equal("Hi there"))
+	})
+
+	It("still populates both fields for an explicit type:message item", func() {
+		input := []any{
+			map[string]any{"type": "message", "role": "user", "content": "Typed"},
+		}
+		msgs, err := convertORInputToMessages(input, cfg)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(msgs).To(HaveLen(1))
+		Expect(msgs[0].StringContent).To(Equal("Typed"))
+		Expect(msgs[0].Content).To(Equal("Typed"))
+	})
+
+	It("does not treat a non-message item (no content key) as a message", func() {
+		// An item with neither a known type nor a {role, content} shape must
+		// keep falling through unchanged — no behaviour change for such inputs.
+		input := []any{
+			map[string]any{"role": "user"},
+		}
+		msgs, err := convertORInputToMessages(input, cfg)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(msgs).To(BeEmpty())
+	})
+})
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -310,6 +310,26 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
 		config.Temperature = input.Temperature
 	}

+	// Map the per-request reasoning_effort onto the reasoning toggle the
+	// backend reads (enable_thinking metadata, set in gRPCPredictOpts).
+	// "none" disables thinking for this request - the use case from #10072,
+	// running a single Qwen3-style model and turning reasoning off per
+	// request. Any explicit effort level enables thinking, UNLESS the model
+	// config explicitly disabled it (DisableReasoning==true wins): an
+	// operator who deliberately turned reasoning off should not be overridden
+	// by a request. A value of "none" always disables, since that never
+	// conflicts with a config that also disables.
+	switch strings.ToLower(input.ReasoningEffort) {
+	case "none":
+		disable := true
+		config.ReasoningConfig.DisableReasoning = &disable
+	case "minimal", "low", "medium", "high":
+		if config.ReasoningConfig.DisableReasoning == nil || !*config.ReasoningConfig.DisableReasoning {
+			enable := false
+			config.ReasoningConfig.DisableReasoning = &enable
+		}
+	}
+
 	// Collapse the modern max_completion_tokens alias into the
 	// legacy Maxtokens field so downstream code reads exactly one.
 	// MaxCompletionTokens wins on conflict — it's the canonical
--- a/core/http/middleware/request_test.go
+++ b/core/http/middleware/request_test.go
@@ -597,3 +597,137 @@ var _ = Describe("SetModelAndConfig tool_choice parsing (chat completions)", fun
 		})
 	})
 })
+
+// These tests cover the per-request reasoning_effort -> enable_thinking mapping.
+// The merge lives in mergeOpenAIRequestAndModelConfig (called from
+// SetOpenAIRequest), so they drive the full middleware chain like the
+// production /v1/chat/completions route does. The block builds its own app per
+// test so the model config can be varied (some cases need reasoning.disable set
+// in the model YAML to assert that an explicit config disable wins).
+//
+// Mapping under test (issue #10072):
+//   - reasoning_effort=none                 -> DisableReasoning=true
+//   - reasoning_effort=low/medium/high      -> DisableReasoning=false, UNLESS the
+//     model config explicitly set true
+//   - empty / unrecognized                  -> no change
+var _ = Describe("SetModelAndConfig reasoning_effort parsing (chat completions)", func() {
+	var modelDir string
+
+	BeforeEach(func() {
+		var err error
+		modelDir, err = os.MkdirTemp("", "localai-test-models-*")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	// buildApp writes a model config with the given YAML body and returns an app
+	// plus a pointer to the captured per-request config.
+	buildApp := func(cfgYAML string) (*echo.Echo, **config.ModelConfig) {
+		Expect(os.WriteFile(filepath.Join(modelDir, "test-model.yaml"), []byte(cfgYAML), 0644)).To(Succeed())
+
+		ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}}
+		appConfig := config.NewApplicationConfig()
+		appConfig.SystemState = ss
+		mcl := config.NewModelConfigLoader(modelDir)
+		ml := model.NewModelLoader(ss)
+		re := NewRequestExtractor(mcl, ml, appConfig)
+
+		captured := new(*config.ModelConfig)
+		app := echo.New()
+		app.POST("/v1/chat/completions",
+			func(c echo.Context) error {
+				if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok {
+					*captured = cfg
+				}
+				return c.String(http.StatusOK, "ok")
+			},
+			re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			func(next echo.HandlerFunc) echo.HandlerFunc {
+				return func(c echo.Context) error {
+					if err := re.SetOpenAIRequest(c); err != nil {
+						return err
+					}
+					return next(c)
+				}
+			},
+		)
+		return app, captured
+	}
+
+	chatReq := func(effort string) string {
+		return `{"model":"test-model",` +
+			`"messages":[{"role":"user","content":"hi"}],` +
+			`"reasoning_effort":` + effort + `}`
+	}
+
+	plainCfg := "name: test-model\nbackend: llama-cpp\n"
+
+	It("disables thinking for reasoning_effort=none", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"none"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("enables thinking for reasoning_effort=high when config is unset", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+
+	It("enables thinking for reasoning_effort=high when config explicitly set false", func() {
+		app, captured := buildApp(plainCfg + "reasoning:\n  disable: false\n")
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+
+	It("config wins: reasoning_effort=high cannot re-enable when config explicitly disabled", func() {
+		app, captured := buildApp(plainCfg + "reasoning:\n  disable: true\n")
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("is a no-op when reasoning_effort is empty", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions",
+			`{"model":"test-model","messages":[{"role":"user","content":"hi"}]}`)
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).To(BeNil())
+	})
+
+	It("is case-insensitive (None disables, HIGH enables)", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"None"`))
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+
+		app2, captured2 := buildApp(plainCfg)
+		rec2 := postJSON(app2, "/v1/chat/completions", chatReq(`"HIGH"`))
+		Expect(rec2.Code).To(Equal(http.StatusOK))
+		Expect(*captured2).ToNot(BeNil())
+		Expect((*captured2).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured2).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+})
--- a/core/http/react-ui/bun.lock
+++ b/core/http/react-ui/bun.lock
@@ -32,6 +32,7 @@
        "yaml": "^2.8.3",
      },
      "devDependencies": {
+        "@bcoe/v8-coverage": "^1.0.2",
        "@eslint/js": "^9.27.0",
        "@playwright/test": "1.58.2",
        "@vitejs/plugin-react": "^6.0.2",
@@ -41,6 +42,7 @@
        "globals": "^16.1.0",
        "i18next-parser": "^9.4.0",
        "nyc": "^18.0.0",
+        "v8-to-istanbul": "^9.3.0",
        "vite": "^8.0.14",
        "vite-plugin-istanbul": "^9.0.0",
      },
@@ -81,6 +83,8 @@

    "@babel/types": ["@babel/types@7.29.0", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.28.5" } }, "sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A=="],

+    "@bcoe/v8-coverage": ["@bcoe/v8-coverage@1.0.2", "", {}, "sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA=="],
+
    "@codemirror/autocomplete": ["@codemirror/autocomplete@6.20.1", "", { "dependencies": { "@codemirror/language": "^6.0.0", "@codemirror/state": "^6.0.0", "@codemirror/view": "^6.17.0", "@lezer/common": "^1.0.0" } }, "sha512-1cvg3Vz1dSSToCNlJfRA2WSI4ht3K+WplO0UMOgmUYPivCyy2oueZY6Lx7M9wThm7SDUBViRmuT+OG/i8+ON9A=="],

    "@codemirror/commands": ["@codemirror/commands@6.10.3", "", { "dependencies": { "@codemirror/language": "^6.0.0", "@codemirror/state": "^6.6.0", "@codemirror/view": "^6.27.0", "@lezer/common": "^1.1.0" } }, "sha512-JFRiqhKu+bvSkDLI+rUhJwSxQxYb759W5GBezE8Uc8mHLqC9aV/9aTC7yJSqCtB3F00pylrLCwnyS91Ap5ej4Q=="],
@@ -267,6 +271,8 @@

    "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="],

+    "@types/istanbul-lib-coverage": ["@types/istanbul-lib-coverage@2.0.6", "", {}, "sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w=="],
+
    "@types/json-schema": ["@types/json-schema@7.0.15", "", {}, "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA=="],

    "@types/minimatch": ["@types/minimatch@3.0.5", "", {}, "sha512-Klz949h02Gz2uZCMGwDUSDS1YBlTdDDgbWHi+81l29tQALUtvz4rAYi5uoVhE5Lagoq6DeqAUlbrHvW/mXDgdQ=="],
@@ -983,6 +989,8 @@

    "uuid": ["uuid@8.3.2", "", { "bin": { "uuid": "dist/bin/uuid" } }, "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg=="],

+    "v8-to-istanbul": ["v8-to-istanbul@9.3.0", "", { "dependencies": { "@jridgewell/trace-mapping": "^0.3.12", "@types/istanbul-lib-coverage": "^2.0.1", "convert-source-map": "^2.0.0" } }, "sha512-kiGUalWN+rgBJ/1OHZsBtU4rXZOfj/7rKQxULKlIzwzQSvMJUUNgPwJEEh7gU6xEVxC0ahoOBvN2YI8GH6FNgA=="],
+
    "value-or-function": ["value-or-function@4.0.0", "", {}, "sha512-aeVK81SIuT6aMJfNo9Vte8Dw0/FZINGBV8BfCraGtqVxIeLAEhJyoWs8SmvRVmXfGss2PmmOwZCuBPbZR+IYWg=="],

    "vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="],
@@ -1121,6 +1129,8 @@

    "test-exclude/minimatch": ["minimatch@10.2.5", "", { "dependencies": { "brace-expansion": "^5.0.5" } }, "sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg=="],

+    "v8-to-istanbul/convert-source-map": ["convert-source-map@2.0.0", "", {}, "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg=="],
+
    "vinyl-sourcemap/convert-source-map": ["convert-source-map@2.0.0", "", {}, "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg=="],

    "vite-plugin-istanbul/espree": ["espree@11.2.0", "", { "dependencies": { "acorn": "^8.16.0", "acorn-jsx": "^5.3.2", "eslint-visitor-keys": "^5.0.1" } }, "sha512-7p3DrVEIopW1B1avAGLuCSh1jubc01H2JHc8B4qqGblmg5gI9yumBgACjWo4JlIc04ufug4xJ3SQI8HkS/Rgzw=="],
--- a/core/http/react-ui/coverage-baseline.txt
+++ b/core/http/react-ui/coverage-baseline.txt
@@ -1 +1 @@
-30.66
+38.29
--- a/core/http/react-ui/e2e/coverage-fixtures.js
+++ b/core/http/react-ui/e2e/coverage-fixtures.js
@@ -15,9 +15,41 @@ import { randomUUID } from 'node:crypto'
 import path from 'node:path'

 const COVERAGE_DIR = path.resolve(process.cwd(), '.nyc_output')
+const V8_COVERAGE = process.env.PW_V8_COVERAGE === '1'
+
+const withCoverage = base.extend({
+  // Worker-scoped V8 coverage accumulator: collects every test's native
+  // Chromium coverage and converts it to istanbul ONCE at worker teardown
+  // (conversion is expensive; see e2e/v8-coverage.js). null when V8 mode is off.
+  _v8acc: [
+    async ({}, use) => {
+      if (!V8_COVERAGE) {
+        await use(null)
+        return
+      }
+      const { createAccumulator } = await import('./v8-coverage.js')
+      const acc = createAccumulator()
+      await use(acc)
+      await acc.flush()
+    },
+    { scope: 'worker' },
+  ],
+
+  page: async ({ page, _v8acc }, use) => {
+    // V8 coverage path: collect native Chromium coverage (cheap), hand it to the
+    // worker accumulator on teardown. Avoids running an instrumented bundle.
+    if (V8_COVERAGE) {
+      const { startV8 } = await import('./v8-coverage.js')
+      await startV8(page)
+      await use(page)
+      try {
+        _v8acc.add(await page.coverage.stopJSCoverage())
+      } catch {
+        // page already closed — nothing to collect
+      }
+      return
+    }

-export const test = base.extend({
-  page: async ({ page }, use) => {
    await use(page)

    let coverage
@@ -37,4 +69,5 @@ export const test = base.extend({
  },
 })

+export const test = withCoverage
 export { expect }
--- a/core/http/react-ui/e2e/models-gallery.spec.js
+++ b/core/http/react-ui/e2e/models-gallery.spec.js
@@ -1,28 +1,52 @@
-import { test, expect } from './coverage-fixtures.js'
+import { test, expect } from "./coverage-fixtures.js";

 const MOCK_MODELS_RESPONSE = {
  models: [
-    { name: 'llama-model', description: 'A llama model', backend: 'llama-cpp', installed: false, tags: ['chat'] },
-    { name: 'whisper-model', description: 'A whisper model', backend: 'whisper', installed: true, tags: ['transcript'] },
-    { name: 'stablediffusion-model', description: 'An image model', backend: 'stablediffusion', installed: false, tags: ['sd'] },
-    { name: 'unknown-model', description: 'No backend', backend: '', installed: false, tags: [] },
+    {
+      name: "llama-model",
+      description: "A llama model",
+      backend: "llama-cpp",
+      installed: false,
+      tags: ["chat"],
+    },
+    {
+      name: "whisper-model",
+      description: "A whisper model",
+      backend: "whisper",
+      installed: true,
+      tags: ["transcript"],
+    },
+    {
+      name: "stablediffusion-model",
+      description: "An image model",
+      backend: "stablediffusion",
+      installed: false,
+      tags: ["sd"],
+    },
+    {
+      name: "unknown-model",
+      description: "No backend",
+      backend: "",
+      installed: false,
+      tags: [],
+    },
  ],
-  allBackends: ['llama-cpp', 'stablediffusion', 'whisper'],
-  allTags: ['chat', 'sd', 'transcript'],
+  allBackends: ["llama-cpp", "stablediffusion", "whisper"],
+  allTags: ["chat", "sd", "transcript"],
  availableModels: 4,
  installedModels: 1,
  totalPages: 1,
  currentPage: 1,
-}
+};

 const MOCK_GPU_RESOURCES_RESPONSE = {
-  type: 'gpu',
+  type: "gpu",
  available: true,
  gpus: [
    {
      index: 0,
-      name: 'Mock GPU',
-      vendor: 'nvidia',
+      name: "Mock GPU",
+      vendor: "nvidia",
      total_vram: 12 * 1024 * 1024 * 1024,
      used_vram: 2 * 1024 * 1024 * 1024,
      free_vram: 10 * 1024 * 1024 * 1024,
@@ -36,272 +60,374 @@ const MOCK_GPU_RESOURCES_RESPONSE = {
    usage_percent: 16.7,
    gpu_count: 1,
  },
-}
+};

 const MOCK_ESTIMATES = {
-  'llama-model': {
+  "llama-model": {
    sizeBytes: 4 * 1024 * 1024 * 1024,
-    sizeDisplay: '4.00 GB',
+    sizeDisplay: "4.00 GB",
    estimates: {
-      '8192': {
+      8192: {
        vramBytes: 8 * 1024 * 1024 * 1024,
-        vramDisplay: '8.00 GB',
+        vramDisplay: "8.00 GB",
      },
    },
  },
-  'whisper-model': {
+  "whisper-model": {
    sizeBytes: 1 * 1024 * 1024 * 1024,
-    sizeDisplay: '1.00 GB',
+    sizeDisplay: "1.00 GB",
    estimates: {
-      '8192': {
+      8192: {
        vramBytes: 2 * 1024 * 1024 * 1024,
-        vramDisplay: '2.00 GB',
+        vramDisplay: "2.00 GB",
      },
    },
  },
-  'stablediffusion-model': {
+  "stablediffusion-model": {
    sizeBytes: 8 * 1024 * 1024 * 1024,
-    sizeDisplay: '8.00 GB',
+    sizeDisplay: "8.00 GB",
    estimates: {
-      '8192': {
+      8192: {
        vramBytes: 16 * 1024 * 1024 * 1024,
-        vramDisplay: '16.00 GB',
+        vramDisplay: "16.00 GB",
      },
    },
  },
-}
+};

-test.describe('Models Gallery - Backend Features', () => {
+test.describe("Models Gallery - Backend Features", () => {
  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/models*', (route) => {
+    await page.route("**/api/models*", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_MODELS_RESPONSE),
-      })
-    })
-    await page.goto('/app/models')
+      });
+    });
+    await page.goto("/app/models");
    // Wait for the table to render
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
-  })
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });

-  test('backend column header is visible', async ({ page }) => {
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible()
-  })
+  test("backend column header is visible", async ({ page }) => {
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible();
+  });

-  test('backend badges shown in table rows', async ({ page }) => {
-    const table = page.locator('table')
-    await expect(table.locator('.badge', { hasText: 'llama-cpp' })).toBeVisible()
-    await expect(table.locator('.badge', { hasText: /^whisper$/ })).toBeVisible()
-  })
+  test("backend badges shown in table rows", async ({ page }) => {
+    const table = page.locator("table");
+    await expect(
+      table.locator(".badge", { hasText: "llama-cpp" }),
+    ).toBeVisible();
+    await expect(
+      table.locator(".badge", { hasText: /^whisper$/ }),
+    ).toBeVisible();
+  });

-  test('backend dropdown is visible', async ({ page }) => {
-    await expect(page.locator('button', { hasText: 'All Backends' })).toBeVisible()
-  })
+  test("backend dropdown is visible", async ({ page }) => {
+    await expect(
+      page.locator("button", { hasText: "All Backends" }),
+    ).toBeVisible();
+  });

-  test('clicking backend dropdown opens searchable panel', async ({ page }) => {
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    await expect(page.locator('input[placeholder="Search backends..."]')).toBeVisible()
-  })
+  test("clicking backend dropdown opens searchable panel", async ({ page }) => {
+    await page.locator("button", { hasText: "All Backends" }).click();
+    await expect(
+      page.locator('input[placeholder="Search backends..."]'),
+    ).toBeVisible();
+  });

-  test('typing in search filters dropdown options', async ({ page }) => {
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    const searchInput = page.locator('input[placeholder="Search backends..."]')
-    await searchInput.fill('llama')
+  test("typing in search filters dropdown options", async ({ page }) => {
+    await page.locator("button", { hasText: "All Backends" }).click();
+    const searchInput = page.locator('input[placeholder="Search backends..."]');
+    await searchInput.fill("llama");

    // llama-cpp option should be visible, whisper should not
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..')  .locator('..')
-    await expect(dropdown.locator('text=llama-cpp')).toBeVisible()
-    await expect(dropdown.locator('text=whisper')).not.toBeVisible()
-  })
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await expect(dropdown.locator("text=llama-cpp")).toBeVisible();
+    await expect(dropdown.locator("text=whisper")).not.toBeVisible();
+  });

-  test('selecting a backend updates the dropdown label', async ({ page }) => {
-    await page.locator('button', { hasText: 'All Backends' }).click()
+  test("selecting a backend updates the dropdown label", async ({ page }) => {
+    await page.locator("button", { hasText: "All Backends" }).click();
    // Click the llama-cpp option within the dropdown (not the table badge)
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
-    await dropdown.locator('text=llama-cpp').click()
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await dropdown.locator("text=llama-cpp").click();

    // The dropdown button should now show the selected backend instead of "All Backends"
-    await expect(page.locator('button span', { hasText: 'llama-cpp' })).toBeVisible()
-  })
+    await expect(
+      page.locator("button span", { hasText: "llama-cpp" }),
+    ).toBeVisible();
+  });

-  test('expanded row shows backend in detail', async ({ page }) => {
+  test("expanded row shows backend in detail", async ({ page }) => {
    // Click the first model row to expand it
-    await page.locator('tr', { hasText: 'llama-model' }).click()
+    await page.locator("tr", { hasText: "llama-model" }).click();

    // The detail view should show Backend label and value
-    const detail = page.locator('td[colspan="8"]')
-    await expect(detail.locator('text=Backend')).toBeVisible()
-    await expect(detail.locator('text=llama-cpp')).toBeVisible()
-  })
-})
+    const detail = page.locator('td[colspan="8"]');
+    await expect(detail.locator("text=Backend")).toBeVisible();
+    await expect(detail.locator("text=llama-cpp")).toBeVisible();
+  });
+});

 const BACKEND_USECASES_MOCK = {
-  'llama-cpp': ['chat', 'embeddings', 'vision'],
-  'whisper': ['transcript'],
-  'stablediffusion': ['image'],
-}
+  "llama-cpp": ["chat", "embeddings", "vision"],
+  whisper: ["transcript"],
+  stablediffusion: ["image"],
+};

-test.describe('Models Gallery - Multi-select Filters', () => {
+const EMPTY_FILTERED_RESPONSE = {
+  ...MOCK_MODELS_RESPONSE,
+  models: [],
+  availableModels: 0,
+  totalPages: 1,
+  currentPage: 1,
+};
+
+test.describe("Models Gallery - Multi-select Filters", () => {
  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/models*', (route) => {
+    await page.route("**/api/models*", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_MODELS_RESPONSE),
-      })
-    })
-    await page.route('**/api/backends/usecases', (route) => {
+      });
+    });
+    await page.route("**/api/backends/usecases", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(BACKEND_USECASES_MOCK),
-      })
-    })
-    await page.goto('/app/models')
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
-  })
+      });
+    });
+    await page.goto("/app/models");
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });

-  test('multi-select toggle: click Chat, TTS, then Chat again', async ({ page }) => {
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+  test("multi-select toggle: click Chat, TTS, then Chat again", async ({
+    page,
+  }) => {
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });

-    await chatBtn.click()
-    await expect(chatBtn).toHaveClass(/active/)
+    await chatBtn.click();
+    await expect(chatBtn).toHaveClass(/active/);

-    await ttsBtn.click()
-    await expect(chatBtn).toHaveClass(/active/)
-    await expect(ttsBtn).toHaveClass(/active/)
+    await ttsBtn.click();
+    await expect(chatBtn).toHaveClass(/active/);
+    await expect(ttsBtn).toHaveClass(/active/);

    // Click Chat again to deselect it
-    await chatBtn.click()
-    await expect(chatBtn).not.toHaveClass(/active/)
-    await expect(ttsBtn).toHaveClass(/active/)
-  })
+    await chatBtn.click();
+    await expect(chatBtn).not.toHaveClass(/active/);
+    await expect(ttsBtn).toHaveClass(/active/);
+  });

  test('"All" clears selection', async ({ page }) => {
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const allBtn = page.locator('.filter-btn', { hasText: 'All' })
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const allBtn = page.locator(".filter-btn", { hasText: "All" });

-    await chatBtn.click()
-    await expect(chatBtn).toHaveClass(/active/)
+    await chatBtn.click();
+    await expect(chatBtn).toHaveClass(/active/);

-    await allBtn.click()
-    await expect(allBtn).toHaveClass(/active/)
-    await expect(chatBtn).not.toHaveClass(/active/)
-  })
+    await allBtn.click();
+    await expect(allBtn).toHaveClass(/active/);
+    await expect(chatBtn).not.toHaveClass(/active/);
+  });

-  test('query param sent correctly with multiple filters', async ({ page }) => {
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+  test("query param sent correctly with multiple filters", async ({ page }) => {
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });

    // Click Chat and wait for its request to settle
-    await chatBtn.click()
-    await page.waitForResponse(resp => resp.url().includes('/api/models'))
+    await chatBtn.click();
+    await page.waitForResponse((resp) => resp.url().includes("/api/models"));

    // Now click TTS and capture the resulting request
    const [request] = await Promise.all([
-      page.waitForRequest(req => {
-        if (!req.url().includes('/api/models')) return false
-        const u = new URL(req.url())
-        const tag = u.searchParams.get('tag')
-        return tag && tag.split(',').length >= 2
+      page.waitForRequest((req) => {
+        if (!req.url().includes("/api/models")) return false;
+        const u = new URL(req.url());
+        const tag = u.searchParams.get("tag");
+        return tag && tag.split(",").length >= 2;
      }),
      ttsBtn.click(),
-    ])
+    ]);

-    const url = new URL(request.url())
-    const tags = url.searchParams.get('tag').split(',').sort()
-    expect(tags).toEqual(['chat', 'tts'])
-  })
+    const url = new URL(request.url());
+    const tags = url.searchParams.get("tag").split(",").sort();
+    expect(tags).toEqual(["chat", "tts"]);
+  });

-  test('backend greys out unavailable filters', async ({ page }) => {
+  test("backend greys out unavailable filters", async ({ page }) => {
    // Select llama-cpp backend via dropdown
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
-    await dropdown.locator('text=llama-cpp').click()
+    await page.locator("button", { hasText: "All Backends" }).click();
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await dropdown.locator("text=llama-cpp").click();

    // Wait for filter state to update
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
-    const sttBtn = page.locator('.filter-btn', { hasText: 'STT' })
-    const imageBtn = page.locator('.filter-btn', { hasText: 'Image' })
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });
+    const sttBtn = page.locator(".filter-btn", { hasText: "STT" });
+    const imageBtn = page.locator(".filter-btn", { hasText: "Image" });

    // TTS, STT, Image should be disabled for llama-cpp
-    await expect(ttsBtn).toBeDisabled()
-    await expect(sttBtn).toBeDisabled()
-    await expect(imageBtn).toBeDisabled()
+    await expect(ttsBtn).toBeDisabled();
+    await expect(sttBtn).toBeDisabled();
+    await expect(imageBtn).toBeDisabled();

    // Chat, Embeddings, Vision should remain enabled
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const embBtn = page.locator('.filter-btn', { hasText: 'Embeddings' })
-    const visBtn = page.locator('.filter-btn', { hasText: 'Vision' })
-    await expect(chatBtn).toBeEnabled()
-    await expect(embBtn).toBeEnabled()
-    await expect(visBtn).toBeEnabled()
-  })
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const embBtn = page.locator(".filter-btn", { hasText: "Embeddings" });
+    const visBtn = page.locator(".filter-btn", { hasText: "Vision" });
+    await expect(chatBtn).toBeEnabled();
+    await expect(embBtn).toBeEnabled();
+    await expect(visBtn).toBeEnabled();
+  });

-  test('backend clears incompatible filters', async ({ page }) => {
+  test("backend clears incompatible filters", async ({ page }) => {
    // Select TTS filter first
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
-    await ttsBtn.click()
-    await expect(ttsBtn).toHaveClass(/active/)
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });
+    await ttsBtn.click();
+    await expect(ttsBtn).toHaveClass(/active/);

    // Now select llama-cpp backend (which doesn't support TTS)
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
-    await dropdown.locator('text=llama-cpp').click()
+    await page.locator("button", { hasText: "All Backends" }).click();
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await dropdown.locator("text=llama-cpp").click();

    // TTS should be auto-removed from selection
-    await expect(ttsBtn).not.toHaveClass(/active/)
-  })
-})
+    await expect(ttsBtn).not.toHaveClass(/active/);
+  });
+});

-test.describe('Models Gallery - Fits In GPU Filter', () => {
+test.describe("Models Gallery - Fits In GPU Filter", () => {
  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/models*', (route) => {
+    await page.route("**/api/models*", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_MODELS_RESPONSE),
-      })
-    })
+      });
+    });

-    await page.route('**/api/resources', (route) => {
+    await page.route("**/api/resources", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_GPU_RESOURCES_RESPONSE),
-      })
-    })
+      });
+    });

-    await page.route('**/api/models/estimate/*', (route) => {
-      const url = new URL(route.request().url())
-      const id = decodeURIComponent(url.pathname.split('/').pop() || '')
+    await page.route("**/api/models/estimate/*", (route) => {
+      const url = new URL(route.request().url());
+      const id = decodeURIComponent(url.pathname.split("/").pop() || "");
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_ESTIMATES[id] || {}),
-      })
-    })
+      });
+    });

-    await page.goto('/app/models')
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
-  })
+    await page.goto("/app/models");
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });

-  test('fits toggle is visible when GPU resources are available', async ({ page }) => {
-    await expect(page.getByText('Fits in GPU')).toBeVisible()
-  })
+  test("fits toggle is visible when GPU resources are available", async ({
+    page,
+  }) => {
+    await expect(page.getByText("Fits in GPU")).toBeVisible();
+  });

-  test('enabling fits filter hides models that exceed available VRAM', async ({ page }) => {
-    await expect(page.locator('tr', { hasText: 'stablediffusion-model' })).toBeVisible()
+  test("enabling fits filter hides models that exceed available VRAM", async ({
+    page,
+  }) => {
+    await expect(
+      page.locator("tr", { hasText: "stablediffusion-model" }),
+    ).toBeVisible();

    // The shared <Toggle> visually hides its native input (opacity:0;w:0;h:0),
    // so .check() can't interact with it directly — click the visible track.
-    await page.locator('label.filter-bar-group__toggle', { hasText: 'Fits in GPU' }).locator('.toggle__track').click()
+    await page
+      .locator("label.filter-bar-group__toggle", { hasText: "Fits in GPU" })
+      .locator(".toggle__track")
+      .click();

-    await expect(page.locator('tr', { hasText: 'stablediffusion-model' })).toHaveCount(0)
-    await expect(page.locator('tr', { hasText: 'llama-model' })).toBeVisible()
+    await expect(
+      page.locator("tr", { hasText: "stablediffusion-model" }),
+    ).toHaveCount(0);
+    await expect(page.locator("tr", { hasText: "llama-model" })).toBeVisible();
    // Unknown estimate stays visible until an explicit non-fit verdict exists.
-    await expect(page.locator('tr', { hasText: 'unknown-model' })).toBeVisible()
-  })
+    await expect(
+      page.locator("tr", { hasText: "unknown-model" }),
+    ).toBeVisible();
+  });

-  test('fits filter state persists after reload', async ({ page }) => {
-    await page.locator('label.filter-bar-group__toggle', { hasText: 'Fits in GPU' }).locator('.toggle__track').click()
-    await page.reload()
-    await expect(page.getByLabel('Fits in GPU')).toBeChecked()
-  })
-})
+  test("fits filter state persists after reload", async ({ page }) => {
+    await page
+      .locator("label.filter-bar-group__toggle", { hasText: "Fits in GPU" })
+      .locator(".toggle__track")
+      .click();
+    await page.reload();
+    await expect(page.getByLabel("Fits in GPU")).toBeChecked();
+  });
+});
+
+test.describe("Models Gallery - Empty State", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route("**/api/models*", (route) => {
+      const url = new URL(route.request().url());
+      const tag = url.searchParams.get("tag");
+      const body =
+        tag === "chat" ? EMPTY_FILTERED_RESPONSE : MOCK_MODELS_RESPONSE;
+
+      route.fulfill({
+        contentType: "application/json",
+        body: JSON.stringify(body),
+      });
+    });
+
+    await page.goto("/app/models");
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });
+
+  test("shows empty state for filtered-out results and clear filters restores the gallery", async ({
+    page,
+  }) => {
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const allBtn = page.locator(".filter-btn", { hasText: "All" });
+
+    await chatBtn.click();
+
+    await expect(page.locator(".empty-state-title")).toHaveText(
+      "No models found",
+    );
+    await expect(page.locator(".empty-state-text")).toHaveText(
+      "No models match your current search or filters.",
+    );
+
+    const clearBtn = page.getByRole("button", { name: "Clear filters" });
+    await expect(clearBtn).toBeVisible();
+    await expect(page.locator("tr", { hasText: "llama-model" })).toHaveCount(0);
+
+    await clearBtn.click();
+
+    await expect(allBtn).toHaveClass(/active/);
+    await expect(chatBtn).not.toHaveClass(/active/);
+    await expect(page.locator(".empty-state")).toHaveCount(0);
+    await expect(page.locator("tr", { hasText: "llama-model" })).toBeVisible();
+  });
+});
--- a/core/http/react-ui/e2e/v8-coverage.js
+++ b/core/http/react-ui/e2e/v8-coverage.js
@@ -0,0 +1,88 @@
+// V8 -> istanbul coverage harvest for the Playwright suite.
+//
+// When PW_V8_COVERAGE=1 the suite runs against a NON-instrumented build (built
+// with COVERAGE_V8=true, which only adds source maps). Chromium collects native
+// V8 coverage with near-zero runtime overhead; we convert it back to per-source
+// istanbul data via v8-to-istanbul (using the on-disk source maps), filter to
+// src/**, and write the same .nyc_output/*.json the istanbul path produced — so
+// `nyc report` and the strict baseline gate are unchanged.
+//
+// Conversion (v8-to-istanbul load() parses the large bundle source map) is the
+// expensive part, so we do NOT convert per test. Instead each worker collects
+// raw V8 coverage from every test, merges it with @bcoe/v8-coverage (which sums
+// counts and reconciles overlapping ranges correctly — applyCoverage can't be
+// called repeatedly, it pushes/overwrites), and converts ONCE at worker
+// teardown. That cuts conversions from ~152 (per test) to ~1 per worker.
+import v8toIstanbul from 'v8-to-istanbul'
+import libCoverage from 'istanbul-lib-coverage'
+import { mergeProcessCovs } from '@bcoe/v8-coverage'
+import { mkdirSync, writeFileSync, existsSync } from 'node:fs'
+import { randomUUID } from 'node:crypto'
+import path from 'node:path'
+
+const COVERAGE_DIR = path.resolve(process.cwd(), '.nyc_output')
+const DIST_ASSETS = path.resolve(process.cwd(), 'dist', 'assets')
+// Absolute app source dir. Match on this (not a bare "/src/" substring) — the
+// repo itself lives under .../go/src/..., so a substring check would collide.
+const SRC_DIR = path.resolve(process.cwd(), 'src') + path.sep
+// Only our own bundle chunks under /assets/*.js carry app source maps.
+const APP_CHUNK = /\/assets\/([^/?]+\.js)(\?|$)/
+
+export async function startV8(page) {
+  // resetOnNavigation:false so hard navigations (goto) within a test accumulate.
+  await page.coverage.startJSCoverage({ resetOnNavigation: false })
+}
+
+// One accumulator per worker (created by the worker-scoped fixture).
+export function createAccumulator() {
+  const processCovs = []
+
+  return {
+    // Called on each test teardown with that test's V8 coverage entries.
+    add(entries) {
+      const result = entries
+        .filter((e) => APP_CHUNK.test(e.url))
+        // Keep only structural fields (drop the ~1MB `source` per entry — it's
+        // re-read from disk at convert time — to bound per-worker memory).
+        .map((e) => ({ scriptId: e.scriptId || e.url, url: e.url, functions: e.functions }))
+      if (result.length) processCovs.push({ result })
+    },
+
+    // Called once at worker teardown: merge all tests' coverage, convert, write.
+    async flush() {
+      if (processCovs.length === 0) return
+      const merged = mergeProcessCovs(processCovs)
+      const map = libCoverage.createCoverageMap({})
+
+      for (const script of merged.result) {
+        const m = APP_CHUNK.exec(script.url)
+        if (!m) continue
+        const diskPath = path.join(DIST_ASSETS, m[1])
+        if (!existsSync(diskPath)) continue
+
+        // v8-to-istanbul auto-loads source + sibling .map from disk; the served
+        // bytes match dist, so the V8 ranges line up.
+        const converter = v8toIstanbul(diskPath, 0)
+        try {
+          await converter.load()
+          converter.applyCoverage(script.functions)
+          const data = converter.toIstanbul()
+          for (const [key, fileCov] of Object.entries(data)) {
+            // v8-to-istanbul keys are already absolute; keep only app sources.
+            if (!key.startsWith(SRC_DIR) || key.includes(`${path.sep}node_modules${path.sep}`)) continue
+            map.merge({ [key]: fileCov })
+          }
+        } catch {
+          // skip a chunk we couldn't convert
+        } finally {
+          converter.destroy()
+        }
+      }
+
+      const json = map.toJSON()
+      if (Object.keys(json).length === 0) return
+      mkdirSync(COVERAGE_DIR, { recursive: true })
+      writeFileSync(path.join(COVERAGE_DIR, `v8-${randomUUID()}.json`), JSON.stringify(json))
+    },
+  }
+}
--- a/core/http/react-ui/package.json
+++ b/core/http/react-ui/package.json
@@ -12,6 +12,7 @@
    "test:e2e": "playwright test",
    "test:e2e:ui": "playwright test --ui",
    "build:coverage": "COVERAGE=true vite build",
+    "build:coverage-v8": "COVERAGE_V8=true vite build",
    "coverage:report": "nyc report"
  },
  "dependencies": {
@@ -42,6 +43,7 @@
    "yaml": "^2.8.3"
  },
  "devDependencies": {
+    "@bcoe/v8-coverage": "^1.0.2",
    "@eslint/js": "^9.27.0",
    "@playwright/test": "1.58.2",
    "@vitejs/plugin-react": "^6.0.2",
@@ -51,6 +53,7 @@
    "globals": "^16.1.0",
    "i18next-parser": "^9.4.0",
    "nyc": "^18.0.0",
+    "v8-to-istanbul": "^9.3.0",
    "vite": "^8.0.14",
    "vite-plugin-istanbul": "^9.0.0"
  }
--- a/core/http/react-ui/src/App.jsx
+++ b/core/http/react-ui/src/App.jsx
@@ -1,4 +1,4 @@
-import { useState, useEffect, useRef } from 'react'
+import { useState, useEffect, useRef, Suspense } from 'react'
 import { Outlet, useLocation, useNavigate } from 'react-router-dom'
 import { useTranslation } from 'react-i18next'
 import Sidebar from './components/Sidebar'
@@ -122,7 +122,14 @@ export default function App() {
        </header>
        <div className="main-content-inner">
          <div className="page-transition" key={location.pathname}>
-            <Outlet context={{ addToast }} />
+            {/* Per-route Suspense catches React.lazy chunk loads (router.jsx)
+                here, inside the App layout. Without it, suspension would bubble
+                up to main.jsx's outer boundary and unmount the sidebar/header
+                on every navigation. fallback={null} keeps the shell stable; the
+                page-content area briefly blanks while the chunk arrives. */}
+            <Suspense fallback={null}>
+              <Outlet context={{ addToast }} />
+            </Suspense>
          </div>
        </div>
        {!isChatRoute && (
--- a/core/http/react-ui/src/components/CanvasPanel.jsx
+++ b/core/http/react-ui/src/components/CanvasPanel.jsx
@@ -4,7 +4,7 @@ import { getArtifactIcon } from '../utils/artifacts'
 import { safeHref } from '../utils/url'
 import { copyToClipboard } from '../utils/clipboard'
 import DOMPurify from 'dompurify'
-import hljs from 'highlight.js'
+import hljs from '../utils/hljs'

 export default function CanvasPanel({ artifacts, selectedId, onSelect, onClose }) {
  const [showPreview, setShowPreview] = useState(true)
--- a/core/http/react-ui/src/components/Sidebar.jsx
+++ b/core/http/react-ui/src/components/Sidebar.jsx
@@ -6,6 +6,7 @@ import LanguageSwitcher from './LanguageSwitcher'
 import { useAuth } from '../context/AuthContext'
 import { useBranding } from '../contexts/BrandingContext'
 import { apiUrl } from '../utils/basePath'
+import { preloadRoute } from '../router'

 const COLLAPSED_KEY = 'localai_sidebar_collapsed'
 const SECTIONS_KEY = 'localai_sidebar_sections'
@@ -85,6 +86,10 @@ const sections = [
 function NavItem({ item, onClose, collapsed }) {
  const { t } = useTranslation('nav')
  const label = t(item.labelKey)
+  // Warm the route's lazy chunk before the user clicks. Touch fires ~150ms
+  // before the synthetic click on mobile; mouseenter/focus cover desktop and
+  // keyboard. The underlying import() is memoised so multiple triggers are free.
+  const preload = () => preloadRoute(item.path)
  return (
    <NavLink
      to={item.path}
@@ -93,6 +98,9 @@ function NavItem({ item, onClose, collapsed }) {
        `nav-item ${isActive ? 'active' : ''}`
      }
      onClick={onClose}
+      onMouseEnter={preload}
+      onFocus={preload}
+      onTouchStart={preload}
      title={collapsed ? label : undefined}
    >
      <i className={`${item.icon} nav-icon`} />
@@ -296,6 +304,9 @@ export default function Sidebar({ isOpen, onClose }) {
              <button
                className="sidebar-user-link"
                onClick={() => { navigate('/app/account'); onClose?.() }}
+                onMouseEnter={() => preloadRoute('/app/account')}
+                onFocus={() => preloadRoute('/app/account')}
+                onTouchStart={() => preloadRoute('/app/account')}
                title={t('accountSettings')}
              >
                {user.avatarUrl ? (
--- a/core/http/react-ui/src/router.jsx
+++ b/core/http/react-ui/src/router.jsx
@@ -1,54 +1,81 @@
+import { lazy } from 'react'
 import { createBrowserRouter, Navigate, useParams } from 'react-router-dom'
 import { routerBasename } from './utils/basePath'
 import App from './App'
-import Home from './pages/Home'
-import Chat from './pages/Chat'
-import Models from './pages/Models'
-import Manage from './pages/Manage'
-import ImageGen from './pages/ImageGen'
-import VideoGen from './pages/VideoGen'
-import TTS from './pages/TTS'
-import Sound from './pages/Sound'
-import AudioTransform from './pages/AudioTransform'
-import Talk from './pages/Talk'
-import Backends from './pages/Backends'
-import Settings from './pages/Settings'
-import Traces from './pages/Traces'
-import P2P from './pages/P2P'
-import Agents from './pages/Agents'
-import AgentCreate from './pages/AgentCreate'
-import AgentChat from './pages/AgentChat'
-import AgentStatus from './pages/AgentStatus'
-import Collections from './pages/Collections'
-import CollectionDetails from './pages/CollectionDetails'
-import Skills from './pages/Skills'
-import SkillEdit from './pages/SkillEdit'
-import AgentJobs from './pages/AgentJobs'
-import AgentTaskDetails from './pages/AgentTaskDetails'
-import AgentJobDetails from './pages/AgentJobDetails'
-import ModelEditor from './pages/ModelEditor'
-// PipelineEditor removed — the Model Editor with templates handles all model types
-import ImportModel from './pages/ImportModel'
-import BackendLogs from './pages/BackendLogs'
-import Explorer from './pages/Explorer'
-import Login from './pages/Login'
-import FineTune from './pages/FineTune'
-import Quantize from './pages/Quantize'
-import Studio from './pages/Studio'
-import FaceRecognition from './pages/FaceRecognition'
-import VoiceRecognition from './pages/VoiceRecognition'
-import Nodes from './pages/Nodes'
-import NodeBackendLogs from './pages/NodeBackendLogs'
-import NotFound from './pages/NotFound'
-import Usage from './pages/Usage'
-import Users from './pages/Users'
-import Middleware from './pages/Middleware'
-import Account from './pages/Account'
 import RequireAdmin from './components/RequireAdmin'
 import RequireAuth from './components/RequireAuth'
 import RequireAuthEnabled from './components/RequireAuthEnabled'
 import RequireFeature from './components/RequireFeature'

+// Pages are code-split: each becomes its own chunk loaded on demand, so a route
+// no longer drags every other page (and its heavy deps — CodeMirror, the MCP
+// SDK, yaml, marked) into the initial bundle. The <Suspense> boundary in
+// App.jsx (around <Outlet/>) shows nothing while a chunk loads, keeping the
+// sidebar/header mounted.
+//
+// `page(key, loader)` registers the dynamic import under a route-segment key
+// (the first segment after /app/) so a NavLink can warm the chunk on hover via
+// `preloadRoute('/app/chat')`. Dynamic import() is memoised by the module
+// loader, so a preloaded chunk is reused — not re-fetched — when the user
+// actually navigates. Pages with `key: null` aren't sidebar-reachable; they
+// still code-split, they just won't be preloaded from the nav.
+const preloaders = {}
+function page(key, loader) {
+  if (key !== null) preloaders[key] = loader
+  return lazy(loader)
+}
+
+export function preloadRoute(path) {
+  if (!path) return
+  const m = path.match(/^\/app(?:\/([^/?#]*))?/)
+  if (!m) return
+  preloaders[m[1] ?? '']?.().catch(() => { /* network blip — real click will retry */ })
+}
+
+const Home = page('', () => import('./pages/Home'))
+const Chat = page('chat', () => import('./pages/Chat'))
+const Models = page('models', () => import('./pages/Models'))
+const Manage = page('manage', () => import('./pages/Manage'))
+const ImageGen = page('image', () => import('./pages/ImageGen'))
+const VideoGen = page('video', () => import('./pages/VideoGen'))
+const TTS = page('tts', () => import('./pages/TTS'))
+const Sound = page('sound', () => import('./pages/Sound'))
+const AudioTransform = page('transform', () => import('./pages/AudioTransform'))
+const Talk = page('talk', () => import('./pages/Talk'))
+const Backends = page('backends', () => import('./pages/Backends'))
+const Settings = page('settings', () => import('./pages/Settings'))
+const Traces = page('traces', () => import('./pages/Traces'))
+const P2P = page('p2p', () => import('./pages/P2P'))
+const Agents = page('agents', () => import('./pages/Agents'))
+const AgentCreate = page(null, () => import('./pages/AgentCreate'))
+const AgentChat = page(null, () => import('./pages/AgentChat'))
+const AgentStatus = page(null, () => import('./pages/AgentStatus'))
+const Collections = page('collections', () => import('./pages/Collections'))
+const CollectionDetails = page(null, () => import('./pages/CollectionDetails'))
+const Skills = page('skills', () => import('./pages/Skills'))
+const SkillEdit = page(null, () => import('./pages/SkillEdit'))
+const AgentJobs = page('agent-jobs', () => import('./pages/AgentJobs'))
+const AgentTaskDetails = page(null, () => import('./pages/AgentTaskDetails'))
+const AgentJobDetails = page(null, () => import('./pages/AgentJobDetails'))
+const ModelEditor = page(null, () => import('./pages/ModelEditor'))
+// PipelineEditor removed — the Model Editor with templates handles all model types
+const ImportModel = page(null, () => import('./pages/ImportModel'))
+const BackendLogs = page(null, () => import('./pages/BackendLogs'))
+const Explorer = page(null, () => import('./pages/Explorer'))
+const Login = page(null, () => import('./pages/Login'))
+const FineTune = page('fine-tune', () => import('./pages/FineTune'))
+const Quantize = page('quantize', () => import('./pages/Quantize'))
+const Studio = page('studio', () => import('./pages/Studio'))
+const FaceRecognition = page('face', () => import('./pages/FaceRecognition'))
+const VoiceRecognition = page('voice', () => import('./pages/VoiceRecognition'))
+const Nodes = page('nodes', () => import('./pages/Nodes'))
+const NodeBackendLogs = page(null, () => import('./pages/NodeBackendLogs'))
+const NotFound = page(null, () => import('./pages/NotFound'))
+const Usage = page('usage', () => import('./pages/Usage'))
+const Users = page('users', () => import('./pages/Users'))
+const Middleware = page('middleware', () => import('./pages/Middleware'))
+const Account = page('account', () => import('./pages/Account'))
+
 function BrowseRedirect() {
  const { '*': splat } = useParams()
  return <Navigate to={`/app/${splat || ''}`} replace />
--- a/core/http/react-ui/src/utils/artifacts.js
+++ b/core/http/react-ui/src/utils/artifacts.js
@@ -1,6 +1,6 @@
 import { Marked } from 'marked'
 import DOMPurify from 'dompurify'
-import hljs from 'highlight.js'
+import hljs from './hljs'
 import { apiUrl } from './basePath'

 const FENCE_REGEX = /```(\w*)\n([\s\S]*?)```/g
@@ -119,12 +119,17 @@ export function getArtifactIcon(type, language) {
 const artifactMarked = new Marked({
  renderer: {
    code({ text, lang }) {
-      // Will be overridden per-call
+      // Match markdown.js's fallback: when the language is unknown (not in
+      // the curated hljs set, see utils/hljs.js), use highlightAuto so the
+      // block still picks up theme colors — otherwise the same fenced block
+      // would render differently in chat (auto-highlighted) vs artifact card
+      // (plain text).
      if (lang && hljs.getLanguage(lang)) {
        const highlighted = hljs.highlight(text, { language: lang }).value
        return `<pre><code class="hljs language-${lang}">${highlighted}</code></pre>`
      }
-      return `<pre><code>${text.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</code></pre>`
+      const highlighted = hljs.highlightAuto(text).value
+      return `<pre><code class="hljs">${highlighted}</code></pre>`
    },
  },
  breaks: true,
--- a/core/http/react-ui/src/utils/hljs.js
+++ b/core/http/react-ui/src/utils/hljs.js
@@ -0,0 +1,53 @@
+// Curated highlight.js build.
+//
+// `import hljs from 'highlight.js'` pulls in the full bundle — ~190 language
+// grammars, ~893 KB raw / ~294 KB gzip, the single biggest item in the app
+// bundle (measured). We render code blocks from chat/markdown/canvas only, and
+// only ever for a handful of common languages, so we import the lightweight
+// core and register just the grammars below. `highlightAuto` still works — it
+// auto-detects among the registered set, which covers what an LLM realistically
+// emits. Import hljs from THIS module, never directly from 'highlight.js'.
+import hljs from 'highlight.js/lib/core'
+
+import bash from 'highlight.js/lib/languages/bash'
+import c from 'highlight.js/lib/languages/c'
+import cpp from 'highlight.js/lib/languages/cpp'
+import csharp from 'highlight.js/lib/languages/csharp'
+import css from 'highlight.js/lib/languages/css'
+import diff from 'highlight.js/lib/languages/diff'
+import dockerfile from 'highlight.js/lib/languages/dockerfile'
+import go from 'highlight.js/lib/languages/go'
+import ini from 'highlight.js/lib/languages/ini'
+import java from 'highlight.js/lib/languages/java'
+import javascript from 'highlight.js/lib/languages/javascript'
+import json from 'highlight.js/lib/languages/json'
+import kotlin from 'highlight.js/lib/languages/kotlin'
+import lua from 'highlight.js/lib/languages/lua'
+import makefile from 'highlight.js/lib/languages/makefile'
+import markdown from 'highlight.js/lib/languages/markdown'
+import php from 'highlight.js/lib/languages/php'
+import plaintext from 'highlight.js/lib/languages/plaintext'
+import powershell from 'highlight.js/lib/languages/powershell'
+import python from 'highlight.js/lib/languages/python'
+import ruby from 'highlight.js/lib/languages/ruby'
+import rust from 'highlight.js/lib/languages/rust'
+import scss from 'highlight.js/lib/languages/scss'
+import shell from 'highlight.js/lib/languages/shell'
+import sql from 'highlight.js/lib/languages/sql'
+import swift from 'highlight.js/lib/languages/swift'
+import typescript from 'highlight.js/lib/languages/typescript'
+import xml from 'highlight.js/lib/languages/xml'
+import yaml from 'highlight.js/lib/languages/yaml'
+
+// Each grammar registers its own aliases (e.g. js→javascript, ts→typescript,
+// yml→yaml, html→xml, sh→bash, py→python), so hljs.getLanguage('js') resolves.
+const languages = {
+  bash, c, cpp, csharp, css, diff, dockerfile, go, ini, java, javascript,
+  json, kotlin, lua, makefile, markdown, php, plaintext, powershell, python,
+  ruby, rust, scss, shell, sql, swift, typescript, xml, yaml,
+}
+for (const [name, lang] of Object.entries(languages)) {
+  hljs.registerLanguage(name, lang)
+}
+
+export default hljs
--- a/core/http/react-ui/src/utils/markdown.js
+++ b/core/http/react-ui/src/utils/markdown.js
@@ -1,6 +1,6 @@
 import { marked } from 'marked'
 import DOMPurify from 'dompurify'
-import hljs from 'highlight.js'
+import hljs from './hljs'

 marked.setOptions({
  highlight(code, lang) {
--- a/core/http/react-ui/vite.config.js
+++ b/core/http/react-ui/vite.config.js
@@ -9,6 +9,11 @@ const backendUrl = process.env.LOCALAI_URL || 'http://localhost:8080'
 // fixture (e2e/coverage-fixtures.js). Off by default so normal/dev/prod builds
 // carry no instrumentation overhead.
 const coverage = process.env.COVERAGE === 'true'
+// COVERAGE_V8=true produces a NON-instrumented build with source maps, so the
+// Playwright coverage fixture can collect Chromium V8 coverage (near-zero
+// runtime overhead, unlike istanbul's build-time counters) and map it back to
+// source via v8-to-istanbul. Mutually exclusive with COVERAGE.
+const coverageV8 = process.env.COVERAGE_V8 === 'true'

 export default defineConfig({
  plugins: [
@@ -50,5 +55,20 @@ export default defineConfig({
  build: {
    outDir: 'dist',
    assetsDir: 'assets',
+    // Source maps are needed only to map V8 coverage back to original sources.
+    sourcemap: coverageV8,
+    rollupOptions: {
+      output: {
+        // The coverage build inlines all dynamic imports into a single chunk.
+        // The app is route-code-split (router.jsx uses React.lazy), so a normal
+        // build emits ~50 lazy chunks. V8 coverage only sees chunks a test
+        // actually loaded, so untested pages would silently drop out of the
+        // denominator and inflate the percentage. Bundling everything into one
+        // chunk for the coverage build keeps the denominator complete and the
+        // measurement invariant to how production is split. Production builds
+        // (COVERAGE_V8 unset) keep code-splitting for fast first paint.
+        inlineDynamicImports: coverageV8,
+      },
+    },
  },
 })
--- a/core/schema/message_test.go
+++ b/core/schema/message_test.go
@@ -332,5 +332,41 @@ var _ = Describe("LLM tests", func() {
 			// Should only extract text parts
 			Expect(protoMessages[0].Content).To(Equal("Hello"))
 		})
+
+		// Regression for mudler/LocalAI#10039: ToProto is the path taken by
+		// UseTokenizerTemplate backends (e.g. imported GGUFs, where the backend
+		// applies the GGUF's jinja template to the raw messages). It reads
+		// Content, not StringContent — so a message that only populated
+		// StringContent (the shape /v1/responses produced before the fix)
+		// reached the backend with empty content. These two cases pin that
+		// contract: Content is authoritative, and producers must set it.
+		It("emits empty content when only StringContent is set (Content nil)", func() {
+			messages := Messages{
+				{
+					Role:          "user",
+					StringContent: "Hello",
+				},
+			}
+
+			protoMessages := messages.ToProto()
+
+			Expect(protoMessages).To(HaveLen(1))
+			Expect(protoMessages[0].Content).To(BeEmpty())
+		})
+
+		It("carries Content through to proto regardless of StringContent", func() {
+			messages := Messages{
+				{
+					Role:          "user",
+					Content:       "Hello",
+					StringContent: "Hello",
+				},
+			}
+
+			protoMessages := messages.ToProto()
+
+			Expect(protoMessages).To(HaveLen(1))
+			Expect(protoMessages[0].Content).To(Equal("Hello"))
+		})
 	})
 })
--- a/core/templates/evaluator.go
+++ b/core/templates/evaluator.go
@@ -111,7 +111,11 @@ func (e *Evaluator) TemplateMessages(input schema.OpenAIRequest, messages []sche
 			}
 		}
 		r := config.Roles[role]
-		contentExists := i.Content != nil && i.StringContent != ""
+		// Treat StringContent as the source of truth — every downstream fallback branch in this
+		// function reads StringContent, not Content. Gating on both with && silently drops
+		// messages that have StringContent set but Content nil (e.g. /v1/responses string-input
+		// before mudler/LocalAI#10039 fix).
+		contentExists := i.StringContent != ""

 		fcall := i.FunctionCall
 		if len(i.ToolCalls) > 0 {
--- a/core/templates/evaluator_test.go
+++ b/core/templates/evaluator_test.go
@@ -218,4 +218,41 @@ var _ = Describe("Templates", func() {
 			})
 		}
 	})
+	// Regression test for mudler/LocalAI#10039: when a model has no Go-side
+	// TemplateConfig.ChatMessage block (e.g. backends that rely on the GGUF's
+	// jinja template), TemplateMessages falls through to the role-prefix path.
+	// That path must still render messages whose StringContent is populated but
+	// Content (any) is nil — which is the shape /v1/responses produced before
+	// the fix to convertORInputToMessages.
+	Context("fallback path with StringContent-only message (no ChatMessage template)", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		It("renders the role prefix and content when only StringContent is set", func() {
+			cfg := &config.ModelConfig{
+				TemplateConfig: config.TemplateConfig{},
+				Roles:          map[string]string{"user": "USER: "},
+			}
+			messages := []schema.Message{
+				{
+					Role:          "user",
+					StringContent: "hello",
+					// Content intentionally left nil — reproduces /v1/responses string-input.
+				},
+			}
+			templated := evaluator.TemplateMessages(schema.OpenAIRequest{}, messages, cfg, []functions.Function{}, false)
+			Expect(templated).To(Equal("USER: hello"), templated)
+		})
+		It("renders content even with no role mapping", func() {
+			cfg := &config.ModelConfig{
+				TemplateConfig: config.TemplateConfig{},
+			}
+			messages := []schema.Message{
+				{Role: "user", StringContent: "hello"},
+			}
+			templated := evaluator.TemplateMessages(schema.OpenAIRequest{}, messages, cfg, []functions.Function{}, false)
+			Expect(templated).To(Equal("hello"), templated)
+		})
+	})
 })
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -412,7 +412,10 @@ These load-time options control how the backend parses `<think>` reasoning block
 | `prefill_assistant` | bool | `true` | When `false`, the trailing assistant message is not pre-filled by the chat template. |

 {{% notice note %}}
-This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg (set via the YAML `reasoning.disable` field) toggles thinking on/off per call without restarting the model.
+This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg toggles thinking on/off per call without restarting the model. It can be driven either by the YAML `reasoning.disable` field (model default) or per request via the OpenAI `reasoning_effort` field on `/v1/chat/completions`:
+
+- `reasoning_effort: "none"` disables thinking for that request (`enable_thinking=false`) - useful to run a single reasoning model like Qwen3 for low-latency tasks while still enabling reasoning on other requests.
+- `reasoning_effort: "minimal" | "low" | "medium" | "high"` enables thinking, unless the model config explicitly set `reasoning.disable: true` (an operator's explicit disable wins and is never re-enabled by a request).
 {{% /notice %}}

 ### Multimodal Backend Options
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.3.1"
+  "version": "v4.3.4"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,37 +1,42 @@
 ---
+- name: "lfm2.5-8b-a1b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/LiquidAI/LFM2.5-8B-A1B-GGUF
+  description: "Try LFM •\nDocs •\nLEAP •\nDiscord\n\n# LFM2.5-8B-A1B\n\nLFM2.5 is a new family of hybrid models designed for on-device deployment. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.\n\n  - **On-device personal assistant**: Designed to power real-life applications, chaining tool calls, and following complex instructions on all devices.\n  - **Compressed performance**: Competitive with much larger dense and MoE models on instruction following and agentic tasks.\n  - **Unmatched throughput**: Fastest in its size class on both CPU and GPU inference, with day-one support for llama.cpp, MLX, vLLM, and SGLang.\n\nFind more information about LFM2.5-8B-A1B in our blog post.\n\n**AA-Omniscience Index (higher is better) rewards correct answers and penalizes hallucinations. Scores range from -100 to 100. See more results on Artificial Analysis.*\n\n## \U0001F5D2️ Model Details\n\nLFM2.5-8B-A1B is a general-purpose text-only model with the following features:\n\n...\n"
+  license: "other"
+  tags:
+    - llm
+    - gguf
+  icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/qUZVGkns1bg3sZUShBbhv.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0.15
+      model: llama-cpp/models/LFM2.5-8B-A1B-GGUF/LFM2.5-8B-A1B-Q4_K_M.gguf
+      repeat_penalty: 1.05
+      temperature: 0.1
+      top_k: 50
+      top_p: 0.1
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/LFM2.5-8B-A1B-GGUF/LFM2.5-8B-A1B-Q4_K_M.gguf
+      uri: https://huggingface.co/LiquidAI/LFM2.5-8B-A1B-GGUF/resolve/main/LFM2.5-8B-A1B-Q4_K_M.gguf
+      sha256: 4923ec14f06b968b74d663e5949867d2d9c3bf13a20b8be1a9f9af39989b2bb0
 - name: "qwopus3.5-9b-coder-mtp"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
    - https://huggingface.co/Jackrong/Qwopus3.5-9B-Coder-MTP-GGUF
-  description: |
-    # 🌟 Qwopus3.5-9B-v3.5
-
-    ## 💡 Model Overview & v3.5 Design
-
-    Qwopus3.5-9B-v3.5 is a **data-scaled continuation** of the Qwopus3.5-9B-v3 model.
-
-    The training data in v3.5 is expanded to cover a broader range of domains, including mathematics, programming, puzzle-solving, multilingual dialogue, instruction-following, multi-turn interactions, and STEM-related tasks.
-
-    Qwopus3.5-9B-v3.5 is a reasoning-enhanced model based on **Qwen3.5-9B**, designed for:
-
-      - 🧩 Structured reasoning
-      - 🔧 Tool-augmented workflows
-      - 🔁 Multi-step agentic tasks
-      - ⚡ Token-efficient inference
-
-    Compared with Qwopus3.5-9B-v3, **3.5 version does not introduce a new architecture, RL stage, or template redesign**.
-
-    This version is trained with approximately **2× more SFT data**.
-
-    ## 🎯 Motivation & Generalization Insight
-
-    The motivation behind v3.5 comes from a simple observation:
-
-    > This work is motivated by the hypothesis that scaling high-quality SFT data may further enhance the generalization ability of large language models.
-
-    In earlier Qwopus3.5 experiments, structured reasoning was observed to improve both **accuracy and efficiency**:
-
-    ...
+  description: "# \U0001F31F Qwopus3.5-9B-v3.5\n\n## \U0001F4A1 Model Overview & v3.5 Design\n\nQwopus3.5-9B-v3.5 is a **data-scaled continuation** of the Qwopus3.5-9B-v3 model.\n\nThe training data in v3.5 is expanded to cover a broader range of domains, including mathematics, programming, puzzle-solving, multilingual dialogue, instruction-following, multi-turn interactions, and STEM-related tasks.\n\nQwopus3.5-9B-v3.5 is a reasoning-enhanced model based on **Qwen3.5-9B**, designed for:\n\n  - \U0001F9E9 Structured reasoning\n  - \U0001F527 Tool-augmented workflows\n  - \U0001F501 Multi-step agentic tasks\n  - ⚡ Token-efficient inference\n\nCompared with Qwopus3.5-9B-v3, **3.5 version does not introduce a new architecture, RL stage, or template redesign**.\n\nThis version is trained with approximately **2× more SFT data**.\n\n## \U0001F3AF Motivation & Generalization Insight\n\nThe motivation behind v3.5 comes from a simple observation:\n\n> This work is motivated by the hypothesis that scaling high-quality SFT data may further enhance the generalization ability of large language models.\n\nIn earlier Qwopus3.5 experiments, structured reasoning was observed to improve both **accuracy and efficiency**:\n\n...\n"
  license: "apache-2.0"
  tags:
    - llm
@@ -67,26 +72,7 @@
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
    - https://huggingface.co/Jackrong/Qwopus3.6-27B-v2-MTP-GGUF
-  description: |
-    🪐 Qwopus3.6-27B-v2-MTP
-    MTP Release
-
-    Multi-Token Prediction reasoning model fine-tuned from Qwen3.6-27B
-
-    🧬 Trace Inversion & Negentropy
-    🧠 27B Parameters
-    ⚡ Speculative Decoding
-    🛠️ Coding / DevOps / Math
-
-    💡 What is Qwopus3.6-27B-v2-MTP?
-    🪐 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.
-
-    ⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.
-    🧩 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.
-    🧪 GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.
-    🚀 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.
-
-    ...
+  description: "\U0001FA90 Qwopus3.6-27B-v2-MTP\nMTP Release\n\nMulti-Token Prediction reasoning model fine-tuned from Qwen3.6-27B\n\n\U0001F9EC Trace Inversion & Negentropy\n\U0001F9E0 27B Parameters\n⚡ Speculative Decoding\n\U0001F6E0️ Coding / DevOps / Math\n\n\U0001F4A1 What is Qwopus3.6-27B-v2-MTP?\n\U0001FA90 Qwopus3.6-27B-v2-MTP is a speed-oriented reasoning release built on top of Qwen3.6-27B. It keeps the Qwopus line's focus on reconstructed reasoning traces, coding discipline, DevOps procedures, and mathematical derivations, while adding Multi-Token Prediction for faster generation. The goal is simple: preserve the depth and structure of a 27B reasoning model while making real interactive use noticeably faster.\n\n⚡ MTP DecodingAuxiliary future-token prediction improves throughput on long reasoning, code, math, and strict-format prompts.\n\U0001F9E9 Structured ReasoningInherits the Qwopus training recipe built around reconstructed step-by-step reasoning trajectories.\n\U0001F9EA GB10 TestedValidated on a 30-question local benchmark across Logic, Coding, DevOps, Math, and Edge tasks.\n\U0001F680 Practical SpeedDesigned for workflows where strong answers matter, but waiting several extra minutes per task does not.\n\n...\n"
  license: "apache-2.0"
  tags:
    - llm
@@ -6208,6 +6194,7 @@
  files:
    - filename: rfdetr-nano-q8_0.gguf
      uri: huggingface://mudler/rfdetr-cpp-nano/rfdetr-nano-q8_0.gguf
+      sha256: 940084c60a780f1a19a51458ae3a601454b3b843675fa0713ff43ae5bccc0d9b
 - name: rfdetr-cpp-base
  url: github:mudler/LocalAI/gallery/virtual.yaml@master
  urls:
@@ -6233,6 +6220,7 @@
  files:
    - filename: rfdetr-base-f16.gguf
      uri: huggingface://mudler/rfdetr-cpp-base/rfdetr-base-f16.gguf
+      sha256: 8a68b21a90478564bcbb758557069a618d96e25e7c358207fd85ba45b90faf52
 - name: rfdetr-cpp-small
  url: github:mudler/LocalAI/gallery/virtual.yaml@master
  urls:
--- a/gallery/qwen3.yaml
+++ b/gallery/qwen3.yaml
@@ -17,6 +17,13 @@ config_file: |
    # "pure content" PEG parser that leaks reasoning tags into content.
    options:
        - use_jinja:true
+    # With use_tokenizer_template the backend (llama.cpp) owns tool-call
+    # grammar generation and parsing too. Disabling LocalAI's own grammar lets
+    # llama.cpp's native name-first tool pipeline run; otherwise the generated
+    # grammar overrides it and the tool-call JSON leaks into content (#10052).
+    function:
+        grammar:
+            disable: true
    template:
        use_tokenizer_template: true
 name: qwen3
--- a/pkg/functions/grammars/json_schema.go
+++ b/pkg/functions/grammars/json_schema.go
@@ -155,12 +155,22 @@ func (sc *JSONSchemaConverter) visit(schema map[string]any, name string, rootSch
 			propName   string
 			propSchema map[string]any
 		}) int {
-			aOrder := propOrder[a.propName]
-			bOrder := propOrder[b.propName]
-			if aOrder != 0 && bOrder != 0 {
+			// Use presence in the order map (not a non-zero sentinel) so that
+			// the first listed key — index 0 — is honored. Keys present in
+			// properties_order sort by their index and ahead of any key that
+			// isn't listed; unlisted keys keep a stable alphabetical order.
+			aOrder, aOK := propOrder[a.propName]
+			bOrder, bOK := propOrder[b.propName]
+			switch {
+			case aOK && bOK:
 				return cmp.Compare(aOrder, bOrder)
+			case aOK:
+				return -1
+			case bOK:
+				return 1
+			default:
+				return cmp.Compare(a.propName, b.propName)
 			}
-			return cmp.Compare(a.propName, b.propName)
 		})

 		var rule strings.Builder
--- a/pkg/functions/grammars/json_schema_test.go
+++ b/pkg/functions/grammars/json_schema_test.go
@@ -547,3 +547,61 @@ realvalue
 		})
 	})
 })
+
+var _ = Describe("JSON schema property ordering (issue #10052)", func() {
+	// A function-call shaped schema. The grammar must honor the configured
+	// properties_order. Before the fix, the sort guard `aOrder != 0 && bOrder != 0`
+	// treated the first listed key (index 0) as "unset" and fell back to
+	// alphabetical order, so "arguments" was emitted before "name" even when
+	// properties_order put name first.
+	const schema = `{
+		"type": "object",
+		"properties": {
+			"name": {"type": "string"},
+			"arguments": {"type": "object", "properties": {"cmd": {"type": "string"}}}
+		}
+	}`
+
+	// keyIndex finds the position of an object-key literal (escaped as \"key\"
+	// in GBNF), which only appears where the key is emitted in the rule — not
+	// in derived rule names like root-name.
+	keyIndex := func(grammar, key string) int {
+		return strings.Index(grammar, `\"`+key+`\"`)
+	}
+
+	It("honors properties_order with name listed first (index 0)", func() {
+		grammar, err := NewJSONSchemaConverter("name,arguments").GrammarFromBytes([]byte(schema))
+		Expect(err).To(BeNil())
+		ni := keyIndex(grammar, "name")
+		ai := keyIndex(grammar, "arguments")
+		Expect(ni).To(BeNumerically(">=", 0))
+		Expect(ai).To(BeNumerically(">=", 0))
+		Expect(ni).To(BeNumerically("<", ai),
+			"properties_order lists name first, so the grammar must emit \"name\" before \"arguments\"")
+	})
+
+	It("keeps alphabetical order when properties_order is empty", func() {
+		grammar, err := NewJSONSchemaConverter("").GrammarFromBytes([]byte(schema))
+		Expect(err).To(BeNil())
+		// No explicit order: keys fall back to alphabetical, so "arguments"
+		// precedes "name". This is the documented default and must not change.
+		Expect(keyIndex(grammar, "arguments")).To(BeNumerically("<", keyIndex(grammar, "name")))
+	})
+
+	It("sorts keys present in properties_order ahead of unlisted keys", func() {
+		const schemaWithExtra = `{
+			"type": "object",
+			"properties": {
+				"name": {"type": "string"},
+				"arguments": {"type": "object", "properties": {"cmd": {"type": "string"}}},
+				"aaa_unlisted": {"type": "string"}
+			}
+		}`
+		// "aaa_unlisted" is alphabetically first but not in the order list, so
+		// it must still come after the listed name/arguments keys.
+		grammar, err := NewJSONSchemaConverter("name,arguments").GrammarFromBytes([]byte(schemaWithExtra))
+		Expect(err).To(BeNil())
+		Expect(keyIndex(grammar, "name")).To(BeNumerically("<", keyIndex(grammar, "arguments")))
+		Expect(keyIndex(grammar, "arguments")).To(BeNumerically("<", keyIndex(grammar, "aaa_unlisted")))
+	})
+})
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@@ -628,6 +628,36 @@ func buildContent(before string, parser *ChatMsgParser) string {
 // This provides better streaming and partial parsing support.
 // When format is nil or when format is set, tries "find scope/tool start, split, parse suffix"
 // first (llama.cpp PEG order) so that content before the tool block does not cause parse failure.
+// validToolNameRe matches a plausible function name. OpenAI tool names are
+// limited to letters, digits, underscores and hyphens; dots appear in some
+// providers' namespaced names. Anything else (whitespace, braces, brackets,
+// quotes, colons) signals the XML auto-detector grabbed a JSON blob or prose
+// rather than a real name.
+var validToolNameRe = regexp.MustCompile(`^[A-Za-z0-9_.\-]+$`)
+
+// plausibleToolName reports whether name looks like a real function name.
+func plausibleToolName(name string) bool {
+	return validToolNameRe.MatchString(strings.TrimSpace(name))
+}
+
+// filterPlausibleToolCalls drops auto-detected tool calls whose name is not a
+// plausible function name. This guards against a format (notably glm-4.5, whose
+// tool block is <tool_call>name...</tool_call>) mis-claiming a Hermes-style
+// <tool_call>JSON</tool_call> block and returning the whole JSON object — or
+// any leading prose / array — as the function name. Dropping the misparse lets
+// auto-detection fall through to the next format and ultimately to JSON
+// parsing, which handles Hermes correctly. Replaces the narrower leading-"{"
+// check (PR #9940); see issue #9722.
+func filterPlausibleToolCalls(calls []FuncCallResults) []FuncCallResults {
+	out := calls[:0:0]
+	for _, c := range calls {
+		if plausibleToolName(c.Name) {
+			out = append(out, c)
+		}
+	}
+	return out
+}
+
 func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]FuncCallResults, error) {
 	// Try split-on-scope first so reasoning/content before tool block is skipped
 	if format != nil {
@@ -639,7 +669,12 @@ func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]F
 		for _, fmtPreset := range formats {
 			if fmtPreset.format != nil {
 				if pr, ok := tryParseXMLFromScopeStart(s, fmtPreset.format, isPartial); ok {
-					return pr.ToolCalls, nil
+					// Auto-detect: discard misparsed (non-name) results so a
+					// format that grabbed a JSON blob doesn't win; fall through
+					// to the next format.
+					if valid := filterPlausibleToolCalls(pr.ToolCalls); len(valid) > 0 {
+						return valid, nil
+					}
 				}
 			}
 		}
@@ -659,14 +694,19 @@ func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]F
 				if err != nil {
 					// Check if it's a partial exception (recoverable)
 					if _, ok := err.(*ChatMsgPartialException); ok {
-						// Partial parse, return what we have
-						return parser.ToolCalls(), nil
+						// Partial parse, return what we have — unless every
+						// result is a misparse, in which case try the next format.
+						if valid := filterPlausibleToolCalls(parser.ToolCalls()); len(valid) > 0 {
+							return valid, nil
+						}
 					}
 					// Try next format
 					continue
 				}
 				if success && len(parser.ToolCalls()) > 0 {
-					return parser.ToolCalls(), nil
+					if valid := filterPlausibleToolCalls(parser.ToolCalls()); len(valid) > 0 {
+						return valid, nil
+					}
 				}
 			}
 		}
--- a/pkg/functions/parse_glm_9722_test.go
+++ b/pkg/functions/parse_glm_9722_test.go
@@ -0,0 +1,56 @@
+package functions
+
+import (
+	"regexp"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Robust fix for the glm-4.5 XML auto-detect false positive (relates to #9722
+// / supersedes the brittle leading-"{" filter in #9940). When the XML
+// auto-detector mis-identifies a Hermes-style <tool_call>JSON</tool_call> block
+// as glm-4.5, it extracts the block body as the function NAME. A real function
+// name is [A-Za-z0-9_.-]+; anything with braces, brackets, whitespace, quotes
+// or colons is a misparse and must not be returned (so JSON parsing can take
+// over). This is stronger than checking only for a leading "{": it also rejects
+// leading prose, JSON arrays, and brace-less garbage.
+var _ = Describe("glm-4.5 auto-detect name validation (#9722/#9940)", func() {
+	// plausibleName mirrors the contract: a returned auto-detected tool name
+	// must look like a real function name.
+	plausible := regexp.MustCompile(`^[A-Za-z0-9_.\-]+$`)
+
+	DescribeTable("auto-detect must not emit a misparsed tool name",
+		func(input string) {
+			results, err := ParseXMLIterative(input, nil, false)
+			Expect(err).ToNot(HaveOccurred())
+			for _, r := range results {
+				Expect(plausible.MatchString(r.Name)).To(BeTrue(),
+					"auto-detected XML tool name must look like a function name, got: %q", r.Name)
+			}
+		},
+		Entry("canonical Hermes JSON", "<tool_call>\n{\"name\": \"bash\", \"arguments\": {\"script\": \"ls\"}}\n</tool_call>"),
+		Entry("leading prose then JSON", "<tool_call>\nSure: {\"name\": \"bash\", \"arguments\": {\"script\": \"ls\"}}\n</tool_call>"),
+		Entry("JSON array (parallel calls)", "<tool_call>\n[{\"name\": \"bash\", \"arguments\": {}}]\n</tool_call>"),
+		Entry("brace-less garbage", "<tool_call>\nname: bash, arguments: {}\n</tool_call>"),
+	)
+
+	// No-regression: a genuine glm-4.5 tool call must still be auto-detected.
+	It("still parses a legitimate glm-4.5 tool call", func() {
+		legit := "<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>NYC</arg_value>\n</tool_call>"
+		results, err := ParseXMLIterative(legit, nil, false)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(results).To(HaveLen(1))
+		Expect(results[0].Name).To(Equal("get_weather"))
+	})
+
+	// A user who explicitly forces the glm-4.5 format keeps the raw behaviour
+	// (no name filtering) — only auto-detection is guarded.
+	It("does not filter when the glm-4.5 format is explicitly forced", func() {
+		input := "<tool_call>\n{\"name\": \"bash\", \"arguments\": {}}\n</tool_call>"
+		forced, err := ParseXMLIterative(input, GetXMLFormatPreset("glm-4.5"), false)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(forced).ToNot(BeEmpty(),
+			"explicit format must be trusted verbatim, even if it yields a JSON-blob name")
+	})
+})
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -53,6 +53,13 @@ type ModelLoader struct {
 	modelRouter              ModelRouter // distributed mode: route to remote node
 	backendLogs              *BackendLogStore
 	backendLoggingEnabled    atomic.Bool
+	// stoppingProcs marks backend processes that LocalAI is stopping on
+	// purpose (model unload / graceful shutdown), keyed by the
+	// *process.Process pointer. The exit-watcher goroutine in startProcess
+	// consults it to decide whether an exit is an expected stop or a crash —
+	// the exit code can't, since a child killed by our own SIGTERM/SIGKILL
+	// reports -1, indistinguishable from a signal-induced crash.
+	stoppingProcs sync.Map
 }

 // NewModelLoader creates a new ModelLoader instance.
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -75,6 +75,9 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 		return nil
 	}

+	// Mark the stop as intentional so the exit-watcher logs it as an
+	// expected stop, not a crash (signal-terminated children report -1).
+	ml.stoppingProcs.Store(process, struct{}{})
 	err := process.Stop()
 	if err != nil {
 		xlog.Error("(deleteProcess) error while deleting process", "error", err, "model", s)
@@ -171,8 +174,16 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 	xlog.Debug("GRPC Service state dir", "dir", grpcControlProcess.StateDir())

 	signals.RegisterGracefulTerminationHandler(func() {
-		err := grpcControlProcess.Stop()
-		if err != nil {
+		// StopAllGRPC (the deleteProcess path) is registered earlier and runs
+		// first for store-tracked backends, stopping this process and removing
+		// its pidfile. Calling Stop again then fails with "failed to read PID".
+		// Skip when it's already gone; this handler still covers processes that
+		// StopAllGRPC doesn't track (e.g. worker-supervised backends).
+		if !grpcControlProcess.IsAlive() {
+			return
+		}
+		ml.stoppingProcs.Store(grpcControlProcess, struct{}{})
+		if err := grpcControlProcess.Stop(); err != nil {
 			xlog.Error("error while shutting down grpc process", "error", err)
 		}
 	})
@@ -211,20 +222,27 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 	// whether the child is alive.
 	go func() {
 		<-grpcControlProcess.Done()
+		// LoadAndDelete both reads the intentional-stop marker and frees the
+		// map entry so it doesn't accumulate across the process's lifetime.
+		_, intentional := ml.stoppingProcs.LoadAndDelete(grpcControlProcess)
 		fields := []any{
 			"id", id,
 			"address", serverAddress,
 			"process", filepath.Base(grpcProcess),
 		}
-		code, codeErr := grpcControlProcess.ExitCode()
-		if codeErr == nil {
+		// Report the raw exit code without interpreting it: a child killed by
+		// our own SIGTERM/SIGKILL surfaces as -1 (Go reports -1 for signal
+		// termination, not the shell's 128+signal convention), so the code
+		// alone can't tell an intended stop from a crash. The stoppingProcs
+		// marker is the reliable signal for that, so it picks the log level.
+		if code, codeErr := grpcControlProcess.ExitCode(); codeErr == nil {
 			fields = append(fields, "exitCode", code)
 		}
-		// 143 = 128 + SIGTERM, the signal sent during graceful stop / model unload.
-		// Treat that and a clean 0 as expected; everything else is a likely crash.
-		if codeErr == nil && (code == "0" || code == "143") {
-			xlog.Info("Backend process exited", fields...)
+		if intentional {
+			xlog.Info("Backend process stopped", fields...)
 		} else {
+			// A stop we didn't initiate — a SIGSEGV from a missing shared
+			// library, a Python ImportError, an OOM kill, an unexpected self-exit.
 			xlog.Warn("Backend process exited unexpectedly", fields...)
 		}
 	}()
--- a/pkg/utils/path_test.go
+++ b/pkg/utils/path_test.go
@@ -0,0 +1,157 @@
+package utils_test
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/mudler/LocalAI/pkg/utils"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("utils/path tests", func() {
+	Describe("VerifyPath", func() {
+		It("accepts a simple file directly inside the base path", func() {
+			Expect(VerifyPath("model.bin", "/srv/models")).To(Succeed())
+		})
+
+		It("accepts a nested subdirectory inside the base path", func() {
+			Expect(VerifyPath("subdir/model.bin", "/srv/models")).To(Succeed())
+		})
+
+		It("accepts traversal sequences that stay inside the base", func() {
+			// "a/b/../c" collapses to "a/c", still strictly inside the base,
+			// so the verifier should permit it.
+			Expect(VerifyPath("a/b/../c", "/srv/models")).To(Succeed())
+		})
+
+		It("rejects a single parent-traversal that escapes the base", func() {
+			Expect(VerifyPath("../etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects compound traversal that climbs above the base", func() {
+			Expect(VerifyPath("a/../../etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects a deeply-escaping path that lands on the filesystem root", func() {
+			Expect(VerifyPath("../../etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects the base path itself", func() {
+			// Documents that VerifyPath requires a strict descendant: an
+			// empty user input resolves to the base directory and is
+			// rejected, which is the safer default for a download helper
+			// that expects a target file inside the base.
+			Expect(VerifyPath("", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("treats an absolute-looking user input as relative to the base", func() {
+			// filepath.Join discards no segments here: the result is
+			// "/srv/models/etc/passwd", which is still inside the base.
+			// This protects callers that forward untrusted user paths
+			// directly to the verifier.
+			Expect(VerifyPath("/etc/passwd", "/srv/models")).To(Succeed())
+		})
+
+		It("is purely lexical and does not follow symlinks", func() {
+			// VerifyPath uses filepath.Clean, not filepath.EvalSymlinks,
+			// so a symlink that escapes the base is not detected here.
+			// Callers who must defend against symlink escapes need to
+			// EvalSymlinks before delegating to VerifyPath. This test
+			// pins the current contract so the trade-off stays explicit.
+			tmpDir := GinkgoT().TempDir()
+			base := filepath.Join(tmpDir, "base")
+			outside := filepath.Join(tmpDir, "outside")
+			Expect(os.Mkdir(base, 0o755)).To(Succeed())
+			Expect(os.Mkdir(outside, 0o755)).To(Succeed())
+			Expect(os.WriteFile(filepath.Join(outside, "secret.txt"), []byte("x"), 0o600)).To(Succeed())
+			Expect(os.Symlink(outside, filepath.Join(base, "escape"))).To(Succeed())
+
+			Expect(VerifyPath("escape/secret.txt", base)).To(Succeed())
+		})
+	})
+
+	Describe("InTrustedRoot", func() {
+		It("accepts a strict descendant of the trusted root", func() {
+			Expect(InTrustedRoot("/srv/models/file", "/srv/models")).To(Succeed())
+		})
+
+		It("accepts a deeply nested descendant", func() {
+			Expect(InTrustedRoot("/srv/models/a/b/c/file", "/srv/models")).To(Succeed())
+		})
+
+		It("rejects the trusted root itself", func() {
+			// The implementation walks up before comparing, so the input
+			// path must have at least one component beneath the root.
+			Expect(InTrustedRoot("/srv/models", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects a sibling directory that shares the parent", func() {
+			Expect(InTrustedRoot("/srv/other/file", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects an unrelated absolute path", func() {
+			Expect(InTrustedRoot("/etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+	})
+
+	Describe("SanitizeFileName", func() {
+		It("returns the original name when nothing is unsafe", func() {
+			Expect(SanitizeFileName("model.bin")).To(Equal("model.bin"))
+		})
+
+		It("strips leading directory components", func() {
+			Expect(SanitizeFileName("subdir/model.bin")).To(Equal("model.bin"))
+		})
+
+		It("strips absolute path prefixes", func() {
+			Expect(SanitizeFileName("/etc/passwd")).To(Equal("passwd"))
+		})
+
+		It("collapses parent-traversal sequences and keeps only the leaf", func() {
+			Expect(SanitizeFileName("../etc/passwd")).To(Equal("passwd"))
+		})
+
+		It("removes embedded .. sequences that Clean+Base alone do not catch", func() {
+			// After Clean+Base "foo..bar" survives unchanged; the explicit
+			// ReplaceAll on ".." in the implementation is the last line of
+			// defence against filenames that look benign but still contain
+			// traversal markers.
+			Expect(SanitizeFileName("foo..bar")).To(Equal("foobar"))
+		})
+
+		It("returns an empty string when the input is only a parent reference", func() {
+			Expect(SanitizeFileName("..")).To(Equal(""))
+		})
+	})
+
+	Describe("GenerateUniqueFileName", func() {
+		It("returns the bare filename when no collision exists", func() {
+			tmpDir := GinkgoT().TempDir()
+			Expect(GenerateUniqueFileName(tmpDir, "model", ".bin")).To(Equal("model.bin"))
+		})
+
+		It("suffixes with _2 when the bare filename already exists", func() {
+			tmpDir := GinkgoT().TempDir()
+			Expect(os.WriteFile(filepath.Join(tmpDir, "model.bin"), nil, 0o600)).To(Succeed())
+
+			Expect(GenerateUniqueFileName(tmpDir, "model", ".bin")).To(Equal("model_2.bin"))
+		})
+
+		It("advances the counter past every existing collision", func() {
+			tmpDir := GinkgoT().TempDir()
+			for _, name := range []string{"model.bin", "model_2.bin", "model_3.bin"} {
+				Expect(os.WriteFile(filepath.Join(tmpDir, name), nil, 0o600)).To(Succeed())
+			}
+
+			Expect(GenerateUniqueFileName(tmpDir, "model", ".bin")).To(Equal("model_4.bin"))
+		})
+
+		It("preserves an empty extension when generating the suffixed name", func() {
+			tmpDir := GinkgoT().TempDir()
+			Expect(os.WriteFile(filepath.Join(tmpDir, "README"), nil, 0o600)).To(Succeed())
+
+			Expect(GenerateUniqueFileName(tmpDir, "README", "")).To(Equal("README_2"))
+		})
+	})
+})
--- a/scripts/ui-coverage-check.sh
+++ b/scripts/ui-coverage-check.sh
@@ -4,13 +4,16 @@
 #
 # Compares the total line coverage in an nyc coverage-summary.json against a
 # committed baseline and fails (exit 1) if it dropped by more than
-# UI_COVERAGE_TOLERANCE percentage points (default 1.0). The React UI e2e suite
+# UI_COVERAGE_TOLERANCE percentage points (default 0.8). The React UI e2e suite
 # drives the real app, so a removed feature or deleted spec shows up as a
 # coverage drop here.
 #
 # UI e2e line coverage is NOT deterministic: async/debounced paths (e.g. the
-# VRAM estimate's 500ms debounce) mean identical specs vary ~0.5pp run-to-run.
-# The tolerance absorbs that jitter; keep it just above the observed wobble.
+# VRAM estimate's 500ms debounce) mean identical specs vary run-to-run. With the
+# V8 path's single-chunk coverage build (vite.config.js inlineDynamicImports)
+# the observed wobble is ~0.5pp, similar to the old istanbul path. The tolerance
+# absorbs that jitter — keep it just above the observed wobble so a real ~1pp
+# regression still trips the gate.
 # (The Go gate carries a smaller tolerance for the same reason — its e2e slice.)
 #
 # When coverage rises meaningfully, regenerate and commit the baseline with:
@@ -19,7 +22,7 @@ set -eu

 summary="${1:?usage: ui-coverage-check.sh SUMMARY_JSON BASELINE_FILE}"
 baseline_file="${2:?usage: ui-coverage-check.sh SUMMARY_JSON BASELINE_FILE}"
-tolerance="${UI_COVERAGE_TOLERANCE:-1.0}"
+tolerance="${UI_COVERAGE_TOLERANCE:-0.8}"

 if [ ! -f "$summary" ]; then
 	echo "ui-coverage-check: coverage summary not found: $summary" >&2
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -10,7 +10,7 @@ import (
 	"time"

 	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/application"
+	localaiapp "github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/config"
 	httpapi "github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/pkg/system"
@@ -41,6 +41,7 @@ var (
 	cloudProxyPath    string
 	mcpServerURL      string
 	mcpServerShutdown func()
+	localAIApp        *localaiapp.Application

 	// Cloud-proxy fake upstreams. Live for the whole suite so the four
 	// cloud-proxy model YAMLs can point at their URLs at startup time.
@@ -390,7 +391,7 @@ var _ = BeforeSuite(func() {
 	// Create application instance (GeneratedContentDir so sound-generation/TTS can write files the handler sends)
 	generatedDir := filepath.Join(tmpDir, "generated")
 	Expect(os.MkdirAll(generatedDir, 0750)).To(Succeed())
-	application, err := application.New(
+	localAIApp, err = localaiapp.New(
 		config.WithContext(appCtx),
 		config.WithSystemState(systemState),
 		config.WithDebug(true),
@@ -399,14 +400,14 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())

 	// Register mock backend (always available for non-realtime tests).
-	application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
-	application.ModelLoader().SetExternalBackend("opus", mockBackendPath)
+	localAIApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
+	localAIApp.ModelLoader().SetExternalBackend("opus", mockBackendPath)
 	if cloudProxyPath != "" {
-		application.ModelLoader().SetExternalBackend("cloud-proxy", cloudProxyPath)
+		localAIApp.ModelLoader().SetExternalBackend("cloud-proxy", cloudProxyPath)
 	}

 	// Create HTTP app
-	app, err = httpapi.API(application)
+	app, err = httpapi.API(localAIApp)
 	Expect(err).ToNot(HaveOccurred())

 	// Get free port
@@ -436,6 +437,14 @@ var _ = BeforeSuite(func() {
 })

 var _ = AfterSuite(func() {
+	// Synchronous shutdown — the context-cancel goroutine in application.New
+	// runs the same cleanup asynchronously, which races test-binary exit and
+	// orphans spawned mock-backend children to init.
+	if localAIApp != nil {
+		if err := localAIApp.Shutdown(); err != nil {
+			xlog.Error("error shutting down application", "error", err)
+		}
+	}
 	if appCancel != nil {
 		appCancel()
 	}
@@ -1 +1 @@
 .66
 .29