From f01a969f7bd40aac3c1b75281136700ec21cdea4 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 00:29:29 +0200
Subject: [PATCH 1/8] docs: :arrow_up: update docs version mudler/LocalAI
 (#10531)

:arrow_up: Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 docs/data/version.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/data/version.json b/docs/data/version.json
index f8cc39cee..944cb9836 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.5.0"
+  "version": "v4.5.2"
 }

From 2c96c2d08ee0441b235128f1b555047f04531ca2 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 00:50:51 +0200
Subject: [PATCH 2/8] chore: :arrow_up: Update mudler/parakeet.cpp to
 `f469a57270a1cc4554acb15febf60e56619673b9` (#10530)

:arrow_up: Update mudler/parakeet.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 backend/go/parakeet-cpp/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile
index 7fc46f8e2..bf7450531 100644
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
+# Upstream pin lives below as PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
 
-PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
+PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
 
 GOCMD?=go

From f98b0f1c1efc1d4e38e967f69e047af79784591a Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 01:36:33 +0200
Subject: [PATCH 3/8] fix(gpu-libs): bundle transitive deps of GPU runtime libs
 (#10537) (#10539)

fix(gpu-libs): bundle transitive deps of GPU runtime libs

The per-vendor packagers in package-gpu-libs.sh copy an explicit allowlist
of top-level GPU runtime libraries (libamdhip64, libhipblas, librocblas, the
CUDA/Intel equivalents, ...) but never resolved their transitive
dependencies. Backends run through the bundled lib/ld.so with
LD_LIBRARY_PATH=lib, so any transitive dep not in the allowlist is a fatal
"cannot open shared object file" at load time.

On recent ROCm (base image rocm 7.2.1) the runtime libs link against
librocprofiler-register.so.0, which is not in the allowlist, so the rocm
llama-cpp backend (and every other GPU backend sharing this script) failed
to load with:

  librocprofiler-register.so.0: cannot open shared object file

The Vulkan path already solved this class of problem with copy_elf_deps
(ldd-based transitive resolution), but that sweep was only wired into the
Vulkan ICD path. This adds a generic sweep_transitive_deps that runs the
same ldd resolution over everything the allowlist already bundled, and wires
it into the ROCm, CUDA and Intel packagers. ldd returns the full recursive
closure, so one pass suffices; core libc-family deps are skipped via
is_core_lib so we never shadow the loader's own libc/libstdc++.

Adds a self-contained regression test (gcc + ldd) that fabricates a primary
lib linking a transitive lib and asserts the sweep bundles the dependency.

Fixes #10537

Assisted-by: Claude:opus-4.8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 scripts/build/package-gpu-libs.sh      | 45 +++++++++++++++++++++
 scripts/build/package-gpu-libs_test.sh | 54 ++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100755 scripts/build/package-gpu-libs_test.sh

diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh
index 40f410173..17c0d0ca8 100755
--- a/scripts/build/package-gpu-libs.sh
+++ b/scripts/build/package-gpu-libs.sh
@@ -141,6 +141,38 @@ copy_elf_deps() {
     done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}')
 }
 
+# Sweep the transitive shared-library dependencies of everything already
+# bundled in a lib dir. The per-vendor packagers below copy an explicit
+# allowlist of top-level runtime libs, but those libs pull in transitive deps
+# that aren't in the list (e.g. ROCm's librocprofiler-register.so.0, libnuma,
+# libdrm_amdgpu). Because backends run through the bundled lib/ld.so with
+# LD_LIBRARY_PATH=lib (see run.sh), an unbundled transitive dep is a hard load
+# failure (issue #10537: "librocprofiler-register.so.0: cannot open shared
+# object file"). ldd resolves the full recursive closure, so a single pass over
+# the already-bundled libs is enough; core libc-family deps are skipped via
+# copy_elf_deps/is_core_lib so we never shadow the loader's own libc/libstdc++.
+sweep_transitive_deps() {
+    local dir="${1:-$TARGET_LIB_DIR}"
+    command -v ldd >/dev/null 2>&1 || return 0
+
+    # Snapshot the current set first: copy_elf_deps adds files as it runs, and
+    # ldd already returns the full recursive closure, so we only need to sweep
+    # the libs that were present before the sweep started.
+    # `local x=$(...)` keeps set -e from tripping on shopt -p's nonzero exit.
+    local old_nullglob=$(shopt -p nullglob)
+    shopt -s nullglob
+    local libs=("$dir"/*.so*)
+    eval "$old_nullglob"
+
+    local lib
+    for lib in "${libs[@]}"; do
+        [ -e "$lib" ] || continue
+        # Skip symlinks: their real target is in the snapshot and gets swept.
+        [ -L "$lib" ] && continue
+        copy_elf_deps "$lib"
+    done
+}
+
 # Package NVIDIA CUDA libraries
 package_cuda_libs() {
     echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -185,6 +217,10 @@ package_cuda_libs() {
     #     cp -arfL /usr/local/cuda/targets "$TARGET_LIB_DIR/../cuda/" 2>/dev/null || true
     # fi
 
+    # Pull in transitive deps the allowlist misses so the backend is
+    # self-contained (same class of failure as #10537).
+    sweep_transitive_deps "$TARGET_LIB_DIR"
+
     echo "CUDA libraries packaged successfully"
 }
 
@@ -261,6 +297,10 @@ package_rocm_libs() {
         fi
     done
 
+    # Pull in transitive deps the allowlist misses (librocprofiler-register.so.0,
+    # libnuma, libdrm_amdgpu, ...) so the backend is self-contained. See #10537.
+    sweep_transitive_deps "$TARGET_LIB_DIR"
+
     echo "ROCm libraries packaged successfully"
 }
 
@@ -303,6 +343,10 @@ package_intel_libs() {
         fi
     done
 
+    # Pull in transitive deps the allowlist misses so the backend is
+    # self-contained (same class of failure as #10537).
+    sweep_transitive_deps "$TARGET_LIB_DIR"
+
     echo "Intel oneAPI libraries packaged successfully"
 }
 
@@ -432,6 +476,7 @@ export -f copy_lib
 export -f copy_libs_glob
 export -f is_core_lib
 export -f copy_elf_deps
+export -f sweep_transitive_deps
 export -f package_cuda_libs
 export -f package_rocm_libs
 export -f package_intel_libs
diff --git a/scripts/build/package-gpu-libs_test.sh b/scripts/build/package-gpu-libs_test.sh
new file mode 100755
index 000000000..39f8331c0
--- /dev/null
+++ b/scripts/build/package-gpu-libs_test.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Regression test for scripts/build/package-gpu-libs.sh.
+#
+# Guards issue #10537: the per-vendor packagers copy an explicit allowlist of
+# top-level GPU runtime libs but used to miss their transitive dependencies
+# (e.g. ROCm's librocprofiler-register.so.0). Since backends run through the
+# bundled lib/ld.so with LD_LIBRARY_PATH=lib, an unbundled transitive dep is a
+# fatal "cannot open shared object file" at load time.
+#
+# This test fabricates a primary lib that links a transitive lib, simulates the
+# allowlist step (primary copied, transitive not), and asserts the transitive
+# sweep pulls the dependency in. Requires gcc + ldd (present in build images).
+set -euo pipefail
+
+CURDIR=$(dirname "$(realpath "$0")")
+SCRIPT="$CURDIR/package-gpu-libs.sh"
+
+if ! command -v gcc >/dev/null 2>&1 || ! command -v ldd >/dev/null 2>&1; then
+    echo "SKIP: gcc/ldd not available"
+    exit 0
+fi
+
+WORK=$(mktemp -d)
+trap 'rm -rf "$WORK"' EXIT
+
+# Transitive dependency (stand-in for librocprofiler-register.so.0).
+echo 'int transitive_fn(void){return 42;}' > "$WORK/transitive.c"
+gcc -shared -fPIC -o "$WORK/libfaketransitive.so.0" "$WORK/transitive.c"
+
+# Primary allowlisted lib (stand-in for libhipblas.so) that links it.
+echo 'int transitive_fn(void); int primary_fn(void){return transitive_fn();}' > "$WORK/primary.c"
+gcc -shared -fPIC -o "$WORK/libfakeprimary.so.0" "$WORK/primary.c" \
+    -L"$WORK" -l:libfaketransitive.so.0 -Wl,-rpath,"$WORK"
+
+# Simulate the allowlist step: primary already bundled, transitive not.
+TARGET="$WORK/target"
+mkdir -p "$TARGET"
+cp "$WORK/libfakeprimary.so.0" "$TARGET/"
+
+# Make the transitive dep resolvable like /opt/rocm libs are in the build image.
+export LD_LIBRARY_PATH="$WORK:${LD_LIBRARY_PATH:-}"
+
+# shellcheck source=/dev/null
+source "$SCRIPT" "$TARGET"
+sweep_transitive_deps "$TARGET"
+
+if [ -e "$TARGET/libfaketransitive.so.0" ]; then
+    echo "PASS: transitive dependency was bundled by sweep_transitive_deps"
+    exit 0
+fi
+
+echo "FAIL: transitive dependency was NOT bundled (regression of #10537)"
+ls -la "$TARGET"
+exit 1

From 64150ca7abf32fc0695eabbbabe7ea685d075462 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 01:36:57 +0200
Subject: [PATCH 4/8] fix(distributed): broadcast admin model-config changes
 across replicas (#10540)

In distributed mode the admin model endpoints (/models/edit, /models/import,
/models/toggle-state and the PATCH config-json endpoint) wrote the YAML to the
shared models dir but reloaded only the local replica's in-memory
ModelConfigLoader. With multiple frontend replicas behind one service, a save
landed on whichever replica handled the request; peers kept serving their stale
in-memory view, so a load-balanced request was a coin-flip between old and new
config (a created alias visible on one replica and missing on the other, an
edited alias target diverging, etc.).

The NATS cache-invalidation channel (SubjectCacheInvalidateModels +
OnModelsChanged) already existed for the gallery install/delete path; these
admin endpoints simply never published on it. Wire them up via a new
GalleryService.BroadcastModelsChanged helper (no-op in standalone mode).

Also fix delete propagation: LoadModelConfigsFromPath is additive and never
drops an entry whose file is gone, so the subscriber hook (which only reloaded
from disk) could not propagate a removal. ApplyRemoteChange now honors the
event op - pruning the element on "delete" and reloading otherwise - and shuts
down any running instance of the affected model so the new config takes effect.
This closes the same latent gap on the gallery delete path.


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/application/startup.go                   | 12 ++-
 core/http/endpoints/localai/config_meta.go    | 10 ++-
 .../endpoints/localai/config_meta_test.go     |  2 +-
 core/http/endpoints/localai/edit_model.go     | 14 +++-
 .../http/endpoints/localai/edit_model_test.go |  8 +-
 core/http/endpoints/localai/import_model.go   |  9 ++-
 core/http/endpoints/localai/toggle_model.go   | 11 ++-
 core/http/routes/localai.go                   |  6 +-
 core/http/routes/ui_api.go                    |  2 +-
 .../galleryop/distributed_sync_test.go        | 30 +++++++
 core/services/galleryop/service.go            | 18 +++++
 core/services/modeladmin/remote_sync.go       | 53 ++++++++++++
 core/services/modeladmin/remote_sync_test.go  | 80 +++++++++++++++++++
 13 files changed, 239 insertions(+), 16 deletions(-)
 create mode 100644 core/services/modeladmin/remote_sync.go
 create mode 100644 core/services/modeladmin/remote_sync_test.go

diff --git a/core/application/startup.go b/core/application/startup.go
index 1e5a7a73b..a71f8d0ea 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -16,6 +16,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/jobs"
 	"github.com/mudler/LocalAI/core/services/messaging"
+	"github.com/mudler/LocalAI/core/services/modeladmin"
 	"github.com/mudler/LocalAI/core/services/monitoring"
 	"github.com/mudler/LocalAI/core/services/nodes"
 	"github.com/mudler/LocalAI/core/services/routing/admission"
@@ -330,9 +331,14 @@ func New(opts ...config.AppOption) (*Application, error) {
 			gs := application.galleryService
 			sys := options.SystemState
 			cfgLoaderOpts := options.ToConfigLoaderOptions()
-			gs.OnModelsChanged = func(_ messaging.CacheInvalidateEvent) {
-				if err := application.ModelConfigLoader().LoadModelConfigsFromPath(sys.Model.ModelsPath, cfgLoaderOpts...); err != nil {
-					xlog.Warn("Failed to reload model configs after peer invalidation", "error", err)
+			gs.OnModelsChanged = func(evt messaging.CacheInvalidateEvent) {
+				// ApplyRemoteChange honors the op: a "delete" prunes the element
+				// (a reload-from-path is additive and cannot drop it), anything
+				// else reloads from disk; a named element's running instance is
+				// shut down so the new config takes effect. The originating
+				// replica reloads inline and never depends on this path.
+				if err := modeladmin.ApplyRemoteChange(application.ModelConfigLoader(), application.modelLoader, sys.Model.ModelsPath, evt, cfgLoaderOpts...); err != nil {
+					xlog.Warn("Failed to apply peer model config change", "error", err)
 				}
 			}
 			if err := application.galleryService.SubscribeBroadcasts(); err != nil {
diff --git a/core/http/endpoints/localai/config_meta.go b/core/http/endpoints/localai/config_meta.go
index b45720b78..3db694512 100644
--- a/core/http/endpoints/localai/config_meta.go
+++ b/core/http/endpoints/localai/config_meta.go
@@ -155,7 +155,7 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
 // @Param name path string true "Model name"
 // @Success 200 {object} map[string]any "success message"
 // @Router /api/models/config-json/{name} [patch]
-func PatchConfigEndpoint(cl *config.ModelConfigLoader, _ *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+func PatchConfigEndpoint(cl *config.ModelConfigLoader, _ *model.ModelLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	svc := modeladmin.NewConfigService(cl, appConfig)
 	return func(c echo.Context) error {
 		modelName := c.Param("name")
@@ -173,6 +173,14 @@ func PatchConfigEndpoint(cl *config.ModelConfigLoader, _ *model.ModelLoader, app
 		if _, err := svc.PatchConfig(c.Request().Context(), modelName, patchMap); err != nil {
 			return c.JSON(httpStatusForModelAdminError(err), map[string]any{"error": err.Error()})
 		}
+
+		// Patch rewrites the config on disk and reloads only the local loader;
+		// tell peers to refresh so the change is consistent across replicas.
+		// No-op in standalone mode.
+		if gs != nil {
+			gs.BroadcastModelsChanged(modelName, "install")
+		}
+
 		return c.JSON(http.StatusOK, map[string]any{
 			"success": true,
 			"message": fmt.Sprintf("Model '%s' updated successfully", modelName),
diff --git a/core/http/endpoints/localai/config_meta_test.go b/core/http/endpoints/localai/config_meta_test.go
index f56c14b00..e60f7e08d 100644
--- a/core/http/endpoints/localai/config_meta_test.go
+++ b/core/http/endpoints/localai/config_meta_test.go
@@ -45,7 +45,7 @@ var _ = Describe("Config Metadata Endpoints", func() {
 		app = echo.New()
 		app.GET("/api/models/config-metadata", ConfigMetadataEndpoint())
 		app.GET("/api/models/config-metadata/autocomplete/:provider", AutocompleteEndpoint(configLoader, modelLoader, appConfig))
-		app.PATCH("/api/models/config-json/:name", PatchConfigEndpoint(configLoader, modelLoader, appConfig))
+		app.PATCH("/api/models/config-json/:name", PatchConfigEndpoint(configLoader, modelLoader, nil, appConfig))
 	})
 
 	AfterEach(func() {
diff --git a/core/http/endpoints/localai/edit_model.go b/core/http/endpoints/localai/edit_model.go
index 4cc0477fb..5dd573751 100644
--- a/core/http/endpoints/localai/edit_model.go
+++ b/core/http/endpoints/localai/edit_model.go
@@ -10,6 +10,7 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/config"
 	httpUtils "github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/modeladmin"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -55,7 +56,7 @@ func GetEditModelPage(cl *config.ModelConfigLoader, appConfig *config.Applicatio
 }
 
 // EditModelEndpoint handles updating existing model configurations
-func EditModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+func EditModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	svc := modeladmin.NewConfigService(cl, appConfig)
 	return func(c echo.Context) error {
 		modelName := c.Param("name")
@@ -70,6 +71,17 @@ func EditModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appC
 		if err != nil {
 			return c.JSON(httpStatusForModelAdminError(err), ModelResponse{Success: false, Error: err.Error()})
 		}
+
+		// Tell peer replicas to refresh their in-memory config: this endpoint
+		// only reloaded the local loader. A rename is a delete of the old name
+		// plus an install of the new one. No-op in standalone mode.
+		if gs != nil {
+			if result.Renamed {
+				gs.BroadcastModelsChanged(result.OldName, "delete")
+			}
+			gs.BroadcastModelsChanged(result.NewName, "install")
+		}
+
 		msg := fmt.Sprintf("Model '%s' updated successfully. Model has been reloaded with new configuration.", result.NewName)
 		if result.Renamed {
 			msg = fmt.Sprintf("Model '%s' renamed to '%s' and updated successfully.", result.OldName, result.NewName)
diff --git a/core/http/endpoints/localai/edit_model_test.go b/core/http/endpoints/localai/edit_model_test.go
index 55328dc39..54ad2d5ec 100644
--- a/core/http/endpoints/localai/edit_model_test.go
+++ b/core/http/endpoints/localai/edit_model_test.go
@@ -56,7 +56,7 @@ var _ = Describe("Edit Model test", func() {
 			app := echo.New()
 			// Set up a simple renderer for the test
 			app.Renderer = &testRenderer{}
-			app.POST("/import-model", ImportModelEndpoint(modelConfigLoader, applicationConfig))
+			app.POST("/import-model", ImportModelEndpoint(modelConfigLoader, nil, applicationConfig))
 			app.GET("/edit-model/:name", GetEditModelPage(modelConfigLoader, applicationConfig))
 
 			requestBody := bytes.NewBufferString(`{"name": "foo", "backend": "foo", "model": "foo"}`)
@@ -106,7 +106,7 @@ var _ = Describe("Edit Model test", func() {
 			Expect(exists).To(BeTrue())
 
 			app := echo.New()
-			app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, applicationConfig))
+			app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, nil, applicationConfig))
 
 			newYAML := "name: newname\nbackend: llama\nmodel: foo\n"
 			req := httptest.NewRequest("POST", "/models/edit/oldname", bytes.NewBufferString(newYAML))
@@ -163,7 +163,7 @@ var _ = Describe("Edit Model test", func() {
 			Expect(modelConfigLoader.LoadModelConfigsFromPath(tempDir)).To(Succeed())
 
 			app := echo.New()
-			app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, applicationConfig))
+			app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, nil, applicationConfig))
 
 			req := httptest.NewRequest(
 				"POST",
@@ -204,7 +204,7 @@ var _ = Describe("Edit Model test", func() {
 			Expect(modelConfigLoader.LoadModelConfigsFromPath(tempDir)).To(Succeed())
 
 			app := echo.New()
-			app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, applicationConfig))
+			app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, nil, applicationConfig))
 
 			req := httptest.NewRequest(
 				"POST",
diff --git a/core/http/endpoints/localai/import_model.go b/core/http/endpoints/localai/import_model.go
index 54a80a9cc..21b7673da 100644
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -125,7 +125,7 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
 }
 
 // ImportModelEndpoint handles creating new model configurations
-func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+func ImportModelEndpoint(cl *config.ModelConfigLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		// Get the raw body
 		body, err := io.ReadAll(c.Request().Body)
@@ -245,6 +245,13 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica
 			}
 			return c.JSON(http.StatusInternalServerError, response)
 		}
+		// Tell peer replicas to load the newly-created config from the shared
+		// models dir: this endpoint only reloaded the local loader. No-op in
+		// standalone mode.
+		if gs != nil {
+			gs.BroadcastModelsChanged(modelConfig.Name, "install")
+		}
+
 		// Return success response
 		response := ModelResponse{
 			Success:  true,
diff --git a/core/http/endpoints/localai/toggle_model.go b/core/http/endpoints/localai/toggle_model.go
index e4e71ca64..545fdc8af 100644
--- a/core/http/endpoints/localai/toggle_model.go
+++ b/core/http/endpoints/localai/toggle_model.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/modeladmin"
 	"github.com/mudler/LocalAI/pkg/model"
 )
@@ -24,7 +25,7 @@ import (
 // @Failure      404  {object}  ModelResponse
 // @Failure      500  {object}  ModelResponse
 // @Router       /api/models/{name}/{action} [put]
-func ToggleStateModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+func ToggleStateModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	svc := modeladmin.NewConfigService(cl, appConfig)
 	return func(c echo.Context) error {
 		modelName := c.Param("name")
@@ -36,6 +37,14 @@ func ToggleStateModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoade
 		if err != nil {
 			return c.JSON(httpStatusForModelAdminError(err), ModelResponse{Success: false, Error: err.Error()})
 		}
+
+		// Enabling/disabling rewrites the config on disk and reloads only the
+		// local loader; tell peers to refresh so the model's availability is
+		// consistent across replicas. No-op in standalone mode.
+		if gs != nil {
+			gs.BroadcastModelsChanged(modelName, "install")
+		}
+
 		msg := fmt.Sprintf("Model '%s' has been %sd successfully.", modelName, action)
 		if action == modeladmin.ActionDisable {
 			msg += " The model will not be loaded on demand until re-enabled."
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index 212f379f0..763623a7f 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -72,19 +72,19 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 		router.POST("/backends/upgrades/check", backendGalleryEndpointService.CheckUpgradesEndpoint(), adminMiddleware)
 		router.POST("/backends/upgrade/:name", backendGalleryEndpointService.UpgradeBackendEndpoint(), adminMiddleware)
 		// Custom model import endpoint
-		router.POST("/models/import", localai.ImportModelEndpoint(cl, appConfig), adminMiddleware)
+		router.POST("/models/import", localai.ImportModelEndpoint(cl, galleryService, appConfig), adminMiddleware)
 
 		// URI model import endpoint
 		router.POST("/models/import-uri", localai.ImportModelURIEndpoint(cl, appConfig, galleryService, opcache), adminMiddleware)
 
 		// Custom model edit endpoint
-		router.POST("/models/edit/:name", localai.EditModelEndpoint(cl, ml, appConfig), adminMiddleware)
+		router.POST("/models/edit/:name", localai.EditModelEndpoint(cl, ml, galleryService, appConfig), adminMiddleware)
 
 		// List model aliases endpoint
 		router.GET("/api/aliases", localai.ListAliasesEndpoint(cl), adminMiddleware)
 
 		// Toggle model enable/disable endpoint
-		router.PUT("/models/toggle-state/:name/:action", localai.ToggleStateModelEndpoint(cl, ml, appConfig), adminMiddleware)
+		router.PUT("/models/toggle-state/:name/:action", localai.ToggleStateModelEndpoint(cl, ml, galleryService, appConfig), adminMiddleware)
 
 		// Toggle model pinned status endpoint
 		router.PUT("/models/toggle-pinned/:name/:action", localai.TogglePinnedModelEndpoint(cl, appConfig, func() {
diff --git a/core/http/routes/ui_api.go b/core/http/routes/ui_api.go
index e26894273..d9c99c6b9 100644
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -922,7 +922,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 	app.GET("/api/models/config-metadata/autocomplete/:provider", localai.AutocompleteEndpoint(cl, ml, appConfig), adminMiddleware)
 
 	// PATCH config endpoint - partial update using nested JSON merge
-	app.PATCH("/api/models/config-json/:name", localai.PatchConfigEndpoint(cl, ml, appConfig), adminMiddleware)
+	app.PATCH("/api/models/config-json/:name", localai.PatchConfigEndpoint(cl, ml, galleryService, appConfig), adminMiddleware)
 
 	// VRAM estimation endpoint
 	app.POST("/api/models/vram-estimate", localai.VRAMEstimateEndpoint(cl, appConfig), adminMiddleware)
diff --git a/core/services/galleryop/distributed_sync_test.go b/core/services/galleryop/distributed_sync_test.go
index 7c1087de8..71a96c7ae 100644
--- a/core/services/galleryop/distributed_sync_test.go
+++ b/core/services/galleryop/distributed_sync_test.go
@@ -404,6 +404,36 @@ var _ = Describe("GalleryService cache invalidation broadcasts", func() {
 			Element: "x", Op: "install",
 		})).To(Succeed())
 	})
+
+	It("BroadcastModelsChanged delivers the element and op to a peer's OnModelsChanged", func() {
+		var (
+			mu   sync.Mutex
+			seen []messaging.CacheInvalidateEvent
+		)
+		svcB.OnModelsChanged = func(evt messaging.CacheInvalidateEvent) {
+			mu.Lock()
+			seen = append(seen, evt)
+			mu.Unlock()
+		}
+		Expect(svcA.SubscribeBroadcasts()).To(Succeed())
+		Expect(svcB.SubscribeBroadcasts()).To(Succeed())
+
+		// An admin edit on replica A must reach replica B over the same subject
+		// the gallery path uses, so B refreshes its in-memory config loader.
+		svcA.BroadcastModelsChanged("my-alias", "install")
+
+		mu.Lock()
+		defer mu.Unlock()
+		Expect(seen).To(ContainElement(messaging.CacheInvalidateEvent{
+			Element: "my-alias", Op: "install",
+		}))
+	})
+
+	It("BroadcastModelsChanged is a no-op when NATS is not wired (standalone)", func() {
+		standalone := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
+		// No SetNATSClient: must not panic and must simply do nothing.
+		Expect(func() { standalone.BroadcastModelsChanged("x", "delete") }).ToNot(Panic())
+	})
 })
 
 var _ = Describe("GalleryService PostgreSQL hydration", func() {
diff --git a/core/services/galleryop/service.go b/core/services/galleryop/service.go
index d01d9cc19..abe399088 100644
--- a/core/services/galleryop/service.go
+++ b/core/services/galleryop/service.go
@@ -201,6 +201,24 @@ func (g *GalleryService) publishCacheInvalidate(subject string, evt messaging.Ca
 	}
 }
 
+// BroadcastModelsChanged notifies peer replicas that a model config was
+// created, edited, or removed out-of-band of the gallery install/delete
+// channel (e.g. the admin /models/edit, /models/import and
+// /models/toggle-state endpoints, which write the YAML and reload only the
+// local in-memory loader). Peers receive it via OnModelsChanged and refresh
+// their own ModelConfigLoader so a request load-balanced to any replica sees
+// the same config. No-op in standalone mode (no NATS client).
+//
+// op is "install" for a create/edit (the element must be (re)loaded from
+// disk) or "delete" for a removal (the element must be pruned from memory,
+// which a reload-from-path cannot do because the loader is additive).
+func (g *GalleryService) BroadcastModelsChanged(element, op string) {
+	g.publishCacheInvalidate(messaging.SubjectCacheInvalidateModels, messaging.CacheInvalidateEvent{
+		Element: element,
+		Op:      op,
+	})
+}
+
 // mergeStatus is the broadcast-side merge: it updates the in-memory map from
 // a peer's GalleryProgressEvent without re-publishing to NATS or re-writing
 // to PostgreSQL. UpdateStatus is the local-write entry point and does both;
diff --git a/core/services/modeladmin/remote_sync.go b/core/services/modeladmin/remote_sync.go
new file mode 100644
index 000000000..5acf5bf9a
--- /dev/null
+++ b/core/services/modeladmin/remote_sync.go
@@ -0,0 +1,53 @@
+package modeladmin
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/messaging"
+	"github.com/mudler/LocalAI/pkg/model"
+
+	"github.com/mudler/xlog"
+)
+
+// opDelete is the CacheInvalidateEvent.Op value the gallery delete path and the
+// admin delete endpoint use; a delete must prune (a reload-from-path cannot).
+const opDelete = "delete"
+
+// ApplyRemoteChange refreshes this replica's in-memory model state from a peer
+// replica's model-config change broadcast (messaging.CacheInvalidateEvent on
+// SubjectCacheInvalidateModels). It is the subscriber-side counterpart to
+// GalleryService.BroadcastModelsChanged.
+//
+// The op matters because LoadModelConfigsFromPath is additive: it loads every
+// YAML on disk into the loader but never removes an entry whose file is gone.
+// So a delete cannot be propagated by a plain reload - the deleted element must
+// be explicitly pruned. Specifically:
+//
+//   - op == "delete" with a named element: prune that element from the loader.
+//   - otherwise: reload all configs from disk (picks up creates and edits).
+//
+// In both cases, when an element is named, any running instance on this replica
+// is shut down (best-effort) so the next request rebuilds it from the new
+// config instead of serving the stale one - mirroring what the originating
+// replica does on a local edit/delete.
+//
+// ml may be nil (no running instances to shut down). modelsPath and opts are
+// forwarded to LoadModelConfigsFromPath.
+func ApplyRemoteChange(cl *config.ModelConfigLoader, ml *model.ModelLoader, modelsPath string, evt messaging.CacheInvalidateEvent, opts ...config.ConfigLoaderOption) error {
+	if evt.Op == opDelete && evt.Element != "" {
+		cl.RemoveModelConfig(evt.Element)
+	} else if err := cl.LoadModelConfigsFromPath(modelsPath, opts...); err != nil {
+		return err
+	}
+
+	// Drop any running instance of the affected model so the next request
+	// rebuilds it from the refreshed config instead of serving the stale one.
+	// Best-effort: the model may not be loaded on this replica, which surfaces
+	// as a benign error here.
+	if ml != nil && evt.Element != "" {
+		if err := ml.ShutdownModel(evt.Element); err != nil {
+			xlog.Debug("ApplyRemoteChange: could not shut down model instance (likely not loaded)",
+				"model", evt.Element, "error", err)
+		}
+	}
+	return nil
+}
diff --git a/core/services/modeladmin/remote_sync_test.go b/core/services/modeladmin/remote_sync_test.go
new file mode 100644
index 000000000..df4907a02
--- /dev/null
+++ b/core/services/modeladmin/remote_sync_test.go
@@ -0,0 +1,80 @@
+package modeladmin
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services/messaging"
+)
+
+var _ = Describe("ApplyRemoteChange", func() {
+	var (
+		dir    string
+		loader *config.ModelConfigLoader
+	)
+
+	BeforeEach(func() {
+		dir = GinkgoT().TempDir()
+		loader = config.NewModelConfigLoader(dir)
+	})
+
+	writeYAML := func(name string, body map[string]any) {
+		body["name"] = name
+		data, err := yaml.Marshal(body)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(os.WriteFile(filepath.Join(dir, name+".yaml"), data, 0644)).To(Succeed())
+	}
+
+	It("loads a peer-created config from disk on an install event", func() {
+		// Peer wrote the YAML to the shared models dir; this replica has not
+		// loaded it yet (empty in-memory loader).
+		writeYAML("peer-alias", map[string]any{"alias": "qwen"})
+		_, ok := loader.GetModelConfig("peer-alias")
+		Expect(ok).To(BeFalse(), "precondition: not yet in memory")
+
+		err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{
+			Element: "peer-alias", Op: "install",
+		})
+		Expect(err).ToNot(HaveOccurred())
+
+		_, ok = loader.GetModelConfig("peer-alias")
+		Expect(ok).To(BeTrue(), "install event must reload the new config from disk")
+	})
+
+	It("prunes a peer-deleted config that a reload-from-path cannot drop", func() {
+		// Model is present in memory (loaded earlier) but its file is now gone
+		// from the shared dir. LoadModelConfigsFromPath is additive, so only an
+		// explicit prune can remove it - this is the cross-replica delete bug.
+		writeYAML("doomed", map[string]any{"alias": "qwen"})
+		Expect(loader.LoadModelConfigsFromPath(dir)).To(Succeed())
+		_, ok := loader.GetModelConfig("doomed")
+		Expect(ok).To(BeTrue(), "precondition: in memory")
+		Expect(os.Remove(filepath.Join(dir, "doomed.yaml"))).To(Succeed())
+
+		err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{
+			Element: "doomed", Op: "delete",
+		})
+		Expect(err).ToNot(HaveOccurred())
+
+		_, ok = loader.GetModelConfig("doomed")
+		Expect(ok).To(BeFalse(), "delete event must prune the element from memory")
+	})
+
+	It("does a full reload when no element is named", func() {
+		writeYAML("m1", map[string]any{"alias": "qwen"})
+		writeYAML("m2", map[string]any{"alias": "qwen"})
+
+		err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{})
+		Expect(err).ToNot(HaveOccurred())
+
+		_, ok1 := loader.GetModelConfig("m1")
+		_, ok2 := loader.GetModelConfig("m2")
+		Expect(ok1).To(BeTrue())
+		Expect(ok2).To(BeTrue())
+	})
+})

From f0d0bff2320c7a7b0a93623949c4f00fc6c61613 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 01:42:05 +0200
Subject: [PATCH 5/8] fix(llama-cpp): stop reinterpreting plain-string message
 content as JSON (#10524) (#10538)

The llama-cpp gRPC backend reconstructs OpenAI messages from proto for the
tokenizer-template path and blindly json::parse'd each message's content
string. LocalAI's Go layer always flattens content to a plain string, so a
user prompt that merely looks like JSON (e.g. mealie's ingredient array
["1/4 cup brown sugar", ...]) was reinterpreted as structured content parts and
rejected by oaicompat_chat_params_parse with "unsupported content[].type".

Normalize content per role instead: user/system/developer content is opaque
text and is never JSON-sniffed; assistant/tool content still collapses a literal
JSON null/object (tool-call bookkeeping) to a string, but a plain string is
never turned into an array/scalar. The array defense is role-independent, so the
role gate only governs the benign null/object case.

While here, extract the duplicated per-message reconstruction and the
pre-template content sanitization into shared, unit-tested helpers
(message_content.h) so the streaming (PredictStream) and non-streaming (Predict)
paths cannot drift. This removes ~490 lines of copy-pasted defensive code, the
dead tool-role parse branches, and the redundant Predict-only tool_calls branch,
while preserving the prior #7324 (null content -> "") and #7528 (tool array
content -> string) fixes.

Tests:
- backend/cpp/llama-cpp/message_content_test.cpp: standalone C++ unit tests for
  all three helpers (#10524, #7324, #7528, multimodal), discovered and run by
  `make test-backend-cpp` and a new generic tests-backend-cpp CI job. Also wired
  as an opt-in CMake/ctest target (-DLLAMA_GRPC_BUILD_TESTS=ON).
- core/schema/message_test.go: Go regression pinning that ToProto flattens a
  JSON-array-looking text part to the verbatim string.
- prepare.sh now copies message_content.h into the build tree.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/test.yml                    |  16 +
 Makefile                                      |   9 +-
 backend/cpp/llama-cpp/CMakeLists.txt          |  15 +
 backend/cpp/llama-cpp/grpc-server.cpp         | 581 +-----------------
 backend/cpp/llama-cpp/message_content.h       | 192 ++++++
 .../cpp/llama-cpp/message_content_test.cpp    | 234 +++++++
 backend/cpp/llama-cpp/prepare.sh              |   4 +
 backend/cpp/run-unit-tests.sh                 |  71 +++
 core/schema/message_test.go                   |  26 +
 9 files changed, 595 insertions(+), 553 deletions(-)
 create mode 100644 backend/cpp/llama-cpp/message_content.h
 create mode 100644 backend/cpp/llama-cpp/message_content_test.cpp
 create mode 100755 backend/cpp/run-unit-tests.sh

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index df5512283..e18e38b62 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -121,3 +121,19 @@ jobs:
           detached: true
           connect-timeout-seconds: 180
           limit-access-to-actor: true
+
+  # Fast standalone unit tests for the backends' pure C++ helpers - currently the
+  # llama-cpp message reconstruction (backend/cpp/llama-cpp/message_content.h),
+  # which guards the OpenAI chat content normalization (mudler/LocalAI#10524,
+  # #7324, #7528). The runner discovers every *_test.cpp under backend/cpp/, so
+  # new pure-C++ unit tests are picked up with no CI changes. These need only the
+  # C++ stdlib + nlohmann/json, so they run on every PR without the full
+  # llama.cpp + gRPC backend build. (The same suite is also wired as an opt-in
+  # CMake/ctest target, -DLLAMA_GRPC_BUILD_TESTS=ON, for in-backend-build runs.)
+  tests-backend-cpp:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v7
+      - name: Run backend C++ unit tests
+        run: make test-backend-cpp
diff --git a/Makefile b/Makefile
index 600f6cbbe..2a8edc3fc 100644
--- a/Makefile
+++ b/Makefile
@@ -103,7 +103,7 @@ COVERAGE_E2E_LABELS?=!real-models
 COVERAGE_EXCLUDE_RE?=grpc/proto/.*[.]pb[.]go
 
 
-.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all
+.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-backend-cpp test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all
 
 all: help
 
@@ -201,6 +201,13 @@ test: prepare-test
 	OPUS_SHIM_LIBRARY=$(abspath ./pkg/opus/shim/libopusshim.so) \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 
+## Compiles and runs the standalone C++ unit tests for the backends (pure
+## helpers that depend only on the stdlib + nlohmann/json, no full backend
+## build). Discovers every *_test.cpp under backend/cpp/ - see
+## backend/cpp/run-unit-tests.sh. Set NLOHMANN_INCLUDE to skip the header fetch.
+test-backend-cpp:
+	bash backend/cpp/run-unit-tests.sh
+
 ## Runs the core suite ($(TEST_PATHS)) with statement-coverage instrumentation
 ## and writes a merged profile to $(COVERAGE_PROFILE). Deliberately omits
 ## --fail-fast so a single failure doesn't truncate the coverage number, and
diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt
index bdf20802a..8b8d2e2d5 100644
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -87,3 +87,18 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
   add_dependencies(${TARGET} BUILD_INFO)
 endif()
+
+# Unit test for the message-content normalization helper (message_content.h).
+# Off by default so the normal backend build is untouched; enable with
+# -DLLAMA_GRPC_BUILD_TESTS=ON and run via ctest. It reuses llama.cpp's vendored
+# <nlohmann/json.hpp> (propagated by the common helpers library) so it has no
+# extra dependency beyond what the backend already builds against.
+option(LLAMA_GRPC_BUILD_TESTS "Build grpc-server unit tests" OFF)
+if(LLAMA_GRPC_BUILD_TESTS)
+    enable_testing()
+    add_executable(message_content_test message_content_test.cpp message_content.h)
+    target_include_directories(message_content_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET})
+    target_compile_features(message_content_test PRIVATE cxx_std_17)
+    add_test(NAME message_content_test COMMAND message_content_test)
+endif()
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 6907b9122..9d17e23b1 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -39,6 +39,7 @@
 #include "common.h"
 #include "arg.h"
 #include "chat-auto-parser.h"
+#include "message_content.h"
 #include <getopt.h>
 #include <grpcpp/ext/proto_server_reflection_plugin.h>
 #include <grpcpp/grpcpp.h>
@@ -1616,242 +1617,20 @@ public:
 
                 for (int i = 0; i < request->messages_size(); i++) {
                     const auto& msg = request->messages(i);
-                    json msg_json;
-                    msg_json["role"] = msg.role();
-
-                    bool is_last_user_msg = (i == last_user_msg_idx);
-                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
-
-                    // Handle content - can be string, null, or array
-                    // For multimodal content, we'll embed images/audio from separate fields
-                    if (!msg.content().empty()) {
-                        // Try to parse content as JSON to see if it's already an array
-                        json content_val;
-                        try {
-                            content_val = json::parse(msg.content());
-                            // Handle null values - convert to empty string to avoid template errors
-                            if (content_val.is_null()) {
-                                content_val = "";
-                            }
-                        } catch (const json::parse_error&) {
-                            // Not JSON, treat as plain string
-                            content_val = msg.content();
-                        }
-
-                        // If content is an object (e.g., from tool call failures), convert to string
-                        if (content_val.is_object()) {
-                            content_val = content_val.dump();
-                        }
-
-                        // If content is a string and this is the last user message with images/audio, combine them
-                        if (content_val.is_string() && is_last_user_msg && has_images_or_audio) {
-                            json content_array = json::array();
-                            // Add text first
-                            content_array.push_back({{"type", "text"}, {"text", content_val.get<std::string>()}});
-                            // Add images
-                            if (request->images_size() > 0) {
-                                for (int j = 0; j < request->images_size(); j++) {
-                                    json image_chunk;
-                                    image_chunk["type"] = "image_url";
-                                    json image_url;
-                                    image_url["url"] = "data:image/jpeg;base64," + request->images(j);
-                                    image_chunk["image_url"] = image_url;
-                                    content_array.push_back(image_chunk);
-                                }
-                            }
-                            // Add audios
-                            if (request->audios_size() > 0) {
-                                for (int j = 0; j < request->audios_size(); j++) {
-                                    json audio_chunk;
-                                    audio_chunk["type"] = "input_audio";
-                                    json input_audio;
-                                    input_audio["data"] = request->audios(j);
-                                    input_audio["format"] = "wav"; // default, could be made configurable
-                                    audio_chunk["input_audio"] = input_audio;
-                                    content_array.push_back(audio_chunk);
-                                }
-                            }
-                            if (request->videos_size() > 0) {
-                                for (int j = 0; j < request->videos_size(); j++) {
-                                    json video_chunk;
-                                    video_chunk["type"] = "input_video";
-                                    json input_video;
-                                    input_video["data"] = request->videos(j);
-                                    video_chunk["input_video"] = input_video;
-                                    content_array.push_back(video_chunk);
-                                }
-                            }
-                            msg_json["content"] = content_array;
-                        } else {
-                            // Use content as-is (already array or not last user message)
-                            // Ensure null values are converted to empty string
-                            if (content_val.is_null()) {
-                                msg_json["content"] = "";
-                            } else {
-                                msg_json["content"] = content_val;
-                            }
-                        }
-                    } else if (is_last_user_msg && has_images_or_audio) {
-                        // If no content but this is the last user message with images/audio, create content array
-                        json content_array = json::array();
-                        if (request->images_size() > 0) {
-                            for (int j = 0; j < request->images_size(); j++) {
-                                json image_chunk;
-                                image_chunk["type"] = "image_url";
-                                json image_url;
-                                image_url["url"] = "data:image/jpeg;base64," + request->images(j);
-                                image_chunk["image_url"] = image_url;
-                                content_array.push_back(image_chunk);
-                            }
-                        }
-                        if (request->audios_size() > 0) {
-                            for (int j = 0; j < request->audios_size(); j++) {
-                                json audio_chunk;
-                                audio_chunk["type"] = "input_audio";
-                                json input_audio;
-                                input_audio["data"] = request->audios(j);
-                                input_audio["format"] = "wav"; // default, could be made configurable
-                                audio_chunk["input_audio"] = input_audio;
-                                content_array.push_back(audio_chunk);
-                            }
-                        }
-                        if (request->videos_size() > 0) {
-                            for (int j = 0; j < request->videos_size(); j++) {
-                                json video_chunk;
-                                video_chunk["type"] = "input_video";
-                                json input_video;
-                                input_video["data"] = request->videos(j);
-                                video_chunk["input_video"] = input_video;
-                                content_array.push_back(video_chunk);
-                            }
-                        }
-                        msg_json["content"] = content_array;
-                    } else if (msg.role() == "tool") {
-                        // Tool role messages must have content field set, even if empty
-                        // Jinja templates expect content to be a string, not null or object
-                        SRV_INF("[CONTENT DEBUG] PredictStream: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0);
-                        if (msg.content().empty()) {
-                            msg_json["content"] = "";
-                            SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): empty content, set to empty string\n", i);
-                        } else {
-                            SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): content exists: %s\n",
-                                    i, msg.content().substr(0, std::min<size_t>(200, msg.content().size())).c_str());
-                            // Content exists, parse and ensure it's a string
-                            json content_val;
-                            try {
-                                content_val = json::parse(msg.content());
-                                SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): parsed JSON, type=%s\n",
-                                        i, content_val.is_null() ? "null" :
-                                           content_val.is_object() ? "object" :
-                                           content_val.is_string() ? "string" :
-                                           content_val.is_array() ? "array" : "other");
-                                // Handle null values - Jinja templates expect content to be a string, not null
-                                if (content_val.is_null()) {
-                                    msg_json["content"] = "";
-                                    SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): null content, converted to empty string\n", i);
-                                } else if (content_val.is_object()) {
-                                    // If content is an object (e.g., from tool call failures/errors), convert to string
-                                    msg_json["content"] = content_val.dump();
-                                    SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): object content, converted to string: %s\n",
-                                            i, content_val.dump().substr(0, std::min<size_t>(200, content_val.dump().size())).c_str());
-                                } else if (content_val.is_string()) {
-                                    msg_json["content"] = content_val.get<std::string>();
-                                    SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): string content, using as-is\n", i);
-                                } else {
-                                    // For arrays or other types, convert to string
-                                    msg_json["content"] = content_val.dump();
-                                    SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): %s content, converted to string\n",
-                                            i, content_val.is_array() ? "array" : "other type");
-                                }
-                            } catch (const json::parse_error&) {
-                                // Not JSON, treat as plain string
-                                msg_json["content"] = msg.content();
-                                SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): not JSON, using as string\n", i);
-                            }
-                        }
-                    } else {
-                        // Ensure all messages have content set (fallback for any unhandled cases)
-                        // Jinja templates expect content to be present, default to empty string if not set
-                        if (!msg_json.contains("content")) {
-                            SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (role=%s): no content field, adding empty string\n",
-                                    i, msg.role().c_str());
-                            msg_json["content"] = "";
-                        }
+                    llama_grpc::ReconstructedMessageInput rin;
+                    rin.role = msg.role();
+                    rin.content = msg.content();
+                    rin.name = msg.name();
+                    rin.tool_call_id = msg.tool_call_id();
+                    rin.reasoning_content = msg.reasoning_content();
+                    rin.tool_calls = msg.tool_calls();
+                    rin.is_last_user_msg = (i == last_user_msg_idx);
+                    if (rin.is_last_user_msg) {
+                        for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j));
+                        for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j));
+                        for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j));
                     }
-
-                    // Add optional fields for OpenAI-compatible message format
-                    if (!msg.name().empty()) {
-                        msg_json["name"] = msg.name();
-                    }
-                    if (!msg.tool_call_id().empty()) {
-                        msg_json["tool_call_id"] = msg.tool_call_id();
-                    }
-                    if (!msg.reasoning_content().empty()) {
-                        msg_json["reasoning_content"] = msg.reasoning_content();
-                    }
-                    if (!msg.tool_calls().empty()) {
-                        // Parse tool_calls JSON string and add to message
-                        try {
-                            json tool_calls = json::parse(msg.tool_calls());
-                            msg_json["tool_calls"] = tool_calls;
-                            SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str());
-                            // IMPORTANT: If message has tool_calls but content is empty or not set,
-                            // set content to space " " instead of empty string "", because llama.cpp's
-                            // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312),
-                            // which causes template errors when accessing message.content[:tool_start_length]
-                            if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get<std::string>().empty())) {
-                                SRV_INF("[CONTENT DEBUG] PredictStream: Message %d has tool_calls but empty content, setting to space\n", i);
-                                msg_json["content"] = " ";
-                            }
-                            // Log each tool call with name and arguments
-                            if (tool_calls.is_array()) {
-                                for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) {
-                                    const auto& tc = tool_calls[tc_idx];
-                                    std::string tool_name = "unknown";
-                                    std::string tool_args = "{}";
-                                    if (tc.contains("function")) {
-                                        const auto& func = tc["function"];
-                                        if (func.contains("name")) {
-                                            tool_name = func["name"].get<std::string>();
-                                        }
-                                        if (func.contains("arguments")) {
-                                            tool_args = func["arguments"].is_string() ?
-                                                func["arguments"].get<std::string>() :
-                                                func["arguments"].dump();
-                                        }
-                                    } else if (tc.contains("name")) {
-                                        tool_name = tc["name"].get<std::string>();
-                                        if (tc.contains("arguments")) {
-                                            tool_args = tc["arguments"].is_string() ?
-                                                tc["arguments"].get<std::string>() :
-                                                tc["arguments"].dump();
-                                        }
-                                    }
-                                    SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d, tool_call %zu: name=%s, arguments=%s\n",
-                                            i, tc_idx, tool_name.c_str(), tool_args.c_str());
-                                }
-                            }
-                        } catch (const json::parse_error& e) {
-                            SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what());
-                        }
-                    }
-
-                    // Debug: Log final content state before adding to array
-                    if (msg_json.contains("content")) {
-                        if (msg_json["content"].is_null()) {
-                            SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i);
-                        } else {
-                            SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content type=%s, has_value=%d\n",
-                                    i, msg_json["content"].is_string() ? "string" :
-                                       msg_json["content"].is_array() ? "array" :
-                                       msg_json["content"].is_object() ? "object" : "other",
-                                    msg_json["content"].is_null() ? 0 : 1);
-                        }
-                    } else {
-                        SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i);
-                    }
-
-                    messages_json.push_back(msg_json);
+                    messages_json.push_back(llama_grpc::build_reconstructed_message(rin));
                 }
 
                 // Final safety check: Ensure no message has null content (Jinja templates require strings)
@@ -2072,36 +1851,7 @@ public:
                 if (body_json.contains("messages") && body_json["messages"].is_array()) {
                     SRV_INF("[CONTENT DEBUG] PredictStream: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size());
                     for (size_t idx = 0; idx < body_json["messages"].size(); idx++) {
-                        auto& msg = body_json["messages"][idx];
-                        std::string role_str = msg.contains("role") ? msg["role"].get<std::string>() : "unknown";
-                        if (msg.contains("content")) {
-                            if (msg["content"].is_null()) {
-                                SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str());
-                                msg["content"] = ""; // Fix null content
-                            } else if (role_str == "tool" && msg["content"].is_array()) {
-                                // Tool messages must have string content, not array
-                                // oaicompat_chat_params_parse expects tool messages to have string content
-                                SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx);
-                                msg["content"] = msg["content"].dump();
-                            } else if (!msg["content"].is_string() && !msg["content"].is_array()) {
-                                // If content is object or other non-string type, convert to string for templates
-                                SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str());
-                                if (msg["content"].is_object()) {
-                                    msg["content"] = msg["content"].dump();
-                                } else {
-                                    msg["content"] = "";
-                                }
-                            } else {
-                                SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n",
-                                        idx, role_str.c_str(),
-                                        msg["content"].is_string() ? "string" :
-                                        msg["content"].is_array() ? "array" :
-                                        msg["content"].is_object() ? "object" : "other");
-                            }
-                        } else {
-                            SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str());
-                            msg["content"] = ""; // Add missing content
-                        }
+                        llama_grpc::normalize_template_message(body_json["messages"][idx]);
                     }
                 }
 
@@ -2433,264 +2183,20 @@ public:
                 SRV_INF("[CONTENT DEBUG] Predict: Processing %d messages\n", request->messages_size());
                 for (int i = 0; i < request->messages_size(); i++) {
                     const auto& msg = request->messages(i);
-                    json msg_json;
-                    msg_json["role"] = msg.role();
-
-                    SRV_INF("[CONTENT DEBUG] Predict: Message %d: role=%s, content_empty=%d, content_length=%zu\n",
-                            i, msg.role().c_str(), msg.content().empty() ? 1 : 0, msg.content().size());
-                    if (!msg.content().empty()) {
-                        SRV_INF("[CONTENT DEBUG] Predict: Message %d content (first 200 chars): %s\n",
-                                i, msg.content().substr(0, std::min<size_t>(200, msg.content().size())).c_str());
+                    llama_grpc::ReconstructedMessageInput rin;
+                    rin.role = msg.role();
+                    rin.content = msg.content();
+                    rin.name = msg.name();
+                    rin.tool_call_id = msg.tool_call_id();
+                    rin.reasoning_content = msg.reasoning_content();
+                    rin.tool_calls = msg.tool_calls();
+                    rin.is_last_user_msg = (i == last_user_msg_idx);
+                    if (rin.is_last_user_msg) {
+                        for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j));
+                        for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j));
+                        for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j));
                     }
-
-                    bool is_last_user_msg = (i == last_user_msg_idx);
-                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
-
-                    // Handle content - can be string, null, or array
-                    // For multimodal content, we'll embed images/audio from separate fields
-                    if (!msg.content().empty()) {
-                        // Try to parse content as JSON to see if it's already an array
-                        json content_val;
-                        try {
-                            content_val = json::parse(msg.content());
-                            // Handle null values - convert to empty string to avoid template errors
-                            if (content_val.is_null()) {
-                                SRV_INF("[CONTENT DEBUG] Predict: Message %d parsed JSON is null, converting to empty string\n", i);
-                                content_val = "";
-                            }
-                        } catch (const json::parse_error&) {
-                            // Not JSON, treat as plain string
-                            content_val = msg.content();
-                        }
-
-                        // If content is an object (e.g., from tool call failures), convert to string
-                        if (content_val.is_object()) {
-                            SRV_INF("[CONTENT DEBUG] Predict: Message %d content is object, converting to string\n", i);
-                            content_val = content_val.dump();
-                        }
-
-                        // If content is a string and this is the last user message with images/audio, combine them
-                        if (content_val.is_string() && is_last_user_msg && has_images_or_audio) {
-                            json content_array = json::array();
-                            // Add text first
-                            content_array.push_back({{"type", "text"}, {"text", content_val.get<std::string>()}});
-                            // Add images
-                            if (request->images_size() > 0) {
-                                for (int j = 0; j < request->images_size(); j++) {
-                                    json image_chunk;
-                                    image_chunk["type"] = "image_url";
-                                    json image_url;
-                                    image_url["url"] = "data:image/jpeg;base64," + request->images(j);
-                                    image_chunk["image_url"] = image_url;
-                                    content_array.push_back(image_chunk);
-                                }
-                            }
-                            // Add audios
-                            if (request->audios_size() > 0) {
-                                for (int j = 0; j < request->audios_size(); j++) {
-                                    json audio_chunk;
-                                    audio_chunk["type"] = "input_audio";
-                                    json input_audio;
-                                    input_audio["data"] = request->audios(j);
-                                    input_audio["format"] = "wav"; // default, could be made configurable
-                                    audio_chunk["input_audio"] = input_audio;
-                                    content_array.push_back(audio_chunk);
-                                }
-                            }
-                            if (request->videos_size() > 0) {
-                                for (int j = 0; j < request->videos_size(); j++) {
-                                    json video_chunk;
-                                    video_chunk["type"] = "input_video";
-                                    json input_video;
-                                    input_video["data"] = request->videos(j);
-                                    video_chunk["input_video"] = input_video;
-                                    content_array.push_back(video_chunk);
-                                }
-                            }
-                            msg_json["content"] = content_array;
-                        } else {
-                            // Use content as-is (already array or not last user message)
-                            // Ensure null values are converted to empty string
-                            if (content_val.is_null()) {
-                                SRV_INF("[CONTENT DEBUG] Predict: Message %d content_val was null, setting to empty string\n", i);
-                                msg_json["content"] = "";
-                            } else {
-                                msg_json["content"] = content_val;
-                                SRV_INF("[CONTENT DEBUG] Predict: Message %d content set, type=%s\n",
-                                        i, content_val.is_string() ? "string" :
-                                           content_val.is_array() ? "array" :
-                                           content_val.is_object() ? "object" : "other");
-                            }
-                        }
-                    } else if (is_last_user_msg && has_images_or_audio) {
-                        // If no content but this is the last user message with images/audio, create content array
-                        json content_array = json::array();
-                        if (request->images_size() > 0) {
-                            for (int j = 0; j < request->images_size(); j++) {
-                                json image_chunk;
-                                image_chunk["type"] = "image_url";
-                                json image_url;
-                                image_url["url"] = "data:image/jpeg;base64," + request->images(j);
-                                image_chunk["image_url"] = image_url;
-                                content_array.push_back(image_chunk);
-                            }
-                        }
-                        if (request->audios_size() > 0) {
-                            for (int j = 0; j < request->audios_size(); j++) {
-                                json audio_chunk;
-                                audio_chunk["type"] = "input_audio";
-                                json input_audio;
-                                input_audio["data"] = request->audios(j);
-                                input_audio["format"] = "wav"; // default, could be made configurable
-                                audio_chunk["input_audio"] = input_audio;
-                                content_array.push_back(audio_chunk);
-                            }
-                        }
-                        if (request->videos_size() > 0) {
-                            for (int j = 0; j < request->videos_size(); j++) {
-                                json video_chunk;
-                                video_chunk["type"] = "input_video";
-                                json input_video;
-                                input_video["data"] = request->videos(j);
-                                video_chunk["input_video"] = input_video;
-                                content_array.push_back(video_chunk);
-                            }
-                        }
-                        msg_json["content"] = content_array;
-                        SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i);
-                    } else if (!msg.tool_calls().empty()) {
-                        // Tool call messages may have null content, but templates expect string
-                        // IMPORTANT: Set to space " " instead of empty string "", because llama.cpp's
-                        // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312),
-                        // which causes template errors when accessing message.content[:tool_start_length]
-                        SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls, setting content to space (not empty string)\n", i);
-                        msg_json["content"] = " ";
-                    } else if (msg.role() == "tool") {
-                        // Tool role messages must have content field set, even if empty
-                        // Jinja templates expect content to be a string, not null or object
-                        SRV_INF("[CONTENT DEBUG] Predict: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0);
-                        if (msg.content().empty()) {
-                            msg_json["content"] = "";
-                            SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): empty content, set to empty string\n", i);
-                        } else {
-                            SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): content exists: %s\n",
-                                    i, msg.content().substr(0, std::min<size_t>(200, msg.content().size())).c_str());
-                            // Content exists, parse and ensure it's a string
-                            json content_val;
-                            try {
-                                content_val = json::parse(msg.content());
-                                SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): parsed JSON, type=%s\n",
-                                        i, content_val.is_null() ? "null" :
-                                           content_val.is_object() ? "object" :
-                                           content_val.is_string() ? "string" :
-                                           content_val.is_array() ? "array" : "other");
-                                // Handle null values - Jinja templates expect content to be a string, not null
-                                if (content_val.is_null()) {
-                                    msg_json["content"] = "";
-                                    SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): null content, converted to empty string\n", i);
-                                } else if (content_val.is_object()) {
-                                    // If content is an object (e.g., from tool call failures/errors), convert to string
-                                    msg_json["content"] = content_val.dump();
-                                    SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): object content, converted to string: %s\n",
-                                            i, content_val.dump().substr(0, std::min<size_t>(200, content_val.dump().size())).c_str());
-                                } else if (content_val.is_string()) {
-                                    msg_json["content"] = content_val.get<std::string>();
-                                    SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): string content, using as-is\n", i);
-                                } else {
-                                    // For arrays or other types, convert to string
-                                    msg_json["content"] = content_val.dump();
-                                    SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): %s content, converted to string\n",
-                                            i, content_val.is_array() ? "array" : "other type");
-                                }
-                            } catch (const json::parse_error&) {
-                                // Not JSON, treat as plain string
-                                msg_json["content"] = msg.content();
-                                SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): not JSON, using as string\n", i);
-                            }
-                        }
-                    } else {
-                        // Ensure all messages have content set (fallback for any unhandled cases)
-                        // Jinja templates expect content to be present, default to empty string if not set
-                        if (!msg_json.contains("content")) {
-                            SRV_INF("[CONTENT DEBUG] Predict: Message %d (role=%s): no content field, adding empty string\n",
-                                    i, msg.role().c_str());
-                            msg_json["content"] = "";
-                        }
-                    }
-
-                    // Add optional fields for OpenAI-compatible message format
-                    if (!msg.name().empty()) {
-                        msg_json["name"] = msg.name();
-                    }
-                    if (!msg.tool_call_id().empty()) {
-                        msg_json["tool_call_id"] = msg.tool_call_id();
-                    }
-                    if (!msg.reasoning_content().empty()) {
-                        msg_json["reasoning_content"] = msg.reasoning_content();
-                    }
-                    if (!msg.tool_calls().empty()) {
-                        // Parse tool_calls JSON string and add to message
-                        try {
-                            json tool_calls = json::parse(msg.tool_calls());
-                            msg_json["tool_calls"] = tool_calls;
-                            SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str());
-                            // IMPORTANT: If message has tool_calls but content is empty or not set,
-                            // set content to space " " instead of empty string "", because llama.cpp's
-                            // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312),
-                            // which causes template errors when accessing message.content[:tool_start_length]
-                            if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get<std::string>().empty())) {
-                                SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls but empty content, setting to space\n", i);
-                                msg_json["content"] = " ";
-                            }
-                            // Log each tool call with name and arguments
-                            if (tool_calls.is_array()) {
-                                for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) {
-                                    const auto& tc = tool_calls[tc_idx];
-                                    std::string tool_name = "unknown";
-                                    std::string tool_args = "{}";
-                                    if (tc.contains("function")) {
-                                        const auto& func = tc["function"];
-                                        if (func.contains("name")) {
-                                            tool_name = func["name"].get<std::string>();
-                                        }
-                                        if (func.contains("arguments")) {
-                                            tool_args = func["arguments"].is_string() ?
-                                                func["arguments"].get<std::string>() :
-                                                func["arguments"].dump();
-                                        }
-                                    } else if (tc.contains("name")) {
-                                        tool_name = tc["name"].get<std::string>();
-                                        if (tc.contains("arguments")) {
-                                            tool_args = tc["arguments"].is_string() ?
-                                                tc["arguments"].get<std::string>() :
-                                                tc["arguments"].dump();
-                                        }
-                                    }
-                                    SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d, tool_call %zu: name=%s, arguments=%s\n",
-                                            i, tc_idx, tool_name.c_str(), tool_args.c_str());
-                                }
-                            }
-                        } catch (const json::parse_error& e) {
-                            SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what());
-                        }
-                    }
-
-                    // Debug: Log final content state before adding to array
-                    if (msg_json.contains("content")) {
-                        if (msg_json["content"].is_null()) {
-                            SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i);
-                        } else {
-                            SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content type=%s, has_value=%d\n",
-                                    i, msg_json["content"].is_string() ? "string" :
-                                       msg_json["content"].is_array() ? "array" :
-                                       msg_json["content"].is_object() ? "object" : "other",
-                                    msg_json["content"].is_null() ? 0 : 1);
-                        }
-                    } else {
-                        SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i);
-                    }
-
-                    messages_json.push_back(msg_json);
+                    messages_json.push_back(llama_grpc::build_reconstructed_message(rin));
                 }
 
                 // Final safety check: Ensure no message has null content (Jinja templates require strings)
@@ -2911,36 +2417,7 @@ public:
                 if (body_json.contains("messages") && body_json["messages"].is_array()) {
                     SRV_INF("[CONTENT DEBUG] Predict: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size());
                     for (size_t idx = 0; idx < body_json["messages"].size(); idx++) {
-                        auto& msg = body_json["messages"][idx];
-                        std::string role_str = msg.contains("role") ? msg["role"].get<std::string>() : "unknown";
-                        if (msg.contains("content")) {
-                            if (msg["content"].is_null()) {
-                                SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str());
-                                msg["content"] = ""; // Fix null content
-                            } else if (role_str == "tool" && msg["content"].is_array()) {
-                                // Tool messages must have string content, not array
-                                // oaicompat_chat_params_parse expects tool messages to have string content
-                                SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx);
-                                msg["content"] = msg["content"].dump();
-                            } else if (!msg["content"].is_string() && !msg["content"].is_array()) {
-                                // If content is object or other non-string type, convert to string for templates
-                                SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str());
-                                if (msg["content"].is_object()) {
-                                    msg["content"] = msg["content"].dump();
-                                } else {
-                                    msg["content"] = "";
-                                }
-                            } else {
-                                SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n",
-                                        idx, role_str.c_str(),
-                                        msg["content"].is_string() ? "string" :
-                                        msg["content"].is_array() ? "array" :
-                                        msg["content"].is_object() ? "object" : "other");
-                            }
-                        } else {
-                            SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str());
-                            msg["content"] = ""; // Add missing content
-                        }
+                        llama_grpc::normalize_template_message(body_json["messages"][idx]);
                     }
                 }
 
diff --git a/backend/cpp/llama-cpp/message_content.h b/backend/cpp/llama-cpp/message_content.h
new file mode 100644
index 000000000..4c7317ecd
--- /dev/null
+++ b/backend/cpp/llama-cpp/message_content.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+namespace llama_grpc {
+
+// Normalizes a proto message's content string into the JSON value used when
+// reconstructing OpenAI-format messages for the tokenizer (jinja) template.
+//
+// Shared by the streaming (PredictStream) and non-streaming (Predict) message
+// reconstruction paths so the two cannot drift.
+//
+// LocalAI's Go layer (schema.Messages.ToProto) always sends content as a plain
+// text string; multimodal media travels in separate proto fields, never inside
+// content. So user/system/developer content is *only ever* opaque text and must
+// NOT be JSON-sniffed: a prompt that merely looks like JSON (e.g. an ingredient
+// list ["1/4 cup sugar", ...]) would otherwise be reinterpreted as structured
+// content parts and rejected by oaicompat_chat_params_parse with
+// "unsupported content[].type" (https://github.com/mudler/LocalAI/issues/10524).
+// (developer is OpenAI's modern system alias - same "human-authored text" nature.)
+//
+// For assistant/tool messages we still collapse a literal JSON null/object
+// (tool-call bookkeeping) to a string, but we never turn a plain string into an
+// array/scalar. The array defense is therefore role-independent (arrays/scalars
+// fall through for every role); the role gate only governs the null/object case.
+inline nlohmann::ordered_json normalize_message_content(const std::string& role,
+                                                        const std::string& content) {
+    nlohmann::ordered_json content_val = content;
+    if (role != "user" && role != "system" && role != "developer") {
+        try {
+            nlohmann::ordered_json parsed = nlohmann::ordered_json::parse(content);
+            if (parsed.is_null()) {
+                content_val = "";
+            } else if (parsed.is_object()) {
+                content_val = parsed.dump();
+            }
+            // arrays / scalars: keep the original plain-text string as-is
+        } catch (const nlohmann::ordered_json::parse_error&) {
+            // Not JSON, already the plain string
+        }
+    }
+    return content_val;
+}
+
+// Final safety pass applied to each reconstructed OpenAI message right before it
+// is handed to oaicompat_chat_params_parse (jinja templating). Jinja templates
+// assume content is a string: a literal null breaks slicing such as
+// message.content[:N] (#7324), and a tool message with array content is rejected
+// (#7528). A multimodal user message legitimately carries a typed-part array
+// ({type:text}, {type:image_url}, ...), which must be left intact. Shared by the
+// streaming and non-streaming paths so this invariant cannot drift between them.
+inline void normalize_template_message(nlohmann::ordered_json& msg) {
+    if (!msg.contains("content")) {
+        msg["content"] = ""; // templates expect the field to exist
+        return;
+    }
+    nlohmann::ordered_json& content = msg["content"];
+    const std::string role = (msg.contains("role") && msg["role"].is_string())
+                                 ? msg["role"].get<std::string>()
+                                 : std::string();
+    if (content.is_null()) {
+        content = ""; // #7324: null would crash content[:N] slicing
+    } else if (role == "tool" && content.is_array()) {
+        content = content.dump(); // #7528: tool messages must have string content
+    } else if (!content.is_string() && !content.is_array()) {
+        if (content.is_object()) {
+            content = content.dump(); // tool-call bookkeeping object -> string
+        } else {
+            content = ""; // other scalar (number/bool) -> empty
+        }
+    }
+    // string, or a non-tool (multimodal) typed-part array: leave untouched
+}
+
+// One proto message's data, flattened to plain types so the reconstruction logic
+// can be shared and unit-tested without protobuf. The streaming and non-streaming
+// predict paths both populate this from proto::Message + the request's media.
+struct ReconstructedMessageInput {
+    std::string role;
+    std::string content;            // proto.Message.content (always a plain string)
+    std::string name;
+    std::string tool_call_id;
+    std::string reasoning_content;
+    std::string tool_calls;         // tool_calls as a JSON string, or empty
+    bool is_last_user_msg = false;  // attach request media to this message
+    std::vector<std::string> images; // base64 (jpeg)
+    std::vector<std::string> audios; // base64 (wav)
+    std::vector<std::string> videos; // base64
+};
+
+// Appends the request's media as OpenAI typed content parts. Imperative (not
+// brace-init) to avoid nlohmann's object-vs-array initializer-list ambiguity.
+inline void append_media_parts(nlohmann::ordered_json& content_array,
+                               const std::vector<std::string>& images,
+                               const std::vector<std::string>& audios,
+                               const std::vector<std::string>& videos) {
+    for (const auto& img : images) {
+        nlohmann::ordered_json image_chunk;
+        image_chunk["type"] = "image_url";
+        nlohmann::ordered_json image_url;
+        image_url["url"] = "data:image/jpeg;base64," + img;
+        image_chunk["image_url"] = image_url;
+        content_array.push_back(image_chunk);
+    }
+    for (const auto& aud : audios) {
+        nlohmann::ordered_json audio_chunk;
+        audio_chunk["type"] = "input_audio";
+        nlohmann::ordered_json input_audio;
+        input_audio["data"] = aud;
+        input_audio["format"] = "wav"; // default; could be made configurable
+        audio_chunk["input_audio"] = input_audio;
+        content_array.push_back(audio_chunk);
+    }
+    for (const auto& vid : videos) {
+        nlohmann::ordered_json video_chunk;
+        video_chunk["type"] = "input_video";
+        nlohmann::ordered_json input_video;
+        input_video["data"] = vid;
+        video_chunk["input_video"] = input_video;
+        content_array.push_back(video_chunk);
+    }
+}
+
+// Reconstructs a single OpenAI-format message (the object fed to
+// oaicompat_chat_params_parse) from a proto message. Shared by PredictStream and
+// Predict so the content/multimodal/tool_calls handling cannot drift between the
+// two stream modes (it previously lived as two ~150-line copies with a redundant
+// Predict-only tool_calls->" " branch). Guarantees content is always a string or
+// a typed-part array, never null/missing.
+inline nlohmann::ordered_json build_reconstructed_message(const ReconstructedMessageInput& in) {
+    nlohmann::ordered_json msg_json;
+    msg_json["role"] = in.role;
+    const bool has_media = !in.images.empty() || !in.audios.empty() || !in.videos.empty();
+
+    if (!in.content.empty()) {
+        nlohmann::ordered_json content_val = normalize_message_content(in.role, in.content);
+        if (content_val.is_string() && in.is_last_user_msg && has_media) {
+            // Last user message + media: build a typed-part array (text first).
+            nlohmann::ordered_json content_array = nlohmann::ordered_json::array();
+            nlohmann::ordered_json text_part;
+            text_part["type"] = "text";
+            text_part["text"] = content_val.get<std::string>();
+            content_array.push_back(text_part);
+            append_media_parts(content_array, in.images, in.audios, in.videos);
+            msg_json["content"] = content_array;
+        } else if (content_val.is_null()) {
+            msg_json["content"] = "";
+        } else {
+            msg_json["content"] = content_val;
+        }
+    } else if (in.is_last_user_msg && has_media) {
+        // No text but media on the last user message: media-only typed array.
+        nlohmann::ordered_json content_array = nlohmann::ordered_json::array();
+        append_media_parts(content_array, in.images, in.audios, in.videos);
+        msg_json["content"] = content_array;
+    } else {
+        // Empty content (any role, incl. tool/assistant): templates need a string.
+        msg_json["content"] = "";
+    }
+
+    if (!in.name.empty()) {
+        msg_json["name"] = in.name;
+    }
+    if (!in.tool_call_id.empty()) {
+        msg_json["tool_call_id"] = in.tool_call_id;
+    }
+    if (!in.reasoning_content.empty()) {
+        msg_json["reasoning_content"] = in.reasoning_content;
+    }
+    if (!in.tool_calls.empty()) {
+        try {
+            nlohmann::ordered_json tool_calls = nlohmann::ordered_json::parse(in.tool_calls);
+            msg_json["tool_calls"] = tool_calls;
+            // tool_calls + empty/blank content: use " " not "", because llama.cpp's
+            // common_chat_msgs_to_json_oaicompat turns "" into null, which breaks
+            // templates that slice message.content[:tool_start_length] (#7324).
+            if (!msg_json.contains("content") ||
+                (msg_json["content"].is_string() && msg_json["content"].get<std::string>().empty())) {
+                msg_json["content"] = " ";
+            }
+        } catch (const nlohmann::ordered_json::parse_error&) {
+            // Malformed tool_calls JSON: leave content as-is (prior behavior).
+        }
+    }
+
+    return msg_json;
+}
+
+}  // namespace llama_grpc
diff --git a/backend/cpp/llama-cpp/message_content_test.cpp b/backend/cpp/llama-cpp/message_content_test.cpp
new file mode 100644
index 000000000..7e9a5383a
--- /dev/null
+++ b/backend/cpp/llama-cpp/message_content_test.cpp
@@ -0,0 +1,234 @@
+// Unit tests for the shared message-reconstruction helpers (message_content.h).
+//
+// Build & run standalone (nlohmann/json single header on the include path):
+//   g++ -std=c++17 -I<dir-with-nlohmann> message_content_test.cpp -o t && ./t
+// or via CMake: -DLLAMA_GRPC_BUILD_TESTS=ON then ctest.
+//
+// Regression coverage for:
+//   #10524 - a user/system prompt that is itself a JSON-array string must stay
+//            plain text, never be reinterpreted as OpenAI structured parts.
+//   #7324  - assistant/tool null content -> "" (templates slice content[:N]);
+//            assistant+tool_calls+empty content -> " " (not "", which becomes null).
+//   #7528  - tool message array content must reach the template as a string.
+//   multimodal - last user message text + media -> typed-part array, media kept.
+
+#include <cassert>
+#include <iostream>
+#include <string>
+
+#include "message_content.h"
+
+using nlohmann::ordered_json;
+using llama_grpc::normalize_message_content;
+using llama_grpc::normalize_template_message;
+using llama_grpc::build_reconstructed_message;
+using llama_grpc::ReconstructedMessageInput;
+
+static int failures = 0;
+
+static void check(bool ok, const std::string& name, const std::string& detail = "") {
+    if (!ok) {
+        std::cerr << "FAIL " << name << (detail.empty() ? "" : ": " + detail) << "\n";
+        failures++;
+    }
+}
+
+// ---- normalize_message_content -------------------------------------------
+
+static void expect_norm_string(const char* name, const std::string& role,
+                               const std::string& content, const std::string& want) {
+    auto got = normalize_message_content(role, content);
+    if (!got.is_string()) {
+        check(false, name, "expected a JSON string, got " +
+                               std::string(got.is_array() ? "array" : got.is_object() ? "object" : "other") +
+                               " (" + got.dump() + ")");
+        return;
+    }
+    check(got.get<std::string>() == want, name, "expected \"" + want + "\", got \"" + got.get<std::string>() + "\"");
+}
+
+static void test_normalize() {
+    const std::string ingredients = R"(["1/4 cup brown sugar, packed","1 pound ground beef"])";
+
+    // #10524 - JSON-array text must stay a string. Role-INDEPENDENT array defense.
+    for (const char* role : {"user", "system", "developer", "function", "assistant", "tool"}) {
+        expect_norm_string((std::string("json_array_stays_text:") + role).c_str(), role, ingredients, ingredients);
+    }
+
+    // #10524 - user/system/developer JSON-object text stays verbatim (NOT re-dumped).
+    expect_norm_string("user_json_object_verbatim", "user", R"({"a":1})", R"({"a":1})");
+    expect_norm_string("system_json_object_verbatim", "system", R"({"a":1})", R"({"a":1})");
+    expect_norm_string("developer_json_object_verbatim", "developer", R"({"a":1})", R"({"a":1})");
+
+    // Plain text unchanged for all roles.
+    expect_norm_string("user_plain_text", "user", "hello world", "hello world");
+    expect_norm_string("assistant_non_json_text_kept", "assistant", "hi [unclosed", "hi [unclosed");
+
+    // #7324 boundary - user/system/developer literal "null" preserved (never parsed).
+    expect_norm_string("user_literal_null_stays", "user", "null", "null");
+    expect_norm_string("system_literal_null_stays", "system", "null", "null");
+    expect_norm_string("developer_literal_null_stays", "developer", "null", "null");
+
+    // #7324 - assistant/tool literal null collapses to empty string.
+    expect_norm_string("assistant_null_to_empty", "assistant", "null", "");
+    expect_norm_string("tool_null_to_empty", "tool", "null", "");
+
+    // #7324/#7528 - assistant/tool object bookkeeping stringified (stays a string).
+    check(normalize_message_content("assistant", R"({"tool":"x"})").is_string(), "assistant_object_stringified");
+    check(normalize_message_content("tool", R"({"error":"boom"})").is_string(), "tool_object_stringified");
+
+    // #10524-family - a bare scalar that parses as a JSON number stays the string.
+    expect_norm_string("assistant_scalar_number_stays_string", "assistant", "42", "42");
+
+    // baseline - empty content stays empty.
+    expect_norm_string("user_empty_stays_empty", "user", "", "");
+}
+
+// ---- normalize_template_message (BEFORE TEMPLATE sanitizer) ---------------
+
+static void test_template_sanitizer() {
+    // #7528 - a tool message with an ACTUAL array becomes a string.
+    {
+        ordered_json msg = {{"role", "tool"}, {"content", ordered_json::array({{{"type", "text"}, {"text", "r"}}})}};
+        normalize_template_message(msg);
+        check(msg["content"].is_string(), "before_template_tool_array_to_string", "got " + msg["content"].dump());
+    }
+    // #7324 - null content -> "" for any role.
+    {
+        ordered_json msg = {{"role", "assistant"}, {"content", nullptr}};
+        normalize_template_message(msg);
+        check(msg["content"].is_string() && msg["content"] == "", "before_template_null_to_empty");
+    }
+    // object content -> dumped string (would otherwise throw at the template).
+    {
+        ordered_json msg = {{"role", "assistant"}, {"content", {{"x", 1}}}};
+        normalize_template_message(msg);
+        check(msg["content"].is_string(), "before_template_object_to_string", "got " + msg["content"].dump());
+    }
+    // missing content field -> "".
+    {
+        ordered_json msg = {{"role", "user"}};
+        normalize_template_message(msg);
+        check(msg.contains("content") && msg["content"] == "", "before_template_missing_to_empty");
+    }
+    // multimodal: a well-typed user array must be left UNTOUCHED (role!=tool).
+    {
+        ordered_json parts = ordered_json::array();
+        parts.push_back({{"type", "text"}, {"text", "x"}});
+        ordered_json img; img["type"] = "image_url"; img["image_url"] = {{"url", "data:..."}};
+        parts.push_back(img);
+        ordered_json msg = {{"role", "user"}, {"content", parts}};
+        normalize_template_message(msg);
+        check(msg["content"].is_array() && msg["content"].size() == 2, "before_template_user_typed_array_preserved",
+              "got " + msg["content"].dump());
+    }
+    // a plain string is left untouched.
+    {
+        ordered_json msg = {{"role", "user"}, {"content", "hello"}};
+        normalize_template_message(msg);
+        check(msg["content"] == "hello", "before_template_string_untouched");
+    }
+}
+
+// ---- build_reconstructed_message ----------------------------------------
+
+static void test_reconstruction() {
+    const std::string ingredients = R"(["1/4 cup brown sugar","1 pound ground beef"])";
+
+    // #10524 end-state - user JSON-array text, no media -> string content.
+    {
+        ReconstructedMessageInput in;
+        in.role = "user"; in.content = ingredients;
+        auto m = build_reconstructed_message(in);
+        check(m["content"].is_string() && m["content"] == ingredients, "recon_user_json_array_string",
+              "got " + m["content"].dump());
+    }
+    // multimodal - user text + one image on last user msg -> typed array, image kept.
+    {
+        ReconstructedMessageInput in;
+        in.role = "user"; in.content = ingredients; in.is_last_user_msg = true;
+        in.images.push_back("BASE64IMG");
+        auto m = build_reconstructed_message(in);
+        check(m["content"].is_array() && m["content"].size() == 2, "recon_multimodal_text_plus_image",
+              "got " + m["content"].dump());
+        check(m["content"][0]["type"] == "text" && m["content"][0]["text"] == ingredients, "recon_multimodal_text_first");
+        check(m["content"][1]["type"] == "image_url", "recon_multimodal_image_kept");
+    }
+    // multimodal media-only - empty text + image on last user msg.
+    {
+        ReconstructedMessageInput in;
+        in.role = "user"; in.content = ""; in.is_last_user_msg = true;
+        in.images.push_back("BASE64IMG");
+        auto m = build_reconstructed_message(in);
+        check(m["content"].is_array() && m["content"].size() == 1 && m["content"][0]["type"] == "image_url",
+              "recon_media_only", "got " + m["content"].dump());
+    }
+    // #7528 - tool array-string content stays a string.
+    {
+        ReconstructedMessageInput in;
+        in.role = "tool"; in.content = R"(["a","b"])"; in.tool_call_id = "call_1";
+        auto m = build_reconstructed_message(in);
+        check(m["content"].is_string() && m["content"] == R"(["a","b"])", "recon_tool_array_string",
+              "got " + m["content"].dump());
+        check(m["tool_call_id"] == "call_1", "recon_tool_call_id_set");
+    }
+    // tool empty content -> "".
+    {
+        ReconstructedMessageInput in;
+        in.role = "tool"; in.content = "";
+        auto m = build_reconstructed_message(in);
+        check(m["content"].is_string() && m["content"] == "", "recon_tool_empty_to_string");
+    }
+    // #7324 - assistant + tool_calls + empty content -> " " (single space, not "").
+    {
+        ReconstructedMessageInput in;
+        in.role = "assistant"; in.content = "";
+        in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])";
+        auto m = build_reconstructed_message(in);
+        check(m["content"].is_string() && m["content"] == " ", "recon_toolcalls_empty_content_space",
+              "got " + m["content"].dump());
+        check(m["tool_calls"].is_array() && m["tool_calls"].size() == 1, "recon_toolcalls_parsed");
+    }
+    // assistant + tool_calls + real content keeps the content.
+    {
+        ReconstructedMessageInput in;
+        in.role = "assistant"; in.content = "I'll call f";
+        in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])";
+        auto m = build_reconstructed_message(in);
+        check(m["content"] == "I'll call f", "recon_toolcalls_with_content_kept");
+    }
+    // assistant null content -> "".
+    {
+        ReconstructedMessageInput in;
+        in.role = "assistant"; in.content = "null";
+        auto m = build_reconstructed_message(in);
+        check(m["content"] == "", "recon_assistant_null_to_empty");
+    }
+    // malformed tool_calls JSON must not throw; content preserved.
+    {
+        ReconstructedMessageInput in;
+        in.role = "assistant"; in.content = "hi"; in.tool_calls = "{not json";
+        auto m = build_reconstructed_message(in);
+        check(m["content"] == "hi" && !m.contains("tool_calls"), "recon_malformed_toolcalls_safe");
+    }
+    // optional fields: name + reasoning carried through.
+    {
+        ReconstructedMessageInput in;
+        in.role = "tool"; in.content = "result"; in.name = "get_weather"; in.reasoning_content = "thinking";
+        auto m = build_reconstructed_message(in);
+        check(m["name"] == "get_weather" && m["reasoning_content"] == "thinking", "recon_optional_fields");
+    }
+}
+
+int main() {
+    test_normalize();
+    test_template_sanitizer();
+    test_reconstruction();
+
+    if (failures == 0) {
+        std::cout << "OK: all message_content tests passed\n";
+        return 0;
+    }
+    std::cerr << failures << " test(s) failed\n";
+    return 1;
+}
diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh
index f9b7e3dd2..4da45ea9d 100644
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
@@ -18,6 +18,10 @@ done
 
 cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
 cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+# Shared message-reconstruction helpers (included by grpc-server.cpp) and their
+# unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON).
+cp -r message_content.h llama.cpp/tools/grpc-server/
+cp -r message_content_test.cpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
 
diff --git a/backend/cpp/run-unit-tests.sh b/backend/cpp/run-unit-tests.sh
new file mode 100755
index 000000000..3f63faa40
--- /dev/null
+++ b/backend/cpp/run-unit-tests.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# Discovers and runs every standalone C++ unit test under backend/cpp/.
+#
+# A "standalone" unit test is a *_test.cpp that depends only on the C++ standard
+# library and nlohmann/json (single header) - i.e. it exercises pure helpers and
+# does not need the full llama.cpp + gRPC backend build. Tests that DO need the
+# backend build use the CMake/ctest path (e.g. -DLLAMA_GRPC_BUILD_TESTS=ON)
+# instead and are skipped here.
+#
+# This keeps CI generic: adding a new pure-C++ unit test file named *_test.cpp in
+# an active backend source dir is picked up automatically, with no CI edits.
+#
+# Env:
+#   NLOHMANN_INCLUDE  include dir that contains nlohmann/json.hpp. If unset, the
+#                     nlohmann/json single header is fetched to a temp dir.
+#   CXX               compiler (default: g++).
+#   JSON_VERSION      nlohmann/json tag to fetch when NLOHMANN_INCLUDE is unset
+#                     (default: v3.11.3).
+set -uo pipefail
+
+ROOT="$(cd "$(dirname "$0")" && pwd)"
+CXX="${CXX:-g++}"
+JSON_VERSION="${JSON_VERSION:-v3.11.3}"
+
+JSON_INC="${NLOHMANN_INCLUDE:-}"
+if [ -z "$JSON_INC" ]; then
+    JSON_INC="$(mktemp -d)"
+    mkdir -p "$JSON_INC/nlohmann"
+    echo "Fetching nlohmann/json ${JSON_VERSION} single header..."
+    if ! curl -L -sf \
+        "https://raw.githubusercontent.com/nlohmann/json/${JSON_VERSION}/single_include/nlohmann/json.hpp" \
+        -o "$JSON_INC/nlohmann/json.hpp"; then
+        echo "ERROR: failed to fetch nlohmann/json header" >&2
+        exit 1
+    fi
+fi
+
+# Active source dirs only - exclude per-variant build copies, dev snapshots and
+# the vendored upstream llama.cpp tree.
+mapfile -t tests < <(find "$ROOT" -name '*_test.cpp' \
+    -not -path '*/llama.cpp/*' \
+    -not -path '*-build/*' \
+    -not -path '*-dev/*' \
+    -not -path '*fallback*' | sort)
+
+if [ "${#tests[@]}" -eq 0 ]; then
+    echo "No standalone C++ unit tests found under $ROOT"
+    exit 0
+fi
+
+fail=0
+for test_src in "${tests[@]}"; do
+    name="$(basename "$test_src" .cpp)"
+    bin="$(mktemp -d)/$name"
+    echo "==> $test_src"
+    if ! "$CXX" -std=c++17 -Wall -Wextra \
+        -I"$JSON_INC" -I"$(dirname "$test_src")" \
+        "$test_src" -o "$bin"; then
+        echo "COMPILE FAILED: $test_src" >&2
+        fail=1
+        continue
+    fi
+    if ! "$bin"; then
+        echo "TEST FAILED: $test_src" >&2
+        fail=1
+    fi
+done
+
+echo "Ran ${#tests[@]} standalone C++ unit test file(s)"
+exit "$fail"
diff --git a/core/schema/message_test.go b/core/schema/message_test.go
index da11d9d20..eb471b57b 100644
--- a/core/schema/message_test.go
+++ b/core/schema/message_test.go
@@ -68,6 +68,32 @@ var _ = Describe("LLM tests", func() {
 			Expect(protoMessages[0].Content).To(Equal("Hello World"))
 		})
 
+		// Regression for mudler/LocalAI#10524: a text part whose inner text is
+		// itself a JSON-array string (mealie sends an ingredient list) must
+		// flatten to that exact string verbatim. ToProto must NOT escape or
+		// restructure it - the C++ backend then treats it as opaque text. This
+		// pins the precise Go-side input that produced the "unsupported
+		// content[].type" gRPC error before the backend stopped re-parsing it.
+		It("flattens a JSON-array-looking text part to the verbatim string (#10524)", func() {
+			ingredients := `["1/4 cup brown sugar, packed","1 pound ground beef"]`
+			messages := Messages{
+				{
+					Role: "user",
+					Content: []any{
+						map[string]any{
+							"type": "text",
+							"text": ingredients,
+						},
+					},
+				},
+			}
+
+			protoMessages := messages.ToProto()
+
+			Expect(protoMessages).To(HaveLen(1))
+			Expect(protoMessages[0].Content).To(Equal(ingredients))
+		})
+
 		It("should convert message with tool_calls", func() {
 			messages := Messages{
 				{

From 14b29ebf4e9d6359e1709c0ba0d6d9c12690ede9 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 02:05:40 +0200
Subject: [PATCH 6/8] fix(backends): derive darwin RUN_BINARY from the exec
 line only (#10541)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

golang-darwin.sh's packaging check derived the launch binary by grepping every
$CURDIR/... reference in run.sh and taking the last one. Backends that pick a
runtime CPU variant assign it via unquoted `LIBRARY=$CURDIR/libgo<x>-avx512.so`
lines, so the heuristic returned `libgo<x>-avx512.so` — a variant Darwin never
builds (arm64 builds only fallback) — and the check then failed with
"package/libgo<x>-avx512.so not found ... refusing to package (#10267)",
breaking the darwin builds for whisper, sam3-cpp, vibevoice-cpp and friends.

Scan only the `exec` line(s) (the actual launch contract) and tolerate a
quoted `exec "$CURDIR"/<binary>`. parakeet-cpp's parakeet-cpp-grpc and the
quoted-only backends (sherpa/piper/opus) resolve correctly; no Linux change.

Assisted-by: Claude:claude-opus-4-8

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 scripts/build/golang-darwin.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/build/golang-darwin.sh b/scripts/build/golang-darwin.sh
index fddd4bc4f..d2e72eac9 100644
--- a/scripts/build/golang-darwin.sh
+++ b/scripts/build/golang-darwin.sh
@@ -17,9 +17,15 @@ rm -rf "${BACKEND_DIR}"/build-*
 # run.sh's final `exec $CURDIR/<binary>` is the contract for what gets launched;
 # the binary is not always named after the backend (e.g. parakeet-cpp launches
 # parakeet-cpp-grpc), so derive it from run.sh and fall back to ${BACKEND}.
+#
+# Only scan the `exec` line(s): many run.sh select a runtime CPU variant via
+# unquoted `LIBRARY=$CURDIR/libgo<x>-avx512.so` lines, and a whole-file grep
+# would pick the last of those (avx512, which Darwin never builds) instead of
+# the binary — failing the check below for whisper/sam3-cpp/vibevoice-cpp/...
+# Also tolerate the exec being quoted (`exec "$CURDIR"/<binary>`).
 RUN_BINARY=""
 if [ -f "${BACKEND_DIR}/run.sh" ]; then
-        RUN_BINARY=$(grep -oE '\$CURDIR/[A-Za-z0-9._-]+' "${BACKEND_DIR}/run.sh" | grep -v 'ld\.so' | tail -1 | sed 's|\$CURDIR/||')
+        RUN_BINARY=$(grep -E '^[[:space:]]*exec[[:space:]]' "${BACKEND_DIR}/run.sh" | grep -oE '"?\$CURDIR"?/[A-Za-z0-9._-]+' | grep -v 'ld\.so' | tail -1 | sed -E 's|"?\$CURDIR"?/||')
 fi
 RUN_BINARY="${RUN_BINARY:-${BACKEND}}"
 

From 0258f8af555e806f11b30ea0984cdd4afbbc3403 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 09:42:22 +0200
Subject: [PATCH 7/8] fix(backends): repair release CI build/test breaks
 (kokoros, fish-speech, llama-cpp-quantization, sglang) (#10547)

* fix(kokoros): implement new Backend RPCs to fix the build

The backend.proto grew six RPCs (SoundDetection, Depth, TokenClassify,
Score and the bidi-streaming Forward) that the kokoros gRPC service never
implemented, so the trait impl no longer satisfies `Backend`:

    error[E0046]: not all trait items implemented, missing:
      `sound_detection`, `depth`, `token_classify`, `score`,
      `ForwardStream`, `forward`

kokoros is a TTS backend with no use for these, so add `unimplemented`
stubs (plus the `ForwardStream` associated type) matching the existing
pattern for every other unsupported RPC in this file.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(fish-speech): add setuptools-rust for the editable source install

install.sh installs the fish-speech source tree editable with
`--no-build-isolation`, which means the build backends of its transitive
dependencies must already be present in the venv. One of them builds a
Rust extension and its metadata step fails with:

    ModuleNotFoundError: No module named 'setuptools_rust'

Add setuptools-rust to requirements.txt so installRequirements provisions
it before the editable install runs.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(llama-cpp-quantization): vendor convert_hf_to_gguf.py with conversion/

Upstream llama.cpp split the model-specific logic out of the single
convert_hf_to_gguf.py file into a sibling `conversion/` package, so the
script now starts with `from conversion import ...`. Downloading just the
one file therefore fails at runtime with:

    ModuleNotFoundError: No module named 'conversion'

Clone the repo (reusing the clone already needed to build llama-quantize)
and copy both the script and the `conversion/` package into the backend
dir. Python puts the script's own directory on sys.path[0], so the package
resolves when it sits beside the script.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(sglang): pin the CPU source build to sglang v0.5.11

The CPU profile builds sgl-kernel from a `git clone` of sglang with no
ref, so it always tracks master. Recent master added CPU kernels (e.g.
mamba/fla.cpp) that fail to compile in our builder:

    constexpr variable 'scale' must be initialized by a constant
    static library kineto_LIBRARY-NOTFOUND not found

Pin the clone to v0.5.11, the same release the GPU path already floors on
(requirements-cublas12-after.txt). Overridable via SGLANG_VERSION so the
pin can be bumped deliberately.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/fish-speech/requirements.txt   |  4 ++
 .../python/llama-cpp-quantization/install.sh  | 36 ++++++++++++------
 backend/python/sglang/install.sh              |  8 +++-
 backend/rust/kokoros/src/service.rs           | 37 +++++++++++++++++++
 4 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/backend/python/fish-speech/requirements.txt b/backend/python/fish-speech/requirements.txt
index 1be3c8250..528abf737 100644
--- a/backend/python/fish-speech/requirements.txt
+++ b/backend/python/fish-speech/requirements.txt
@@ -7,3 +7,7 @@ setuptools
 six
 scipy
 numpy
+# fish-speech is installed editable with --no-build-isolation, so the build
+# backends of its transitive deps must already be in the venv. One of them
+# builds a Rust extension and needs setuptools-rust present at metadata time.
+setuptools-rust
diff --git a/backend/python/llama-cpp-quantization/install.sh b/backend/python/llama-cpp-quantization/install.sh
index 05ac24f70..a9001ffaa 100755
--- a/backend/python/llama-cpp-quantization/install.sh
+++ b/backend/python/llama-cpp-quantization/install.sh
@@ -11,14 +11,31 @@ fi
 EXTRA_PIP_INSTALL_FLAGS+=" --upgrade "
 installRequirements
 
-# Fetch convert_hf_to_gguf.py from llama.cpp
+# Fetch convert_hf_to_gguf.py from llama.cpp.
+# Upstream split the model-specific logic out of the single file into a
+# sibling `conversion/` package (convert_hf_to_gguf.py now does
+# `from conversion import ...`), so a single-file download no longer runs —
+# it fails with `ModuleNotFoundError: No module named 'conversion'`. We clone
+# the repo and copy both the script and the package; Python puts the script's
+# own directory on sys.path[0], so the package resolves when placed beside it.
 LLAMA_CPP_CONVERT_VERSION="${LLAMA_CPP_CONVERT_VERSION:-master}"
+LLAMA_CPP_SRC="${EDIR}/llama.cpp"
 CONVERT_SCRIPT="${EDIR}/convert_hf_to_gguf.py"
-if [ ! -f "${CONVERT_SCRIPT}" ]; then
-    echo "Downloading convert_hf_to_gguf.py from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..."
-    curl -L --fail --retry 3 \
-        "https://raw.githubusercontent.com/ggml-org/llama.cpp/${LLAMA_CPP_CONVERT_VERSION}/convert_hf_to_gguf.py" \
-        -o "${CONVERT_SCRIPT}" || echo "Warning: Failed to download convert_hf_to_gguf.py."
+
+cloneLlamaCpp() {
+    if [ ! -d "${LLAMA_CPP_SRC}/.git" ]; then
+        git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \
+            https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \
+        git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}"
+    fi
+}
+
+if [ ! -f "${CONVERT_SCRIPT}" ] || [ ! -d "${EDIR}/conversion" ]; then
+    echo "Fetching convert_hf_to_gguf.py + conversion/ from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..."
+    cloneLlamaCpp
+    cp "${LLAMA_CPP_SRC}/convert_hf_to_gguf.py" "${CONVERT_SCRIPT}"
+    rm -rf "${EDIR}/conversion"
+    cp -r "${LLAMA_CPP_SRC}/conversion" "${EDIR}/conversion"
 fi
 
 # Install gguf package from the same llama.cpp commit to keep them in sync
@@ -41,12 +58,7 @@ QUANTIZE_BIN="${EDIR}/llama-quantize"
 if [ ! -x "${QUANTIZE_BIN}" ] && ! command -v llama-quantize &>/dev/null; then
     if command -v cmake &>/dev/null; then
         echo "Building llama-quantize from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..."
-        LLAMA_CPP_SRC="${EDIR}/llama.cpp"
-        if [ ! -d "${LLAMA_CPP_SRC}" ]; then
-            git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \
-                https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \
-            git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}"
-        fi
+        cloneLlamaCpp  # reuses the clone fetched for convert_hf_to_gguf.py
         cmake -B "${LLAMA_CPP_SRC}/build" -S "${LLAMA_CPP_SRC}" -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF
         cmake --build "${LLAMA_CPP_SRC}/build" --target llama-quantize -j"$(nproc 2>/dev/null || echo 2)"
         cp "${LLAMA_CPP_SRC}/build/bin/llama-quantize" "${QUANTIZE_BIN}"
diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh
index 928f7bd11..68812f8a7 100755
--- a/backend/python/sglang/install.sh
+++ b/backend/python/sglang/install.sh
@@ -85,9 +85,15 @@ if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
     # The resulting binary still requires an AVX-512 capable CPU at runtime,
     # same constraint sglang upstream documents in docker/xeon.Dockerfile.
 
+    # Pin the source build to the same release the GPU path floors on
+    # (0.5.11, see requirements-cublas12-after.txt). An unpinned master clone
+    # pulls in newer CPU kernels (e.g. mamba/fla.cpp) that fail to compile
+    # (constexpr non-constant + kineto_LIBRARY-NOTFOUND). Bump deliberately.
+    SGLANG_VERSION="${SGLANG_VERSION:-v0.5.11}"
     _sgl_src=$(mktemp -d)
     trap 'rm -rf "${_sgl_src}"' EXIT
-    git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang"
+    git clone --depth 1 --branch "${SGLANG_VERSION}" \
+        https://github.com/sgl-project/sglang "${_sgl_src}/sglang"
 
     # Patch -march=native → -march=sapphirerapids in the CPU kernel CMakeLists
     sed -i 's/-march=native/-march=sapphirerapids/g' \
diff --git a/backend/rust/kokoros/src/service.rs b/backend/rust/kokoros/src/service.rs
index b980feb52..ef361b9dc 100644
--- a/backend/rust/kokoros/src/service.rs
+++ b/backend/rust/kokoros/src/service.rs
@@ -570,6 +570,43 @@ impl Backend for KokorosService {
     ) -> Result<Response<backend::Result>, Status> {
         Err(Status::unimplemented("Not supported"))
     }
+
+    async fn sound_detection(
+        &self,
+        _: Request<backend::SoundDetectionRequest>,
+    ) -> Result<Response<backend::SoundDetectionResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    async fn depth(
+        &self,
+        _: Request<backend::DepthRequest>,
+    ) -> Result<Response<backend::DepthResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    async fn token_classify(
+        &self,
+        _: Request<backend::TokenClassifyRequest>,
+    ) -> Result<Response<backend::TokenClassifyResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    async fn score(
+        &self,
+        _: Request<backend::ScoreRequest>,
+    ) -> Result<Response<backend::ScoreResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    type ForwardStream = ReceiverStream<Result<backend::ForwardReply, Status>>;
+
+    async fn forward(
+        &self,
+        _: Request<tonic::Streaming<backend::ForwardRequest>>,
+    ) -> Result<Response<Self::ForwardStream>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
 }
 
 #[cfg(test)]

From e95018ef7036c3ac9400c3dd2c897b3c151960ee Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 09:42:46 +0200
Subject: [PATCH 8/8] chore(model gallery): :robot: add 1 new models via
 gallery agent (#10544)

chore(model gallery): :robot: add new models via gallery agent

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 gallery/index.yaml | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index ffac8e85f..cc975a83a 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,52 @@
 ---
+- name: "qwen-agentworld-35b-a3b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/unsloth/Qwen-AgentWorld-35B-A3B-GGUF
+  description: |
+    # Qwen-AgentWorld-35B-A3B
+
+    📑 Technical Report |
+    📖 Blog |
+    🤗 Hugging Face |
+    🤖 ModelScope |
+    💻 GitHub |
+    🖥️ Demo
+
+    > [!Note]
+    > This repository contains the model weights and configuration files for **Qwen-AgentWorld-35B-A3B**, a native language world model trained for agentic environment simulation.
+    >
+    > These artifacts are compatible with Hugging Face Transformers, vLLM, SGLang, etc.
+
+    **Qwen-AgentWorld** is the first language world model to cover seven agent interaction domains within a single model. It simulates agentic environments via long chain-of-thought reasoning, predicting the next environment state given an agent's action and interaction history. Trained through a three-stage pipeline — CPT injects environment knowledge, SFT activates next-state-prediction reasoning, RL sharpens simulation fidelity — Qwen-AgentWorld is a **native world model**: environment modeling is the training objective from the CPT stage onward, not a post-hoc add-on.
+
+    ## Highlights
+
+    ...
+  license: "apache-2.0"
+  tags:
+    - llm
+    - gguf
+    - qwen
+  icon: https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen-AgentWorld/logo.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      model: llama-cpp/models/Qwen-AgentWorld-35B-A3B-GGUF/Qwen-AgentWorld-35B-A3B-UD-Q4_K_M.gguf
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/Qwen-AgentWorld-35B-A3B-GGUF/Qwen-AgentWorld-35B-A3B-UD-Q4_K_M.gguf
+      sha256: e7a8eafdd8013443b6bcc4b6fb47b2d2025f772d359650b9ceb7d75971e22cad
+      uri: https://huggingface.co/unsloth/Qwen-AgentWorld-35B-A3B-GGUF/resolve/main/Qwen-AgentWorld-35B-A3B-UD-Q4_K_M.gguf
 - name: "ornith-1.0-9b"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
   urls: