From a7a7bd646b61fa2e376e1a62d779db1baf440407 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:12:42 +0200
Subject: [PATCH] fix(mlx): route vision-language models to the mlx-vlm backend
 (#10274)

Vision-language checkpoints such as mlx-community/gemma-4-E4B-it-qat-4bit
declare the "image-text-to-text" pipeline tag on HuggingFace. The mlx
importer hardcoded backend "mlx" for every mlx-community model, so these
VLMs were served by the text-only mlx-lm backend whose tokenizer does not
carry the processor chat template. The template was never applied and the
model produced degenerate, looping output that echoed the prompt.

Detect the "image-text-to-text" pipeline tag in the importer and route those
models to mlx-vlm, which applies the processor-aware chat template. An
explicit backend preference still wins.

As a defensive backstop, the mlx backend now warns loudly when the loaded
model has no chat template, so a misrouted VLM surfaces the problem instead
of silently looping.

Fixes #10269


Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/mlx/backend.py      | 18 ++++++++++
 core/gallery/importers/mlx.go      | 10 ++++++
 core/gallery/importers/mlx_test.go | 55 ++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py
index a71da522c..6a4602c01 100644
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -407,6 +407,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
             messages = messages_to_dicts(request.Messages)
 
+            # The mlx-lm tokenizer only carries a text-LM chat template. A
+            # vision-language checkpoint (e.g. gemma-4 E4B) loaded here has no
+            # usable template, so apply_chat_template silently passes the raw
+            # text through and the model just echoes/loops (issue #10269).
+            # Warn loudly so the misroute is visible; such models belong on the
+            # mlx-vlm backend.
+            chat_template = getattr(self.tokenizer, "chat_template", None)
+            if not chat_template:
+                underlying = getattr(self.tokenizer, "_tokenizer", None)
+                chat_template = getattr(underlying, "chat_template", None)
+            if not chat_template:
+                print(
+                    "WARNING: this model has no chat template; output may be "
+                    "degenerate. Vision-language models (e.g. gemma-4 E4B) must "
+                    "use the 'mlx-vlm' backend instead of 'mlx'.",
+                    file=sys.stderr,
+                )
+
             kwargs = {"tokenize": False, "add_generation_prompt": True}
             if request.Tools:
                 try:
diff --git a/core/gallery/importers/mlx.go b/core/gallery/importers/mlx.go
index fc841bd45..079ef4ed8 100644
--- a/core/gallery/importers/mlx.go
+++ b/core/gallery/importers/mlx.go
@@ -64,7 +64,17 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) {
 		description = "Imported from " + details.URI
 	}
 
+	// Vision-language checkpoints (e.g. gemma-4 E4B) declare the
+	// "image-text-to-text" pipeline tag on HuggingFace. The text-only mlx-lm
+	// tokenizer does not carry their processor chat template, so routing them
+	// through the plain mlx backend yields degenerate looping output
+	// (issue #10269). Send them to the mlx-vlm backend, which applies the
+	// processor-aware chat template.
 	backend := "mlx"
+	if details.HuggingFace != nil && details.HuggingFace.PipelineTag == "image-text-to-text" {
+		backend = "mlx-vlm"
+	}
+	// An explicit backend preference always wins.
 	b, ok := preferencesMap["backend"].(string)
 	if ok {
 		backend = b
diff --git a/core/gallery/importers/mlx_test.go b/core/gallery/importers/mlx_test.go
index 82e02aff0..dc4c1e6c2 100644
--- a/core/gallery/importers/mlx_test.go
+++ b/core/gallery/importers/mlx_test.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 
 	"github.com/mudler/LocalAI/core/gallery/importers"
+	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
@@ -122,6 +123,60 @@ var _ = Describe("MLXImporter", func() {
 			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm"))
 		})
 
+		It("should auto-route vision-language models to the mlx-vlm backend", func() {
+			// gemma-4 E4B and similar VLMs declare pipeline_tag
+			// "image-text-to-text" on HuggingFace. The text-only mlx-lm
+			// tokenizer does not carry their processor chat template, so
+			// routing them through the plain mlx backend produces degenerate
+			// looping output (issue #10269). They must go to mlx-vlm.
+			details := importers.Details{
+				URI: "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID:     "mlx-community/gemma-4-E4B-it-qat-4bit",
+					PipelineTag: "image-text-to-text",
+				},
+			}
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm"))
+		})
+
+		It("should keep text-only models on the plain mlx backend", func() {
+			details := importers.Details{
+				URI: "https://huggingface.co/mlx-community/Llama-3.2-1B-Instruct-4bit",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID:     "mlx-community/Llama-3.2-1B-Instruct-4bit",
+					PipelineTag: "text-generation",
+				},
+			}
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx"))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm"))
+		})
+
+		It("should honor an explicit backend preference even for a VLM", func() {
+			preferences := json.RawMessage(`{"backend": "mlx"}`)
+			details := importers.Details{
+				URI:         "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit",
+				Preferences: preferences,
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID:     "mlx-community/gemma-4-E4B-it-qat-4bit",
+					PipelineTag: "image-text-to-text",
+				},
+			}
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx"))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm"))
+		})
+
 		It("should handle invalid JSON preferences", func() {
 			preferences := json.RawMessage(`invalid json`)
 			details := importers.Details{