From a7a7bd646b61fa2e376e1a62d779db1baf440407 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:12:42 +0200 Subject: [PATCH] fix(mlx): route vision-language models to the mlx-vlm backend (#10274) Vision-language checkpoints such as mlx-community/gemma-4-E4B-it-qat-4bit declare the "image-text-to-text" pipeline tag on HuggingFace. The mlx importer hardcoded backend "mlx" for every mlx-community model, so these VLMs were served by the text-only mlx-lm backend whose tokenizer does not carry the processor chat template. The template was never applied and the model produced degenerate, looping output that echoed the prompt. Detect the "image-text-to-text" pipeline tag in the importer and route those models to mlx-vlm, which applies the processor-aware chat template. An explicit backend preference still wins. As a defensive backstop, the mlx backend now warns loudly when the loaded model has no chat template, so a misrouted VLM surfaces the problem instead of silently looping. Fixes #10269 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- backend/python/mlx/backend.py | 18 ++++++++++ core/gallery/importers/mlx.go | 10 ++++++ core/gallery/importers/mlx_test.go | 55 ++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py index a71da522c..6a4602c01 100644 --- a/backend/python/mlx/backend.py +++ b/backend/python/mlx/backend.py @@ -407,6 +407,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if not request.Prompt and request.UseTokenizerTemplate and request.Messages: messages = messages_to_dicts(request.Messages) + # The mlx-lm tokenizer only carries a text-LM chat template. A + # vision-language checkpoint (e.g. gemma-4 E4B) loaded here has no + # usable template, so apply_chat_template silently passes the raw + # text through and the model just echoes/loops (issue #10269). + # Warn loudly so the misroute is visible; such models belong on the + # mlx-vlm backend. + chat_template = getattr(self.tokenizer, "chat_template", None) + if not chat_template: + underlying = getattr(self.tokenizer, "_tokenizer", None) + chat_template = getattr(underlying, "chat_template", None) + if not chat_template: + print( + "WARNING: this model has no chat template; output may be " + "degenerate. Vision-language models (e.g. gemma-4 E4B) must " + "use the 'mlx-vlm' backend instead of 'mlx'.", + file=sys.stderr, + ) + kwargs = {"tokenize": False, "add_generation_prompt": True} if request.Tools: try: diff --git a/core/gallery/importers/mlx.go b/core/gallery/importers/mlx.go index fc841bd45..079ef4ed8 100644 --- a/core/gallery/importers/mlx.go +++ b/core/gallery/importers/mlx.go @@ -64,7 +64,17 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) { description = "Imported from " + details.URI } + // Vision-language checkpoints (e.g. gemma-4 E4B) declare the + // "image-text-to-text" pipeline tag on HuggingFace. The text-only mlx-lm + // tokenizer does not carry their processor chat template, so routing them + // through the plain mlx backend yields degenerate looping output + // (issue #10269). Send them to the mlx-vlm backend, which applies the + // processor-aware chat template. backend := "mlx" + if details.HuggingFace != nil && details.HuggingFace.PipelineTag == "image-text-to-text" { + backend = "mlx-vlm" + } + // An explicit backend preference always wins. b, ok := preferencesMap["backend"].(string) if ok { backend = b diff --git a/core/gallery/importers/mlx_test.go b/core/gallery/importers/mlx_test.go index 82e02aff0..dc4c1e6c2 100644 --- a/core/gallery/importers/mlx_test.go +++ b/core/gallery/importers/mlx_test.go @@ -4,6 +4,7 @@ import ( "encoding/json" "github.com/mudler/LocalAI/core/gallery/importers" + hfapi "github.com/mudler/LocalAI/pkg/huggingface-api" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) @@ -122,6 +123,60 @@ var _ = Describe("MLXImporter", func() { Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm")) }) + It("should auto-route vision-language models to the mlx-vlm backend", func() { + // gemma-4 E4B and similar VLMs declare pipeline_tag + // "image-text-to-text" on HuggingFace. The text-only mlx-lm + // tokenizer does not carry their processor chat template, so + // routing them through the plain mlx backend produces degenerate + // looping output (issue #10269). They must go to mlx-vlm. + details := importers.Details{ + URI: "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit", + HuggingFace: &hfapi.ModelDetails{ + ModelID: "mlx-community/gemma-4-E4B-it-qat-4bit", + PipelineTag: "image-text-to-text", + }, + } + + modelConfig, err := importer.Import(details) + + Expect(err).ToNot(HaveOccurred()) + Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm")) + }) + + It("should keep text-only models on the plain mlx backend", func() { + details := importers.Details{ + URI: "https://huggingface.co/mlx-community/Llama-3.2-1B-Instruct-4bit", + HuggingFace: &hfapi.ModelDetails{ + ModelID: "mlx-community/Llama-3.2-1B-Instruct-4bit", + PipelineTag: "text-generation", + }, + } + + modelConfig, err := importer.Import(details) + + Expect(err).ToNot(HaveOccurred()) + Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx")) + Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm")) + }) + + It("should honor an explicit backend preference even for a VLM", func() { + preferences := json.RawMessage(`{"backend": "mlx"}`) + details := importers.Details{ + URI: "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit", + Preferences: preferences, + HuggingFace: &hfapi.ModelDetails{ + ModelID: "mlx-community/gemma-4-E4B-it-qat-4bit", + PipelineTag: "image-text-to-text", + }, + } + + modelConfig, err := importer.Import(details) + + Expect(err).ToNot(HaveOccurred()) + Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx")) + Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm")) + }) + It("should handle invalid JSON preferences", func() { preferences := json.RawMessage(`invalid json`) details := importers.Details{