mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-12 18:58:49 -04:00
fix(mlx): route vision-language models to the mlx-vlm backend (#10274)
Vision-language checkpoints such as mlx-community/gemma-4-E4B-it-qat-4bit declare the "image-text-to-text" pipeline tag on HuggingFace. The mlx importer hardcoded backend "mlx" for every mlx-community model, so these VLMs were served by the text-only mlx-lm backend whose tokenizer does not carry the processor chat template. The template was never applied and the model produced degenerate, looping output that echoed the prompt. Detect the "image-text-to-text" pipeline tag in the importer and route those models to mlx-vlm, which applies the processor-aware chat template. An explicit backend preference still wins. As a defensive backstop, the mlx backend now warns loudly when the loaded model has no chat template, so a misrouted VLM surfaces the problem instead of silently looping. Fixes #10269 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -407,6 +407,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
messages = messages_to_dicts(request.Messages)
|
||||
|
||||
# The mlx-lm tokenizer only carries a text-LM chat template. A
|
||||
# vision-language checkpoint (e.g. gemma-4 E4B) loaded here has no
|
||||
# usable template, so apply_chat_template silently passes the raw
|
||||
# text through and the model just echoes/loops (issue #10269).
|
||||
# Warn loudly so the misroute is visible; such models belong on the
|
||||
# mlx-vlm backend.
|
||||
chat_template = getattr(self.tokenizer, "chat_template", None)
|
||||
if not chat_template:
|
||||
underlying = getattr(self.tokenizer, "_tokenizer", None)
|
||||
chat_template = getattr(underlying, "chat_template", None)
|
||||
if not chat_template:
|
||||
print(
|
||||
"WARNING: this model has no chat template; output may be "
|
||||
"degenerate. Vision-language models (e.g. gemma-4 E4B) must "
|
||||
"use the 'mlx-vlm' backend instead of 'mlx'.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
kwargs = {"tokenize": False, "add_generation_prompt": True}
|
||||
if request.Tools:
|
||||
try:
|
||||
|
||||
@@ -64,7 +64,17 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) {
|
||||
description = "Imported from " + details.URI
|
||||
}
|
||||
|
||||
// Vision-language checkpoints (e.g. gemma-4 E4B) declare the
|
||||
// "image-text-to-text" pipeline tag on HuggingFace. The text-only mlx-lm
|
||||
// tokenizer does not carry their processor chat template, so routing them
|
||||
// through the plain mlx backend yields degenerate looping output
|
||||
// (issue #10269). Send them to the mlx-vlm backend, which applies the
|
||||
// processor-aware chat template.
|
||||
backend := "mlx"
|
||||
if details.HuggingFace != nil && details.HuggingFace.PipelineTag == "image-text-to-text" {
|
||||
backend = "mlx-vlm"
|
||||
}
|
||||
// An explicit backend preference always wins.
|
||||
b, ok := preferencesMap["backend"].(string)
|
||||
if ok {
|
||||
backend = b
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"encoding/json"
|
||||
|
||||
"github.com/mudler/LocalAI/core/gallery/importers"
|
||||
hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
@@ -122,6 +123,60 @@ var _ = Describe("MLXImporter", func() {
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm"))
|
||||
})
|
||||
|
||||
It("should auto-route vision-language models to the mlx-vlm backend", func() {
|
||||
// gemma-4 E4B and similar VLMs declare pipeline_tag
|
||||
// "image-text-to-text" on HuggingFace. The text-only mlx-lm
|
||||
// tokenizer does not carry their processor chat template, so
|
||||
// routing them through the plain mlx backend produces degenerate
|
||||
// looping output (issue #10269). They must go to mlx-vlm.
|
||||
details := importers.Details{
|
||||
URI: "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit",
|
||||
HuggingFace: &hfapi.ModelDetails{
|
||||
ModelID: "mlx-community/gemma-4-E4B-it-qat-4bit",
|
||||
PipelineTag: "image-text-to-text",
|
||||
},
|
||||
}
|
||||
|
||||
modelConfig, err := importer.Import(details)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx-vlm"))
|
||||
})
|
||||
|
||||
It("should keep text-only models on the plain mlx backend", func() {
|
||||
details := importers.Details{
|
||||
URI: "https://huggingface.co/mlx-community/Llama-3.2-1B-Instruct-4bit",
|
||||
HuggingFace: &hfapi.ModelDetails{
|
||||
ModelID: "mlx-community/Llama-3.2-1B-Instruct-4bit",
|
||||
PipelineTag: "text-generation",
|
||||
},
|
||||
}
|
||||
|
||||
modelConfig, err := importer.Import(details)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx"))
|
||||
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm"))
|
||||
})
|
||||
|
||||
It("should honor an explicit backend preference even for a VLM", func() {
|
||||
preferences := json.RawMessage(`{"backend": "mlx"}`)
|
||||
details := importers.Details{
|
||||
URI: "https://huggingface.co/mlx-community/gemma-4-E4B-it-qat-4bit",
|
||||
Preferences: preferences,
|
||||
HuggingFace: &hfapi.ModelDetails{
|
||||
ModelID: "mlx-community/gemma-4-E4B-it-qat-4bit",
|
||||
PipelineTag: "image-text-to-text",
|
||||
},
|
||||
}
|
||||
|
||||
modelConfig, err := importer.Import(details)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: mlx"))
|
||||
Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: mlx-vlm"))
|
||||
})
|
||||
|
||||
It("should handle invalid JSON preferences", func() {
|
||||
preferences := json.RawMessage(`invalid json`)
|
||||
details := importers.Details{
|
||||
|
||||
Reference in New Issue
Block a user