From 63bde9ff7380b7952573bb466982ca5cefdcee4b Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 19 Apr 2026 14:04:37 -0700
Subject: [PATCH] llama/compat: add mistral3 vision (clip) support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Translates Ollama's monolithic mistral3 GGUF into the upstream pixtral
mmproj shape so `--mmproj <same-blob>` loads alongside the text model.

KV synthesis: rewrite arch to `clip`, copy `mistral3.vision.*` →
`clip.vision.*`, set `projector_type=pixtral`, `use_silu=true`,
inject pixtral image_mean/std defaults, copy
`mistral3.spatial_merge_size` → `clip.vision.spatial_merge_size`.

Tensor renames: `v.patch_conv`→`v.patch_embd`, `v.encoder_norm`→
`v.pre_ln`, `attn_output`→`attn_out`, `attn_norm`/`ffn_norm`→
`ln1`/`ln2`, `mm.linear_{1,2}`→`mm.{1,2}`, `mm.norm`→`mm.input_norm`,
`mm.patch_merger.merging_layer`→`mm.patch_merger`.

img_break: pixtral's loader strictly requires `v.token_embd.img_break`
(the embedding row for the [IMG_BREAK] separator token). Ollama
doesn't ship it as a separate tensor; the "right" value is row 12 of
token_embd.weight, but token_embd is Q4_K and per-row dequant is
heavyweight. Reclaim the orphan `output_norm.weight` slot (already
[n_embd] F32) and zero-fill via load op — pixtral.cpp adds img_break
to row separators, so a zero embedding makes [IMG_BREAK] insertion a
no-op without breaking the rest of the vision graph.

Adds `mistral3` to the Go-side `compatClipArches` allowlist so
ollama auto-passes `--mmproj <same-blob>` for vision requests.

Tested with ministral-3 (8B, just released): text + vision both work;
solid colored test images correctly identified (red/green/blue).
---
 llama/compat/README.md               |  1 +
 llama/compat/llama-ollama-compat.cpp | 90 ++++++++++++++++++++++++++++
 llm/llama_server.go                  |  1 +
 3 files changed, 92 insertions(+)
diff --git a/llama/compat/README.md b/llama/compat/README.md
index be84a0600..a63e1ed92 100644
--- a/llama/compat/README.md
+++ b/llama/compat/README.md
@@ -43,6 +43,7 @@ an immediate no-op.
 | `qwen35moe` | head_count_kv array → scalar, rope dimension_sections pad 3→4, `ssm_dt`→`ssm_dt.bias` rename, drop `v.*`/`mm.*`/`mtp.*` tensors | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.projector_type=qwen3vl_merger`), per-block QKV merge (concat at load time), patch_embed reshape + F16→F32 + slice-as-temporal-pair (reclaiming an orphan `v.blk.0.attn_k` slot for the second pair) |
 | `gptoss` | Arch rename `gptoss`→`gpt-oss` (incl. KV prefix), inject `gpt-oss.expert_feed_forward_length` from `ffn_gate_exps` shape, tensor renames (`attn_out`→`attn_output`, `attn_sinks`→`attn_sinks.weight`, `ffn_norm`→`post_attention_norm`) | n/a |
 | `lfm2` | Tensor rename `output_norm.weight`→`token_embd_norm.weight`, fix stale `lfm2.feed_forward_length` from `ffn_gate` shape | n/a |
+| `mistral3` | RoPE YaRN renames (`rope.scaling.beta_*`→`rope.scaling.yarn_beta_*`), `rope.scaling_beta`→`attention.temperature_scale`, drop `v.*`/`mm.*` tensors | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.projector_type=pixtral`), tensor renames (`v.patch_conv`→`v.patch_embd`, `v.encoder_norm`→`v.pre_ln`, `attn_output`→`attn_out`, `attn_norm`/`ffn_norm`→`ln1`/`ln2`, `mm.linear_{1,2}`→`mm.{1,2}`, `mm.norm`→`mm.input_norm`, `mm.patch_merger.merging_layer`→`mm.patch_merger`), zero-fill `v.token_embd.img_break` (reclaims `output_norm.weight` slot — Ollama's monolithic blob doesn't ship this tensor and per-row dequant of token_embd Q4_K is heavyweight; zero-fill makes [IMG_BREAK] insertion a no-op) |
 
 Usage:
 
diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp
index 681e4e741..df592c129 100644
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@@ -510,6 +510,92 @@ void handle_qwen35moe_clip(gguf_context * meta, ggml_context * ctx) {
     promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
 }
 
+// =========================================================================
+// mistral3 (clip side — pixtral projector)
+// =========================================================================
+//
+// Tensor renames Ollama → upstream pixtral:
+//   v.patch_conv                       -> v.patch_embd
+//   v.encoder_norm                     -> v.pre_ln
+//   v.blk.X.attn_output                -> v.blk.X.attn_out
+//   v.blk.X.attn_norm                  -> v.blk.X.ln1
+//   v.blk.X.ffn_norm                   -> v.blk.X.ln2
+//   mm.linear_1                        -> mm.1
+//   mm.linear_2                        -> mm.2
+//   mm.norm                            -> mm.input_norm
+//   mm.patch_merger.merging_layer      -> mm.patch_merger
+//
+// img_break: pixtral's loader requires `v.token_embd.img_break` (the
+// embedding row for the [IMG_BREAK] token, used as a row separator).
+// Ollama's monolithic blob doesn't ship it as a separate tensor; the
+// "ideal" value is row 12 of token_embd.weight, but token_embd is
+// quantized (Q4_K) and per-row dequant is heavyweight. Reclaim the
+// orphan output_norm.weight slot (already [n_embd] F32) and zero-fill
+// it — pixtral.cpp adds img_break to row separator embeddings, so a
+// zero embedding makes [IMG_BREAK] insertion a no-op without breaking
+// the rest of the vision graph.
+constexpr std::pair<const char *, const char *> kMistral3ClipRenames[] = {
+    {"v.patch_conv",                  "v.patch_embd"},
+    {"v.encoder_norm",                "v.pre_ln"},
+    {".attn_output",                  ".attn_out"},
+    {".attn_norm",                    ".ln1"},
+    {".ffn_norm",                     ".ln2"},
+    {"mm.linear_1",                   "mm.1"},
+    {"mm.linear_2",                   "mm.2"},
+    {"mm.patch_merger.merging_layer", "mm.patch_merger"},
+    {"mm.norm",                       "mm.input_norm"},
+};
+
+void handle_mistral3_clip(gguf_context * meta, ggml_context * ctx) {
+    LLAMA_LOG_INFO("%s: detected Ollama-format mistral3 GGUF used as mmproj; translating\n", __func__);
+
+    copy_u32_kv(meta, "mistral3.vision.block_count",            "clip.vision.block_count");
+    copy_u32_kv(meta, "mistral3.vision.embedding_length",       "clip.vision.embedding_length");
+    copy_u32_kv(meta, "mistral3.vision.feed_forward_length",    "clip.vision.feed_forward_length");
+    copy_u32_kv(meta, "mistral3.vision.attention.head_count",   "clip.vision.attention.head_count");
+    copy_u32_kv(meta, "mistral3.vision.image_size",             "clip.vision.image_size");
+    copy_u32_kv(meta, "mistral3.vision.patch_size",             "clip.vision.patch_size");
+    copy_u32_kv(meta, "mistral3.vision.num_channels",           "clip.vision.num_channels");
+    copy_u32_kv(meta, "mistral3.spatial_merge_size",            "clip.vision.spatial_merge_size");
+    copy_f32_kv(meta, "mistral3.vision.rope.freq_base",         "clip.rope.freq_base");
+    // projection_dim is required by the loader but pixtral derives the
+    // actual output dim from mm_2_w shape — any non-zero value works.
+    // Mirror the LM embedding length for diagnostics-friendliness.
+    copy_u32_kv(meta, "mistral3.embedding_length",              "clip.vision.projection_dim");
+
+    inject_f32_if_missing(meta, "clip.vision.attention.layer_norm_epsilon", 1e-5f);
+
+    // Pixtral image stats (CLIP-style means).
+    static const float kPixtralMean[3] = {0.48145467f, 0.45782750f, 0.40821072f};
+    static const float kPixtralStd [3] = {0.26862955f, 0.26130259f, 0.27577710f};
+    inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kPixtralMean, 3);
+    inject_f32_arr_if_missing(meta, "clip.vision.image_std",  kPixtralStd,  3);
+
+    inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
+    inject_bool_if_missing(meta, "clip.use_silu",           true);
+    gguf_set_val_str(meta, "clip.projector_type",  "pixtral");
+    gguf_set_val_str(meta, "general.architecture", "clip");
+
+    // Reclaim output_norm.weight as v.token_embd.img_break (zero-filled).
+    const int64_t lm_embd_kid = gguf_find_key(meta, "mistral3.embedding_length");
+    const uint32_t lm_embd = lm_embd_kid >= 0 ? gguf_get_val_u32(meta, lm_embd_kid) : 0;
+    if (lm_embd > 0 && reclaim_slot_as(meta, ctx,
+                                       "output_norm.weight", "v.token_embd.img_break",
+                                       {(int64_t) lm_embd}, GGML_TYPE_F32)) {
+        register_load_op("v.token_embd.img_break", LoadOp{
+            [](const char *, void * dst, size_t dst_size) {
+                std::memset(dst, 0, dst_size);
+                return true;
+            },
+            "img_break zero-fill",
+        });
+    }
+
+    for (const auto & [from, to] : kMistral3ClipRenames) {
+        rename_tensors_containing(meta, ctx, from, to);
+    }
+}
+
 } // anonymous namespace
 
 // =========================================================================
@@ -542,6 +628,10 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
         handle_qwen35moe_clip(meta, ctx);
         return;
     }
+    if (detect_ollama_mistral3(meta, ctx)) {
+        handle_mistral3_clip(meta, ctx);
+        return;
+    }
 }
 
 bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
diff --git a/llm/llama_server.go b/llm/llama_server.go
index 8ed5a4613..aefd51ed6 100644
--- a/llm/llama_server.go
+++ b/llm/llama_server.go
@@ -435,6 +435,7 @@ func NewLlamaServerRunner(
 	compatClipArches := map[string]bool{
 		"gemma3":    true,
 		"qwen35moe": true,
+		"mistral3":  true,
 		// Add entries as llama/compat grows clip handlers.
 	}
 	if len(projectors) == 0 &&