diff --git a/llama/compat/README.md b/llama/compat/README.md index 2c6fab9c9..193002bac 100644 --- a/llama/compat/README.md +++ b/llama/compat/README.md @@ -47,6 +47,7 @@ an immediate no-op. | `qwen35` | Same fixes as `qwen35moe` (head_count_kv array→scalar, rope dimension_sections pad 3→4, `ssm_dt`→`ssm_dt.bias`, drop `v.*`/`mm.*`/`mtp.*`) but for the non-MoE qwen3.5 (e.g. 9B). Both arches share `apply_qwen35_text_fixes`. | n/a | | `gemma4` | Drop `a.*`/`v.*`/`mm.*` (audio + vision + projector) from the text loader. Covers both E2B/E4B (dense) and 26B-A4B (MoE). | n/a | | `deepseekocr` | Arch rename `deepseekocr`→`deepseek2-ocr` (incl. KV prefix), inject `expert_feed_forward_length` from `ffn_down_exps` shape, `expert_shared_count` from `ffn_down_shexp` shape, default `attention.layer_norm_rms_epsilon`, drop `s.*`/`v.*`/`mm.*` | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.vision.sam.*`, `clip.projector_type=deepseekocr`, defaults for `feed_forward_length`/`projection_dim`/`window_size`/image stats), prefix-only rename `s.*`→`v.sam.*` (substring rename would corrupt `mm.layers`), CLIP leaf renames (`self_attn.{out,qkv}_proj`→`attn_{out,qkv}`, `layer_norm{1,2}`→`ln{1,2}`, `mlp.fc{1,2}`→`ffn_{up,down}`, `pre_layrnorm`→`pre_ln`), SAM leaf renames (`attn.proj`→`attn.out`, `attn.rel_pos_{h,w}`→`attn.pos_{h,w}.weight`, `norm{1,2}`→`{pre,post}_ln`), projector renames (`mm.layers`→`mm.model.fc`, `mm.image_newline`/`view_seperator`→`v.*`), F32 promote of `v.patch_embd.weight`, `v.sam.patch_embd.weight`, `v.position_embd.weight` | +| `nemotron_h_moe` | For latent-FFN variants (e.g. nemotron-3-super 120B-A12B): inject `moe_latent_size` from `ffn_latent_in.weight` ne[1], rename `ffn_latent_{in,out}`→`ffn_latent_{down,up}`. For all variants: drop `mtp.*` (Multi-Token Prediction tensors that Ollama emits as one-tensor-per-expert; ~1040 extras on the 120B). Standard variants (e.g. nemotron-cascade-2 30B-A3B) load with no rename, only the MTP skip. | n/a | Usage: diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp index 37869ea9d..ca6d045da 100644 --- a/llama/compat/llama-ollama-compat.cpp +++ b/llama/compat/llama-ollama-compat.cpp @@ -299,6 +299,57 @@ void handle_deepseekocr(const llama_model_loader * ml, gguf_context * meta, add_skip_prefix(ml, "mm."); } +// ========================================================================= +// nemotron_h_moe (text only) +// ========================================================================= +// +// Same arch name on both sides. Most variants (e.g. nemotron-cascade-2) +// load as-is. The latent-FFN variants (e.g. nemotron-3-super 120B-A12B) +// rename `ffn_latent_in` / `ffn_latent_out` to `ffn_latent_down` / +// `ffn_latent_up`, and need `moe_latent_size` injected (derived from +// the latent tensor shape). + +bool detect_ollama_nemotron_h_moe(const gguf_context * meta, const ggml_context * ctx) { + const int64_t arch_kid = gguf_find_key(meta, "general.architecture"); + if (arch_kid < 0) return false; + if (std::strcmp(gguf_get_val_str(meta, arch_kid), "nemotron_h_moe") != 0) return false; + return any_tensor_with_prefix(ctx, "blk.1.ffn_latent_in") + || any_tensor_with_prefix(ctx, "blk.0.ffn_latent_in") + || any_tensor_with_prefix(ctx, "mtp."); +} + +void handle_nemotron_h_moe(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) { + if (!detect_ollama_nemotron_h_moe(meta, ctx)) return; + + LLAMA_LOG_INFO("%s: detected Ollama-format nemotron_h_moe GGUF; applying compatibility fixes\n", __func__); + + // Inject moe_latent_size for latent-FFN variants (e.g. super 120B-A12B). + // Standard variants (e.g. cascade-2 30B-A3B) have no latent tensors and + // use n_embd as the MoE inner dim — leave the key absent. + if (!has_key(meta, "nemotron_h_moe.moe_latent_size")) { + for (uint32_t b = 0; b < 1024; ++b) { + char name[64]; + std::snprintf(name, sizeof(name), "blk.%u.ffn_latent_in.weight", b); + if (ggml_tensor * t = ggml_get_tensor(ctx, name)) { + gguf_set_val_u32(meta, "nemotron_h_moe.moe_latent_size", + (uint32_t) t->ne[1]); + break; + } + } + } + + // Rename the latent projection tensors to upstream's naming (no-op when + // the file has no latent tensors). + rename_tensors_containing(meta, ctx, ".ffn_latent_in", ".ffn_latent_down"); + rename_tensors_containing(meta, ctx, ".ffn_latent_out", ".ffn_latent_up"); + + // Drop MTP (Multi-Token Prediction) tensors — Ollama's converter emits + // them as one-tensor-per-expert (`mtp.layers.X.mixer.experts.Y.{up,down}_proj`) + // which upstream's nemotron_h_moe loader doesn't claim. Total: ~1040 extra + // tensors on super 120B. + add_skip_prefix(ml, "mtp."); +} + // ========================================================================= // gpt-oss (text only) // ========================================================================= @@ -925,10 +976,11 @@ void translate_metadata(const llama_model_loader * ml, if (arch_name == "gemma4") handle_gemma4 (ml, meta, ctx); if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx); if (arch_name == "qwen35") handle_qwen35 (ml, meta, ctx); - if (arch_name == "gptoss") handle_gptoss (ml, meta, ctx, arch_name); - if (arch_name == "lfm2") handle_lfm2 (ml, meta, ctx); - if (arch_name == "mistral3") handle_mistral3 (ml, meta, ctx); - if (arch_name == "deepseekocr") handle_deepseekocr(ml, meta, ctx, arch_name); + if (arch_name == "gptoss") handle_gptoss (ml, meta, ctx, arch_name); + if (arch_name == "lfm2") handle_lfm2 (ml, meta, ctx); + if (arch_name == "mistral3") handle_mistral3 (ml, meta, ctx); + if (arch_name == "deepseekocr") handle_deepseekocr (ml, meta, ctx, arch_name); + if (arch_name == "nemotron_h_moe") handle_nemotron_h_moe(ml, meta, ctx); // Dispatch. Add more arches as they are wired up. }