diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp index bda0c9b64..7d0df8a35 100644 --- a/llama/compat/llama-ollama-compat.cpp +++ b/llama/compat/llama-ollama-compat.cpp @@ -120,25 +120,119 @@ void add_skip_prefix(const llama_model_loader * ml, std::string prefix) { } // ------------------------------------------------------------------------- -// F16 -> F32 tensor promotion (needed for Metal IM2COL on gemma3 conv weights) +// Load-time tensor transforms (registry consumed by maybe_load_tensor) +// +// Each registered op produces the final bytes for a single destination +// tensor by reading + transforming bytes from the source GGUF file. +// Used for F16->F32 promotion, QKV merging, and patch-embed splitting. // ------------------------------------------------------------------------- -std::mutex g_promote_mutex; -std::unordered_set g_promote_f16_to_f32; +struct LoadOp { + // apply() reads what it needs from `src_file` and fills `dst` (dst_size + // bytes). Returns false on failure. + std::function apply; + const char * description; +}; -// Set a tensor's type + strides in a ggml_context. The companion to this is -// the `maybe_load_tensor` read hook, which converts F16 bytes from disk into -// the newly-wider F32 buffer at load time. -void promote_tensor_to_f32(ggml_context * ctx, const char * name) { - ggml_tensor * t = ggml_get_tensor(ctx, name); - if (!t) return; - t->type = GGML_TYPE_F32; - t->nb[0] = ggml_type_size(GGML_TYPE_F32); - t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(GGML_TYPE_F32)); +std::mutex g_loadop_mutex; +std::unordered_map g_loadops; + +void register_load_op(std::string dest_name, LoadOp op) { + std::lock_guard lk(g_loadop_mutex); + g_loadops[std::move(dest_name)] = std::move(op); +} + +// Helper: read `size` bytes at `offset` from `path` into `dst`. +bool read_at(const char * path, size_t offset, void * dst, size_t size) { + FILE * f = std::fopen(path, "rb"); + if (!f) return false; + bool ok = (std::fseek(f, (long) offset, SEEK_SET) == 0 + && std::fread(dst, 1, size, f) == size); + std::fclose(f); + return ok; +} + +// Capture a tensor's absolute file offset BEFORE any rename or reshape. +size_t tensor_file_offset(const gguf_context * meta, const char * name) { + const int64_t id = gguf_find_tensor(meta, name); + if (id < 0) return 0; + return gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id); +} + +// Set a tensor's type and recompute strides in a ggml_context. +void set_tensor_type(ggml_tensor * t, ggml_type type) { + t->type = type; + t->nb[0] = ggml_type_size(type); + t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(type)); for (int i = 2; i < GGML_MAX_DIMS; ++i) t->nb[i] = t->nb[i - 1] * t->ne[i - 1]; +} - std::lock_guard lk(g_promote_mutex); - g_promote_f16_to_f32.insert(name); +// Set a tensor's shape and recompute strides in a ggml_context. +void set_tensor_shape(ggml_tensor * t, std::initializer_list shape) { + int i = 0; + for (auto v : shape) t->ne[i++] = v; + for (; i < GGML_MAX_DIMS; ++i) t->ne[i] = 1; + set_tensor_type(t, t->type); +} + +// Promote a tensor F16 -> F32. The disk bytes stay F16; we register a +// load op that converts on read. +void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name) { + const int64_t tid = gguf_find_tensor(meta, name); + if (tid < 0) return; + ggml_tensor * t = ggml_get_tensor(ctx, name); + if (!t || t->type != GGML_TYPE_F16) return; + + const size_t src_offset = tensor_file_offset(meta, name); + const size_t n_elem = ggml_nelements(t); + const size_t src_size = n_elem * sizeof(uint16_t); + + set_tensor_type(t, GGML_TYPE_F32); + + register_load_op(name, LoadOp{ + [src_offset, src_size, n_elem](const char * path, void * dst, size_t dst_size) { + (void) dst_size; + std::vector src(src_size); + if (!read_at(path, src_offset, src.data(), src_size)) return false; + const uint16_t * sp = reinterpret_cast(src.data()); + float * dp = reinterpret_cast(dst); + for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]); + return true; + }, + "F16->F32 promote", + }); +} + +// Concatenate N source tensors into one destination tensor. Captures +// source file offsets and sizes at registration time so later renames or +// reshapes don't affect the read. Layout assumption: the source tensors +// concatenate cleanly along their slowest dim, which in C/ggml order +// means the destination's bytes are just src[0] || src[1] || ... . +void register_concat_load(const gguf_context * meta, std::string dest_name, + const std::vector & src_names) { + std::vector> regions; // (offset, size) + regions.reserve(src_names.size()); + for (const auto & n : src_names) { + const int64_t id = gguf_find_tensor(meta, n.c_str()); + if (id < 0) return; // bail; downstream will fail loudly + regions.emplace_back( + gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id), + gguf_get_tensor_size(meta, id)); + } + register_load_op(std::move(dest_name), LoadOp{ + [regions](const char * path, void * dst, size_t dst_size) { + size_t total = 0; + for (auto & [_, sz] : regions) total += sz; + if (total != dst_size) return false; + uint8_t * p = static_cast(dst); + for (auto & [off, sz] : regions) { + if (!read_at(path, off, p, sz)) return false; + p += sz; + } + return true; + }, + "concat sources", + }); } // ------------------------------------------------------------------------- @@ -228,21 +322,22 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont // ------------------------------------------------------------------------- bool detect_ollama_qwen35moe(const gguf_context * meta, const ggml_context * ctx) { - // Strongest markers: vision KVs live in-file (upstream splits to mmproj) - // or MTP tensors are present (upstream strips them). - if (has_key(meta, "qwen35moe.vision.block_count")) return true; - if (has_key(meta, "qwen35moe.image_token_id")) return true; - if (has_key(meta, "qwen35moe.ssm.v_head_reordered")) return true; - if (has_key(meta, "qwen35moe.feed_forward_length")) return true; // upstream omits (=0 stored) - if (has_key(meta, "qwen35moe.rope.mrope_interleaved")) return true; - if (any_tensor_with_prefix(ctx, "mtp.")) return true; - if (any_tensor_with_prefix(ctx, "v.")) return true; + // Require the file to declare itself qwen35moe first. + const int64_t arch_kid = gguf_find_key(meta, "general.architecture"); + if (arch_kid < 0) return false; + if (std::strcmp(gguf_get_val_str(meta, arch_kid), "qwen35moe") != 0) return false; - // Scalar-vs-array: upstream writes head_count_kv as UINT32; Ollama wrote - // it as a per-layer array. has_key alone can't tell us that, but a mismatch - // shows up as a type-mismatch crash downstream, which is worse than over- - // detecting. If any of the above markers fire we'll normalize it below. - return false; + // Then: at least one Ollama-ism. Upstream qwen35moe text files have none + // of these — the vision KVs move to mmproj, MTP tensors are dropped, + // head_count_kv is a scalar not an array, and the various extra rope / + // ssm KVs below are either absent or stored differently. + return has_key(meta, "qwen35moe.vision.block_count") + || has_key(meta, "qwen35moe.image_token_id") + || has_key(meta, "qwen35moe.ssm.v_head_reordered") + || has_key(meta, "qwen35moe.feed_forward_length") + || has_key(meta, "qwen35moe.rope.mrope_interleaved") + || any_tensor_with_prefix(ctx, "mtp.") + || any_tensor_with_prefix(ctx, "v."); } void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) { @@ -357,8 +452,204 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) { // Upstream stores patch_embd/position_embd as F32 (Gemma3VisionModel // tensor_force_quant); Ollama stored F16. Metal's IM2COL convolution // requires F32, so promote both at load time. - promote_tensor_to_f32(ctx, "v.patch_embd.weight"); - promote_tensor_to_f32(ctx, "v.position_embd.weight"); + promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight"); + promote_tensor_to_f32(meta, ctx, "v.position_embd.weight"); +} + +// ------------------------------------------------------------------------- +// qwen35moe (clip side) +// ------------------------------------------------------------------------- + +// Substring renames. One entry handles both `.weight` and `.bias` variants. +constexpr std::pair kQwen35moeClipRenames[] = { + {"v.pos_embed", "v.position_embd"}, + {"v.patch_embed", "v.patch_embd"}, + {"v.merger.norm", "v.post_ln"}, + {"v.merger.linear_fc1", "mm.0"}, + {"v.merger.linear_fc2", "mm.2"}, + {".mlp.linear_fc1", ".ffn_up"}, + {".mlp.linear_fc2", ".ffn_down"}, + {".norm1", ".ln1"}, + {".norm2", ".ln2"}, +}; + +// Register a QKV merge for a single block: Ollama has separate attn_q, +// attn_k, attn_v tensors; upstream wants them concatenated along their +// slow axis. Capture source file offsets BEFORE renaming. +void register_qwen35moe_qkv_merge(gguf_context * meta, ggml_context * ctx, int block_idx) { + char qname[64], kname[64], vname[64]; + std::snprintf(qname, sizeof(qname), "v.blk.%d.attn_q.weight", block_idx); + std::snprintf(kname, sizeof(kname), "v.blk.%d.attn_k.weight", block_idx); + std::snprintf(vname, sizeof(vname), "v.blk.%d.attn_v.weight", block_idx); + + const ggml_tensor * q = ggml_get_tensor(ctx, qname); + if (!q) return; // not a qwen35moe vision block + + // Set up the destination tensor. We rename attn_q -> attn_qkv and + // widen its slow axis from [1152, 1152] to [1152, 3456] (3 * hidden). + char qkv_w[64], qkv_b[64], qbias[64], kbias[64], vbias[64]; + std::snprintf(qkv_w, sizeof(qkv_w), "v.blk.%d.attn_qkv.weight", block_idx); + std::snprintf(qkv_b, sizeof(qkv_b), "v.blk.%d.attn_qkv.bias", block_idx); + std::snprintf(qbias, sizeof(qbias), "v.blk.%d.attn_q.bias", block_idx); + std::snprintf(kbias, sizeof(kbias), "v.blk.%d.attn_k.bias", block_idx); + std::snprintf(vbias, sizeof(vbias), "v.blk.%d.attn_v.bias", block_idx); + + // Capture source offsets for the concat BEFORE renaming. + register_concat_load(meta, qkv_w, {qname, kname, vname}); + register_concat_load(meta, qkv_b, {qbias, kbias, vbias}); + + // Rename attn_q -> attn_qkv and widen shape. + rename_tensor(meta, ctx, qname, qkv_w); + if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_w)) { + set_tensor_shape(t, {t->ne[0], t->ne[1] * 3}); + } + // Rename attn_q.bias -> attn_qkv.bias and widen from [1152] to [3456]. + rename_tensor(meta, ctx, qbias, qkv_b); + if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_b)) { + set_tensor_shape(t, {t->ne[0] * 3}); + } +} + +// Register the patch_embed reshape + split + F16->F32. +// +// Source: one Ollama tensor `v.patch_embed.weight`, ggml shape +// [h=16, w=16, t=2, packed=3456] F16 +// where `packed` is the PyTorch row-major flattening of HF's +// [out_c=1152, in_c=3, ...] dim pair, so packed_c = c_out*3 + c_in. +// +// Destination: two upstream tensors with ggml shape +// [h=16, w=16, c_in=3, c_out=1152] F32 each, +// one per temporal slice. Matches upstream's +// yield data_torch[:, :, 0, ...] # PyTorch [1152, 3, 16, 16] +// yield data_torch[:, :, 1, ...] +// which reverses to ggml ne=[16, 16, 3, 1152] per slice. +// +// For each output element (h, w, c_in, c_out): +// src_idx = h + w*W + t*W*H + (c_out*C_in + c_in)*W*H*T +// dst_idx = h + w*W + c_in*W*H + c_out*W*H*C_in +void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ctx) { + const char * src_name = "v.patch_embed.weight"; + const int64_t tid = gguf_find_tensor(meta, src_name); + if (tid < 0) return; + + ggml_tensor * src_t = ggml_get_tensor(ctx, src_name); + if (!src_t) return; + + const size_t src_offset = tensor_file_offset(meta, src_name); + const size_t src_size = ggml_nelements(src_t) * sizeof(uint16_t); + + constexpr int H = 16, W = 16, T = 2, CIN = 3, COUT = 1152; + constexpr size_t HW = (size_t) H * W; + + auto make_slice_op = [=](int slice_idx) { + return LoadOp{ + [=](const char * path, void * dst, size_t dst_size) { + if (dst_size != (size_t) H * W * CIN * COUT * sizeof(float)) return false; + std::vector src(src_size); + if (!read_at(path, src_offset, src.data(), src_size)) return false; + const uint16_t * sp = reinterpret_cast(src.data()); + float * dp = reinterpret_cast(dst); + for (int c_out = 0; c_out < COUT; ++c_out) { + for (int c_in = 0; c_in < CIN; ++c_in) { + const size_t packed = (size_t) c_out * CIN + c_in; + const uint16_t * in_base = sp + HW * (slice_idx + T * packed); + float * out_base = dp + HW * (c_in + CIN * c_out); + for (size_t i = 0; i < HW; ++i) out_base[i] = ggml_fp16_to_fp32(in_base[i]); + } + } + return true; + }, + slice_idx == 0 ? "patch_embed slice 0 (permute+F16->F32)" + : "patch_embed slice 1 (permute+F16->F32)", + }; + }; + + // Rename src -> `v.patch_embd.weight`, reshape to dest layout, register + // the slice-0 load op against its new name. + rename_tensor(meta, ctx, src_name, "v.patch_embd.weight"); + ggml_tensor * dest0 = ggml_get_tensor(ctx, "v.patch_embd.weight"); + if (!dest0) return; + set_tensor_shape(dest0, {16, 16, 3, 1152}); + set_tensor_type (dest0, GGML_TYPE_F32); + register_load_op("v.patch_embd.weight", make_slice_op(0)); + + // We need a sibling tensor `v.patch_embd.weight.1` in ctx_meta so clip's + // get_tensor() can find it. ggml_new_tensor() would blow ctx_meta's + // fixed memory pool (sized exactly for the original tensor count). + // Instead, steal an unused slot: after the QKV merge, `v.blk.0.attn_k` + // is orphaned in ctx_meta — clip never looks it up because it asks for + // the merged `attn_qkv`. Rename it to our sibling and reshape. + rename_tensor(meta, ctx, "v.blk.0.attn_k.weight", "v.patch_embd.weight.1"); + ggml_tensor * dest1 = ggml_get_tensor(ctx, "v.patch_embd.weight.1"); + if (!dest1) return; + set_tensor_shape(dest1, {16, 16, 3, 1152}); + set_tensor_type (dest1, GGML_TYPE_F32); + register_load_op("v.patch_embd.weight.1", make_slice_op(1)); +} + +void handle_qwen35moe_clip(gguf_context * meta, ggml_context * ctx) { + LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF used as mmproj; translating\n", __func__); + + // KV synthesis: clip.vision.* from qwen35moe.vision.* (plus defaults). + copy_u32_kv(meta, "qwen35moe.vision.block_count", "clip.vision.block_count"); + copy_u32_kv(meta, "qwen35moe.vision.embedding_length", "clip.vision.embedding_length"); + copy_u32_kv(meta, "qwen35moe.vision.attention.head_count", "clip.vision.attention.head_count"); + copy_u32_kv(meta, "qwen35moe.vision.patch_size", "clip.vision.patch_size"); + copy_u32_kv(meta, "qwen35moe.vision.spatial_merge_size", "clip.vision.spatial_merge_size"); + copy_u32_kv(meta, "qwen35moe.vision.num_channels", "clip.vision.num_channels"); + // projection_dim is the text model's embedding_length (merger out dim). + copy_u32_kv(meta, "qwen35moe.embedding_length", "clip.vision.projection_dim"); + + // Ollama omitted these; defaults match reference (ref_Q3.5-35B-A3B mmproj). + if (!has_key(meta, "clip.vision.feed_forward_length")) + gguf_set_val_u32(meta, "clip.vision.feed_forward_length", 4304); + if (!has_key(meta, "clip.vision.image_size")) + gguf_set_val_u32(meta, "clip.vision.image_size", 768); + if (!has_key(meta, "clip.vision.attention.layer_norm_epsilon")) + gguf_set_val_f32(meta, "clip.vision.attention.layer_norm_epsilon", 1e-6f); + + // image_mean / image_std — constants for qwen3.5 vision. + if (!has_key(meta, "clip.vision.image_mean")) { + const float v[3] = {0.5f, 0.5f, 0.5f}; + gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32, v, 3); + } + if (!has_key(meta, "clip.vision.image_std")) { + const float v[3] = {0.5f, 0.5f, 0.5f}; + gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32, v, 3); + } + + // is_deepstack_layers: qwen3.5 35B has no deepstack layers. Set a + // 27-element array of False matching clip.vision.block_count. + if (!has_key(meta, "clip.vision.is_deepstack_layers")) { + uint8_t bools[27] = {}; + gguf_set_arr_data(meta, "clip.vision.is_deepstack_layers", GGUF_TYPE_BOOL, bools, 27); + } + + if (!has_key(meta, "clip.has_vision_encoder")) gguf_set_val_bool(meta, "clip.has_vision_encoder", true); + if (!has_key(meta, "clip.use_gelu")) gguf_set_val_bool(meta, "clip.use_gelu", true); + gguf_set_val_str(meta, "clip.projector_type", "qwen3vl_merger"); + gguf_set_val_str(meta, "general.architecture", "clip"); + + // QKV merge per block. Runs BEFORE the substring renames so we can + // reliably find attn_q / attn_k / attn_v by name. + const int64_t n_blocks_key = gguf_find_key(meta, "clip.vision.block_count"); + const uint32_t n_blocks = n_blocks_key >= 0 ? gguf_get_val_u32(meta, n_blocks_key) : 27; + for (uint32_t b = 0; b < n_blocks; ++b) { + register_qwen35moe_qkv_merge(meta, ctx, (int) b); + } + + // patch_embed: reshape + temporal split + F16->F32. Also BEFORE renames + // because it references `v.patch_embed.weight` by name. + register_qwen35moe_patch_embed_split(meta, ctx); + + // Substring renames (last). These handle the simple pos_embed, merger.*, + // linear_fc1/2, norm1/2 conversions. + for (const auto & [from, to] : kQwen35moeClipRenames) { + rename_tensors_containing(meta, ctx, from, to); + } + + // F16 -> F32 on position_embd after rename. + promote_tensor_to_f32(meta, ctx, "v.position_embd.weight"); } } // anonymous namespace @@ -379,10 +670,16 @@ void translate_metadata(const llama_model_loader * ml, void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) { if (!meta) return; - // Require both the gemma3 markers AND embedded vision tensors to fire. - if (detect_ollama_gemma3(meta, ctx) && any_tensor_with_prefix(ctx, "v.")) { + if (!any_tensor_with_prefix(ctx, "v.")) return; // nothing to translate + + if (detect_ollama_gemma3(meta, ctx)) { LLAMA_LOG_INFO("%s: detected Ollama-format gemma3 GGUF used as mmproj; translating\n", __func__); handle_gemma3_clip(meta, ctx); + return; + } + if (detect_ollama_qwen35moe(meta, ctx)) { + handle_qwen35moe_clip(meta, ctx); + return; } } @@ -400,35 +697,27 @@ bool maybe_load_tensor(ggml_tensor * cur, const char * source_file, size_t file_offset, ggml_backend_buffer_type_t buft) { + (void) file_offset; // registered ops capture their own offsets + + LoadOp op; { - std::lock_guard lk(g_promote_mutex); - if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) return false; + std::lock_guard lk(g_loadop_mutex); + auto it = g_loadops.find(ggml_get_name(cur)); + if (it == g_loadops.end()) return false; + op = it->second; } - if (cur->type != GGML_TYPE_F32) return false; - const size_t n_elem = ggml_nelements(cur); - const size_t src_size = n_elem * sizeof(uint16_t); - const size_t dst_size = n_elem * sizeof(float); - - std::vector src(src_size); - FILE * f = std::fopen(source_file, "rb"); - if (!f || std::fseek(f, (long) file_offset, SEEK_SET) != 0 - || std::fread(src.data(), 1, src_size, f) != src_size) { - if (f) std::fclose(f); - LLAMA_LOG_ERROR("%s: failed to read F16 bytes for '%s'\n", __func__, ggml_get_name(cur)); + const size_t dst_size = ggml_nbytes(cur); + std::vector dst(dst_size); + if (!op.apply(source_file, dst.data(), dst_size)) { + LLAMA_LOG_ERROR("%s: %s failed for %s\n", __func__, op.description, ggml_get_name(cur)); return false; } - std::fclose(f); - - std::vector dst(dst_size); - const uint16_t * sp = reinterpret_cast(src.data()); - float * dp = reinterpret_cast(dst.data()); - for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]); if (ggml_backend_buft_is_host(buft)) std::memcpy(cur->data, dst.data(), dst_size); else ggml_backend_tensor_set(cur, dst.data(), 0, dst_size); - LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n", __func__, ggml_get_name(cur), n_elem); + LLAMA_LOG_INFO("%s: %s for %s (%zu bytes)\n", __func__, op.description, ggml_get_name(cur), dst_size); return true; } diff --git a/llm/llama_server.go b/llm/llama_server.go index c9620d613..8ed5a4613 100644 --- a/llm/llama_server.go +++ b/llm/llama_server.go @@ -433,9 +433,9 @@ func NewLlamaServerRunner( // and aborts model load. So gate on an explicit allowlist that mirrors // the compat layer's clip-side coverage in llama/compat/. compatClipArches := map[string]bool{ - "gemma3": true, + "gemma3": true, + "qwen35moe": true, // Add entries as llama/compat grows clip handlers. - // "qwen35moe": true, } if len(projectors) == 0 && len(f.Tensors().Items("v.")) > 0 &&