From 021389f7bbbcba28f34ed8f950f25ae2bdd03517 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sun, 19 Apr 2026 10:50:34 -0700 Subject: [PATCH] llama/compat: shrink clip.cpp injection from 18 lines to 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clip.cpp tensor-read loop was the fattest hook in the patch — it duplicated the host-vs-device buffer dispatch around a call into the compat layer. Move that dispatch into our code (maybe_load_tensor), so the upstream patch is a single conditional call. Net: upstream patch drops from 48 lines across 6 files to 34 lines. Every remaining edit is either a 1-line include, a 1-line function call, or the gguf_rename_tensor shim (which accesses gguf_context internals and has to live in gguf.cpp). Verified end-to-end: text + vision both still correct after rebuild. --- llama/compat/llama-ollama-compat.cpp | 45 +++++++++++++++------------- llama/compat/llama-ollama-compat.h | 18 ++++++----- llama/compat/upstream-edits.patch | 25 ++-------------- 3 files changed, 38 insertions(+), 50 deletions(-) diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp index 83994d14f..311299ed4 100644 --- a/llama/compat/llama-ollama-compat.cpp +++ b/llama/compat/llama-ollama-compat.cpp @@ -393,50 +393,53 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) { } } -bool supply_promoted_tensor_data(const ggml_tensor * cur, - const char * source_file, - size_t file_offset, - std::vector & out) { +bool maybe_load_tensor(ggml_tensor * cur, + const char * source_file, + size_t file_offset, + ggml_backend_buffer_type_t buft) { + // Check registry: is this tensor marked for F16→F32 promotion? { std::lock_guard lk(g_promote_mutex); if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) { return false; } } - // cur->type is F32 (after promotion). Source bytes are F16 at file_offset. - if (cur->type != GGML_TYPE_F32) { - return false; - } + // Destination was promoted to F32 by translate_clip_metadata. Source + // bytes on disk are still F16 at file_offset. + if (cur->type != GGML_TYPE_F32) return false; - const size_t n_elem = ggml_nelements(cur); - const size_t src_bytes = n_elem * sizeof(uint16_t); - const size_t dst_bytes = n_elem * sizeof(float); + const size_t n_elem = ggml_nelements(cur); + const size_t src_size = n_elem * sizeof(uint16_t); + const size_t dst_size = n_elem * sizeof(float); - std::vector src(src_bytes); + std::vector src(src_size); FILE * f = std::fopen(source_file, "rb"); if (!f) { LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, source_file); return false; } - if (std::fseek(f, (long) file_offset, SEEK_SET) != 0) { - std::fclose(f); - LLAMA_LOG_ERROR("%s: failed to seek in '%s'\n", __func__, source_file); - return false; - } - if (std::fread(src.data(), 1, src_bytes, f) != src_bytes) { + if (std::fseek(f, (long) file_offset, SEEK_SET) != 0 || + std::fread(src.data(), 1, src_size, f) != src_size) { std::fclose(f); LLAMA_LOG_ERROR("%s: failed to read %zu bytes for '%s'\n", - __func__, src_bytes, ggml_get_name(cur)); + __func__, src_size, ggml_get_name(cur)); return false; } std::fclose(f); - out.resize(dst_bytes); + std::vector dst(dst_size); convert_f16_to_f32(reinterpret_cast(src.data()), - reinterpret_cast(out.data()), + reinterpret_cast(dst.data()), n_elem); + // Deliver the converted bytes to the tensor's final backend buffer. + if (ggml_backend_buft_is_host(buft)) { + std::memcpy(cur->data, dst.data(), dst_size); + } else { + ggml_backend_tensor_set(cur, dst.data(), 0, dst_size); + } + LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n", __func__, ggml_get_name(cur), n_elem); return true; diff --git a/llama/compat/llama-ollama-compat.h b/llama/compat/llama-ollama-compat.h index 13b73c493..0519bfefd 100644 --- a/llama/compat/llama-ollama-compat.h +++ b/llama/compat/llama-ollama-compat.h @@ -18,6 +18,8 @@ #include #include +#include "ggml-backend.h" // for ggml_backend_buffer_type_t + struct gguf_context; struct ggml_context; struct ggml_tensor; @@ -64,15 +66,17 @@ void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx); void translate_clip_metadata(gguf_context * meta, ggml_context * ctx); // Called from clip.cpp's tensor-loading loop, before reading bytes from the -// file. If this tensor was marked for type promotion by translate_clip_metadata, -// fills `out` with the promoted data (e.g. F16→F32) and returns true. The -// caller should then use `out` instead of reading from the file. +// file. If this tensor was marked for type promotion by translate_clip_metadata +// (e.g. F16→F32), reads the source bytes, converts them, and writes the +// result directly into `cur` (choosing host copy vs. backend upload based +// on `buft`). Returns true if the tensor was handled — caller should skip +// its normal file-read path. Returns false otherwise; caller loads normally. // // `file_offset` is the absolute file offset of the original (pre-promotion) // tensor data in the source GGUF. -bool supply_promoted_tensor_data(const ggml_tensor * cur, - const char * source_file, - size_t file_offset, - std::vector & out); +bool maybe_load_tensor(ggml_tensor * cur, + const char * source_file, + size_t file_offset, + ggml_backend_buffer_type_t buft); } // namespace llama_ollama_compat diff --git a/llama/compat/upstream-edits.patch b/llama/compat/upstream-edits.patch index f4c5985df..5343dac93 100644 --- a/llama/compat/upstream-edits.patch +++ b/llama/compat/upstream-edits.patch @@ -108,7 +108,7 @@ index 4ded484dd..7d3509c23 100644 if (use_mmap_buffer) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp -index f0e8786b6..ec2a7d320 100644 +index f0e8786b6..1e6319ca0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -10,6 +10,8 @@ @@ -132,30 +132,11 @@ index f0e8786b6..ec2a7d320 100644 const int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); // print gguf info -@@ -2358,11 +2365,25 @@ struct clip_model_loader { +@@ -2358,6 +2365,7 @@ struct clip_model_loader { auto it_off = tensor_offset.find(t->name); GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor"); const size_t offset = it_off->second; -+ size_t num_bytes = ggml_nbytes(cur); -+ -+ // Ollama-compat: let the compat layer supply promoted tensor -+ // data (e.g. F16→F32 for conv weights) instead of reading -+ // bytes directly from the file. -+ std::vector compat_buf; -+ if (llama_ollama_compat::supply_promoted_tensor_data(cur, fname.c_str(), offset, compat_buf)) { -+ if (ggml_backend_buft_is_host(buft)) { -+ std::memcpy(cur->data, compat_buf.data(), num_bytes); -+ } else { -+ ggml_backend_tensor_set(cur, compat_buf.data(), 0, num_bytes); -+ } -+ continue; -+ } -+ ++ if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue; fin.seekg(offset, std::ios::beg); if (!fin) { throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); - } -- size_t num_bytes = ggml_nbytes(cur); - if (ggml_backend_buft_is_host(buft)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes);