From 021389f7bbbcba28f34ed8f950f25ae2bdd03517 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 19 Apr 2026 10:50:34 -0700
Subject: [PATCH] llama/compat: shrink clip.cpp injection from 18 lines to 1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The clip.cpp tensor-read loop was the fattest hook in the patch — it
duplicated the host-vs-device buffer dispatch around a call into the
compat layer. Move that dispatch into our code (maybe_load_tensor),
so the upstream patch is a single conditional call.

Net: upstream patch drops from 48 lines across 6 files to 34 lines.
Every remaining edit is either a 1-line include, a 1-line function call,
or the gguf_rename_tensor shim (which accesses gguf_context internals
and has to live in gguf.cpp).

Verified end-to-end: text + vision both still correct after rebuild.
---
 llama/compat/llama-ollama-compat.cpp | 45 +++++++++++++++-------------
 llama/compat/llama-ollama-compat.h   | 18 ++++++-----
 llama/compat/upstream-edits.patch    | 25 ++--------------
 3 files changed, 38 insertions(+), 50 deletions(-)
diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp
index 83994d14f..311299ed4 100644
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@@ -393,50 +393,53 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
     }
 }
 
-bool supply_promoted_tensor_data(const ggml_tensor * cur,
-                                 const char * source_file,
-                                 size_t file_offset,
-                                 std::vector<uint8_t> & out) {
+bool maybe_load_tensor(ggml_tensor * cur,
+                       const char * source_file,
+                       size_t file_offset,
+                       ggml_backend_buffer_type_t buft) {
+    // Check registry: is this tensor marked for F16→F32 promotion?
     {
         std::lock_guard<std::mutex> lk(g_promote_mutex);
         if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) {
             return false;
         }
     }
-    // cur->type is F32 (after promotion). Source bytes are F16 at file_offset.
-    if (cur->type != GGML_TYPE_F32) {
-        return false;
-    }
+    // Destination was promoted to F32 by translate_clip_metadata. Source
+    // bytes on disk are still F16 at file_offset.
+    if (cur->type != GGML_TYPE_F32) return false;
 
-    const size_t n_elem = ggml_nelements(cur);
-    const size_t src_bytes = n_elem * sizeof(uint16_t);
-    const size_t dst_bytes = n_elem * sizeof(float);
+    const size_t n_elem   = ggml_nelements(cur);
+    const size_t src_size = n_elem * sizeof(uint16_t);
+    const size_t dst_size = n_elem * sizeof(float);
 
-    std::vector<uint8_t> src(src_bytes);
+    std::vector<uint8_t> src(src_size);
 
     FILE * f = std::fopen(source_file, "rb");
     if (!f) {
         LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, source_file);
         return false;
     }
-    if (std::fseek(f, (long) file_offset, SEEK_SET) != 0) {
-        std::fclose(f);
-        LLAMA_LOG_ERROR("%s: failed to seek in '%s'\n", __func__, source_file);
-        return false;
-    }
-    if (std::fread(src.data(), 1, src_bytes, f) != src_bytes) {
+    if (std::fseek(f, (long) file_offset, SEEK_SET) != 0 ||
+        std::fread(src.data(), 1, src_size, f) != src_size) {
         std::fclose(f);
         LLAMA_LOG_ERROR("%s: failed to read %zu bytes for '%s'\n",
-                        __func__, src_bytes, ggml_get_name(cur));
+                        __func__, src_size, ggml_get_name(cur));
         return false;
     }
     std::fclose(f);
 
-    out.resize(dst_bytes);
+    std::vector<uint8_t> dst(dst_size);
     convert_f16_to_f32(reinterpret_cast<const uint16_t *>(src.data()),
-                       reinterpret_cast<float *>(out.data()),
+                       reinterpret_cast<float *>(dst.data()),
                        n_elem);
 
+    // Deliver the converted bytes to the tensor's final backend buffer.
+    if (ggml_backend_buft_is_host(buft)) {
+        std::memcpy(cur->data, dst.data(), dst_size);
+    } else {
+        ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
+    }
+
     LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n",
                    __func__, ggml_get_name(cur), n_elem);
     return true;
diff --git a/llama/compat/llama-ollama-compat.h b/llama/compat/llama-ollama-compat.h
index 13b73c493..0519bfefd 100644
--- a/llama/compat/llama-ollama-compat.h
+++ b/llama/compat/llama-ollama-compat.h
@@ -18,6 +18,8 @@
 #include <string>
 #include <vector>
 
+#include "ggml-backend.h" // for ggml_backend_buffer_type_t
+
 struct gguf_context;
 struct ggml_context;
 struct ggml_tensor;
@@ -64,15 +66,17 @@ void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx);
 void translate_clip_metadata(gguf_context * meta, ggml_context * ctx);
 
 // Called from clip.cpp's tensor-loading loop, before reading bytes from the
-// file. If this tensor was marked for type promotion by translate_clip_metadata,
-// fills `out` with the promoted data (e.g. F16→F32) and returns true. The
-// caller should then use `out` instead of reading from the file.
+// file. If this tensor was marked for type promotion by translate_clip_metadata
+// (e.g. F16→F32), reads the source bytes, converts them, and writes the
+// result directly into `cur` (choosing host copy vs. backend upload based
+// on `buft`). Returns true if the tensor was handled — caller should skip
+// its normal file-read path. Returns false otherwise; caller loads normally.
 //
 // `file_offset` is the absolute file offset of the original (pre-promotion)
 // tensor data in the source GGUF.
-bool supply_promoted_tensor_data(const ggml_tensor * cur,
-                                 const char * source_file,
-                                 size_t file_offset,
-                                 std::vector<uint8_t> & out);
+bool maybe_load_tensor(ggml_tensor * cur,
+                       const char * source_file,
+                       size_t file_offset,
+                       ggml_backend_buffer_type_t buft);
 
 } // namespace llama_ollama_compat
diff --git a/llama/compat/upstream-edits.patch b/llama/compat/upstream-edits.patch
index f4c5985df..5343dac93 100644
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@@ -108,7 +108,7 @@ index 4ded484dd..7d3509c23 100644
  
      if (use_mmap_buffer) {
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index f0e8786b6..ec2a7d320 100644
+index f0e8786b6..1e6319ca0 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
 @@ -10,6 +10,8 @@
@@ -132,30 +132,11 @@ index f0e8786b6..ec2a7d320 100644
          const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
  
          // print gguf info
-@@ -2358,11 +2365,25 @@ struct clip_model_loader {
+@@ -2358,6 +2365,7 @@ struct clip_model_loader {
                  auto it_off = tensor_offset.find(t->name);
                  GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
                  const size_t offset = it_off->second;
-+                size_t num_bytes = ggml_nbytes(cur);
-+
-+                // Ollama-compat: let the compat layer supply promoted tensor
-+                // data (e.g. F16→F32 for conv weights) instead of reading
-+                // bytes directly from the file.
-+                std::vector<uint8_t> compat_buf;
-+                if (llama_ollama_compat::supply_promoted_tensor_data(cur, fname.c_str(), offset, compat_buf)) {
-+                    if (ggml_backend_buft_is_host(buft)) {
-+                        std::memcpy(cur->data, compat_buf.data(), num_bytes);
-+                    } else {
-+                        ggml_backend_tensor_set(cur, compat_buf.data(), 0, num_bytes);
-+                    }
-+                    continue;
-+                }
-+
++                if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue;
                  fin.seekg(offset, std::ios::beg);
                  if (!fin) {
                      throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
-                 }
--                size_t num_bytes = ggml_nbytes(cur);
-                 if (ggml_backend_buft_is_host(buft)) {
-                     // for the CPU and Metal backend, we can read directly into the tensor
-                     fin.read(reinterpret_cast<char *>(cur->data), num_bytes);