From 61b367ec29ae0fe7c3a3aae2f984f72dac87f78b Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 19 Apr 2026 10:58:03 -0700
Subject: [PATCH] llama/compat: shrink patch to pure call-site hooks (34 -> 20
 lines)

Two reductions:

1. Drop the gguf_rename_tensor forwarder from gguf.h/gguf.cpp.
   The rename-in-place trick it does (calling ggml_set_name on an embedded
   ggml_tensor) can be done from outside gguf.cpp via:

     char * p = const_cast<char *>(gguf_get_tensor_name(meta, id));
     strncpy(p, new_name, GGML_MAX_NAME - 1);

   That pointer aims into a mutable char[GGML_MAX_NAME] inside a std::vector
   element; the const on the return type is API courtesy. This is defined
   behavior and has no struct-layout dependency.

2. Drop the src/CMakeLists.txt hunk that added llama-ollama-compat.cpp to
   the llama target. Replace with a target_sources() call in Ollama's
   llama/server/CMakeLists.txt after FetchContent_MakeAvailable. Our
   compat files now stay in llama/compat/ and are never copied into the
   fetched _deps/ tree.

Net patch now touches 3 files, 20 lines, all pure call-site insertions:
  src/llama-model-loader.cpp  +8  (include + translate + 2x should_skip)
  src/llama-model.cpp         +4  (include + apply_tensor_transforms)
  tools/mtmd/clip.cpp         +8  (include + translate_clip + maybe_load)

Verified: fresh build from scratch (rm -rf build && cmake configure)
runs PATCH_COMMAND cleanly, compiles, and ollama run gemma3 still works
end-to-end for text + vision.
---
 llama/compat/compat.cmake            | 25 ++++++++------
 llama/compat/llama-ollama-compat.cpp | 17 +++++++++-
 llama/compat/upstream-edits.patch    | 51 ++--------------------------
 llama/server/CMakeLists.txt          | 16 +++++++++
 4 files changed, 49 insertions(+), 60 deletions(-)
diff --git a/llama/compat/compat.cmake b/llama/compat/compat.cmake
index ef8fc9dfe..9c8406d8e 100644
--- a/llama/compat/compat.cmake
+++ b/llama/compat/compat.cmake
@@ -24,21 +24,26 @@
 set(_compat_dir ${CMAKE_CURRENT_LIST_DIR})
 
 # Expose a single variable the main CMakeLists passes into FetchContent's
-# PATCH_COMMAND. Uses CMake's own `-E copy` for cross-platform file copy.
-# The patch itself is applied via a small CMake script so the step is
-# idempotent — re-configuring or rebuilding won't fail with "already applied".
+# PATCH_COMMAND. The patch is applied via a small CMake script so the step
+# is idempotent — re-configuring or rebuilding won't fail with "already
+# applied".
+#
+# The compat source files (.h, .cpp) are NOT copied into the fetched tree.
+# Instead, llama/server/CMakeLists.txt does target_sources() on the llama
+# target after FetchContent_MakeAvailable. That keeps Ollama's code in
+# Ollama's tree and makes the patch pure call-site insertions.
 set(OLLAMA_LLAMA_CPP_COMPAT_PATCH_COMMAND
-    ${CMAKE_COMMAND} -E copy
-        "${_compat_dir}/llama-ollama-compat.h"
-        "src/llama-ollama-compat.h"
-    COMMAND ${CMAKE_COMMAND} -E copy
-        "${_compat_dir}/llama-ollama-compat.cpp"
-        "src/llama-ollama-compat.cpp"
-    COMMAND ${CMAKE_COMMAND}
+    ${CMAKE_COMMAND}
         -DPATCH_FILE=${_compat_dir}/upstream-edits.patch
         -P ${_compat_dir}/apply-patch.cmake
     CACHE INTERNAL "llama.cpp compat patch command for FetchContent")
 
+# Where the compat source files live, so the main CMakeLists can wire them
+# into the llama target.
+set(OLLAMA_LLAMA_CPP_COMPAT_DIR
+    "${_compat_dir}"
+    CACHE INTERNAL "Directory holding llama-ollama-compat.{h,cpp}")
+
 # Also export the individual paths in case callers want to do something
 # custom (e.g. emit a dependency on the patch so reconfigures re-apply).
 set(OLLAMA_LLAMA_CPP_COMPAT_PATCH_FILE
diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp
index 311299ed4..f424b26ea 100644
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@@ -224,9 +224,24 @@ namespace {
 
 // Rename a tensor in BOTH the gguf_context and the ggml_context so that all
 // name-based lookups — offset map, ggml_get_tensor, tensor.name — agree.
+//
+// The gguf_context side is a bit sneaky: gguf_get_tensor_name returns a
+// pointer into the embedded ggml_tensor's `name[GGML_MAX_NAME]` buffer.
+// That buffer is non-const storage inside a std::vector element; the const
+// on the return type is just API hygiene. Casting it away and strncpy'ing
+// a new name is well-defined and avoids needing to patch gguf's internals.
 void rename_tensor(gguf_context * meta, ggml_context * ctx,
                    const char * old_name, const char * new_name) {
-    if (!gguf_rename_tensor(meta, old_name, new_name)) return;
+    const int64_t id = gguf_find_tensor(meta, old_name);
+    if (id < 0) return;
+
+    // Update the gguf-side name (what gguf_get_tensor_name returns later).
+    if (char * name_ptr = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
+        std::strncpy(name_ptr, new_name, GGML_MAX_NAME - 1);
+        name_ptr[GGML_MAX_NAME - 1] = '\0';
+    }
+
+    // Update the ggml-side name (what ggml_get_tensor looks up by).
     if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) {
         ggml_set_name(t, new_name);
     }
diff --git a/llama/compat/upstream-edits.patch b/llama/compat/upstream-edits.patch
index 5343dac93..cd65f85cf 100644
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@@ -1,50 +1,3 @@
-diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
-index 02d5f221c..2f64264f4 100644
---- a/ggml/include/gguf.h
-+++ b/ggml/include/gguf.h
-@@ -162,6 +162,9 @@ extern "C" {
-     // assumes that at least gguf_get_tensor_size bytes can be read from data
-     GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
- 
-+    // rename a tensor. Returns false if old_name is not present.
-+    GGML_API bool gguf_rename_tensor(struct gguf_context * ctx, const char * old_name, const char * new_name);
-+
-     // writing gguf files can be done in 3 ways:
-     //
-     // - write the entire gguf_context to a binary file in a single pass:
-diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index ab3cc9748..e3d06c959 100644
---- a/ggml/src/gguf.cpp
-+++ b/ggml/src/gguf.cpp
-@@ -1280,6 +1280,16 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
-     ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const
- }
- 
-+bool gguf_rename_tensor(struct gguf_context * ctx, const char * old_name, const char * new_name) {
-+    const int64_t tensor_id = gguf_find_tensor(ctx, old_name);
-+    if (tensor_id < 0) {
-+        return false;
-+    }
-+    // ggml_set_name truncates to GGML_MAX_NAME.
-+    ggml_set_name(&ctx->info[tensor_id].t, new_name);
-+    return true;
-+}
-+
- struct gguf_writer_base {
-     size_t written_bytes {0u};
- 
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index 7b1fcfca0..155a819fa 100644
---- a/src/CMakeLists.txt
-+++ b/src/CMakeLists.txt
-@@ -32,6 +32,7 @@ add_library(llama
-             llama-model-loader.cpp
-             llama-model-saver.cpp
-             llama-model.cpp
-+            llama-ollama-compat.cpp
-             llama-quant.cpp
-             llama-sampler.cpp
-             llama-vocab.cpp
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
 index 4e65a45a5..75836c683 100644
 --- a/src/llama-model-loader.cpp
@@ -108,14 +61,14 @@ index 4ded484dd..7d3509c23 100644
  
      if (use_mmap_buffer) {
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index f0e8786b6..1e6319ca0 100644
+index f0e8786b6..35defa89d 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
 @@ -10,6 +10,8 @@
  #include "ggml-backend.h"
  #include "gguf.h"
  
-+#include "src/llama-ollama-compat.h"
++#include "llama-ollama-compat.h"
 +
  #include <algorithm>
  #include <cassert>
diff --git a/llama/server/CMakeLists.txt b/llama/server/CMakeLists.txt
index 410eb365a..3fcace72c 100644
--- a/llama/server/CMakeLists.txt
+++ b/llama/server/CMakeLists.txt
@@ -71,6 +71,22 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(llama_cpp)
 
+# Link the Ollama-compat source files into the fetched llama target.
+# Kept separate from the upstream-edits patch so our .cpp/.h stay
+# on-disk in llama/compat/ rather than being copied into _deps/.
+if(DEFINED OLLAMA_LLAMA_CPP_COMPAT_DIR)
+    target_sources(llama PRIVATE
+        ${OLLAMA_LLAMA_CPP_COMPAT_DIR}/llama-ollama-compat.cpp)
+    target_include_directories(llama PRIVATE
+        ${OLLAMA_LLAMA_CPP_COMPAT_DIR})
+    # mtmd's clip.cpp #include's the compat header too — add the same dir
+    # to its PRIVATE include path (PRIVATE on llama doesn't propagate).
+    if(TARGET mtmd)
+        target_include_directories(mtmd PRIVATE
+            ${OLLAMA_LLAMA_CPP_COMPAT_DIR})
+    endif()
+endif()
+
 # Find GPU toolkits for runtime dependency bundling.
 # The upstream llama.cpp build finds these internally, but we need the
 # variables (CUDAToolkit_LIBRARY_DIR, etc.) in our install scope.