From 61b367ec29ae0fe7c3a3aae2f984f72dac87f78b Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sun, 19 Apr 2026 10:58:03 -0700 Subject: [PATCH] llama/compat: shrink patch to pure call-site hooks (34 -> 20 lines) Two reductions: 1. Drop the gguf_rename_tensor forwarder from gguf.h/gguf.cpp. The rename-in-place trick it does (calling ggml_set_name on an embedded ggml_tensor) can be done from outside gguf.cpp via: char * p = const_cast(gguf_get_tensor_name(meta, id)); strncpy(p, new_name, GGML_MAX_NAME - 1); That pointer aims into a mutable char[GGML_MAX_NAME] inside a std::vector element; the const on the return type is API courtesy. This is defined behavior and has no struct-layout dependency. 2. Drop the src/CMakeLists.txt hunk that added llama-ollama-compat.cpp to the llama target. Replace with a target_sources() call in Ollama's llama/server/CMakeLists.txt after FetchContent_MakeAvailable. Our compat files now stay in llama/compat/ and are never copied into the fetched _deps/ tree. Net patch now touches 3 files, 20 lines, all pure call-site insertions: src/llama-model-loader.cpp +8 (include + translate + 2x should_skip) src/llama-model.cpp +4 (include + apply_tensor_transforms) tools/mtmd/clip.cpp +8 (include + translate_clip + maybe_load) Verified: fresh build from scratch (rm -rf build && cmake configure) runs PATCH_COMMAND cleanly, compiles, and ollama run gemma3 still works end-to-end for text + vision. --- llama/compat/compat.cmake | 25 ++++++++------ llama/compat/llama-ollama-compat.cpp | 17 +++++++++- llama/compat/upstream-edits.patch | 51 ++-------------------------- llama/server/CMakeLists.txt | 16 +++++++++ 4 files changed, 49 insertions(+), 60 deletions(-) diff --git a/llama/compat/compat.cmake b/llama/compat/compat.cmake index ef8fc9dfe..9c8406d8e 100644 --- a/llama/compat/compat.cmake +++ b/llama/compat/compat.cmake @@ -24,21 +24,26 @@ set(_compat_dir ${CMAKE_CURRENT_LIST_DIR}) # Expose a single variable the main CMakeLists passes into FetchContent's -# PATCH_COMMAND. Uses CMake's own `-E copy` for cross-platform file copy. -# The patch itself is applied via a small CMake script so the step is -# idempotent — re-configuring or rebuilding won't fail with "already applied". +# PATCH_COMMAND. The patch is applied via a small CMake script so the step +# is idempotent — re-configuring or rebuilding won't fail with "already +# applied". +# +# The compat source files (.h, .cpp) are NOT copied into the fetched tree. +# Instead, llama/server/CMakeLists.txt does target_sources() on the llama +# target after FetchContent_MakeAvailable. That keeps Ollama's code in +# Ollama's tree and makes the patch pure call-site insertions. set(OLLAMA_LLAMA_CPP_COMPAT_PATCH_COMMAND - ${CMAKE_COMMAND} -E copy - "${_compat_dir}/llama-ollama-compat.h" - "src/llama-ollama-compat.h" - COMMAND ${CMAKE_COMMAND} -E copy - "${_compat_dir}/llama-ollama-compat.cpp" - "src/llama-ollama-compat.cpp" - COMMAND ${CMAKE_COMMAND} + ${CMAKE_COMMAND} -DPATCH_FILE=${_compat_dir}/upstream-edits.patch -P ${_compat_dir}/apply-patch.cmake CACHE INTERNAL "llama.cpp compat patch command for FetchContent") +# Where the compat source files live, so the main CMakeLists can wire them +# into the llama target. +set(OLLAMA_LLAMA_CPP_COMPAT_DIR + "${_compat_dir}" + CACHE INTERNAL "Directory holding llama-ollama-compat.{h,cpp}") + # Also export the individual paths in case callers want to do something # custom (e.g. emit a dependency on the patch so reconfigures re-apply). set(OLLAMA_LLAMA_CPP_COMPAT_PATCH_FILE diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp index 311299ed4..f424b26ea 100644 --- a/llama/compat/llama-ollama-compat.cpp +++ b/llama/compat/llama-ollama-compat.cpp @@ -224,9 +224,24 @@ namespace { // Rename a tensor in BOTH the gguf_context and the ggml_context so that all // name-based lookups — offset map, ggml_get_tensor, tensor.name — agree. +// +// The gguf_context side is a bit sneaky: gguf_get_tensor_name returns a +// pointer into the embedded ggml_tensor's `name[GGML_MAX_NAME]` buffer. +// That buffer is non-const storage inside a std::vector element; the const +// on the return type is just API hygiene. Casting it away and strncpy'ing +// a new name is well-defined and avoids needing to patch gguf's internals. void rename_tensor(gguf_context * meta, ggml_context * ctx, const char * old_name, const char * new_name) { - if (!gguf_rename_tensor(meta, old_name, new_name)) return; + const int64_t id = gguf_find_tensor(meta, old_name); + if (id < 0) return; + + // Update the gguf-side name (what gguf_get_tensor_name returns later). + if (char * name_ptr = const_cast(gguf_get_tensor_name(meta, id))) { + std::strncpy(name_ptr, new_name, GGML_MAX_NAME - 1); + name_ptr[GGML_MAX_NAME - 1] = '\0'; + } + + // Update the ggml-side name (what ggml_get_tensor looks up by). if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) { ggml_set_name(t, new_name); } diff --git a/llama/compat/upstream-edits.patch b/llama/compat/upstream-edits.patch index 5343dac93..cd65f85cf 100644 --- a/llama/compat/upstream-edits.patch +++ b/llama/compat/upstream-edits.patch @@ -1,50 +1,3 @@ -diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h -index 02d5f221c..2f64264f4 100644 ---- a/ggml/include/gguf.h -+++ b/ggml/include/gguf.h -@@ -162,6 +162,9 @@ extern "C" { - // assumes that at least gguf_get_tensor_size bytes can be read from data - GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data); - -+ // rename a tensor. Returns false if old_name is not present. -+ GGML_API bool gguf_rename_tensor(struct gguf_context * ctx, const char * old_name, const char * new_name); -+ - // writing gguf files can be done in 3 ways: - // - // - write the entire gguf_context to a binary file in a single pass: -diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp -index ab3cc9748..e3d06c959 100644 ---- a/ggml/src/gguf.cpp -+++ b/ggml/src/gguf.cpp -@@ -1280,6 +1280,16 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo - ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const - } - -+bool gguf_rename_tensor(struct gguf_context * ctx, const char * old_name, const char * new_name) { -+ const int64_t tensor_id = gguf_find_tensor(ctx, old_name); -+ if (tensor_id < 0) { -+ return false; -+ } -+ // ggml_set_name truncates to GGML_MAX_NAME. -+ ggml_set_name(&ctx->info[tensor_id].t, new_name); -+ return true; -+} -+ - struct gguf_writer_base { - size_t written_bytes {0u}; - -diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 7b1fcfca0..155a819fa 100644 ---- a/src/CMakeLists.txt -+++ b/src/CMakeLists.txt -@@ -32,6 +32,7 @@ add_library(llama - llama-model-loader.cpp - llama-model-saver.cpp - llama-model.cpp -+ llama-ollama-compat.cpp - llama-quant.cpp - llama-sampler.cpp - llama-vocab.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4e65a45a5..75836c683 100644 --- a/src/llama-model-loader.cpp @@ -108,14 +61,14 @@ index 4ded484dd..7d3509c23 100644 if (use_mmap_buffer) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp -index f0e8786b6..1e6319ca0 100644 +index f0e8786b6..35defa89d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -10,6 +10,8 @@ #include "ggml-backend.h" #include "gguf.h" -+#include "src/llama-ollama-compat.h" ++#include "llama-ollama-compat.h" + #include #include diff --git a/llama/server/CMakeLists.txt b/llama/server/CMakeLists.txt index 410eb365a..3fcace72c 100644 --- a/llama/server/CMakeLists.txt +++ b/llama/server/CMakeLists.txt @@ -71,6 +71,22 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(llama_cpp) +# Link the Ollama-compat source files into the fetched llama target. +# Kept separate from the upstream-edits patch so our .cpp/.h stay +# on-disk in llama/compat/ rather than being copied into _deps/. +if(DEFINED OLLAMA_LLAMA_CPP_COMPAT_DIR) + target_sources(llama PRIVATE + ${OLLAMA_LLAMA_CPP_COMPAT_DIR}/llama-ollama-compat.cpp) + target_include_directories(llama PRIVATE + ${OLLAMA_LLAMA_CPP_COMPAT_DIR}) + # mtmd's clip.cpp #include's the compat header too — add the same dir + # to its PRIVATE include path (PRIVATE on llama doesn't propagate). + if(TARGET mtmd) + target_include_directories(mtmd PRIVATE + ${OLLAMA_LLAMA_CPP_COMPAT_DIR}) + endif() +endif() + # Find GPU toolkits for runtime dependency bundling. # The upstream llama.cpp build finds these internally, but we need the # variables (CUDAToolkit_LIBRARY_DIR, etc.) in our install scope.