diff --git a/backend/cpp/ik-llama-cpp/CMakeLists.txt b/backend/cpp/ik-llama-cpp/CMakeLists.txt
index 545dc59db..c0157a0c6 100644
--- a/backend/cpp/ik-llama-cpp/CMakeLists.txt
+++ b/backend/cpp/ik-llama-cpp/CMakeLists.txt
@@ -1,15 +1,6 @@
-## Clip/LLaVA library for multimodal support — built locally from copied sources
-set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_include_directories(myclip PUBLIC .)
-target_include_directories(myclip PUBLIC ../..)
-target_include_directories(myclip PUBLIC ../../common)
-target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
-endif()
+## Multimodal support is provided by the in-tree `mtmd` library target
+## (examples/mtmd/), which the grpc-server links and includes below. clip/llava
+## were pruned upstream; the high-level mtmd_* / mtmd_helper_* API is used instead.
 
 set(TARGET grpc-server)
 set(CMAKE_CXX_STANDARD 17)
@@ -67,12 +58,16 @@ add_library(hw_grpc_proto
   ${hw_proto_hdrs} )
 
 add_executable(${TARGET} grpc-server.cpp json.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+# mtmd public headers (mtmd.h / mtmd-helper.h) live in examples/mtmd/.
+# Linking the mtmd target also propagates this include dir, but we add it
+# explicitly for clarity.
+target_include_directories(${TARGET} PRIVATE ../mtmd)
+target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
   absl::flags_parse
   gRPC::${_REFLECTION}
   gRPC::${_GRPC_GRPCPP}
   protobuf::${_PROTOBUF_LIBPROTOBUF})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 if(TARGET BUILD_INFO)
   add_dependencies(${TARGET} BUILD_INFO)
 endif()
diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile
index d76a07854..ef261a0a6 100644
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-IK_LLAMA_VERSION?=b84902d2ad27c34f989f23947200c4b91b1568fd
+IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 
 CMAKE_ARGS?=
diff --git a/backend/cpp/ik-llama-cpp/grpc-server.cpp b/backend/cpp/ik-llama-cpp/grpc-server.cpp
index ff1408630..578d69034 100644
--- a/backend/cpp/ik-llama-cpp/grpc-server.cpp
+++ b/backend/cpp/ik-llama-cpp/grpc-server.cpp
@@ -11,8 +11,8 @@
 #include <memory>
 #include <string>
 #include <getopt.h>
-#include "clip.h"
-#include "llava.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
 #include "log.h"
 #include "common.h"
 #include "json.hpp"
@@ -219,6 +219,11 @@ struct llama_client_slot
 
     // multimodal
     std::vector<slot_image> images;
+    // Full prompt with mtmd media markers (mtmd_default_marker()) substituted in
+    // place of the legacy [img-N] tags, covering the text up to and including the
+    // last image. The text after the last image is kept in params.input_suffix and
+    // decoded through the normal token path so the sampling loop is unchanged.
+    std::string mtmd_prompt;
 
     // stats
     size_t sent_count = 0;
@@ -252,14 +257,14 @@ struct llama_client_slot
 
         for (slot_image & img : images)
         {
-            free(img.image_embedding);
-            if (img.img_data) {
-                clip_image_u8_free(img.img_data);
+            if (img.bitmap) {
+                mtmd_bitmap_free(img.bitmap);
+                img.bitmap = nullptr;
             }
-            img.prefix_prompt = "";
         }
 
         images.clear();
+        mtmd_prompt = "";
     }
 
     bool has_budget(gpt_params &global_params) {
@@ -396,46 +401,13 @@ struct llama_metrics {
     }
 };
 
-struct llava_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 struct llama_server_context
 {
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     const llama_vocab * vocab = nullptr;
 
-    clip_ctx *clp_ctx = nullptr;
+    mtmd_context *mctx = nullptr;
 
     gpt_params params;
 
@@ -491,11 +463,6 @@ struct llama_server_context
         if (!params.mmproj.path.empty()) {
             multimodal = true;
             LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1);
-            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
-                return false;
-            }
 
             if (params.n_ctx < 2048) { // request larger context for the image embedding
                 params.n_ctx = 2048;
@@ -512,10 +479,24 @@ struct llama_server_context
         }
 
         if (multimodal) {
-            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_model_n_embd(model);
-            if (n_embd_clip != n_embd_llm) {
-                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+            // mtmd_init_from_file requires the already-loaded text model, so it must
+            // run AFTER llama_init_from_gpt_params. It validates the projector
+            // against the model internally and returns nullptr on dim mismatch, so
+            // the explicit clip_n_mmproj_embd check is no longer needed.
+            mtmd_context_params mparams = mtmd_context_params_default();
+            mparams.use_gpu         = params.mmproj_use_gpu;
+            mparams.print_timings   = false;
+            mparams.n_threads       = params.n_threads_mtmd != -1 ? params.n_threads_mtmd
+                                      : params.n_threads_batch != -1 ? params.n_threads_batch
+                                                                     : params.n_threads;
+            mparams.verbosity       = GGML_LOG_LEVEL_INFO;
+            mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED
+                                                        : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+            mparams.image_min_tokens = params.image_min_tokens;
+            mparams.image_max_tokens = params.image_max_tokens;
+            mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
+            if (mctx == nullptr) {
+                LOG_ERR("unable to load multimodal projector: %s", params.mmproj.path.c_str());
                 llama_free(ctx);
                 llama_free_model(model);
                 return false;
@@ -865,8 +846,8 @@ struct llama_server_context
 
                     slot_image img_sl;
                     img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    img_sl.img_data = clip_image_u8_init();
-                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
+                    img_sl.bitmap = mtmd_helper_bitmap_init_from_buf(mctx, image_buffer.data(), image_buffer.size());
+                    if (img_sl.bitmap == nullptr)
                     {
                         LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
                              __func__,
@@ -879,50 +860,74 @@ struct llama_server_context
                         {"slot_id",   slot->id},
                         {"img_sl_id", img_sl.id}
                     });
-                    img_sl.request_encode_image = true;
                     slot->images.push_back(img_sl);
                 }
-                // process prompt
-                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
+                // Translate the legacy [img-N] tags into mtmd media markers, in
+                // order, and collect the matching bitmaps in marker order so they
+                // line up with the markers passed to mtmd_tokenize(). The text after
+                // the last image stays in input_suffix and is decoded through the
+                // normal token path, so the sampling loop is unchanged.
+                // example: system prompt [img-102] user [img-103] describe [img-134]
                 if (slot->images.size() > 0 && !slot->prompt.is_array())
                 {
+                    const std::string marker = mtmd_default_marker();
                     std::string prompt = slot->prompt.get<std::string>();
-                    size_t pos = 0, begin_prefix = 0;
+                    std::string built_prompt;
+                    std::vector<slot_image> ordered;
+                    size_t pos = 0, copy_from = 0;
                     std::string pattern = "[img-";
-                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-                        size_t end_prefix = pos;
-                        pos += pattern.length();
-                        size_t end_pos = prompt.find(']', pos);
-                        if (end_pos != std::string::npos)
-                        {
-                            std::string image_id = prompt.substr(pos, end_pos - pos);
-                            try
-                            {
-                                int img_id = std::stoi(image_id);
-                                bool found = false;
-                                for (slot_image &img : slot->images)
-                                {
-                                    if (img.id == img_id) {
-                                        found = true;
-                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
-                                        begin_prefix = end_pos + 1;
-                                        break;
-                                    }
-                                }
-                                if (!found) {
-                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
-                                    slot->images.clear();
-                                    return false;
-                                }
-                            } catch (const std::invalid_argument& e) {
-                                LOG("Invalid image number id in prompt\n");
-                                slot->images.clear();
-                                return false;
+
+                    auto free_images = [&]() {
+                        for (slot_image &img : slot->images) {
+                            if (img.bitmap) {
+                                mtmd_bitmap_free(img.bitmap);
+                                img.bitmap = nullptr;
                             }
                         }
+                        slot->images.clear();
+                    };
+
+                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+                        size_t tag_begin = pos;
+                        pos += pattern.length();
+                        size_t end_pos = prompt.find(']', pos);
+                        if (end_pos == std::string::npos) {
+                            break;
+                        }
+                        std::string image_id = prompt.substr(pos, end_pos - pos);
+                        try
+                        {
+                            int img_id = std::stoi(image_id);
+                            bool found = false;
+                            for (slot_image &img : slot->images)
+                            {
+                                if (img.id == img_id) {
+                                    found = true;
+                                    // text before this tag, then the media marker
+                                    built_prompt += prompt.substr(copy_from, tag_begin - copy_from);
+                                    built_prompt += marker;
+                                    copy_from = end_pos + 1;
+                                    ordered.push_back(img);
+                                    break;
+                                }
+                            }
+                            if (!found) {
+                                LOG("ERROR: Image with id: %i, not found.\n", img_id);
+                                free_images();
+                                return false;
+                            }
+                        } catch (const std::invalid_argument& e) {
+                            LOG("Invalid image number id in prompt\n");
+                            free_images();
+                            return false;
+                        }
+                        pos = end_pos + 1;
                     }
+                    // bitmaps are consumed in marker order by mtmd_tokenize()
+                    slot->images = ordered;
+                    slot->mtmd_prompt = built_prompt;
                     slot->prompt = "";
-                    slot->params.input_suffix = prompt.substr(begin_prefix);
+                    slot->params.input_suffix = prompt.substr(copy_from);
                     slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                 }
             }
@@ -1176,21 +1181,10 @@ struct llama_server_context
 
     bool process_images(llama_client_slot &slot) const
     {
-        for (slot_image &img : slot.images)
-        {
-            if (!img.request_encode_image)
-            {
-                continue;
-            }
-
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG("Error processing the given image");
-                return false;
-            }
-
-            img.request_encode_image = false;
-        }
-
+        // With the mtmd pipeline, image encoding is no longer eager: the bitmaps
+        // are tokenized and encoded together with the surrounding text inside
+        // ingest_images() via mtmd_tokenize() + mtmd_helper_eval_chunks(). This
+        // just reports whether the slot carries any images to process.
         return slot.images.size() > 0;
     }
 
@@ -1435,69 +1429,70 @@ struct llama_server_context
         }
     }
 
-    // for multiple images processing
+    // Tokenize the multimodal prompt (text interleaved with media markers) together
+    // with the slot's bitmaps, then decode the resulting chunks into the llama
+    // context via the high-level mtmd helper. The helper runs llama_decode() on the
+    // text chunks and mtmd_encode() + llama_decode() on the image chunks, handling
+    // batching and any pre/post decode setup (e.g. non-causal attention for gemma3).
+    // Advances slot.n_past by the number of positions consumed, then leaves the
+    // post-image suffix tokens in `batch` so the normal decode + sampling loop
+    // produces the first generated token.
     bool ingest_images(llama_client_slot &slot, int n_batch)
     {
-        int image_idx = 0;
-
-        while (image_idx < (int) slot.images.size())
+        if (mctx == nullptr)
         {
-            slot_image &img = slot.images[image_idx];
+            LOG("%s : multimodal context is not initialized\n", __func__);
+            return false;
+        }
 
-            // process prefix prompt
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-            {
-                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                };
-                if (llama_decode(ctx, batch_view))
-                {
-                    LOG("%s : failed to eval\n", __func__);
-                    return false;
-                }
-            }
+        // bitmaps stay owned by slot.images (freed on reset()); pass non-owning ptrs
+        std::vector<const mtmd_bitmap *> bitmaps;
+        bitmaps.reserve(slot.images.size());
+        for (const slot_image &img : slot.images)
+        {
+            bitmaps.push_back(img.bitmap);
+        }
 
-            // process image with llm
-            for (int i = 0; i < img.image_tokens; i += n_batch)
-            {
-                int n_eval = img.image_tokens - i;
-                if (n_eval > n_batch)
-                {
-                    n_eval = n_batch;
-                }
+        mtmd_input_text inp_txt;
+        inp_txt.text          = slot.mtmd_prompt.c_str();
+        inp_txt.add_special   = add_bos_token;
+        inp_txt.parse_special = true;
 
-                const int n_embd = llama_model_n_embd(model);
-                float * embd = img.image_embedding + i * n_embd;
-                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
-                if (llama_decode(ctx, llava_batch.batch))
-                {
-                    LOG("%s : failed to eval image\n", __func__);
-                    return false;
-                }
-                slot.n_past += n_eval;
-            }
-            image_idx++;
+        mtmd::input_chunks chunks(mtmd_input_chunks_init());
+        int32_t res = mtmd_tokenize(mctx,
+                                    chunks.ptr.get(),
+                                    &inp_txt,
+                                    bitmaps.data(),
+                                    bitmaps.size());
+        if (res != 0)
+        {
+            LOG("%s : failed to tokenize multimodal prompt, res = %d\n", __func__, res);
+            return false;
+        }
 
-            common_batch_clear(batch);
+        const llama_pos start_pos = (llama_pos) system_tokens.size() + slot.n_past;
+        llama_pos new_n_past = start_pos;
+        if (mtmd_helper_eval_chunks(mctx,
+                                    ctx,
+                                    chunks.ptr.get(),
+                                    start_pos,
+                                    slot.id,
+                                    n_batch,
+                                    /*logits_last=*/ false,
+                                    &new_n_past) != 0)
+        {
+            LOG("%s : failed to eval multimodal chunks\n", __func__);
+            return false;
+        }
+        slot.n_past += (int32_t) (new_n_past - start_pos);
 
-            // append prefix of next image
-            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
-                slot.params.input_suffix : // no more images, then process suffix prompt
-                (json)(slot.images[image_idx].prefix_prompt);
-
-            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
-            {
-                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
-                slot.n_past += 1;
-            }
+        // queue the post-image suffix text for the normal decode + sampling path
+        common_batch_clear(batch);
+        std::vector<llama_token> suffix_tokens = tokenize(slot.params.input_suffix, false);
+        for (llama_token tok : suffix_tokens)
+        {
+            common_batch_add(batch, tok, system_tokens.size() + slot.n_past, { slot.id }, false);
+            slot.n_past += 1;
         }
 
         return true;
@@ -1884,8 +1879,11 @@ struct llama_server_context
 
                     const bool has_images = process_images(slot);
 
-                    // process the prefix of first image
-                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+                    // For the multimodal path the whole pre-image / inter-image text is
+                    // tokenized and decoded inside ingest_images() via mtmd, so no prefix
+                    // tokens are queued here; the post-image suffix is appended by
+                    // ingest_images() for the normal decode + sampling loop.
+                    std::vector<llama_token> prefix_tokens = has_images ? std::vector<llama_token>() : prompt_tokens;
 
                     int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
 
diff --git a/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch b/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch
deleted file mode 100644
index 5724f4d06..000000000
--- a/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -2494,7 +2494,7 @@
-             }
-             new_data = work.data();
-
--            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
-+            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr, nullptr);
-         } else {
-             new_type = cur->type;
-             new_data = cur->data;
diff --git a/backend/cpp/ik-llama-cpp/prepare.sh b/backend/cpp/ik-llama-cpp/prepare.sh
index fb0ba7624..b6c03c0f9 100644
--- a/backend/cpp/ik-llama-cpp/prepare.sh
+++ b/backend/cpp/ik-llama-cpp/prepare.sh
@@ -17,28 +17,9 @@ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -r utils.hpp llama.cpp/examples/grpc-server/
 cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/
 
-## Copy clip/llava files for multimodal support (built as myclip library)
-cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
-cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-# Prepend llama.h include to llava.h
-echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-# Copy clip-impl.h if it exists
-if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
-    cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
-fi
-# Copy stb_image.h
-if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
-    cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
-elif [ -f llama.cpp/common/stb_image.h ]; then
-    cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
-fi
-
-## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
-if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
-    sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
-fi
+## Multimodal support is provided by the `mtmd` library target (examples/mtmd/),
+## which the grpc-server links and includes directly. No source copy is needed:
+## clip/llava were pruned upstream and the high-level mtmd_* API is used instead.
 
 set +e
 if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
diff --git a/backend/cpp/ik-llama-cpp/utils.hpp b/backend/cpp/ik-llama-cpp/utils.hpp
index e5cf2a009..4427d4b91 100644
--- a/backend/cpp/ik-llama-cpp/utils.hpp
+++ b/backend/cpp/ik-llama-cpp/utils.hpp
@@ -11,7 +11,7 @@
 
 #include "json.hpp"
 
-#include "clip.h"
+#include "mtmd.h"
 
 using json = nlohmann::json;
 
@@ -111,13 +111,12 @@ struct slot_image
 {
     int32_t id;
 
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
+    // mtmd bitmap (image/audio) decoded from the request buffer. Owned by the
+    // slot; freed via mtmd_bitmap_free() on reset. The high-level mtmd pipeline
+    // (mtmd_tokenize + mtmd_helper_eval_chunks) consumes these directly, so the
+    // legacy eager-encode fields (embedding/tokens) and per-image prefix prompt
+    // are no longer needed.
+    mtmd_bitmap * bitmap = nullptr;
 };
 
 // completion token output with probabilities