diff --git a/backend/cpp/ik-llama-cpp/CMakeLists.txt b/backend/cpp/ik-llama-cpp/CMakeLists.txt index 545dc59db..c0157a0c6 100644 --- a/backend/cpp/ik-llama-cpp/CMakeLists.txt +++ b/backend/cpp/ik-llama-cpp/CMakeLists.txt @@ -1,15 +1,6 @@ -## Clip/LLaVA library for multimodal support — built locally from copied sources -set(TARGET myclip) -add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h) -install(TARGETS ${TARGET} LIBRARY) -target_include_directories(myclip PUBLIC .) -target_include_directories(myclip PUBLIC ../..) -target_include_directories(myclip PUBLIC ../../common) -target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -if (NOT MSVC) - target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) -endif() +## Multimodal support is provided by the in-tree `mtmd` library target +## (examples/mtmd/), which the grpc-server links and includes below. clip/llava +## were pruned upstream; the high-level mtmd_* / mtmd_helper_* API is used instead. set(TARGET grpc-server) set(CMAKE_CXX_STANDARD 17) @@ -67,12 +58,16 @@ add_library(hw_grpc_proto ${hw_proto_hdrs} ) add_executable(${TARGET} grpc-server.cpp json.hpp) -target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto +# mtmd public headers (mtmd.h / mtmd-helper.h) live in examples/mtmd/. +# Linking the mtmd target also propagates this include dir, but we add it +# explicitly for clarity. +target_include_directories(${TARGET} PRIVATE ../mtmd) +target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto absl::flags_parse gRPC::${_REFLECTION} gRPC::${_GRPC_GRPCPP} protobuf::${_PROTOBUF_LIBPROTOBUF}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) endif() diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile index d76a07854..ef261a0a6 100644 --- a/backend/cpp/ik-llama-cpp/Makefile +++ b/backend/cpp/ik-llama-cpp/Makefile @@ -1,5 +1,5 @@ -IK_LLAMA_VERSION?=b84902d2ad27c34f989f23947200c4b91b1568fd +IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/ik-llama-cpp/grpc-server.cpp b/backend/cpp/ik-llama-cpp/grpc-server.cpp index ff1408630..578d69034 100644 --- a/backend/cpp/ik-llama-cpp/grpc-server.cpp +++ b/backend/cpp/ik-llama-cpp/grpc-server.cpp @@ -11,8 +11,8 @@ #include #include #include -#include "clip.h" -#include "llava.h" +#include "mtmd.h" +#include "mtmd-helper.h" #include "log.h" #include "common.h" #include "json.hpp" @@ -219,6 +219,11 @@ struct llama_client_slot // multimodal std::vector images; + // Full prompt with mtmd media markers (mtmd_default_marker()) substituted in + // place of the legacy [img-N] tags, covering the text up to and including the + // last image. The text after the last image is kept in params.input_suffix and + // decoded through the normal token path so the sampling loop is unchanged. + std::string mtmd_prompt; // stats size_t sent_count = 0; @@ -252,14 +257,14 @@ struct llama_client_slot for (slot_image & img : images) { - free(img.image_embedding); - if (img.img_data) { - clip_image_u8_free(img.img_data); + if (img.bitmap) { + mtmd_bitmap_free(img.bitmap); + img.bitmap = nullptr; } - img.prefix_prompt = ""; } images.clear(); + mtmd_prompt = ""; } bool has_budget(gpt_params &global_params) { @@ -396,46 +401,13 @@ struct llama_metrics { } }; -struct llava_embd_batch { - std::vector pos; - std::vector n_seq_id; - std::vector seq_id_0; - std::vector seq_ids; - std::vector logits; - llama_batch batch; - llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); - logits .resize(n_tokens); - seq_id_0.resize(1); - seq_id_0[0] = seq_id; - seq_ids [n_tokens] = nullptr; - batch = { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), - /*logits =*/ logits.data(), - }; - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = pos_0 + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = false; - } - } -}; - struct llama_server_context { llama_model *model = nullptr; llama_context *ctx = nullptr; const llama_vocab * vocab = nullptr; - clip_ctx *clp_ctx = nullptr; + mtmd_context *mctx = nullptr; gpt_params params; @@ -491,11 +463,6 @@ struct llama_server_context if (!params.mmproj.path.empty()) { multimodal = true; LOG_INFO("Multi Modal Mode Enabled", {}); - clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1); - if(clp_ctx == nullptr) { - LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str()); - return false; - } if (params.n_ctx < 2048) { // request larger context for the image embedding params.n_ctx = 2048; @@ -512,10 +479,24 @@ struct llama_server_context } if (multimodal) { - const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); - const int n_embd_llm = llama_model_n_embd(model); - if (n_embd_clip != n_embd_llm) { - LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); + // mtmd_init_from_file requires the already-loaded text model, so it must + // run AFTER llama_init_from_gpt_params. It validates the projector + // against the model internally and returns nullptr on dim mismatch, so + // the explicit clip_n_mmproj_embd check is no longer needed. + mtmd_context_params mparams = mtmd_context_params_default(); + mparams.use_gpu = params.mmproj_use_gpu; + mparams.print_timings = false; + mparams.n_threads = params.n_threads_mtmd != -1 ? params.n_threads_mtmd + : params.n_threads_batch != -1 ? params.n_threads_batch + : params.n_threads; + mparams.verbosity = GGML_LOG_LEVEL_INFO; + mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED + : LLAMA_FLASH_ATTN_TYPE_DISABLED; + mparams.image_min_tokens = params.image_min_tokens; + mparams.image_max_tokens = params.image_max_tokens; + mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams); + if (mctx == nullptr) { + LOG_ERR("unable to load multimodal projector: %s", params.mmproj.path.c_str()); llama_free(ctx); llama_free_model(model); return false; @@ -865,8 +846,8 @@ struct llama_server_context slot_image img_sl; img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size(); - img_sl.img_data = clip_image_u8_init(); - if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) + img_sl.bitmap = mtmd_helper_bitmap_init_from_buf(mctx, image_buffer.data(), image_buffer.size()); + if (img_sl.bitmap == nullptr) { LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", __func__, @@ -879,50 +860,74 @@ struct llama_server_context {"slot_id", slot->id}, {"img_sl_id", img_sl.id} }); - img_sl.request_encode_image = true; slot->images.push_back(img_sl); } - // process prompt - // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]} + // Translate the legacy [img-N] tags into mtmd media markers, in + // order, and collect the matching bitmaps in marker order so they + // line up with the markers passed to mtmd_tokenize(). The text after + // the last image stays in input_suffix and is decoded through the + // normal token path, so the sampling loop is unchanged. + // example: system prompt [img-102] user [img-103] describe [img-134] if (slot->images.size() > 0 && !slot->prompt.is_array()) { + const std::string marker = mtmd_default_marker(); std::string prompt = slot->prompt.get(); - size_t pos = 0, begin_prefix = 0; + std::string built_prompt; + std::vector ordered; + size_t pos = 0, copy_from = 0; std::string pattern = "[img-"; - while ((pos = prompt.find(pattern, pos)) != std::string::npos) { - size_t end_prefix = pos; - pos += pattern.length(); - size_t end_pos = prompt.find(']', pos); - if (end_pos != std::string::npos) - { - std::string image_id = prompt.substr(pos, end_pos - pos); - try - { - int img_id = std::stoi(image_id); - bool found = false; - for (slot_image &img : slot->images) - { - if (img.id == img_id) { - found = true; - img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix); - begin_prefix = end_pos + 1; - break; - } - } - if (!found) { - LOG("ERROR: Image with id: %i, not found.\n", img_id); - slot->images.clear(); - return false; - } - } catch (const std::invalid_argument& e) { - LOG("Invalid image number id in prompt\n"); - slot->images.clear(); - return false; + + auto free_images = [&]() { + for (slot_image &img : slot->images) { + if (img.bitmap) { + mtmd_bitmap_free(img.bitmap); + img.bitmap = nullptr; } } + slot->images.clear(); + }; + + while ((pos = prompt.find(pattern, pos)) != std::string::npos) { + size_t tag_begin = pos; + pos += pattern.length(); + size_t end_pos = prompt.find(']', pos); + if (end_pos == std::string::npos) { + break; + } + std::string image_id = prompt.substr(pos, end_pos - pos); + try + { + int img_id = std::stoi(image_id); + bool found = false; + for (slot_image &img : slot->images) + { + if (img.id == img_id) { + found = true; + // text before this tag, then the media marker + built_prompt += prompt.substr(copy_from, tag_begin - copy_from); + built_prompt += marker; + copy_from = end_pos + 1; + ordered.push_back(img); + break; + } + } + if (!found) { + LOG("ERROR: Image with id: %i, not found.\n", img_id); + free_images(); + return false; + } + } catch (const std::invalid_argument& e) { + LOG("Invalid image number id in prompt\n"); + free_images(); + return false; + } + pos = end_pos + 1; } + // bitmaps are consumed in marker order by mtmd_tokenize() + slot->images = ordered; + slot->mtmd_prompt = built_prompt; slot->prompt = ""; - slot->params.input_suffix = prompt.substr(begin_prefix); + slot->params.input_suffix = prompt.substr(copy_from); slot->params.cache_prompt = false; // multimodal doesn't support cache prompt } } @@ -1176,21 +1181,10 @@ struct llama_server_context bool process_images(llama_client_slot &slot) const { - for (slot_image &img : slot.images) - { - if (!img.request_encode_image) - { - continue; - } - - if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { - LOG("Error processing the given image"); - return false; - } - - img.request_encode_image = false; - } - + // With the mtmd pipeline, image encoding is no longer eager: the bitmaps + // are tokenized and encoded together with the surrounding text inside + // ingest_images() via mtmd_tokenize() + mtmd_helper_eval_chunks(). This + // just reports whether the slot carries any images to process. return slot.images.size() > 0; } @@ -1435,69 +1429,70 @@ struct llama_server_context } } - // for multiple images processing + // Tokenize the multimodal prompt (text interleaved with media markers) together + // with the slot's bitmaps, then decode the resulting chunks into the llama + // context via the high-level mtmd helper. The helper runs llama_decode() on the + // text chunks and mtmd_encode() + llama_decode() on the image chunks, handling + // batching and any pre/post decode setup (e.g. non-causal attention for gemma3). + // Advances slot.n_past by the number of positions consumed, then leaves the + // post-image suffix tokens in `batch` so the normal decode + sampling loop + // produces the first generated token. bool ingest_images(llama_client_slot &slot, int n_batch) { - int image_idx = 0; - - while (image_idx < (int) slot.images.size()) + if (mctx == nullptr) { - slot_image &img = slot.images[image_idx]; + LOG("%s : multimodal context is not initialized\n", __func__); + return false; + } - // process prefix prompt - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) - { - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - }; - if (llama_decode(ctx, batch_view)) - { - LOG("%s : failed to eval\n", __func__); - return false; - } - } + // bitmaps stay owned by slot.images (freed on reset()); pass non-owning ptrs + std::vector bitmaps; + bitmaps.reserve(slot.images.size()); + for (const slot_image &img : slot.images) + { + bitmaps.push_back(img.bitmap); + } - // process image with llm - for (int i = 0; i < img.image_tokens; i += n_batch) - { - int n_eval = img.image_tokens - i; - if (n_eval > n_batch) - { - n_eval = n_batch; - } + mtmd_input_text inp_txt; + inp_txt.text = slot.mtmd_prompt.c_str(); + inp_txt.add_special = add_bos_token; + inp_txt.parse_special = true; - const int n_embd = llama_model_n_embd(model); - float * embd = img.image_embedding + i * n_embd; - llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0); - if (llama_decode(ctx, llava_batch.batch)) - { - LOG("%s : failed to eval image\n", __func__); - return false; - } - slot.n_past += n_eval; - } - image_idx++; + mtmd::input_chunks chunks(mtmd_input_chunks_init()); + int32_t res = mtmd_tokenize(mctx, + chunks.ptr.get(), + &inp_txt, + bitmaps.data(), + bitmaps.size()); + if (res != 0) + { + LOG("%s : failed to tokenize multimodal prompt, res = %d\n", __func__, res); + return false; + } - common_batch_clear(batch); + const llama_pos start_pos = (llama_pos) system_tokens.size() + slot.n_past; + llama_pos new_n_past = start_pos; + if (mtmd_helper_eval_chunks(mctx, + ctx, + chunks.ptr.get(), + start_pos, + slot.id, + n_batch, + /*logits_last=*/ false, + &new_n_past) != 0) + { + LOG("%s : failed to eval multimodal chunks\n", __func__); + return false; + } + slot.n_past += (int32_t) (new_n_past - start_pos); - // append prefix of next image - const auto json_prompt = (image_idx >= (int) slot.images.size()) ? - slot.params.input_suffix : // no more images, then process suffix prompt - (json)(slot.images[image_idx].prefix_prompt); - - std::vector append_tokens = tokenize(json_prompt, false); // has next image - for (int i = 0; i < (int) append_tokens.size(); ++i) - { - common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true); - slot.n_past += 1; - } + // queue the post-image suffix text for the normal decode + sampling path + common_batch_clear(batch); + std::vector suffix_tokens = tokenize(slot.params.input_suffix, false); + for (llama_token tok : suffix_tokens) + { + common_batch_add(batch, tok, system_tokens.size() + slot.n_past, { slot.id }, false); + slot.n_past += 1; } return true; @@ -1884,8 +1879,11 @@ struct llama_server_context const bool has_images = process_images(slot); - // process the prefix of first image - std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; + // For the multimodal path the whole pre-image / inter-image text is + // tokenized and decoded inside ingest_images() via mtmd, so no prefix + // tokens are queued here; the post-image suffix is appended by + // ingest_images() for the normal decode + sampling loop. + std::vector prefix_tokens = has_images ? std::vector() : prompt_tokens; int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; diff --git a/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch b/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch deleted file mode 100644 index 5724f4d06..000000000 --- a/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- a/examples/llava/clip.cpp -+++ b/examples/llava/clip.cpp -@@ -2494,7 +2494,7 @@ - } - new_data = work.data(); - -- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); -+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr, nullptr); - } else { - new_type = cur->type; - new_data = cur->data; diff --git a/backend/cpp/ik-llama-cpp/prepare.sh b/backend/cpp/ik-llama-cpp/prepare.sh index fb0ba7624..b6c03c0f9 100644 --- a/backend/cpp/ik-llama-cpp/prepare.sh +++ b/backend/cpp/ik-llama-cpp/prepare.sh @@ -17,28 +17,9 @@ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/ cp -r utils.hpp llama.cpp/examples/grpc-server/ cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/ -## Copy clip/llava files for multimodal support (built as myclip library) -cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h -cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp -cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp -# Prepend llama.h include to llava.h -echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h -cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h -# Copy clip-impl.h if it exists -if [ -f llama.cpp/examples/llava/clip-impl.h ]; then - cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h -fi -# Copy stb_image.h -if [ -f llama.cpp/vendor/stb/stb_image.h ]; then - cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h -elif [ -f llama.cpp/common/stb_image.h ]; then - cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h -fi - -## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd) -if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then - sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp -fi +## Multimodal support is provided by the `mtmd` library target (examples/mtmd/), +## which the grpc-server links and includes directly. No source copy is needed: +## clip/llava were pruned upstream and the high-level mtmd_* API is used instead. set +e if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then diff --git a/backend/cpp/ik-llama-cpp/utils.hpp b/backend/cpp/ik-llama-cpp/utils.hpp index e5cf2a009..4427d4b91 100644 --- a/backend/cpp/ik-llama-cpp/utils.hpp +++ b/backend/cpp/ik-llama-cpp/utils.hpp @@ -11,7 +11,7 @@ #include "json.hpp" -#include "clip.h" +#include "mtmd.h" using json = nlohmann::json; @@ -111,13 +111,12 @@ struct slot_image { int32_t id; - bool request_encode_image = false; - float * image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8 * img_data; - - std::string prefix_prompt; // before of this image + // mtmd bitmap (image/audio) decoded from the request buffer. Owned by the + // slot; freed via mtmd_bitmap_free() on reset. The high-level mtmd pipeline + // (mtmd_tokenize + mtmd_helper_eval_chunks) consumes these directly, so the + // legacy eager-encode fields (embedding/tokens) and per-image prefix prompt + // are no longer needed. + mtmd_bitmap * bitmap = nullptr; }; // completion token output with probabilities