From 0a168830ea9b84e6955dcbced9477567eaef878a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Dec 2025 00:28:27 +0100 Subject: [PATCH] chore(deps): Bump llama.cpp to '5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15' and refactor usage of base params (#7706) * chore(deps): Bump llama.cpp to '5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15' and refactor usage of base params Signed-off-by: Ettore Di Giacinto * chore: update AGENTS.md Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- AGENTS.md | 32 +++++++++++++++ backend/cpp/llama-cpp/Makefile | 2 +- backend/cpp/llama-cpp/grpc-server.cpp | 57 +++++++++++++-------------- 3 files changed, 61 insertions(+), 30 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index be78bed39..227e68258 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -45,3 +45,35 @@ trim_trailing_whitespace = false # Logging Use `github.com/mudler/xlog` for logging which has the same API as slog. + +# llama.cpp Backend + +The llama.cpp backend (`backend/cpp/llama-cpp/grpc-server.cpp`) is a gRPC adaptation of the upstream HTTP server (`llama.cpp/tools/server/server.cpp`). It uses the same underlying server infrastructure from `llama.cpp/tools/server/server-context.cpp`. + +## Building and Testing + +- Test llama.cpp backend compilation: `make backends/llama-cpp` +- The backend is built as part of the main build process +- Check `backend/cpp/llama-cpp/Makefile` for build configuration + +## Architecture + +- **grpc-server.cpp**: gRPC server implementation, adapts HTTP server patterns to gRPC +- Uses shared server infrastructure: `server-context.cpp`, `server-task.cpp`, `server-queue.cpp`, `server-common.cpp` +- The gRPC server mirrors the HTTP server's functionality but uses gRPC instead of HTTP + +## Common Issues When Updating llama.cpp + +When fixing compilation errors after upstream changes: +1. Check how `server.cpp` (HTTP server) handles the same change +2. Look for new public APIs or getter methods +3. Store copies of needed data instead of accessing private members +4. Update function calls to match new signatures +5. Test with `make backends/llama-cpp` + +## Key Differences from HTTP Server + +- gRPC uses `BackendServiceImpl` class with gRPC service methods +- HTTP server uses `server_routes` with HTTP handlers +- Both use the same `server_context` and task queue infrastructure +- gRPC methods: `LoadModel`, `Predict`, `PredictStream`, `Embedding`, `Rerank`, `TokenizeString`, `GetMetrics`, `Health` diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 09496dd39..101676a7a 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=0e1ccf15c7b6d05c720551b537857ecf6194d420 +LLAMA_VERSION?=5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index dca3e3ae2..3f33c74bf 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -351,7 +351,7 @@ static void add_rpc_devices(std::string servers) { } } -static void params_parse(server_context& ctx_server, const backend::ModelOptions* request, +static void params_parse(server_context& /*ctx_server*/, const backend::ModelOptions* request, common_params & params) { // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809 @@ -683,18 +683,18 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions class BackendServiceImpl final : public backend::Backend::Service { private: server_context& ctx_server; - const common_params* params_base_ptr; // Store pointer to params_base, set after model load + common_params params_base; // Store copy of params_base, set after model load public: - BackendServiceImpl(server_context& ctx) : ctx_server(ctx), params_base_ptr(nullptr) {} + BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {} - grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) { + grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) { // Implement Health RPC reply->set_message("OK"); return Status::OK; } - grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { + grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) { // Implement LoadModel RPC common_params params; params_parse(ctx_server, request, params); @@ -745,19 +745,16 @@ public: processed_triggers.push_back(trigger); } } - // Update the grammar triggers in params_base - ctx_server.impl->params_base.sampling.grammar_triggers = std::move(processed_triggers); - // Also update preserved_tokens in params_base - ctx_server.impl->params_base.sampling.preserved_tokens = params.sampling.preserved_tokens; + // Update the grammar triggers in params + params.sampling.grammar_triggers = std::move(processed_triggers); } //ctx_server.init(); result->set_message("Loading succeeded"); result->set_success(true); loaded_model = true; - ctx_server.impl->slot_prompt_similarity = params.slot_prompt_similarity; - // Store pointer to params_base for use in parse_options - params_base_ptr = &ctx_server.impl->params_base; + // Store copy of params_base for use in parse_options and other methods + params_base = params; return Status::OK; } @@ -785,14 +782,14 @@ public: } grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter* writer) override { - if (!params_base_ptr) { + if (params_base.model.path.empty()) { return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded"); } - json data = parse_options(true, request, *params_base_ptr, ctx_server.get_llama_context()); + json data = parse_options(true, request, params_base, ctx_server.get_llama_context()); //Raise error if embeddings is set to true - if (ctx_server.impl->params_base.embedding) { + if (params_base.embedding) { return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in streaming mode"); } @@ -1332,8 +1329,9 @@ public: task.tokens = std::move(inputs[i]); task.params = server_task::params_from_json_cmpl( - ctx_server.get_llama_context(), - ctx_server.impl->params_base, + ctx_server.impl->vocab, + params_base, + ctx_server.get_meta().slot_n_ctx, data); task.id_slot = json_value(data, "id_slot", -1); @@ -1497,14 +1495,14 @@ public: } grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) { - if (!params_base_ptr) { + if (params_base.model.path.empty()) { return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded"); } - json data = parse_options(true, request, *params_base_ptr, ctx_server.get_llama_context()); + json data = parse_options(true, request, params_base, ctx_server.get_llama_context()); data["stream"] = false; //Raise error if embeddings is set to true - if (ctx_server.impl->params_base.embedding) { + if (params_base.embedding) { return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in Predict mode"); } std::cout << "[PREDICT] Received result: " << data.dump(2) << std::endl; @@ -2070,8 +2068,9 @@ public: task.tokens = std::move(inputs[i]); task.params = server_task::params_from_json_cmpl( - ctx_server.get_llama_context(), - ctx_server.impl->params_base, + ctx_server.impl->vocab, + params_base, + ctx_server.get_meta().slot_n_ctx, data); task.id_slot = json_value(data, "id_slot", -1); @@ -2167,10 +2166,10 @@ public: } grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) { - if (!params_base_ptr) { + if (params_base.model.path.empty()) { return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded"); } - json body = parse_options(false, request, *params_base_ptr, ctx_server.get_llama_context()); + json body = parse_options(false, request, params_base, ctx_server.get_llama_context()); body["stream"] = false; @@ -2262,7 +2261,7 @@ public: } grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) { - if (!ctx_server.impl->params_base.embedding || ctx_server.impl->params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { + if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`"); } @@ -2347,11 +2346,11 @@ public: return grpc::Status::OK; } - grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) { - if (!params_base_ptr) { + grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) { + if (params_base.model.path.empty()) { return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded"); } - json body = parse_options(false, request, *params_base_ptr, ctx_server.get_llama_context()); + json body = parse_options(false, request, params_base, ctx_server.get_llama_context()); body["stream"] = false; json tokens_response = json::array(); @@ -2370,7 +2369,7 @@ public: return grpc::Status::OK; } - grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) { + grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) { // request slots data using task queue auto rd = ctx_server.get_response_reader();