chore(deps): Bump llama.cpp to '5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15' and refactor usage of base params (#7706)

* chore(deps): Bump llama.cpp to '5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15' and refactor usage of base params

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore: update AGENTS.md

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-12-24 00:28:27 +01:00
committed by GitHub
parent 96d3f0ebc8
commit 0a168830ea
3 changed files with 61 additions and 30 deletions

View File

@@ -45,3 +45,35 @@ trim_trailing_whitespace = false
# Logging
Use `github.com/mudler/xlog` for logging which has the same API as slog.
# llama.cpp Backend
The llama.cpp backend (`backend/cpp/llama-cpp/grpc-server.cpp`) is a gRPC adaptation of the upstream HTTP server (`llama.cpp/tools/server/server.cpp`). It uses the same underlying server infrastructure from `llama.cpp/tools/server/server-context.cpp`.
## Building and Testing
- Test llama.cpp backend compilation: `make backends/llama-cpp`
- The backend is built as part of the main build process
- Check `backend/cpp/llama-cpp/Makefile` for build configuration
## Architecture
- **grpc-server.cpp**: gRPC server implementation, adapts HTTP server patterns to gRPC
- Uses shared server infrastructure: `server-context.cpp`, `server-task.cpp`, `server-queue.cpp`, `server-common.cpp`
- The gRPC server mirrors the HTTP server's functionality but uses gRPC instead of HTTP
## Common Issues When Updating llama.cpp
When fixing compilation errors after upstream changes:
1. Check how `server.cpp` (HTTP server) handles the same change
2. Look for new public APIs or getter methods
3. Store copies of needed data instead of accessing private members
4. Update function calls to match new signatures
5. Test with `make backends/llama-cpp`
## Key Differences from HTTP Server
- gRPC uses `BackendServiceImpl` class with gRPC service methods
- HTTP server uses `server_routes` with HTTP handlers
- Both use the same `server_context` and task queue infrastructure
- gRPC methods: `LoadModel`, `Predict`, `PredictStream`, `Embedding`, `Rerank`, `TokenizeString`, `GetMetrics`, `Health`

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=0e1ccf15c7b6d05c720551b537857ecf6194d420
LLAMA_VERSION?=5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -351,7 +351,7 @@ static void add_rpc_devices(std::string servers) {
}
}
static void params_parse(server_context& ctx_server, const backend::ModelOptions* request,
static void params_parse(server_context& /*ctx_server*/, const backend::ModelOptions* request,
common_params & params) {
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
@@ -683,18 +683,18 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
class BackendServiceImpl final : public backend::Backend::Service {
private:
server_context& ctx_server;
const common_params* params_base_ptr; // Store pointer to params_base, set after model load
common_params params_base; // Store copy of params_base, set after model load
public:
BackendServiceImpl(server_context& ctx) : ctx_server(ctx), params_base_ptr(nullptr) {}
BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {}
grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) {
// Implement Health RPC
reply->set_message("OK");
return Status::OK;
}
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) {
// Implement LoadModel RPC
common_params params;
params_parse(ctx_server, request, params);
@@ -745,19 +745,16 @@ public:
processed_triggers.push_back(trigger);
}
}
// Update the grammar triggers in params_base
ctx_server.impl->params_base.sampling.grammar_triggers = std::move(processed_triggers);
// Also update preserved_tokens in params_base
ctx_server.impl->params_base.sampling.preserved_tokens = params.sampling.preserved_tokens;
// Update the grammar triggers in params
params.sampling.grammar_triggers = std::move(processed_triggers);
}
//ctx_server.init();
result->set_message("Loading succeeded");
result->set_success(true);
loaded_model = true;
ctx_server.impl->slot_prompt_similarity = params.slot_prompt_similarity;
// Store pointer to params_base for use in parse_options
params_base_ptr = &ctx_server.impl->params_base;
// Store copy of params_base for use in parse_options and other methods
params_base = params;
return Status::OK;
}
@@ -785,14 +782,14 @@ public:
}
grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
if (!params_base_ptr) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
json data = parse_options(true, request, *params_base_ptr, ctx_server.get_llama_context());
json data = parse_options(true, request, params_base, ctx_server.get_llama_context());
//Raise error if embeddings is set to true
if (ctx_server.impl->params_base.embedding) {
if (params_base.embedding) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in streaming mode");
}
@@ -1332,8 +1329,9 @@ public:
task.tokens = std::move(inputs[i]);
task.params = server_task::params_from_json_cmpl(
ctx_server.get_llama_context(),
ctx_server.impl->params_base,
ctx_server.impl->vocab,
params_base,
ctx_server.get_meta().slot_n_ctx,
data);
task.id_slot = json_value(data, "id_slot", -1);
@@ -1497,14 +1495,14 @@ public:
}
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
if (!params_base_ptr) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
json data = parse_options(true, request, *params_base_ptr, ctx_server.get_llama_context());
json data = parse_options(true, request, params_base, ctx_server.get_llama_context());
data["stream"] = false;
//Raise error if embeddings is set to true
if (ctx_server.impl->params_base.embedding) {
if (params_base.embedding) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in Predict mode");
}
std::cout << "[PREDICT] Received result: " << data.dump(2) << std::endl;
@@ -2070,8 +2068,9 @@ public:
task.tokens = std::move(inputs[i]);
task.params = server_task::params_from_json_cmpl(
ctx_server.get_llama_context(),
ctx_server.impl->params_base,
ctx_server.impl->vocab,
params_base,
ctx_server.get_meta().slot_n_ctx,
data);
task.id_slot = json_value(data, "id_slot", -1);
@@ -2167,10 +2166,10 @@ public:
}
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
if (!params_base_ptr) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
json body = parse_options(false, request, *params_base_ptr, ctx_server.get_llama_context());
json body = parse_options(false, request, params_base, ctx_server.get_llama_context());
body["stream"] = false;
@@ -2262,7 +2261,7 @@ public:
}
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
if (!ctx_server.impl->params_base.embedding || ctx_server.impl->params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
}
@@ -2347,11 +2346,11 @@ public:
return grpc::Status::OK;
}
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
if (!params_base_ptr) {
grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
json body = parse_options(false, request, *params_base_ptr, ctx_server.get_llama_context());
json body = parse_options(false, request, params_base, ctx_server.get_llama_context());
body["stream"] = false;
json tokens_response = json::array();
@@ -2370,7 +2369,7 @@ public:
return grpc::Status::OK;
}
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) {
// request slots data using task queue
auto rd = ctx_server.get_response_reader();