mirror of
https://github.com/mudler/LocalAI.git
synced 2025-12-23 22:49:10 -05:00
chore(deps): Bump llama.cpp to '5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15' and refactor usage of base params (#7706)
* chore(deps): Bump llama.cpp to '5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15' and refactor usage of base params Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: update AGENTS.md Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
96d3f0ebc8
commit
0a168830ea
32
AGENTS.md
32
AGENTS.md
@@ -45,3 +45,35 @@ trim_trailing_whitespace = false
|
||||
# Logging
|
||||
|
||||
Use `github.com/mudler/xlog` for logging which has the same API as slog.
|
||||
|
||||
# llama.cpp Backend
|
||||
|
||||
The llama.cpp backend (`backend/cpp/llama-cpp/grpc-server.cpp`) is a gRPC adaptation of the upstream HTTP server (`llama.cpp/tools/server/server.cpp`). It uses the same underlying server infrastructure from `llama.cpp/tools/server/server-context.cpp`.
|
||||
|
||||
## Building and Testing
|
||||
|
||||
- Test llama.cpp backend compilation: `make backends/llama-cpp`
|
||||
- The backend is built as part of the main build process
|
||||
- Check `backend/cpp/llama-cpp/Makefile` for build configuration
|
||||
|
||||
## Architecture
|
||||
|
||||
- **grpc-server.cpp**: gRPC server implementation, adapts HTTP server patterns to gRPC
|
||||
- Uses shared server infrastructure: `server-context.cpp`, `server-task.cpp`, `server-queue.cpp`, `server-common.cpp`
|
||||
- The gRPC server mirrors the HTTP server's functionality but uses gRPC instead of HTTP
|
||||
|
||||
## Common Issues When Updating llama.cpp
|
||||
|
||||
When fixing compilation errors after upstream changes:
|
||||
1. Check how `server.cpp` (HTTP server) handles the same change
|
||||
2. Look for new public APIs or getter methods
|
||||
3. Store copies of needed data instead of accessing private members
|
||||
4. Update function calls to match new signatures
|
||||
5. Test with `make backends/llama-cpp`
|
||||
|
||||
## Key Differences from HTTP Server
|
||||
|
||||
- gRPC uses `BackendServiceImpl` class with gRPC service methods
|
||||
- HTTP server uses `server_routes` with HTTP handlers
|
||||
- Both use the same `server_context` and task queue infrastructure
|
||||
- gRPC methods: `LoadModel`, `Predict`, `PredictStream`, `Embedding`, `Rerank`, `TokenizeString`, `GetMetrics`, `Health`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=0e1ccf15c7b6d05c720551b537857ecf6194d420
|
||||
LLAMA_VERSION?=5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -351,7 +351,7 @@ static void add_rpc_devices(std::string servers) {
|
||||
}
|
||||
}
|
||||
|
||||
static void params_parse(server_context& ctx_server, const backend::ModelOptions* request,
|
||||
static void params_parse(server_context& /*ctx_server*/, const backend::ModelOptions* request,
|
||||
common_params & params) {
|
||||
|
||||
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
|
||||
@@ -683,18 +683,18 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
|
||||
class BackendServiceImpl final : public backend::Backend::Service {
|
||||
private:
|
||||
server_context& ctx_server;
|
||||
const common_params* params_base_ptr; // Store pointer to params_base, set after model load
|
||||
common_params params_base; // Store copy of params_base, set after model load
|
||||
|
||||
public:
|
||||
BackendServiceImpl(server_context& ctx) : ctx_server(ctx), params_base_ptr(nullptr) {}
|
||||
BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {}
|
||||
|
||||
grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
|
||||
grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) {
|
||||
// Implement Health RPC
|
||||
reply->set_message("OK");
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
|
||||
grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) {
|
||||
// Implement LoadModel RPC
|
||||
common_params params;
|
||||
params_parse(ctx_server, request, params);
|
||||
@@ -745,19 +745,16 @@ public:
|
||||
processed_triggers.push_back(trigger);
|
||||
}
|
||||
}
|
||||
// Update the grammar triggers in params_base
|
||||
ctx_server.impl->params_base.sampling.grammar_triggers = std::move(processed_triggers);
|
||||
// Also update preserved_tokens in params_base
|
||||
ctx_server.impl->params_base.sampling.preserved_tokens = params.sampling.preserved_tokens;
|
||||
// Update the grammar triggers in params
|
||||
params.sampling.grammar_triggers = std::move(processed_triggers);
|
||||
}
|
||||
|
||||
//ctx_server.init();
|
||||
result->set_message("Loading succeeded");
|
||||
result->set_success(true);
|
||||
loaded_model = true;
|
||||
ctx_server.impl->slot_prompt_similarity = params.slot_prompt_similarity;
|
||||
// Store pointer to params_base for use in parse_options
|
||||
params_base_ptr = &ctx_server.impl->params_base;
|
||||
// Store copy of params_base for use in parse_options and other methods
|
||||
params_base = params;
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
@@ -785,14 +782,14 @@ public:
|
||||
}
|
||||
|
||||
grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
|
||||
if (!params_base_ptr) {
|
||||
if (params_base.model.path.empty()) {
|
||||
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
|
||||
}
|
||||
json data = parse_options(true, request, *params_base_ptr, ctx_server.get_llama_context());
|
||||
json data = parse_options(true, request, params_base, ctx_server.get_llama_context());
|
||||
|
||||
|
||||
//Raise error if embeddings is set to true
|
||||
if (ctx_server.impl->params_base.embedding) {
|
||||
if (params_base.embedding) {
|
||||
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in streaming mode");
|
||||
}
|
||||
|
||||
@@ -1332,8 +1329,9 @@ public:
|
||||
|
||||
task.tokens = std::move(inputs[i]);
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
ctx_server.get_llama_context(),
|
||||
ctx_server.impl->params_base,
|
||||
ctx_server.impl->vocab,
|
||||
params_base,
|
||||
ctx_server.get_meta().slot_n_ctx,
|
||||
data);
|
||||
task.id_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
@@ -1497,14 +1495,14 @@ public:
|
||||
}
|
||||
|
||||
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
|
||||
if (!params_base_ptr) {
|
||||
if (params_base.model.path.empty()) {
|
||||
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
|
||||
}
|
||||
json data = parse_options(true, request, *params_base_ptr, ctx_server.get_llama_context());
|
||||
json data = parse_options(true, request, params_base, ctx_server.get_llama_context());
|
||||
|
||||
data["stream"] = false;
|
||||
//Raise error if embeddings is set to true
|
||||
if (ctx_server.impl->params_base.embedding) {
|
||||
if (params_base.embedding) {
|
||||
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in Predict mode");
|
||||
}
|
||||
std::cout << "[PREDICT] Received result: " << data.dump(2) << std::endl;
|
||||
@@ -2070,8 +2068,9 @@ public:
|
||||
|
||||
task.tokens = std::move(inputs[i]);
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
ctx_server.get_llama_context(),
|
||||
ctx_server.impl->params_base,
|
||||
ctx_server.impl->vocab,
|
||||
params_base,
|
||||
ctx_server.get_meta().slot_n_ctx,
|
||||
data);
|
||||
task.id_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
@@ -2167,10 +2166,10 @@ public:
|
||||
}
|
||||
|
||||
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
|
||||
if (!params_base_ptr) {
|
||||
if (params_base.model.path.empty()) {
|
||||
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
|
||||
}
|
||||
json body = parse_options(false, request, *params_base_ptr, ctx_server.get_llama_context());
|
||||
json body = parse_options(false, request, params_base, ctx_server.get_llama_context());
|
||||
|
||||
body["stream"] = false;
|
||||
|
||||
@@ -2262,7 +2261,7 @@ public:
|
||||
}
|
||||
|
||||
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
|
||||
if (!ctx_server.impl->params_base.embedding || ctx_server.impl->params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
|
||||
if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
|
||||
return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
|
||||
}
|
||||
|
||||
@@ -2347,11 +2346,11 @@ public:
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
|
||||
if (!params_base_ptr) {
|
||||
grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
|
||||
if (params_base.model.path.empty()) {
|
||||
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
|
||||
}
|
||||
json body = parse_options(false, request, *params_base_ptr, ctx_server.get_llama_context());
|
||||
json body = parse_options(false, request, params_base, ctx_server.get_llama_context());
|
||||
body["stream"] = false;
|
||||
|
||||
json tokens_response = json::array();
|
||||
@@ -2370,7 +2369,7 @@ public:
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
|
||||
grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) {
|
||||
|
||||
// request slots data using task queue
|
||||
auto rd = ctx_server.get_response_reader();
|
||||
|
||||
Reference in New Issue
Block a user