mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
Compare commits
4 Commits
master
...
feat/llama
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
28beac9a18 | ||
|
|
977ccd88f0 | ||
|
|
74e6c60045 | ||
|
|
811f0db2e3 |
@@ -37,6 +37,7 @@
|
|||||||
#include "backend.pb.h"
|
#include "backend.pb.h"
|
||||||
#include "backend.grpc.pb.h"
|
#include "backend.grpc.pb.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "arg.h"
|
||||||
#include "chat-auto-parser.h"
|
#include "chat-auto-parser.h"
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include <grpcpp/ext/proto_server_reflection_plugin.h>
|
#include <grpcpp/ext/proto_server_reflection_plugin.h>
|
||||||
@@ -592,6 +593,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
|||||||
params.checkpoint_min_step = 256;
|
params.checkpoint_min_step = 256;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Raw upstream llama-server flags collected from any option entry that
|
||||||
|
// starts with '-'. Applied once after the loop via common_params_parse.
|
||||||
|
std::vector<std::string> extra_argv;
|
||||||
|
|
||||||
// decode options. Options are in form optname:optvale, or if booleans only optname.
|
// decode options. Options are in form optname:optvale, or if booleans only optname.
|
||||||
for (int i = 0; i < request->options_size(); i++) {
|
for (int i = 0; i < request->options_size(); i++) {
|
||||||
std::string opt = request->options(i);
|
std::string opt = request->options(i);
|
||||||
@@ -1080,6 +1085,31 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
|||||||
} catch (...) {}
|
} catch (...) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- main model MoE on CPU (upstream --cpu-moe / --n-cpu-moe) ---
|
||||||
|
} else if (!strcmp(optname, "cpu_moe")) {
|
||||||
|
// Bool-style flag: keep all MoE expert weights on CPU.
|
||||||
|
const bool enable = (optval == NULL) ||
|
||||||
|
optval_str == "true" || optval_str == "1" || optval_str == "yes" ||
|
||||||
|
optval_str == "on" || optval_str == "enabled";
|
||||||
|
if (enable) {
|
||||||
|
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
||||||
|
}
|
||||||
|
} else if (!strcmp(optname, "n_cpu_moe")) {
|
||||||
|
if (optval != NULL) {
|
||||||
|
try {
|
||||||
|
int n = std::stoi(optval_str);
|
||||||
|
if (n < 0) n = 0;
|
||||||
|
// Keep override-name storage alive for the lifetime of the
|
||||||
|
// params struct (mirrors upstream arg.cpp's function-local static).
|
||||||
|
static std::list<std::string> buft_overrides_main;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
buft_overrides_main.push_back(llm_ffn_exps_block_regex(i));
|
||||||
|
params.tensor_buft_overrides.push_back(
|
||||||
|
{buft_overrides_main.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||||
|
}
|
||||||
|
} catch (...) {}
|
||||||
|
}
|
||||||
|
|
||||||
// --- draft model tensor buffer overrides (upstream --spec-draft-override-tensor) ---
|
// --- draft model tensor buffer overrides (upstream --spec-draft-override-tensor) ---
|
||||||
} else if (!strcmp(optname, "draft_override_tensor") || !strcmp(optname, "spec_draft_override_tensor")) {
|
} else if (!strcmp(optname, "draft_override_tensor") || !strcmp(optname, "spec_draft_override_tensor")) {
|
||||||
// Format: <tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...
|
// Format: <tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...
|
||||||
@@ -1111,6 +1141,30 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
|||||||
else { cur.push_back(c); }
|
else { cur.push_back(c); }
|
||||||
}
|
}
|
||||||
if (!cur.empty()) flush(cur);
|
if (!cur.empty()) flush(cur);
|
||||||
|
|
||||||
|
// --- generic passthrough: any entry starting with '-' is a raw
|
||||||
|
// upstream llama-server flag, forwarded verbatim to the parser. ---
|
||||||
|
} else if (optname[0] == '-') {
|
||||||
|
std::string flag = optname;
|
||||||
|
// These flags make upstream's parser exit() (printing usage /
|
||||||
|
// completion), which would kill the backend process. Skip them.
|
||||||
|
if (flag == "-h" || flag == "--help" || flag == "--usage" ||
|
||||||
|
flag == "--version" || flag == "--license" ||
|
||||||
|
flag == "--list-devices" || flag == "-cl" ||
|
||||||
|
flag == "--cache-list" ||
|
||||||
|
flag.rfind("--completion", 0) == 0) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"[llama-cpp] ignoring passthrough flag that would exit: %s\n",
|
||||||
|
flag.c_str());
|
||||||
|
} else {
|
||||||
|
extra_argv.push_back(flag);
|
||||||
|
// Preserve the whole value after the first ':' so embedded
|
||||||
|
// colons (e.g. host:port) survive strtok's truncation of optval.
|
||||||
|
auto colon = opt.find(':');
|
||||||
|
if (colon != std::string::npos) {
|
||||||
|
extra_argv.push_back(opt.substr(colon + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1146,27 +1200,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.kv_overrides.empty()) {
|
|
||||||
params.kv_overrides.emplace_back();
|
|
||||||
params.kv_overrides.back().key[0] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
|
|
||||||
// Real entries are pushed during option parsing; here we pad/terminate so the
|
|
||||||
// model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
|
|
||||||
// and so llama_params_fit has the placeholder slots it requires.
|
|
||||||
{
|
|
||||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
||||||
while (params.tensor_buft_overrides.size() < ntbo) {
|
|
||||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
|
|
||||||
// the main-model handling above.
|
|
||||||
if (!params.speculative.draft.tensor_buft_overrides.empty()) {
|
|
||||||
params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Add yarn
|
// TODO: Add yarn
|
||||||
|
|
||||||
if (!request->tensorsplit().empty()) {
|
if (!request->tensorsplit().empty()) {
|
||||||
@@ -1259,6 +1292,69 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
|||||||
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
params.sampling.grammar_triggers.push_back(std::move(trigger));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Apply any raw upstream flags last so an explicit passthrough flag wins
|
||||||
|
// over the LocalAI-resolved field it maps to (e.g. --ctx-size beats
|
||||||
|
// context_size). This is the same parser llama-server itself uses.
|
||||||
|
if (!extra_argv.empty()) {
|
||||||
|
// common_params_parser_init resets a few fields for the SERVER example
|
||||||
|
// (n_parallel -> -1, use_color). Snapshot n_parallel so an unrelated
|
||||||
|
// passthrough flag can't silently clobber LocalAI's resolved value.
|
||||||
|
const int saved_n_parallel = params.n_parallel;
|
||||||
|
|
||||||
|
std::vector<char *> argv;
|
||||||
|
std::string prog = "llama-server";
|
||||||
|
argv.push_back(prog.data());
|
||||||
|
for (auto & a : extra_argv) {
|
||||||
|
argv.push_back(a.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ctx_arg.params is a reference, so this overlays the given flags onto
|
||||||
|
// `params` in place. Returns false on a recoverable parse error (and
|
||||||
|
// self-restores params); may exit() on a hard error, exactly as
|
||||||
|
// passing the same bad flag to llama-server would.
|
||||||
|
if (!common_params_parse((int)argv.size(), argv.data(), params,
|
||||||
|
LLAMA_EXAMPLE_SERVER)) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"[llama-cpp] failed to parse passthrough options; ignoring them\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restore n_parallel unless a passthrough flag explicitly set it
|
||||||
|
// (parser_init's reset sentinel for SERVER is -1).
|
||||||
|
if (params.n_parallel == -1) {
|
||||||
|
params.n_parallel = saved_n_parallel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Terminate/pad the override vectors only after BOTH the named-option loop
|
||||||
|
// and the generic passthrough (common_params_parse above) have pushed their
|
||||||
|
// real entries, so back() is the null sentinel the model loader asserts on.
|
||||||
|
// Running these before the passthrough let a passthrough flag (--cpu-moe,
|
||||||
|
// --override-tensor, --override-kv, ...) append a real entry after the
|
||||||
|
// sentinel: a GGML_ASSERT crash for tensor_buft_overrides, a silent drop for
|
||||||
|
// kv_overrides. Double-termination is harmless (the while is a no-op if the
|
||||||
|
// passthrough parse already padded; an extra trailing null is ignored).
|
||||||
|
|
||||||
|
if (!params.kv_overrides.empty()) {
|
||||||
|
params.kv_overrides.emplace_back();
|
||||||
|
params.kv_overrides.back().key[0] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
|
||||||
|
// Real entries are pushed during option parsing; here we pad/terminate so the
|
||||||
|
// model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
|
||||||
|
// and so llama_params_fit has the placeholder slots it requires.
|
||||||
|
{
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
while (params.tensor_buft_overrides.size() < ntbo) {
|
||||||
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
|
||||||
|
// the main-model handling above.
|
||||||
|
if (!params.speculative.draft.tensor_buft_overrides.empty()) {
|
||||||
|
params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -494,6 +494,39 @@ These llama.cpp options are passed through the `options:` array.
|
|||||||
| `direct_io` / `use_direct_io` | bool | `false` | Open the model with `O_DIRECT` (faster cold loads on NVMe; ignored if not supported). |
|
| `direct_io` / `use_direct_io` | bool | `false` | Open the model with `O_DIRECT` (faster cold loads on NVMe; ignored if not supported). |
|
||||||
| `verbosity` | int | `3` | llama.cpp internal log verbosity threshold. Higher = more verbose. |
|
| `verbosity` | int | `3` | llama.cpp internal log verbosity threshold. Higher = more verbose. |
|
||||||
| `override_tensor` / `tensor_buft_overrides` | string | "" | Per-tensor buffer-type overrides for the main model. Format: `<tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...`. Mirrors the existing `draft_override_tensor` syntax for the draft model. |
|
| `override_tensor` / `tensor_buft_overrides` | string | "" | Per-tensor buffer-type overrides for the main model. Format: `<tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...`. Mirrors the existing `draft_override_tensor` syntax for the draft model. |
|
||||||
|
| `cpu_moe` | bool | false | Keep all MoE expert weights of the main model on CPU (upstream `--cpu-moe`). Frees VRAM on large MoE models (DeepSeek, Qwen3 `*-A3B`). |
|
||||||
|
| `n_cpu_moe` | int | 0 | Keep MoE expert weights of the first N main-model layers on CPU (upstream `--n-cpu-moe`). |
|
||||||
|
|
||||||
|
#### Generic option passthrough
|
||||||
|
|
||||||
|
Any `options:` entry whose name starts with `-` is forwarded **verbatim** to
|
||||||
|
upstream llama.cpp's own `llama-server` argument parser. This means any flag the
|
||||||
|
bundled llama.cpp supports works without LocalAI needing a dedicated option,
|
||||||
|
even ones added after your LocalAI version was built. See the upstream
|
||||||
|
[server flags reference](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md).
|
||||||
|
|
||||||
|
Format mirrors the rest of the array - `--flag` for a boolean, or `--flag:value`
|
||||||
|
for a flag that takes a value. Everything after the first `:` is the value, so
|
||||||
|
embedded colons (e.g. `host:port`) are preserved:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
options:
|
||||||
|
- "--cpu-moe" # boolean flag
|
||||||
|
- "--n-cpu-moe:4" # flag with a value
|
||||||
|
- "--override-tensor:exps=CPU"
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- **Precedence:** passthrough flags are applied last, so an explicit flag
|
||||||
|
overrides the LocalAI option it maps to (e.g. `--ctx-size:8192` overrides
|
||||||
|
`context_size`).
|
||||||
|
- **Power-user territory:** an invalid flag or value is rejected by the upstream
|
||||||
|
parser exactly as it would be by `llama-server`, which can fail model loading.
|
||||||
|
Prefer the named options above when one exists.
|
||||||
|
- Flags that would terminate the process (such as `--help`, `--usage`,
|
||||||
|
`--version`, `--license`, `--list-devices`, `--cache-list`, and
|
||||||
|
`--completion*`) are ignored.
|
||||||
|
|
||||||
### Prompt Caching
|
### Prompt Caching
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user