Compare commits

..

1 Commits

Author SHA1 Message Date
jmorganca
e23ddd84b8 x/grammar: add experimental GPU accelerated constrained decoding package 2026-01-11 00:50:11 -08:00
224 changed files with 8677 additions and 14671 deletions

View File

@@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor
FETCH_HEAD=b1377188784f9aea26b8abde56d4aee8c733eec7
FETCH_HEAD=ec98e2002
.PHONY: help
help:

18
go.mod
View File

@@ -15,8 +15,8 @@ require (
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.9.0
github.com/x448/float16 v0.8.4
golang.org/x/sync v0.17.0
golang.org/x/sys v0.37.0
golang.org/x/sync v0.19.0
golang.org/x/sys v0.39.0
)
require (
@@ -30,8 +30,8 @@ require (
github.com/tkrajina/typescriptify-golang-structs v0.2.0
github.com/wk8/go-ordered-map/v2 v2.1.8
golang.org/x/image v0.22.0
golang.org/x/mod v0.30.0
golang.org/x/tools v0.38.0
golang.org/x/mod v0.31.0
golang.org/x/tools v0.40.0
gonum.org/v1/gonum v0.15.0
)
@@ -81,11 +81,11 @@ require (
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.43.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
golang.org/x/net v0.46.0 // indirect
golang.org/x/term v0.36.0
golang.org/x/text v0.30.0
golang.org/x/crypto v0.46.0
golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93
golang.org/x/net v0.48.0 // indirect
golang.org/x/term v0.38.0
golang.org/x/text v0.32.0
google.golang.org/protobuf v1.34.1
gopkg.in/yaml.v3 v3.0.1 // indirect
)

36
go.sum
View File

@@ -233,16 +233,16 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4=
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0=
golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
@@ -264,8 +264,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -278,8 +278,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -289,8 +289,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -306,17 +306,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q=
golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -330,8 +330,8 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View File

@@ -14,28 +14,25 @@ make -f Makefile.sync apply-patches
### Updating Base Commit
To update to a new base commit:
**Pin to new base commit**
1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash.
To change the base commit, update `FETCH_HEAD` in Makefile.sync.
2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts.
When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
3. **Apply patches**:
```shell
make -f Makefile.sync apply-patches
```
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
4. **Resolve conflicts** (if any): When `git am` fails on a patch:
- Fix conflicts in `./vendor/`
- Stage the resolved files: `git -C llama/vendor add <file>`
- Continue: `git -C llama/vendor am --continue`
- Re-run: `make -f Makefile.sync apply-patches`
- Repeat until all patches are applied.
```shell
make -f Makefile.sync apply-patches
```
5. **Regenerate patches and sync**:
```shell
make -f Makefile.sync format-patches sync
```
If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
Once all patches are applied, commit the changes to the tracking repository.
```shell
make -f Makefile.sync format-patches sync
```
### Generating Patches

2
llama/build-info.cpp generated vendored
View File

@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "b1377188784f9aea26b8abde56d4aee8c733eec7";
char const *LLAMA_COMMIT = "ec98e2002";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";

View File

@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
}
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
if (!setpriority(PRIO_PROCESS, 0, p)) {
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
@@ -1078,15 +1078,12 @@ struct common_init_result::impl {
impl() = default;
~impl() = default;
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
llama_model_ptr model;
llama_context_ptr context;
std::vector<llama_adapter_lora_ptr> lora;
std::vector<common_sampler_ptr> samplers;
std::vector<llama_sampler_seq_config> samplers_seq_config;
};
common_init_result::common_init_result(common_params & params) :
@@ -1095,9 +1092,9 @@ common_init_result::common_init_result(common_params & params) :
auto cparams = common_context_params_to_llama(params);
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}
@@ -1110,25 +1107,6 @@ common_init_result::common_init_result(common_params & params) :
const llama_vocab * vocab = llama_model_get_vocab(model);
// load and optionally apply lora adapters (must be loaded before context creation)
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
pimpl->model.reset(model);
return;
}
char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}
// updates params.sampling
// TODO: fix naming
common_init_sampler_from_model(model, params.sampling);
@@ -1163,19 +1141,10 @@ common_init_result::common_init_result(common_params & params) :
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
//}
// init the backend samplers as part of the context creation
pimpl->samplers.resize(cparams.n_seq_max);
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
}
// TODO: temporarily gated behind a flag
if (params.sampling.backend_sampling) {
cparams.samplers = pimpl->samplers_seq_config.data();
cparams.n_samplers = pimpl->samplers_seq_config.size();
}
llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1199,12 +1168,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
return pimpl->samplers[seq_id].get();
}
void common_init_result::reset_samplers() {
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
}
}
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora;
}
@@ -1280,6 +1243,24 @@ common_init_result_ptr common_init_from_params(common_params & params) {
}
}
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
return res;
}
char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
}
if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters);
}
@@ -1320,9 +1301,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
// reset samplers to reset RNG state after warmup to the seeded state
res->reset_samplers();
}
return res;
@@ -1361,12 +1339,14 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.devices = params.devices.data();
}
mparams.n_gpu_layers = params.n_gpu_layers;
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;

View File

@@ -80,7 +80,6 @@ int32_t cpu_get_num_math();
//
enum llama_example {
LLAMA_EXAMPLE_DEBUG,
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_COMPLETION,
@@ -217,8 +216,6 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
bool backend_sampling = false;
bool has_logit_bias() const {
return !logit_bias.empty();
}
@@ -332,14 +329,12 @@ struct common_params {
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
@@ -375,11 +370,6 @@ struct common_params {
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
// llama-debug specific options
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
bool save_logits = false; // whether to save logits to files // NOLINT
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
@@ -430,8 +420,7 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // enable mmap to use filesystem cache
bool use_direct_io = true; // read from disk without buffering for faster model loading
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
@@ -486,8 +475,7 @@ struct common_params {
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
std::vector<std::string> api_keys;
@@ -496,11 +484,8 @@ struct common_params {
std::map<std::string, std::string> default_template_kwargs;
// webui configs
bool webui = true;
std::string webui_config_json;
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
@@ -700,9 +685,7 @@ struct common_init_result {
llama_model * model();
llama_context * context();
common_sampler * sampler(llama_seq_id seq_id);
void reset_samplers();
std::vector<llama_adapter_lora_ptr> & lora();

View File

@@ -104,9 +104,10 @@ struct ring_buffer {
struct common_sampler {
common_params_sampling params;
struct llama_sampler * grmr;
struct llama_sampler * chain;
bool grammar;
ring_buffer<llama_token> prev;
std::vector<llama_token_data> cur;
@@ -120,34 +121,17 @@ struct common_sampler {
}
void set_logits(struct llama_context * ctx, int idx) {
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
const auto * logits = llama_get_logits_ith(ctx, idx);
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
if (sampled_probs) {
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
cur.resize(sampled_probs_count);
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
}
} else if (sampled_logits) {
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
cur.resize(sampled_logits_count);
for (uint32_t i = 0; i < sampled_logits_count; i++) {
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
}
} else {
const auto * logits = llama_get_logits_ith(ctx, idx);
GGML_ASSERT(logits != nullptr);
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
cur_p = { cur.data(), cur.size(), -1, false };
@@ -176,50 +160,45 @@ std::string common_params_sampling::print() const {
return std::string(result);
}
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf;
llama_sampler * grmr = nullptr;
llama_sampler * chain = llama_sampler_chain_init(lparams);
bool grammar = false;
std::vector<llama_sampler *> samplers;
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
grammar = true;
#else
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE
} else {
std::vector<std::string> trigger_patterns;
std::vector<std::string> patterns_anywhere;
std::vector<llama_token> trigger_tokens;
for (const auto & trigger : params.grammar_triggers) {
switch (trigger.type) {
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
{
const auto & word = trigger.value;
trigger_patterns.push_back(regex_escape(word));
patterns_anywhere.push_back(regex_escape(word));
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
{
trigger_patterns.push_back(trigger.value);
patterns_anywhere.push_back(trigger.value);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
{
const auto & pattern = trigger.value;
std::string anchored = "^$";
if (!pattern.empty()) {
anchored = (pattern.front() != '^' ? "^" : "")
+ pattern
+ (pattern.back() != '$' ? "$" : "");
}
trigger_patterns.push_back(anchored);
trigger_patterns.push_back(trigger.value);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -233,6 +212,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
if (!patterns_anywhere.empty()) {
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
}
std::vector<const char *> trigger_patterns_c;
trigger_patterns_c.reserve(trigger_patterns.size());
for (const auto & regex : trigger_patterns) {
@@ -241,12 +224,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
if (!params.grammar.empty()) {
if (params.grammar_lazy) {
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size());
samplers.push_back(
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size()));
} else {
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
}
grammar = true;
}
}
@@ -315,16 +301,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
llama_sampler_chain_add(chain, smpl);
}
if (grmr && params.backend_sampling) {
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
params.backend_sampling = false;
}
auto * result = new common_sampler {
/* .params = */ params,
/* .grmr = */ grmr,
/* .chain = */ chain,
/* .grammar = */ grammar,
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
/* .cur_p = */ {},
@@ -335,7 +315,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
void common_sampler_free(struct common_sampler * gsmpl) {
if (gsmpl) {
llama_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->chain);
delete gsmpl;
@@ -345,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
const auto tm = gsmpl->tm();
if (gsmpl->grmr && accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
if (gsmpl->grammar) {
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
llama_sampler_accept(gsmpl->chain, token);
for (int i = 0; i < n_smpl; i++) {
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
// the grammar sampler is always the first one
if (i == 0) {
if (accept_grammar) {
llama_sampler_accept(smpl, token);
}
} else {
llama_sampler_accept(smpl, token);
}
}
} else {
llama_sampler_accept(gsmpl->chain, token);
}
gsmpl->prev.push_back(token);
}
@@ -361,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler {
/* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .grammar = */ gsmpl->grammar,
/* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur,
/* .cur_p = */ gsmpl->cur_p,
@@ -418,7 +410,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
return gsmpl->chain;
}
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
llama_synchronize(ctx);
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -426,61 +418,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
llama_token id = LLAMA_TOKEN_NULL;
auto & grmr = gsmpl->grmr;
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
// Check if a backend sampler has already sampled a token in which case we
// return that token id directly.
{
id = llama_get_sampled_token_ith(ctx, idx);
if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
// TODO: simplify
gsmpl->cur.resize(1);
gsmpl->cur[0] = { id, 0.0f, 1.0f };
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
return id;
}
}
gsmpl->set_logits(ctx, idx);
if (grammar_first) {
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p);
id = cur_p.data[cur_p.selected].id;
if (grammar_first) {
return id;
}
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
{
llama_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_sampler_apply(grmr, &single_token_data_array);
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
if (is_valid) {
return id;
}
}
// resampling:
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -490,7 +432,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
return id;
}
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
std::vector<llama_token> result;
@@ -498,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
size_t i = 0;
for (; i < draft.size(); i++) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
common_sampler_accept(gsmpl, id, true);
@@ -510,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
}
if (i == draft.size()) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
common_sampler_accept(gsmpl, id, true);
@@ -520,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
return result;
}
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
std::vector<int> idxs(draft.size() + 1);
for (size_t i = 0; i < idxs.size(); ++i) {
idxs[i] = i;
}
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
}
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {

View File

@@ -36,8 +36,7 @@ struct common_sampler;
// llama_sampler API overloads
// note: can mutate params in some cases
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
void common_sampler_free(struct common_sampler * gsmpl);
@@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// get the underlying llama_sampler_chain
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
// extended sampling implementation:
@@ -59,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
// generalized version of common_sampler_sample
//
@@ -80,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
//
// returns at least 1 token, up to idxs.size()
//
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

View File

@@ -286,7 +286,7 @@ extern "C" {
// NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -309,7 +309,6 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_direct_io; // use direct io, takes precedence over use_mmap
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -317,11 +316,6 @@ extern "C" {
bool no_alloc; // only load metadata and simulate memory allocations
};
struct llama_sampler_seq_config {
llama_seq_id seq_id;
struct llama_sampler * sampler;
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggml-org/llama.cpp/pull/7544
struct llama_context_params {
@@ -370,12 +364,6 @@ extern "C" {
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
// [EXPERIMENTAL]
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
struct llama_sampler_seq_config * samplers;
size_t n_samplers;
};
// model quantization parameters
@@ -479,23 +467,16 @@ extern "C" {
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
enum llama_params_fit_status {
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
};
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
LLAMA_API enum llama_params_fit_status llama_params_fit(
// returns true if the parameters could be successfully modified to fit device memory
// this function is NOT thread safe because it modifies the global llama logger state
LLAMA_API bool llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes
size_t margin, // margin of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
@@ -536,7 +517,6 @@ extern "C" {
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
@@ -620,8 +600,6 @@ extern "C" {
//
// Load a LoRA adapter from file
// The adapter is valid as long as the associated model is not freed
// All adapters must be loaded before context creation
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model,
const char * path_lora);
@@ -1005,32 +983,6 @@ extern "C" {
// otherwise: float[n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
//
// backend sampling API [EXPERIMENTAL]
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
//
// Get the backend sampled token for the ith token.
// Returns LLAMA_TOKEN_NULL if no token was sampled.
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled probabilites for the ith token
// The index matches llama_get_sampled_token_ith().
// Returns NULL if no probabilites were generated.
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled logits for the ith token
// Returns NULL if no logits were sampled.
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled candidates (token ids) for the ith token
// These are needed to map probability/logit indices to vocab token ids.
// Returns NULL if no candidates were sampled.
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
//
// Vocab
//
@@ -1202,16 +1154,11 @@ extern "C" {
//
// llama_sampler_free(smpl);
//
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
//
typedef void * llama_sampler_context_t;
struct llama_sampler_data {
struct ggml_tensor * logits;
struct ggml_tensor * probs;
struct ggml_tensor * sampled;
struct ggml_tensor * candidates;
};
// user code can implement the interface below in order to create custom llama_sampler
struct llama_sampler_i {
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
@@ -1221,45 +1168,17 @@ extern "C" {
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
// [EXPERIMENTAL]
// backend sampling interface:
// return true if the backend supports all ops needed by the sampler
// note: call once per sampler
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
// call after .backend_apply()
void (*backend_accept)(
struct llama_sampler * smpl,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_tensor * selected_token);
// call after .backend_init()
void (*backend_apply)(
struct llama_sampler * smpl,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct llama_sampler_data * data);
// called before graph execution to set inputs for the current ubatch
void (*backend_set_input)(struct llama_sampler * smpl);
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
//void (*apply_ggml) (struct llama_sampler * smpl, ...);
};
struct llama_sampler {
struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
const struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
};
// [EXPERIMENTAL]
// attach a sampler to the context
// note: prefer initializing the context with llama_context_params.samplers when possible
// note: changing the samplers of a context can cause graph reallocations and degraded performance
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
// mirror of llama_sampler_i:
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1275,15 +1194,7 @@ extern "C" {
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
// return NULL if:
// - the sampler is NULL
// - the sampler is not a llama_sampler_chain
// - the index is out of bounds, unless i == -1
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
// the total number of samplers in the chain
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -1292,9 +1203,7 @@ extern "C" {
// available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
/// seed == LLAMA_DEFAULT_SEED to use a random seed.
LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop

View File

@@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
return nullptr;
}
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
llama_model & model = adapter.model;
ggml_context * ctx_init;
gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
@@ -413,17 +411,14 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
}
}
// update number of nodes used
model.n_lora_nodes += adapter.get_n_nodes();
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
llama_adapter_lora * adapter = new llama_adapter_lora();
try {
llama_adapter_lora_init_impl(path_lora, *adapter);
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -474,10 +469,6 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
}
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
// update number of nodes used
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
delete adapter;
}

View File

@@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
};
struct llama_adapter_lora {
llama_model & model;
// map tensor name to lora_a_b
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
@@ -75,14 +73,10 @@ struct llama_adapter_lora {
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
llama_adapter_lora(llama_model & model) : model(model) {}
llama_adapter_lora() = default;
~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
uint32_t get_n_nodes() const {
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
}
};
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;

View File

@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_MODERN_BERT, "modern-bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" },
@@ -42,7 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_PHIMOE, "phimoe" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_PLAMO2, "plamo2" },
{ LLM_ARCH_PLAMO3, "plamo3" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" },
@@ -117,9 +115,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_MAINCODER, "maincoder" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -153,7 +148,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
@@ -211,7 +205,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
@@ -223,7 +216,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
@@ -508,7 +500,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_MISTRAL3:
case LLM_ARCH_LLAMA_EMBED:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
@@ -790,20 +781,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
case LLM_ARCH_MODERN_BERT:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
case LLM_ARCH_JINA_BERT_V2:
return {
LLM_TENSOR_TOKEN_EMBD,
@@ -1083,22 +1060,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_POST_NORM,
};
case LLM_ARCH_PLAMO3:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
case LLM_ARCH_CODESHELL:
return {
LLM_TENSOR_TOKEN_EMBD,
@@ -2079,7 +2040,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM_LFM2,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_DENSE_2_OUT,
};
case LLM_ARCH_LFM2MOE:
return {
@@ -2098,7 +2058,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_SHORTCONV_INPROJ,
LLM_TENSOR_SHORTCONV_OUTPROJ,
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM_LFM2,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
@@ -2214,49 +2174,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_VISEXP_FFN_DOWN,
LLM_TENSOR_VISEXP_FFN_UP,
};
case LLM_ARCH_MIMO2:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_SINKS,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
};
case LLM_ARCH_GPTJ:
case LLM_ARCH_UNKNOWN:
return {
LLM_TENSOR_TOKEN_EMBD,
};
case LLM_ARCH_MAINCODER:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
};
case LLM_ARCH_SOLAR:
return {
LLM_TENSOR_TOKEN_EMBD,

View File

@@ -24,7 +24,6 @@ enum llm_arch {
LLM_ARCH_STARCODER,
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
LLM_ARCH_MODERN_BERT,
LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT,
@@ -46,7 +45,6 @@ enum llm_arch {
LLM_ARCH_PHIMOE,
LLM_ARCH_PLAMO,
LLM_ARCH_PLAMO2,
LLM_ARCH_PLAMO3,
LLM_ARCH_CODESHELL,
LLM_ARCH_ORION,
LLM_ARCH_INTERNLM2,
@@ -121,9 +119,6 @@ enum llm_arch {
LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3,
LLM_ARCH_MIMO2,
LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_MAINCODER,
LLM_ARCH_UNKNOWN,
};
@@ -157,7 +152,6 @@ enum llm_kv {
LLM_KV_VOCAB_SIZE,
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
LLM_KV_EMBEDDING_LENGTH_OUT,
LLM_KV_FEATURES_LENGTH,
LLM_KV_BLOCK_COUNT,
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -215,7 +209,6 @@ enum llm_kv {
LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -227,7 +220,6 @@ enum llm_kv {
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,

View File

@@ -74,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
};
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -217,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_GROK_2;
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
@@ -848,14 +845,6 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "[unused9]助手:";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
}
if (add_ass) {
ss << "<|begin|>assistant";
}
} else {
// template not supported
return -1;

View File

@@ -54,7 +54,6 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_PANGU_EMBED,
LLM_CHAT_TEMPLATE_SOLAR_OPEN,
LLM_CHAT_TEMPLATE_UNKNOWN,
};

View File

@@ -60,25 +60,6 @@ llama_context::llama_context(
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
// Initialize backend samplers here so they are part of the sampling graph
// before the reserve passes run later in this function. This avoids a later
// re-reserve when graph nodes change.
if (params.samplers != nullptr && params.n_samplers > 0) {
for (size_t i = 0; i < params.n_samplers; ++i) {
const auto & config = params.samplers[i];
if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
}
if (set_sampler(config.seq_id, config.sampler)) {
const int n_samplers = llama_sampler_chain_n(config.sampler);
LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
}
}
}
auto rope_scaling_type = params.rope_scaling_type;
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
rope_scaling_type = hparams.rope_scaling_type_train;
@@ -250,10 +231,7 @@ llama_context::llama_context(
// graph outputs buffer
{
// resized during inference when a batch uses more outputs
// Create a dummy batch for initialization.
llama_batch dummy_batch = {};
dummy_batch.n_tokens = 0;
if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
throw std::runtime_error("failed to reserve initial output buffer");
}
@@ -316,8 +294,8 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.n_gpu_layers() > model.hparams.n_layer &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
@@ -478,35 +456,26 @@ llama_context::llama_context(
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
}
}
// Initialize the full vocabulary token ids for backend samplers.
{
const int n_vocab = model.vocab.n_tokens();
sampling.token_ids_full_vocab.resize(n_vocab);
for (int i = 0; i < n_vocab; ++i) {
sampling.token_ids_full_vocab[i] = i;
}
}
}
llama_context::~llama_context() {
if (!model.hparams.no_alloc) {
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
ggml_backend_t backend = backend_ptrs[i];
ggml_backend_buffer_type_t buft = backend_buft[i];
// FIXME this currently results in a use-after-free bug if the model is freed before the context
// if (!model.hparams.no_alloc) {
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
// ggml_backend_t backend = backend_ptrs[i];
// ggml_backend_buffer_type_t buft = backend_buft[i];
const size_t size_exp = backend_buf_exp_size[i];
const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
if (size_exp == size_act) {
LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
} else {
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
}
}
}
// const size_t size_exp = backend_buf_exp_size[i];
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
// if (size_exp == size_act) {
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
// } else {
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
// }
// }
// }
ggml_opt_free(opt_ctx);
}
@@ -648,35 +617,6 @@ float * llama_context::get_logits() {
return logits;
}
int64_t llama_context::output_resolve_row(int32_t i) const {
int64_t j = -1;
// support negative indices (last output row)
if (i < 0) {
j = n_outputs + i;
if (j < 0) {
throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
}
} else if ((size_t) i >= output_ids.size()) {
throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
} else {
// use output_ids to translate the batch token index into a row number
// that holds this token's data.
j = output_ids[i];
}
if (j < 0) {
// the batch token was not configured to output anything
throw std::runtime_error(format("batch.logits[%d] != true", i));
}
if (j >= n_outputs) {
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
}
return j;
}
float * llama_context::get_logits_ith(int32_t i) {
int64_t j = -1;
@@ -687,7 +627,6 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error("no logits");
}
// TODO: use output_resolve_row()
if (i < 0) {
j = n_outputs + i;
if (j < 0) {
@@ -724,10 +663,6 @@ float * llama_context::get_embeddings() {
return embd;
}
llama_token * llama_context::get_sampled_tokens() const{
return sampling.sampled;
}
float * llama_context::get_embeddings_ith(int32_t i) {
int64_t j = -1;
@@ -738,7 +673,6 @@ float * llama_context::get_embeddings_ith(int32_t i) {
throw std::runtime_error("no embeddings");
}
// TODO: use output_resolve_row()
if (i < 0) {
j = n_outputs + i;
if (j < 0) {
@@ -758,8 +692,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
}
const uint32_t n_embd_out = model.hparams.get_n_embd_out();
return embd + j*n_embd_out;
return embd + j*model.hparams.n_embd;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -779,136 +712,6 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
return it->second.data();
}
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
output_reorder();
if (sampling.sampled == nullptr) {
return LLAMA_TOKEN_NULL;
}
try {
const int64_t row = output_resolve_row(idx);
GGML_ASSERT(row < (int64_t) sampling.sampled_size);
return sampling.sampled[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
return LLAMA_TOKEN_NULL;
}
}
float * llama_context::get_sampled_probs_ith(int32_t idx) {
output_reorder();
if (sampling.probs == nullptr) {
return nullptr;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
return nullptr;
}
return sampling.probs + row*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
return nullptr;
}
}
float * llama_context::get_sampled_logits_ith(int32_t idx) {
output_reorder();
if (sampling.logits == nullptr) {
return nullptr;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
return nullptr;
}
return sampling.logits + row*model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
return nullptr;
}
}
const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
output_reorder();
try {
const int64_t row = output_resolve_row(idx);
if (sampling.candidates != nullptr &&
(size_t) row < sampling.candidates_count.size() &&
sampling.candidates_count[row] > 0) {
return sampling.candidates + row*model.vocab.n_tokens();
}
} catch (const std::exception & err) {
// fallback to full vocab list
}
return sampling.token_ids_full_vocab.data();
}
size_t llama_context::get_sampled_candidates_count(int32_t idx) {
output_reorder();
if (sampling.candidates == nullptr) {
return 0;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.candidates_count.size()) {
return 0;
}
return sampling.candidates_count[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
return 0;
}
}
size_t llama_context::get_sampled_logits_count(int32_t idx) {
output_reorder();
if (sampling.logits == nullptr) {
return model.vocab.n_tokens();
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.logits_count.size()) {
return 0;
}
return sampling.logits_count[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
return 0;
}
}
size_t llama_context::get_sampled_probs_count(int32_t idx) {
output_reorder();
if (sampling.probs == nullptr) {
return 0;
}
try {
const int64_t row = output_resolve_row(idx);
if ((size_t) row >= sampling.probs_count.size()) {
return 0;
}
return sampling.probs_count[row];
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
return 0;
}
}
void llama_context::attach_threadpool(
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch) {
@@ -965,42 +768,6 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value;
}
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
const bool can_offload =
sampler &&
sampler->iface->backend_init &&
sampler->iface->backend_apply &&
llama_sampler_chain_n(sampler) > 0;
if (sampler && can_offload) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
if (host_buft) {
buft = host_buft;
}
sampler->iface->backend_init(sampler, buft);
sampling.samplers[seq_id] = sampler;
return true;
}
if (sampler && !can_offload) {
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
sampling.samplers.erase(seq_id);
return false;
}
sampling.samplers.erase(seq_id);
return true;
}
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@@ -1141,7 +908,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
n_queued_tokens += n_tokens;
// reserve output buffer
if (output_reserve(n_tokens, batch_inp) < n_tokens) {
if (output_reserve(n_tokens) < n_tokens) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
return -2;
};
@@ -1195,10 +962,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
{
// extract token embeddings
GGML_ASSERT(embd != nullptr);
const uint32_t n_embd_out = hparams.get_n_embd_out();
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
} break;
case LLAMA_POOLING_TYPE_MEAN:
case LLAMA_POOLING_TYPE_CLS:
@@ -1266,112 +1032,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
return 0;
}
static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
std::map<llama_seq_id, uint32_t> seq_to_row;
// how many output tokens we have seen so far for this ubatch.
uint32_t local = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
// skip tokens that are not output.
if (!ubatch.output[i]) {
continue;
}
const llama_seq_id seq_id = ubatch.seq_id[i][0];
// row_offset is the number of output tokens before this ubatch.
seq_to_row[seq_id] = row_offset + local;
++local;
}
return seq_to_row;
}
static void copy_tensor_async_ints(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
llama_token * sampled,
size_t sampled_size,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
if (sampled == nullptr) {
return;
}
for (const auto & [seq_id, tensor] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}
const uint32_t row = it->second;
GGML_ASSERT(row < sampled_size);
GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
}
}
static void copy_tensor_async_floats(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
float * dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
if (dst == nullptr) {
return;
}
for (const auto & [seq_id, tensor] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}
const uint32_t row = it->second;
GGML_ASSERT(row < counts.size());
GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
float * row_ptr = dst + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
// Update the actual number of logits/probabilities that were written for this row.
counts[row] = ggml_nelements(tensor);
}
}
static void copy_tensor_async_candidates(
const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
llama_token * dst,
size_t stride,
std::vector<uint32_t> & counts,
const std::map<llama_seq_id, uint32_t> & seq_to_row,
ggml_backend_sched_t sched) {
if (dst == nullptr) {
return;
}
for (const auto & [seq_id, tensor] : tensor_map) {
auto it = seq_to_row.find(seq_id);
if (it == seq_to_row.end()) {
continue;
}
const uint32_t row = it->second;
GGML_ASSERT(row < counts.size());
GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
llama_token * row_ptr = dst + (size_t) row * stride;
ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
// Update the actual number of candidates that were written.
counts[row] = ggml_nelements(tensor);
}
}
int llama_context::decode(const llama_batch & batch_inp) {
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
@@ -1392,35 +1052,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_embd = hparams.n_embd_inp();
const bool output_all = false;
const bool has_samplers = !sampling.samplers.empty();
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
// TODO: avoid this workaround in the future
if (has_samplers && batch_inp.logits) {
std::vector<int32_t> seq_output_count(n_seq_max, 0);
for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
if (batch_inp.logits[i] == 0) {
continue;
}
const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
for (int32_t s = 0; s < ns; ++s) {
const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
seq_output_count[seq_id]++;
if (seq_output_count[seq_id] > 1) {
LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
__func__, seq_id, seq_output_count[seq_id]);
return -1;
}
}
}
}
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
return -1;
}
@@ -1501,7 +1134,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
// reserve output buffer
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
return -2;
};
@@ -1574,10 +1207,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
// extract logits
// For multi-sequence batches that mix backend samplers and CPU sampler
// this is currently inefficient as we copy all logits even for the
// backend sampled tokens.
if (logits && t_logits && n_outputs > 0) {
if (t_logits && n_outputs > 0) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr);
@@ -1592,7 +1222,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
// extract embeddings
if (embd && t_embd && n_outputs > 0) {
if (t_embd && n_outputs > 0) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);
@@ -1601,13 +1231,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
{
// extract token embeddings
GGML_ASSERT(embd != nullptr);
const uint32_t n_embd_out = hparams.get_n_embd_out();
float * embd_out = embd + n_outputs_prev*n_embd_out;
float * embd_out = embd + n_outputs_prev*n_embd;
if (n_outputs) {
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_MEAN:
@@ -1647,22 +1276,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
}
// This flag indicates whether a backend sampler has actually sampled a specific
// token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
if (has_samplers && has_sampled) {
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
const auto stride = n_vocab;
// async copy the sampling data from the backend to the host
copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
copy_tensor_async_floats (res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
copy_tensor_async_floats (res->t_sampled_probs, sampling.probs, stride, sampling.probs_count, seq_to_output_row, sched.get());
copy_tensor_async_candidates(res->t_candidates, sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
}
n_outputs_prev += n_outputs;
} while (mctx->next());
@@ -1726,15 +1339,15 @@ int llama_context::decode(const llama_batch & batch_inp) {
// output
//
uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
uint32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
const auto n_batch = cparams.n_batch;
const auto n_vocab = vocab.n_tokens();
const auto n_embd_out = hparams.get_n_embd_out();
const auto n_batch = cparams.n_batch;
const auto n_vocab = vocab.n_tokens();
const auto n_embd = hparams.n_embd;
bool has_logits = true;
bool has_embd = cparams.embeddings;
@@ -1745,53 +1358,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
has_embd = true;
}
// Check which sampling modes are needed for the current batch.
// TODO: avoid this branching by working with the worst-case
bool has_sampling = false;
bool cpu_logits = false;
if (batch.logits) {
for (int32_t i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
llama_seq_id seq_id = batch.seq_id[i][j];
if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
has_sampling = true;
} else {
cpu_logits = true;
}
}
}
} else {
// When batch.logits is nullptr (when loading state with a dummy batch),
// allocate CPU logits.
cpu_logits = true;
}
size_t backend_float_count = 0;
size_t backend_token_count = 0;
// Allocate CPU logits buffer only if needed by sequences in this batch
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
// TODO: avoid this branching by working with the worst-case
if (!has_sampling) {
sampling.logits_size = 0;
sampling.probs_size = 0;
sampling.sampled_size = 0;
sampling.candidates_size = 0;
} else {
sampling.logits_size = n_vocab*n_outputs_max;
sampling.probs_size = n_vocab*n_outputs_max;
sampling.sampled_size = n_outputs_max;
sampling.candidates_size = n_vocab*n_outputs_max;
backend_float_count = sampling.logits_size + sampling.probs_size;
backend_token_count = sampling.sampled_size + sampling.candidates_size;
}
logits_size = has_logits ? n_vocab*n_outputs_max : 0;
embd_size = has_embd ? n_embd*n_outputs_max : 0;
if (output_ids.empty()) {
// init, never resized afterwards
@@ -1799,9 +1367,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
}
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
const size_t new_size =
(logits_size + embd_size + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
const size_t new_size = (logits_size + embd_size) * sizeof(float);
// alloc only when more than the current capacity is required
// TODO: also consider shrinking the buffer
@@ -1809,11 +1375,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
if (buf_output) {
#ifndef NDEBUG
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
synchronize();
// TODO: not needed?
buf_output = nullptr;
logits = nullptr;
embd = nullptr;
@@ -1835,49 +1399,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
logits = nullptr;
embd = nullptr;
size_t offset = 0;
uint8_t * base = (uint8_t *) output_base;
logits = (has_logits && cpu_logits) ? output_base : nullptr;
offset += logits_size * sizeof(float);
embd = has_embd ? (float *) (base + offset) : nullptr;
offset += embd_size * sizeof(float);
sampling.logits = nullptr;
sampling.probs = nullptr;
sampling.sampled = nullptr;
sampling.candidates = nullptr;
if (has_sampling) {
sampling.logits = (float *) (base + offset);
offset += sampling.logits_size * sizeof(float);
sampling.probs = (float *) (base + offset);
offset += sampling.probs_size * sizeof(float);
sampling.sampled = (llama_token *) (base + offset);
offset += sampling.sampled_size * sizeof(llama_token);
sampling.candidates = (llama_token *) (base + offset);
offset += sampling.candidates_size * sizeof(llama_token);
// The count vectors keep track of the actual number of logits/probs/candidates
// copied from the backend for each output row.
sampling.logits_count.resize(n_outputs_max);
sampling.probs_count.resize(n_outputs_max);
sampling.candidates_count.resize(n_outputs_max);
std::fill(sampling.logits_count.begin(), sampling.logits_count.end(), 0);
std::fill(sampling.probs_count.begin(), sampling.probs_count.end(), 0);
std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
}
logits = has_logits ? output_base : nullptr;
embd = has_embd ? output_base + logits_size : nullptr;
// set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1906,40 +1429,6 @@ void llama_context::output_reorder() {
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
}
}
if (sampling.logits && sampling.logits_size > 0) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
}
}
if (sampling.probs && sampling.probs_size > 0) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
}
}
if (sampling.candidates && sampling.candidates_size > 0) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
}
}
if (sampling.sampled && sampling.sampled_size > 0) {
std::swap(sampling.sampled[i0], sampling.sampled[i1]);
}
if (!sampling.logits_count.empty()) {
std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
}
if (!sampling.probs_count.empty()) {
std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
}
if (!sampling.candidates_count.empty()) {
std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
}
}
output_swaps.clear();
@@ -1953,9 +1442,7 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
if (model.arch == LLM_ARCH_QWEN3NEXT) {
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
}
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
res += model.n_lora_nodes;
return res;
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
}
llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -1969,7 +1456,7 @@ ggml_cgraph * llama_context::graph_reserve(
if (n_tokens % n_seqs != 0) {
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
n_outputs = std::max(n_outputs, n_tokens);
n_outputs = std::min(n_outputs, n_tokens);
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
}
@@ -1988,15 +1475,6 @@ ggml_cgraph * llama_context::graph_reserve(
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
// set one output token per sequence in order to activate all backend samplers
std::vector<llama_seq_id> seq_ids(n_seqs);
for (uint32_t i = 0; i < n_seqs; ++i) {
seq_ids[i] = i;
ubatch.n_seq_id[i] = 1;
ubatch.seq_id[i] = &seq_ids[i];
ubatch.output[i] = true;
}
auto * res = gf_res_reserve.get();
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
@@ -2027,7 +1505,7 @@ llm_graph_params llama_context::graph_params(
llm_graph_result * res,
const llama_ubatch & ubatch,
const llama_memory_context_i * mctx,
llm_graph_type gtype) const {
llm_graph_type gtype) const {
return {
/*.arch =*/ model.arch,
/*.hparams =*/ model.hparams,
@@ -2040,7 +1518,6 @@ llm_graph_params llama_context::graph_params(
/*.loras =*/ &loras,
/*.mctx =*/ mctx,
/*.cross =*/ &cross,
/*.samplers =*/ sampling.samplers,
/*.n_outputs =*/ n_outputs,
/*.cb =*/ graph_get_cb(),
/*.res =*/ res,
@@ -2093,7 +1570,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);
@@ -2496,9 +1973,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
}
}
// TODO: handle sampling buffers and samplers state ?
// https://github.com/ggml-org/llama.cpp/pull/17004
if (memory != nullptr) {
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
memory->state_write(io);
@@ -2531,10 +2005,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
auto n_outputs = this->n_outputs;
io.read_to(&n_outputs, sizeof(n_outputs));
// Create a dummy batch for state loading.
llama_batch dummy_batch = {};
dummy_batch.n_tokens = 0;
if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
if (n_outputs > output_reserve(n_outputs)) {
throw std::runtime_error("could not reserve outputs");
}
@@ -2588,9 +2059,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
}
}
// TODO: handle sampling buffers and samplers state ?
// https://github.com/ggml-org/llama.cpp/pull/17004
if (memory) {
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
@@ -2779,7 +2247,7 @@ void llama_context::opt_epoch_iter(
}
// reserve output buffer
if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
GGML_ABORT("TODO: handle this error");
};
@@ -2924,8 +2392,6 @@ llama_context_params llama_context_default_params() {
/*.op_offload =*/ true,
/*.swa_full =*/ true,
/*.kv_unified =*/ false,
/*.sampler =*/ nullptr,
/*.n_sampler =*/ 0,
};
return result;
@@ -3085,15 +2551,7 @@ float * llama_get_logits(llama_context * ctx) {
float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
float * res = nullptr;
res = ctx->get_sampled_logits_ith(i);
if (!res) {
res = ctx->get_logits_ith(i);
}
return res;
return ctx->get_logits_ith(i);
}
float * llama_get_embeddings(llama_context * ctx) {
@@ -3114,52 +2572,6 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
return ctx->get_embeddings_seq(seq_id);
}
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
return ctx->set_sampler(seq_id, smpl);
}
llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_sampled_token_ith(i);
}
float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_sampled_probs_ith(i);
}
float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_sampled_logits_ith(i);
}
llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
}
uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
}
uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
}
uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
}
// llama adapter API
int32_t llama_set_adapter_lora(

View File

@@ -70,18 +70,6 @@ struct llama_context {
float * get_embeddings_ith(int32_t i);
float * get_embeddings_seq(llama_seq_id seq_id);
llama_token * get_sampled_tokens() const;
llama_token get_sampled_token_ith(int32_t idx);
float * get_sampled_logits_ith(int32_t idx);
size_t get_sampled_logits_count(int32_t idx);
float * get_sampled_probs_ith(int32_t idx);
size_t get_sampled_probs_count(int32_t idx);
const llama_token * get_sampled_candidates_ith(int32_t idx);
size_t get_sampled_candidates_count(int32_t idx);
void attach_threadpool(
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
@@ -204,13 +192,10 @@ private:
// Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved.
uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
uint32_t output_reserve(int32_t n_outputs);
void output_reorder();
// map the output row index `i` to batch index
int64_t output_resolve_row(int32_t i) const;
//
// graph
//
@@ -228,8 +213,6 @@ public:
ggml_cgraph * graph_reserve(
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
private:
llm_graph_params graph_params(
llm_graph_result * res,
@@ -269,31 +252,6 @@ private:
size_t embd_size = 0; // capacity (of floats) for embeddings
float * embd = nullptr;
// TODO: simplify
struct sampling_info {
std::map<llama_seq_id, llama_sampler *> samplers;
float * logits = nullptr;
size_t logits_size = 0;
llama_token * sampled = nullptr;
size_t sampled_size = 0;
float * probs = nullptr;
size_t probs_size = 0;
llama_token * candidates = nullptr;
size_t candidates_size = 0;
std::vector<uint32_t> logits_count;
std::vector<uint32_t> probs_count;
std::vector<uint32_t> candidates_count;
std::vector<llama_token> token_ids_full_vocab;
};
sampling_info sampling;
// sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq;

View File

@@ -369,44 +369,6 @@ static void print_rule(
fprintf(file, "\n");
}
//
// Regex utilities
//
size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
auto find_start_pos = [](const std::smatch & match) {
// get from the first matched capturing group to the end of the string
size_t start = std::string::npos;
for (auto i = 1u; i < match.size(); i++) {
if (match.length(i) > 0) {
start = match.position(i);
break;
}
}
if (start == std::string::npos) {
start = match.position(0);
}
return start;
};
if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
// match against the entire input
std::smatch match;
if (std::regex_match(input, match, regex)) {
return find_start_pos(match);
}
}
// search anywhere
std::smatch match;
if (std::regex_search(input, match, regex)) {
return find_start_pos(match);
}
return std::string::npos;
}
//
// implementation
//
@@ -1359,10 +1321,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
grammar.trigger_buffer += piece;
std::smatch match;
for (const auto & trigger_pattern : grammar.trigger_patterns) {
auto start = trigger_pattern.find(grammar.trigger_buffer);
if (start != std::string::npos) {
if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
grammar.awaiting_trigger = false;
// get from the first matched capturing group to the end of the string
size_t start = std::string::npos;
for (auto i = 1u; i < match.size(); i++) {
if (match.length(i) > 0) {
start = match.position(i);
break;
}
}
if (start == std::string::npos) {
start = match.position(0);
}
// replay tokens that overlap with [start, end)
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {

View File

@@ -130,8 +130,6 @@ struct llama_grammar_parser {
struct llama_grammar_trigger_pattern {
std::string pattern;
std::regex regex;
size_t find(const std::string & input) const;
};
struct llama_grammar {

View File

@@ -12,7 +12,6 @@
#include <cassert>
#include <cmath>
#include <cstring>
#include <unordered_set>
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) {
@@ -33,7 +32,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
return res;
}
@@ -63,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
res &= pos->ne[0] == params.ubatch.n_tokens;
return res;
}
@@ -522,43 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
return res;
}
void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
// set the inputs only for the active samplers in the current ubatch
std::unordered_set<llama_seq_id> active_samplers;
for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
if (ubatch->output[i]) {
llama_seq_id seq_id = ubatch->seq_id[i][0];
active_samplers.insert(seq_id);
}
}
for (auto seq_id : active_samplers) {
if (samplers.find(seq_id) == samplers.end()) {
continue;
}
auto & sampler = samplers[seq_id];
if (sampler->iface->backend_set_input) {
sampler->iface->backend_set_input(sampler);
}
}
}
bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
if (samplers.size() != params.samplers.size()) {
return false;
}
for (const auto & [seq_id, sampler] : params.samplers) {
if (samplers[seq_id] != sampler) {
return false;
}
}
return true;
}
//
// llm_graph_result
//
@@ -579,10 +541,6 @@ void llm_graph_result::reset() {
t_logits = nullptr;
t_embd = nullptr;
t_embd_pooled = nullptr;
t_sampled.clear();
t_sampled_probs.clear();
t_sampled_logits.clear();
t_candidates.clear();
params = {};
@@ -607,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
}
}
void llm_graph_result::set_outputs() {
if (t_logits != nullptr) {
ggml_set_output(t_logits);
}
if (t_embd != nullptr) {
ggml_set_output(t_embd);
}
if (t_embd_pooled != nullptr) {
ggml_set_output(t_embd_pooled);
}
for (auto & [seq_id, t] : t_sampled) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_sampled_probs) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_sampled_logits) {
if (t != nullptr) {
ggml_set_output(t);
}
}
for (auto & [seq_id, t] : t_candidates) {
if (t != nullptr) {
ggml_set_output(t);
}
}
}
bool llm_graph_result::can_reuse(const llm_graph_params & params) {
if (!this->params.allow_reuse(params)) {
if (debug > 1) {
@@ -720,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
loras (params.loras),
mctx (params.mctx),
cross (params.cross),
samplers (params.samplers),
cb_func (params.cb),
res (params.res),
ctx0 (res->get_ctx()),
@@ -1326,10 +1251,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
res->add_input(std::move(inp));
// make sure the produced embeddings are immediately materialized in the ggml graph
// ref: https://github.com/ggml-org/llama.cpp/pull/18599
ggml_build_forward_expand(gf, cur);
return cur;
}
@@ -1913,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask);
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
}
{
@@ -1929,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask_swa);
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
}
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2071,18 +1988,14 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
void llm_graph_context::build_dense_out(
ggml_tensor * dense_2,
ggml_tensor * dense_3) const {
if (!cparams.embeddings || !(dense_2 || dense_3)) {
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
return;
}
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
if (dense_2) {
cur = ggml_mul_mat(ctx0, dense_2, cur);
}
if (dense_3) {
cur = ggml_mul_mat(ctx0, dense_3, cur);
}
cur = ggml_mul_mat(ctx0, dense_2, cur);
cur = ggml_mul_mat(ctx0, dense_3, cur);
cb(cur, "result_embd_pooled", -1);
res->t_embd_pooled = cur;
ggml_build_forward_expand(gf, cur);
@@ -2173,87 +2086,6 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur);
}
void llm_graph_context::build_sampling() const {
if (samplers.empty() || !res->t_logits) {
return;
}
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
res->add_input(std::move(inp_sampling));
std::map<llama_seq_id, int32_t> seq_to_logit_row;
int32_t logit_row_idx = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
if (ubatch.output[i]) {
llama_seq_id seq_id = ubatch.seq_id[i][0];
seq_to_logit_row[seq_id] = logit_row_idx;
logit_row_idx++;
}
}
// res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
// add a dummy row of logits
// this trick makes the graph static, regardless of which samplers are activated
// this is important in order to minimize graph reallocations
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
for (const auto & [seq_id, sampler] : samplers) {
const auto it = seq_to_logit_row.find(seq_id);
// inactive samplers always work on the first row
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
struct llama_sampler_data data = {
/*.logits =*/ logits_seq,
/*.probs =*/ nullptr,
/*.sampled =*/ nullptr,
/*.candidates =*/ nullptr,
};
assert(sampler->iface->backend_apply);
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
if (data.sampled != nullptr) {
res->t_sampled[seq_id] = data.sampled;
ggml_build_forward_expand(gf, data.sampled);
}
if (data.probs != nullptr) {
res->t_sampled_probs[seq_id] = data.probs;
ggml_build_forward_expand(gf, data.probs);
}
if (data.logits != nullptr) {
res->t_sampled_logits[seq_id] = data.logits;
ggml_build_forward_expand(gf, data.logits);
}
if (data.candidates != nullptr) {
res->t_candidates[seq_id] = data.candidates;
ggml_build_forward_expand(gf, data.candidates);
}
}
// TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
/*
for (const auto & [seq_id, sampler] : samplers) {
if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
ggml_tensor * selected_token = it->second;
if (selected_token != nullptr) {
llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
}
}
}
*/
}
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
// TODO move to hparams if a T5 variant appears that uses a different value
const int64_t max_distance = 128;

View File

@@ -10,7 +10,6 @@
#include <memory>
#include <set>
#include <functional>
#include <map>
struct ggml_cgraph;
struct ggml_context;
@@ -397,18 +396,6 @@ public:
const llama_memory_hybrid_context * mctx;
};
class llm_graph_input_sampling : public llm_graph_input_i {
public:
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
samplers(std::move(samplers)) { }
virtual ~llm_graph_input_sampling() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
std::map<llama_seq_id, llama_sampler *> samplers;
};
//
// llm_graph_result
//
@@ -442,23 +429,6 @@ struct llm_graph_params {
const llama_memory_context_i * mctx;
const llama_cross * cross;
std::map<llama_seq_id, llama_sampler *> samplers;
static bool samplers_equal(
const std::map<llama_seq_id, llama_sampler *> & lhs,
const std::map<llama_seq_id, llama_sampler *> & rhs) {
if (lhs.size() != rhs.size()) {
return false;
}
for (const auto & [seq_id, sampler] : lhs) {
auto it = rhs.find(seq_id);
if (it == rhs.end() || it->second != sampler) {
return false;
}
}
return true;
}
uint32_t n_outputs;
llm_graph_cb cb;
@@ -498,36 +468,15 @@ struct llm_graph_params {
return false;
}
if (n_outputs != other.n_outputs) {
return false;
}
if (!samplers_equal(samplers, other.samplers)) {
return false;
}
if (samplers.size() > 0) {
if (!ubatch.data || !other.ubatch.data) {
return false;
}
// check that the outputs are the same for all samplers
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
if (ubatch.output[i] != other.ubatch.output[i] ||
ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
return false;
}
}
}
return
cparams.embeddings == other.cparams.embeddings &&
cparams.causal_attn == other.cparams.causal_attn &&
arch == other.arch &&
gtype == other.gtype &&
cvec == other.cvec &&
loras == other.loras &&
cross == other.cross;
arch == other.arch &&
gtype == other.gtype &&
cvec == other.cvec &&
loras == other.loras &&
cross == other.cross &&
n_outputs == other.n_outputs;
}
};
@@ -550,7 +499,6 @@ public:
void reset();
void set_inputs(const llama_ubatch * ubatch);
void set_outputs();
// try to update the existing graph result using the new graph parameters in order to reuse it
// this can only be done if we determine that the resulting graph using the new graph parameters
@@ -569,11 +517,6 @@ public:
ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr;
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
std::map<llama_seq_id, ggml_tensor*> t_candidates;
std::map<llama_seq_id, ggml_tensor*> t_sampled;
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
std::vector<llm_graph_input_ptr> inputs;
ggml_context_ptr ctx_compute;
@@ -649,8 +592,6 @@ struct llm_graph_context {
const llama_memory_context_i * mctx;
const llama_cross * cross;
std::map<llama_seq_id, llama_sampler *> samplers;
const llm_graph_cb & cb_func;
llm_graph_result * res;
@@ -891,12 +832,6 @@ struct llm_graph_context {
ggml_tensor * cls_out,
ggml_tensor * cls_out_b) const;
//
// sampling (backend sampling)
//
void build_sampling() const;
//
// dense (out)
//

View File

@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
return n_embd_inp;
}
uint32_t llama_hparams::get_n_embd_out() const {
return n_embd_out > 0 ? n_embd_out : n_embd;
}
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);

View File

@@ -107,9 +107,9 @@ struct llama_hparams {
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_base_train_swa = 10000.0f;
float rope_freq_base_train_swa;
float rope_freq_scale_train;
float rope_freq_scale_train_swa = 1.0f;
float rope_freq_scale_train_swa;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;
@@ -125,11 +125,10 @@ struct llama_hparams {
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
// the size of the sliding window (0 - no SWA)
uint32_t n_swa = 0;
// if swa_layers[il] == 1, then layer il is SWA
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
// if swa_layers[il] == true, then layer il is SWA
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
// by default, all layers are dense
// note: using uint32_t type for compatibility reason
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
// for State Space Models
uint32_t ssm_d_conv = 0;
@@ -164,9 +163,6 @@ struct llama_hparams {
// for Classifiers
uint32_t n_cls_out = 1;
// output embedding dimension (0 = use n_embd)
uint32_t n_embd_out = 0;
// llama4 smallthinker
uint32_t n_moe_layer_step = 0;
uint32_t n_no_rope_layer_step = 4;
@@ -239,9 +235,6 @@ struct llama_hparams {
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of output embeddings
uint32_t get_n_embd_out() const;
// dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const;

View File

@@ -305,7 +305,7 @@ public:
bool do_shift,
stream_copy_info sc_info);
// used to create a batch processing context from a batch
// used to create a batch procesing context from a batch
llama_kv_cache_context(
llama_kv_cache * kv,
slot_info_vec_t sinfos,

View File

@@ -13,10 +13,9 @@
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
return ret;
}
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
impl(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -110,7 +109,7 @@ struct llama_file::impl {
}
}
void read_raw(void * ptr, size_t len) {
void read_raw(void * ptr, size_t len) const {
size_t bytes_read = 0;
while (bytes_read < len) {
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +126,7 @@ struct llama_file::impl {
}
}
uint32_t read_u32() {
uint32_t read_u32() const {
uint32_t val;
read_raw(&val, sizeof(val));
return val;
@@ -154,55 +153,16 @@ struct llama_file::impl {
write_raw(&val, sizeof(val));
}
bool has_direct_io() const {
return true;
}
~impl() {
if (fp) {
std::fclose(fp);
}
}
#else
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
#ifdef __linux__
// Try unbuffered I/O for read only
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
if (init_fd()) {
return;
}
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
fname, strerror(errno));
}
#endif
init_fp(mode);
}
#ifdef __linux__
bool init_fd() {
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
if (fd != -1) {
struct stat file_stats{};
fstat(fd, &file_stats);
size = file_stats.st_size;
alignment = file_stats.st_blksize;
off_t ret = lseek(fd, 0, SEEK_SET);
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
return true;
}
return false;
}
#endif
void init_fp(const char * mode) {
fp = ggml_fopen(fname.c_str(), mode);
impl(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
@@ -210,118 +170,46 @@ struct llama_file::impl {
}
size_t tell() const {
if (fd == -1) {
long ret = std::ftell(fp);
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}
return (size_t) ret;
// TODO: this ifdef is never true?
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}
off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos == -1) {
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
}
return (size_t) pos;
return (size_t) ret;
}
void seek(size_t offset, int whence) const {
off_t ret = 0;
if (fd == -1) {
ret = std::fseek(fp, (long) offset, whence);
} else {
ret = lseek(fd, offset, whence);
}
if (ret == -1) {
// TODO: this ifdef is never true?
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
if (ret != 0) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
}
void read_raw_unsafe(void * ptr, size_t len) {
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
if (fd == -1) {
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
} else {
size_t bytes_read = 0;
while (bytes_read < len) {
const size_t to_read = len - bytes_read;
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
if (ret == -1) {
if (errno == EINTR) {
continue; // Interrupted by signal, retry
}
// Fallback to std::fread in case the DMA controller cannot access the buffer
if (errno == EFAULT) {
auto curr_off = tell();
close(fd);
fd = -1;
alignment = 1;
init_fp("rb");
seek(curr_off, SEEK_SET);
read_raw_unsafe(ptr, len);
return;
}
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret == 0) {
// EOF: allow if this read was only pulling alignment padding past file end
off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos != -1 && (size_t) pos == size) {
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
return;
}
throw std::runtime_error("unexpectedly reached end of file");
}
bytes_read += (size_t) ret;
}
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
}
void read_aligned_chunk(void * dest, size_t size) {
size_t offset = tell();
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
void * raw_buffer = nullptr;
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
if (ret != 0) {
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
}
struct aligned_buffer_deleter {
void operator()(void * p) const { free(p); }
};
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
seek(aligned_offset, SEEK_SET);
read_raw_unsafe(buffer.get(), bytes_to_read);
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}
void read_raw(void * ptr, size_t len) {
if (has_direct_io()) {
read_aligned_chunk(ptr, len);
} else {
read_raw_unsafe(ptr, len);
}
}
uint32_t read_u32() {
uint32_t read_u32() const {
uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
@@ -342,41 +230,23 @@ struct llama_file::impl {
write_raw(&val, sizeof(val));
}
bool has_direct_io() const {
return fd != -1 && alignment > 1;
}
~impl() {
if (fd != -1) {
close(fd);
} else {
if (fp) {
std::fclose(fp);
}
}
int fd = -1;
std::string fname;
#endif
size_t read_alignment() const {
return alignment;
}
size_t alignment = 1;
FILE * fp{};
size_t size{};
FILE * fp;
size_t size;
};
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
llama_file::~llama_file() = default;
size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
int llama_file::file_id() const {
#ifdef _WIN32
return _fileno(pimpl->fp);
@@ -390,14 +260,9 @@ int llama_file::file_id() const {
}
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
#ifdef _WIN32
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
#else
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
#endif
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }

View File

@@ -3,7 +3,6 @@
#include <cstdint>
#include <memory>
#include <vector>
#include <cstdio>
struct llama_file;
struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
struct llama_file {
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
llama_file(const char * fname, const char * mode);
~llama_file();
size_t tell() const;
@@ -24,16 +23,12 @@ struct llama_file {
void seek(size_t offset, int whence) const;
void read_raw(void * ptr, size_t len);
void read_raw_unsafe(void * ptr, size_t len);
void read_aligned_chunk(void * dest, size_t size);
uint32_t read_u32();
void read_raw(void * ptr, size_t len) const;
uint32_t read_u32() const;
void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const;
size_t read_alignment() const;
bool has_direct_io() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;

View File

@@ -462,29 +462,6 @@ namespace GGUFMeta {
return get_key_or_arr(llm_kv(kid), result, n, required);
}
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
const std::string key = llm_kv(kid);
const int id = gguf_find_key(meta.get(), key.c_str());
if (id < 0) {
if (required) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
}
return false;
}
// throw and error if type is an array
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
if (required) {
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
}
return false;
}
return get_key(key, result, required);
}
// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -495,7 +472,6 @@ llama_model_loader::llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
@@ -528,17 +504,9 @@ llama_model_loader::llama_model_loader(
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx);
use_direct_io = use_direct_io && files.back()->has_direct_io();
// Disable mmap in case Direct I/O is enabled and available
if (use_direct_io && use_mmap) {
use_mmap = false;
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
}
// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
@@ -604,7 +572,7 @@ llama_model_loader::llama_model_loader(
}
}
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
files.emplace_back(new llama_file(fname_split, "rb"));
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
@@ -748,7 +716,6 @@ llama_model_loader::llama_model_loader(
}
this->use_mmap = use_mmap;
this->use_direct_io = use_direct_io;
this->check_tensors = check_tensors;
this->no_alloc = no_alloc;
}
@@ -968,15 +935,7 @@ bool llama_model_loader::load_all_data(
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4;
size_t alignment = 1;
for (const auto & file : files) {
alignment = std::max(file->read_alignment(), alignment);
}
// Buffer size: balance between memory usage and I/O efficiency
// 64MB works well for NVMe drives
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<ggml_backend_event_t> events;
@@ -1026,7 +985,6 @@ bool llama_model_loader::load_all_data(
// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) {
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
ggml_backend_dev_name(dev));
@@ -1108,7 +1066,6 @@ bool llama_model_loader::load_all_data(
}
} else {
const auto & file = files.at(weight->idx);
if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(weight->offs, SEEK_SET);
file->read_raw(cur->data, n_size);
@@ -1120,54 +1077,19 @@ bool llama_model_loader::load_all_data(
} else {
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) {
size_t offset = weight->offs;
alignment = file->read_alignment();
size_t aligned_offset = offset & ~(alignment - 1);
size_t offset_from_alignment = offset - aligned_offset;
file->seek(aligned_offset, SEEK_SET);
// Calculate aligned read boundaries
size_t read_start = aligned_offset;
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
file->seek(weight->offs, SEEK_SET);
size_t bytes_read = 0;
size_t data_read = 0; // Actual tensor data copied (excluding padding)
while (bytes_read < read_end - read_start) {
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
while (bytes_read < n_size) {
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
// Align the destination pointer within the pinned buffer
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
// Wait for previous upload to complete before reusing buffer
ggml_backend_event_synchronize(events[buffer_idx]);
// Read aligned chunk from file
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
// Calculate actual data portion (excluding alignment padding)
uintptr_t ptr_data = ptr_dest_aligned;
size_t data_to_copy = read_size;
// Skip alignment padding at start of first chunk
if (bytes_read == 0) {
ptr_data += offset_from_alignment;
data_to_copy -= offset_from_alignment;
}
// Trim alignment padding at end of last chunk
if (aligned_offset + bytes_read + read_size > offset + n_size) {
data_to_copy -= (read_end - (offset + n_size));
}
// Async upload actual data to GPU
ggml_backend_tensor_set_async(upload_backend, cur,
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
file->read_raw(host_ptrs[buffer_idx], read_iteration);
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
ggml_backend_event_record(events[buffer_idx], upload_backend);
data_read += data_to_copy;
bytes_read += read_size;
bytes_read += read_iteration;
++buffer_idx;
buffer_idx %= n_buffers;
}

View File

@@ -70,7 +70,6 @@ struct llama_model_loader {
size_t n_bytes = 0;
bool use_mmap = false;
bool use_direct_io = false;
bool check_tensors;
bool no_alloc;
@@ -98,7 +97,6 @@ struct llama_model_loader {
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
@@ -133,8 +131,6 @@ struct llama_model_loader {
template<typename T>
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
std::string get_arch_name() const;
enum llm_arch get_arch() const;

View File

@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
if (hparams.n_embd_out > 0) {
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
}
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);

View File

@@ -31,14 +31,12 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_17M: return "17M";
case LLM_TYPE_22M: return "22M";
case LLM_TYPE_33M: return "33M";
case LLM_TYPE_47M: return "47M";
case LLM_TYPE_60M: return "60M";
case LLM_TYPE_70M: return "70M";
case LLM_TYPE_80M: return "80M";
case LLM_TYPE_109M: return "109M";
case LLM_TYPE_137M: return "137M";
case LLM_TYPE_140M: return "140M";
case LLM_TYPE_149M: return "149M";
case LLM_TYPE_160M: return "160M";
case LLM_TYPE_190M: return "190M";
case LLM_TYPE_220M: return "220M";
@@ -48,7 +46,6 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_335M: return "335M";
case LLM_TYPE_350M: return "350M";
case LLM_TYPE_360M: return "360M";
case LLM_TYPE_395M: return "395M";
case LLM_TYPE_410M: return "410M";
case LLM_TYPE_450M: return "450M";
case LLM_TYPE_475M: return "475M";
@@ -126,12 +123,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
case LLM_TYPE_310B_A15B: return "310B.A15B";
case LLM_TYPE_355B_A32B: return "355B.A32B";
case LLM_TYPE_E2B: return "E2B";
case LLM_TYPE_E4B: return "E4B";
@@ -507,7 +502,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -579,7 +573,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
// TODO: Handle SWA metadata similarly when models start implementing it
// rope_freq_scale (inverse of the kv) is optional
float ropescale = 0.0f;
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -588,6 +581,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
// non-transformer models do not have attention heads
@@ -606,7 +603,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -630,7 +627,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// arch-specific KVs
switch (arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA_EMBED:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -675,10 +671,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.f_attn_temp_scale = 0.1f;
hparams.f_attn_temp_offset = 1.0f;
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
}
switch (hparams.n_expert) {
@@ -724,10 +716,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
@@ -887,34 +875,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MODERN_BERT:
{
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
uint32_t swa_period = 3;
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 12:
type = LLM_TYPE_47M; break; // granite-embedding-small
case 22:
type = LLM_TYPE_149M; break; // modern-bert-base
case 28:
type = LLM_TYPE_395M; break; // modern-bert-large
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1116,14 +1076,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MAINCODER:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_1B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN3VL:
{
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@@ -1242,25 +1194,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
} break;
case LLM_ARCH_PLAMO3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
uint32_t swa_period = 8;
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
switch (hparams.n_layer) {
case 24: type = LLM_TYPE_2B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GPT2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1314,10 +1247,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.n_swa = 4096; // default value of gemma 2
hparams.set_swa_pattern(2);
hparams.attn_soft_cap = true;
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
@@ -1342,7 +1272,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(6);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
@@ -1372,9 +1303,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.set_swa_pattern(5);
hparams.n_layer_kv_from_start = 20;
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
hparams.f_attention_scale = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1390,8 +1322,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.set_swa_pattern(6);
hparams.causal_attn = false; // embeddings do not use causal attention
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1530,10 +1463,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1572,10 +1502,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
@@ -1703,7 +1629,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -1799,7 +1725,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
default: type = LLM_TYPE_UNKNOWN;
}
@@ -1918,10 +1843,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
}
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -2239,10 +2160,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(2);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
switch (hparams.n_layer) {
case 24: type = LLM_TYPE_20B; break;
case 36: type = LLM_TYPE_120B; break;
@@ -2287,10 +2204,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
hparams.set_swa_pattern(4, true);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -2409,22 +2322,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MIMO2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_310B_A15B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
default: throw std::runtime_error("unsupported model architecture");
}
@@ -2447,16 +2344,15 @@ void llama_model::load_vocab(llama_model_loader & ml) {
bool llama_model::load_tensors(llama_model_loader & ml) {
const auto & split_mode = params.split_mode;
const auto & n_gpu_layers = params.n_gpu_layers;
const auto & use_mlock = params.use_mlock;
const auto & tensor_split = params.tensor_split;
const int n_layer = hparams.n_layer;
const int n_gpu_layers = this->n_gpu_layers();
const int n_layer = hparams.n_layer;
const bool use_mmap_buffer = true;
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -2467,11 +2363,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
}
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
// calculate the split points
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
std::vector<float> splits(n_devices());
@@ -2482,13 +2373,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
size_t total;
size_t free;
ggml_backend_dev_memory(dev, &free, &total);
// devices can return 0 bytes for free and total memory if they do not
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
ggml_backend_dev_memory(cpu_dev, &free, &total);
}
splits[i] = free;
}
} else {
@@ -2505,10 +2389,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
splits[i] /= split_sum;
}
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
return {cpu_dev, &pimpl->cpu_buft_list};
@@ -2748,7 +2636,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_MISTRAL3:
case LLM_ARCH_LLAMA_EMBED:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3283,37 +3170,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
}
} break;
case LLM_ARCH_MODERN_BERT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
for(int i = 0; i < n_layer; ++i) {
auto& layer = layers[i];
if ( i != 0 ) {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
} else{
// layer 0 uses identity
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
}
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
}
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_NEO_BERT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3378,14 +3234,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -3913,44 +3762,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
}
} break;
case LLM_ARCH_PLAMO3:
{
const int64_t head_dim_q = hparams.n_embd_head_k;
const int64_t head_dim_v = hparams.n_embd_head_v;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
const int64_t num_attention_heads = hparams.n_head(i);
const int64_t num_key_value_heads = hparams.n_head_kv(i);
const int64_t q_proj_dim = num_attention_heads * head_dim_q;
const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
const int64_t n_ff_cur = hparams.n_ff(i);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
{n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
}
} break;
case LLM_ARCH_GPT2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4841,11 +4652,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
// try to load output.weight, if not found, use token_embd (tied embeddings)
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
@@ -4908,11 +4715,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
// try to load output.weight, if not found, use token_embd (tied embeddings)
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
@@ -5279,9 +5082,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
@@ -5393,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp;
// embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5444,9 +5250,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else {
if (n_expert != 0) {
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
@@ -6476,8 +6279,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6522,9 +6325,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
}
}
// for LFM2-ColBert-350M
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_SMALLTHINKER:
{
@@ -6827,75 +6627,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
}
} break;
case LLM_ARCH_MIMO2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
uint32_t n_head = hparams.n_head(i);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
// non-MoE branch
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
// MoE branch
int64_t n_ff_exp = hparams.n_ff_exp;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
}
} break;
case LLM_ARCH_MAINCODER:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
default:
throw std::runtime_error("unknown architecture");
}
@@ -7005,12 +6736,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (llama_supports_gpu_offload()) {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
int n_repeating = n_gpu;
if (n_repeating > 0) {
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) {
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
n_repeating--;
}
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
const int max_backend_supported_layers = hparams.n_layer + 1;
const int max_offloadable_layers = hparams.n_layer + 1;
@@ -7077,14 +6806,6 @@ size_t llama_model::n_devices() const {
return devices.size();
}
uint32_t llama_model::n_gpu_layers() const {
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
}
llama_split_mode llama_model::split_mode() const {
return params.split_mode;
}
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
@@ -7177,10 +6898,6 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
}
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7413,7 +7130,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_GEMMA_EMBEDDING:
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
@@ -7531,24 +7247,16 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
switch (arch) {
case LLM_ARCH_LLAMA:
{
llm = std::make_unique<llm_build_llama<false>>(*this, params);
llm = std::make_unique<llm_build_llama>(*this, params);
} break;
case LLM_ARCH_LLAMA4:
{
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
llm = std::make_unique<llm_build_llama<false>>(*this, params);
llm = std::make_unique<llm_build_llama>(*this, params);
} else {
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
}
} break;
case LLM_ARCH_LLAMA_EMBED:
{
llm = std::make_unique<llm_build_llama<true>>(*this, params);
} break;
case LLM_ARCH_MAINCODER:
{
llm = std::make_unique<llm_build_maincoder>(*this, params);
} break;
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params);
@@ -7581,10 +7289,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_bert>(*this, params);
} break;
case LLM_ARCH_MODERN_BERT:
{
llm = std::make_unique<llm_build_modern_bert>(*this, params);
} break;
case LLM_ARCH_NEO_BERT:
{
llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -7674,14 +7378,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_plamo2>(*this, params);
} break;
case LLM_ARCH_PLAMO3:
{
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
} else {
llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
}
} break;
case LLM_ARCH_GPT2:
{
llm = std::make_unique<llm_build_gpt2>(*this, params);
@@ -7986,10 +7682,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_mistral3>(*this, params);
} break;
case LLM_ARCH_MIMO2:
{
llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
} break;
default:
GGML_ABORT("fatal error");
}
@@ -7997,17 +7689,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
// add on pooling layer
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
// add backend sampling layers (if any)
llm->build_sampling();
// if the gguf model was converted with --sentence-transformers-dense-modules
// there will be two additional dense projection layers
// dense linear projections are applied after pooling
// TODO: move reranking logic here and generalize
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
llm->res->set_outputs();
return llm->res->get_gf();
}
@@ -8020,7 +7707,7 @@ llama_model_params llama_model_default_params() {
llama_model_params result = {
/*.devices =*/ nullptr,
/*.tensor_buft_overrides =*/ nullptr,
/*.n_gpu_layers =*/ -1,
/*.n_gpu_layers =*/ 999,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
@@ -8029,7 +7716,6 @@ llama_model_params llama_model_default_params() {
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
/*.use_direct_io =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
@@ -8064,10 +7750,6 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
return model->hparams.n_embd_inp();
}
int32_t llama_model_n_embd_out(const llama_model * model) {
return model->hparams.get_n_embd_out();
}
int32_t llama_model_n_layer(const llama_model * model) {
return model->hparams.n_layer;
}
@@ -8171,8 +7853,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_MISTRAL3:
case LLM_ARCH_LLAMA_EMBED:
case LLM_ARCH_MAINCODER:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
@@ -8182,7 +7862,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V3:
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM:
@@ -8202,7 +7881,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_PHIMOE:
case LLM_ARCH_PLAMO:
case LLM_ARCH_PLAMO2:
case LLM_ARCH_PLAMO3:
case LLM_ARCH_GEMMA:
case LLM_ARCH_GEMMA2:
case LLM_ARCH_GEMMA3:
@@ -8233,7 +7911,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_PANGU_EMBED:
case LLM_ARCH_AFMOE:
case LLM_ARCH_QWEN3NEXT:
case LLM_ARCH_MIMO2:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:

View File

@@ -24,14 +24,12 @@ enum llm_type {
LLM_TYPE_17M,
LLM_TYPE_22M,
LLM_TYPE_33M,
LLM_TYPE_47M,
LLM_TYPE_60M,
LLM_TYPE_70M,
LLM_TYPE_80M,
LLM_TYPE_109M,
LLM_TYPE_137M,
LLM_TYPE_140M,
LLM_TYPE_149M,
LLM_TYPE_160M,
LLM_TYPE_190M,
LLM_TYPE_220M,
@@ -41,7 +39,6 @@ enum llm_type {
LLM_TYPE_335M,
LLM_TYPE_350M,
LLM_TYPE_360M,
LLM_TYPE_395M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_475M,
@@ -120,12 +117,10 @@ enum llm_type {
LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
LLM_TYPE_355B_A32B, // GLM-4.5
LLM_TYPE_E2B,
LLM_TYPE_E4B,
@@ -470,6 +465,8 @@ struct llama_model {
struct ggml_tensor * dense_2_out_layers = nullptr;
struct ggml_tensor * dense_3_out_layers = nullptr;
llama_model_params params;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
@@ -479,9 +476,6 @@ struct llama_model {
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
// for keeping track of extra nodes used by lora adapters
uint32_t n_lora_nodes = 0;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
@@ -503,9 +497,6 @@ struct llama_model {
size_t n_tensors() const;
size_t n_devices() const;
uint32_t n_gpu_layers() const;
llama_split_mode split_mode() const;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
// total number of parameters in the model
@@ -534,8 +525,6 @@ struct llama_model {
ggml_cgraph * build_graph(const llm_graph_params & params) const;
private:
llama_model_params params;
struct impl;
std::unique_ptr<impl> pimpl;
};

View File

@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());

View File

File diff suppressed because it is too large Load Diff

View File

@@ -14,19 +14,7 @@ struct llama_grammar;
struct llama_sampler_chain {
llama_sampler_chain_params params;
// has .backend_init() been called?
bool is_init = false;
struct info {
bool is_backend;
llama_sampler * ptr;
};
std::vector<info> samplers;
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
std::vector<llama_token_data> cur;
std::vector<struct llama_sampler *> samplers;
// timing
@@ -36,9 +24,9 @@ struct llama_sampler_chain {
};
struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const std::vector<std::vector<llama_token>> & seq_breakers);
int32_t context_size,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const std::vector<std::vector<llama_token>>& seq_breakers);

View File

@@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
regex_exprs = {
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = {
"[\r\n]",
@@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
regex_exprs = {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1856,11 +1849,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false;
} else if (
tokenizer_pre == "youtu") {
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
clean_spaces = false;
ignore_merges = true;
} else if (
tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1879,8 +1867,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "a.x-4.0" ||
tokenizer_pre == "mellum" ||
tokenizer_pre == "modern-bert" ) {
tokenizer_pre == "mellum") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "jina-v1-en" ||
@@ -2016,10 +2003,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "minimax-m2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false;
} else if (
tokenizer_pre == "solar-open") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2193,8 +2176,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// for now, we apply this workaround to find the tokens based on their text
for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (special_eot_id == LLAMA_TOKEN_NULL) {
if (false
@@ -2210,10 +2191,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_utterance>" // smoldocling
) {
special_eot_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2224,10 +2205,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|eom_id|>"
) {
special_eom_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2244,10 +2225,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_prefix|>" // GLM-4.5
) {
special_fim_pre_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2264,10 +2245,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_suffix|>" // GLM-4.5
) {
special_fim_suf_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2284,10 +2265,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|code_middle|>" // GLM-4.5
) {
special_fim_mid_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2301,10 +2282,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<PAD>"
) {
special_fim_pad_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2319,10 +2300,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<reponame>" // Granite
) {
special_fim_rep_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
@@ -2333,41 +2314,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|file_sep|>" // Qwen
) {
special_fim_sep_id = t.second;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
}
// auto-detect unused tokens: e.g. control tokens with the word "unused"
// ideally, these tokens should be marked as unused during conversion
{
uint32_t n_unused = 0;
for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
continue;
}
if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
if (strstr(t.first.c_str(), "unused") != NULL) {
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
}
}
if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
n_unused++;
}
}
LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
}
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2386,16 +2341,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
}
for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<|return|>" // o200k_harmony
|| t.first == "<|call|>" // o200k_harmony
|| t.first == "<|flush|>" // solar-open
|| t.first == "<|calls|>" // solar-open
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>"
@@ -2405,28 +2356,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_utterance>" // smoldocling
) {
special_eog_ids.insert(t.second);
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} else {
if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
// token is control, but not marked as EOG -> print a debug log
if (special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
// token is control, but not marked as EOG -> print a debug log
if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
}
}
// @ngxson : quick hack for gpt-oss, always render these tokens
for (const auto & t : token_to_id) {
auto & attr = id_to_token[t.second].attr;
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
}
}
@@ -2446,42 +2393,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
// TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
// we remove the "<|end|>" token from the EOG list
{
bool has_return = false;
bool has_call = false;
bool has_end = false;
bool has_flush = false;
llama_token end_id = LLAMA_TOKEN_NULL;
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
for (auto tid : special_eog_ids) {
auto & text = id_to_token[tid].text;
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
if (text == "<|return|>") {
if (id_to_token[tid].text == "<|return|>") {
has_return = true;
} else if (text == "<|call|>" || text == "<|calls|>") {
} else if (id_to_token[tid].text == "<|call|>") {
has_call = true;
} else if (text == "<|flush|>") {
has_flush = true;
} else if (text == "<|end|>") {
} else if (id_to_token[tid].text == "<|end|>") {
has_end = true;
end_id = tid;
}
}
if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
if (has_return && has_call && has_end) {
special_eog_ids.erase(end_id);
auto & attr = id_to_token[end_id].attr;
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
}
}
}
@@ -2579,13 +2518,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
} else if (_contains_any(model_name, {"modern-bert"})) {
if (token_to_id.count("[MASK]") == 0 ) {
LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
}
else {
_set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
}
}
}
}

View File

@@ -51,8 +51,6 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
};
struct LLM_KV;

View File

@@ -71,9 +71,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
}, &ud);
llama_model_params mparams_copy = *mparams;
mparams_copy.no_alloc = true;
mparams_copy.use_mmap = false;
mparams_copy.use_mlock = false;
mparams_copy.no_alloc = true;
mparams_copy.use_mmap = false;
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
if (model == nullptr) {
@@ -111,20 +110,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
}
}
for (size_t i = 0; i < ret.size(); i++) {
size_t free;
size_t total;
size_t free, total;
ggml_backend_dev_memory(model->devices[i], &free, &total);
// devices can return 0 bytes for free and total memory if they do not
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
ggml_backend_dev_memory(cpu_dev, &free, &total);
}
ret[i].free = free;
ret[i].total = total;
}
@@ -152,15 +139,12 @@ enum layer_fraction_t {
};
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
class llama_params_fit_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
static void llama_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
constexpr int64_t MiB = 1024*1024;
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
typedef std::vector<llama_device_memory_data> dmds_t;
const llama_model_params default_mparams = llama_model_default_params();
@@ -179,12 +163,6 @@ static void llama_params_fit_impl(
return;
}
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
margins.reserve(nd);
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
}
std::vector<std::string> dev_names;
{
dev_names.reserve(nd);
@@ -202,12 +180,11 @@ static void llama_params_fit_impl(
}
}
int64_t sum_free = 0;
int64_t sum_projected_free = 0;
int64_t sum_projected_used = 0;
int64_t sum_projected_model = 0;
std::vector<int64_t> projected_free_per_device;
projected_free_per_device.reserve(nd);
int64_t sum_total = 0;
int64_t sum_projected_free = 0;
int64_t min_projected_free = INT64_MAX;
int64_t sum_projected_used = 0;
int64_t sum_projected_ctx = 0;
if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -217,98 +194,59 @@ static void llama_params_fit_impl(
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);
sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;
sum_total += dmd.total;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
min_projected_free = std::min(min_projected_free, projected_free);
sum_projected_ctx += dmd.mb.context;
if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
projected_free >= 0 ? "surplus" : "deficit");
}
}
assert(sum_free >= 0 && sum_projected_used >= 0);
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
assert(sum_projected_used >= sum_projected_ctx);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
__func__, sum_projected_used/MiB, sum_total/MiB);
if (min_projected_free >= margin) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
}
}
if (!changes_needed) {
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
__func__, min_projected_free/MiB, margin/MiB);
return;
}
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
__func__, min_projected_free/MiB, margin/MiB);
return;
}
// step 2: try reducing memory use by reducing the context size
{
int64_t global_surplus = sum_projected_free;
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
}
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
if (global_surplus < 0) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
__func__, margins[0]/MiB, -global_surplus/MiB);
} else {
LLAMA_LOG_INFO(
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
__func__, -global_surplus/MiB);
}
LLAMA_LOG_INFO(nd == 1 ?
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
__func__, margin/MiB, -global_surplus/MiB);
if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) {
int64_t sum_used_target = sum_free;
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
}
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
// - for dense models only whole layers can be assigned to devices
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
// - on average we expect a waste of 0.5 layers/tensors per device
// - use slightly more than the expected average for nd devices to be safe
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
}
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (const auto & dmd : dmds_min_ctx) {
sum_projected_used_min_ctx += dmd.mb.total();
}
if (sum_used_target > sum_projected_used_min_ctx) {
// linear interpolation between minimum and maximum context size:
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
/ (sum_projected_used - sum_projected_used_min_ctx);
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
const uint32_t ctx_reduction = std::min(
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
cparams->n_ctx = hp_nct - ctx_reduction;
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
global_surplus += memory_reduction;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (global_surplus >= 0) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return;
}
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
} else {
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
}
} else {
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
@@ -321,28 +259,32 @@ static void llama_params_fit_impl(
}
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
}
if (nd > 1) {
if (!tensor_split) {
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
}
if (mparams->tensor_split) {
for (size_t id = 0; id < nd; id++) {
if (mparams->tensor_split[id] != 0.0f) {
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
throw std::runtime_error("model_params::tensor_split already set by user, abort");
}
}
}
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
}
if (hp_ngl < 2*nd) {
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
}
}
if (!tensor_buft_overrides) {
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
}
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
}
// step 3: iteratively fill the back to front with "dense" layers
@@ -395,11 +337,6 @@ static void llama_params_fit_impl(
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
uint32_t n_full() const {
assert(n_layer >= n_part);
return n_layer - n_part;
}
};
const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -408,7 +345,8 @@ static void llama_params_fit_impl(
auto set_ngl_tensor_split_tbo = [&](
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
llama_model_params & mparams) {
llama_model_params & mparams,
const bool add_nonrepeating) {
mparams.n_gpu_layers = 0;
for (size_t id = 0; id < nd; id++) {
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -416,25 +354,29 @@ static void llama_params_fit_impl(
tensor_split[id] = ngl_per_device[id].n_layer;
}
}
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
if (add_nonrepeating) {
mparams.n_gpu_layers += 1;
tensor_split[nd - 1] += 1;
}
mparams.tensor_split = tensor_split;
size_t itbo = 0;
for (size_t id = 0; id < nd; id++) {
il0 += ngl_per_device[id].n_full();
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
if (itbo + 1 >= ntbo) {
tensor_buft_overrides[itbo].pattern = nullptr;
tensor_buft_overrides[itbo].buft = nullptr;
itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides;
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
+ std::to_string(ntbo) + " is insufficient for model");
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+ std::to_string(ntbo) + " is insufficient for model\n");
}
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
itbo++;
}
il0 += ngl_per_device[id].n_part;
@@ -449,9 +391,10 @@ static void llama_params_fit_impl(
auto get_memory_for_layers = [&](
const char * func_name,
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
const bool add_nonrepeating) -> std::vector<int64_t> {
llama_model_params mparams_copy = *mparams;
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
const dmds_t dmd_nl = llama_get_device_memory_data(
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -484,9 +427,9 @@ static void llama_params_fit_impl(
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (size_t id = 0; id < nd; id++) {
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
global_surplus_cpu_moe += dmd.free;
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
}
if (global_surplus_cpu_moe > 0) {
@@ -505,18 +448,27 @@ static void llama_params_fit_impl(
std::vector<int64_t> targets; // maximum acceptable memory use per device
targets.reserve(nd);
for (size_t id = 0; id < nd; id++) {
targets.push_back(dmds_full[id].free - margins[id]);
targets.push_back(dmds_full[id].free - margin);
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
}
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
// whether for the optimal memory use we expect to load at least some MoE tensors:
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
overflow_bufts.reserve(nd);
for (size_t id = 0; id < nd; id++) {
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
for (size_t id = 0; id < nd - 1; ++id) {
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
}
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
std::vector<ngl_t> ngl_per_device(nd);
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
if (hp_nex > 0) {
for (size_t id = 0; id < nd; id++) {
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
}
}
// optimize the number of layers per device using the method of false position:
// - ngl_per_device has 0 layers for each device, lower bound
@@ -524,30 +476,22 @@ static void llama_params_fit_impl(
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
// - check memory use of our guess, replace either the low or high bound
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
// - the last device has the output layer, which cannot be a partial layer
if (hp_nex == 0) {
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
} else {
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
}
uint32_t n_unassigned = hp_ngl;
for (int id = nd - 1; id >= 0; id--) {
uint32_t n_unassigned = hp_ngl + 1;
for (size_t jd = id + 1; jd < nd; ++jd) {
assert(n_unassigned >= ngl_per_device[jd].n_layer);
n_unassigned -= ngl_per_device[jd].n_layer;
}
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
ngl_per_device_high[id].n_layer = n_unassigned;
if (hp_nex > 0) {
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
}
if (ngl_per_device_high[id].n_layer > 0) {
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
@@ -556,26 +500,25 @@ static void llama_params_fit_impl(
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
ngl_per_device_test[id].n_layer += step_size;
if (hp_nex) {
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
step_size - 1 : step_size; // the first layer is the output layer which must always be full
ngl_per_device_test[id].n_part += step_size;
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
ngl_per_device = ngl_per_device_test;
mem = mem_test;
n_unassigned -= ngl_per_device[id].n_layer;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
} else {
ngl_per_device_high = ngl_per_device_test;
mem_high = mem_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
}
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
}
} else {
assert(ngl_per_device_high[id].n_layer == n_unassigned);
ngl_per_device = ngl_per_device_high;
mem = mem_high;
ngl_per_device = ngl_per_device_high;
n_unassigned -= ngl_per_device[id].n_layer;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
}
}
@@ -586,7 +529,7 @@ static void llama_params_fit_impl(
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
}
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
return;
}
@@ -606,20 +549,24 @@ static void llama_params_fit_impl(
assert(id_dense_start < nd);
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
for (size_t id = 0; id <= id_dense_start; id++) {
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
for (size_t jd = id_dense_start; jd < nd; jd++) {
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
ngl_per_device_high[id].n_layer += n_layer_move;
ngl_per_device_high[jd].n_layer -= n_layer_move;
ngl_per_device_high[jd].n_part = 0;
}
size_t id_dense_start_high = nd - 1;
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
@@ -635,11 +582,11 @@ static void llama_params_fit_impl(
ngl_per_device_test[id].n_layer += n_convert_jd;
n_converted_test += n_convert_jd;
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
break;
}
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
@@ -654,38 +601,32 @@ static void llama_params_fit_impl(
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
}
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
}
} else {
ngl_per_device = ngl_per_device_high;
mem = mem_high;
id_dense_start = id_dense_start_high;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
// try to fit at least part of one more layer
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
if (ngl_per_device[id_dense_start].n_layer > 0) {
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start;
ngl_per_device_test[id_dense_start_test].n_layer--;
ngl_per_device_test[id_dense_start_test].n_part--;
ngl_per_device_test[id].n_layer++;
ngl_per_device_test[id].n_part++;
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
id_dense_start_test++;
}
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
if (id < nd - 1) {
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
}
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] < targets[id]) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -693,10 +634,9 @@ static void llama_params_fit_impl(
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] < targets[id]) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -705,10 +645,9 @@ static void llama_params_fit_impl(
} else {
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
if (mem_test[id] < targets[id]) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -723,41 +662,30 @@ static void llama_params_fit_impl(
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
// print info for devices that were not changed during the conversion from dense only to full layers:
for (size_t id = id_dense_start + 1; id < nd; id++) {
const int64_t projected_margin = dmds_full[id].free - mem[id];
LLAMA_LOG_INFO(
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
}
enum llama_params_fit_status llama_params_fit(
bool llama_params_fit(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
const int64_t t0_us = llama_time_us();
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
bool ok = true;
try {
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
} catch (const llama_params_fit_exception & e) {
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
} catch (const std::runtime_error & e) {
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
ok = false;
}
const int64_t t1_us = llama_time_us();
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
return status;
return ok;
}
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
struct llama_sampler_chain_params result = {
/*.no_perf =*/ true,
/*.no_perf =*/ true,
};
return result;
@@ -830,7 +758,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
model.t_start_us = tm.t_start_us;
try {
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info();

View File

@@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL;
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
(il + 1) % hparams.n_no_rope_layer_step != 0;
// dual attention normalization (pre)
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
@@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
cb(Qcur, "Qcur_normed", il);
cb(Kcur, "Kcur_normed", il);
// RoPE only for sliding_attention layers
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
((il + 1) % hparams.n_no_rope_layer_step) != 0;
if (use_rope) {
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur_rope", il);
}

View File

@@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
type_op, LLM_FFN_PAR, il);
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
cur = build_ffn(cur,

View File

@@ -3,14 +3,12 @@
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
float kq_scale = 1.0f / sqrtf(float(n_embd_head));
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * inpL;
ggml_tensor * cur;
ggml_tensor *inpL, *cur;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
@@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
}
ggml_tensor * inpSA = inpL;
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// build self attention
{

View File

@@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
for (int il = 0; il < n_layer; ++il) {
const bool is_swa = hparams.is_swa(il);
// UNUSED:
// const float freq_base_l = model.get_rope_freq_base (cparams, il);
// const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);

View File

@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale, hparams.expert_weights_scale,
true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@@ -1,5 +1,7 @@
#include "models.h"
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
if (ubatch.token) {
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

View File

@@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
@@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);

View File

@@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
if (ubatch.token) {
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

View File

@@ -1,5 +1,7 @@
#include "models.h"
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
@@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
if (ubatch.token) {
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
@@ -245,7 +248,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
// equivalent to get_per_layer_inputs() in python code
// output shape: [n_embd_altup, n_layer, n_tokens]
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
auto inp = std::make_unique<llm_graph_input_embd>();
auto inp = std::make_unique<llm_graph_input_embd>();
ggml_tensor * inp_per_layer;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
@@ -255,20 +258,10 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
cb(inp_per_layer, "inp_per_layer_selected", -1);
res->add_input(std::move(inp));
} else {
// Vision embedding path: use padding token (ID=0) embedding
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
// Extract and dequantize padding token embedding (column 0)
ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
// Reshape to [n_embd_altup, n_layer, 1]
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
cb(inp_per_layer, "inp_per_layer_vision", -1);
GGML_ABORT("TODO: support embd input");
}
res->add_input(std::move(inp));
return inp_per_layer;
}
@@ -286,7 +279,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
-1); // [n_embd_altup, n_layer, n_tokens]
cb(per_layer_proj, "per_layer_proj", -1);
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
cb(inp_per_layer, "inp_per_layer", -1);

View File

@@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL;
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
(il + 1) % hparams.n_no_rope_layer_step != 0;
@@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
if (use_rope) {
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
} else if (inp_attn_scale) {

View File

@@ -1,7 +1,6 @@
#include "models.h"
template <bool embed>
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15,14 +14,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (embed) {
inp_attn = build_attn_inp_no_cache();
} else {
inp_attn = build_attn_inp_kv();
}
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -153,16 +145,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cb(cur, "result_norm", -1);
res->t_embd = cur;
if constexpr (!embed) {
// lm_head
cur = build_lora_mm(model.output, cur);
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
}
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
template struct llm_build_llama<false>;
template struct llm_build_llama<true>;

View File

@@ -1,117 +0,0 @@
#include "models.h"
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -1,123 +0,0 @@
#include "models.h"
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
uint32_t n_head_l = hparams.n_head(il);
uint32_t n_head_kv_l = hparams.n_head_kv(il);
const float freq_base_l = model.get_rope_freq_base(cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
cur = inpL;
// self_attention
{
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
ggml_tensor * sinks = model.layers[il].attn_sinks;
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {
// dense branch
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE branch
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -303,7 +303,6 @@ struct llm_build_llada_moe : public llm_graph_context {
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
};
template <bool embed>
struct llm_build_llama : public llm_graph_context {
llm_build_llama(const llama_model & model, const llm_graph_params & params);
};
@@ -312,18 +311,10 @@ struct llm_build_llama_iswa : public llm_graph_context {
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_maincoder : public llm_graph_context {
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mamba : public llm_graph_context_mamba {
llm_build_mamba(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mimo2_iswa : public llm_graph_context {
llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_minicpm3 : public llm_graph_context {
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
};
@@ -336,10 +327,6 @@ struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mpt : public llm_graph_context {
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
};
@@ -409,11 +396,6 @@ struct llm_build_plamo : public llm_graph_context {
llm_build_plamo(const llama_model & model, const llm_graph_params & params);
};
template <bool iswa>
struct llm_build_plamo3 : public llm_graph_context {
llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_plm : public llm_graph_context {
llm_build_plm(const llama_model & model, const llm_graph_params & params);
};

View File

@@ -1,116 +0,0 @@
#include "models.h"
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inp_pos = build_inp_pos();
// construct input embeddings (token, type, position)
inpL = build_inp_embd(model.tok_embd);
cb(inpL, "inp_embd", -1);
// embed layer norm
inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
cb(inpL, "inp_norm", -1);
ggml_tensor * inp_out_ids = build_inp_out_ids();
auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base(cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
cur = inpL;
// attention layer norm
if (model.layers[il].attn_norm) {
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM, il);
cb(cur, "attn_norm", il);
}
// self attention
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
const size_t type_size = ggml_type_size(cur->type);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
// RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
// re-add the layer input
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
// attention layer norm
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM, -1);
cb(cur, "final_norm_out", -1);
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
// extracting cls token
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
cb(cur, "cls_pooled_embd", -1);
}
cb(cur, "res_embd", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL;
// norm
@@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);

View File

@@ -1,128 +0,0 @@
#include "models.h"
template <bool iswa>
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t head_dim_q = hparams.n_embd_head_k;
const int64_t head_dim_v = hparams.n_embd_head_v;
ggml_tensor * cur;
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv();
}
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * residual = inpL;
float freq_base_l = 0.0f;
float freq_scale_l = 0.0f;
if constexpr (iswa) {
freq_base_l = model.get_rope_freq_base (cparams, il);
freq_scale_l = model.get_rope_freq_scale(cparams, il);
} else {
freq_base_l = freq_base;
freq_scale_l = freq_scale;
}
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
const int32_t n_head = hparams.n_head(il);
const int32_t n_head_kv = hparams.n_head_kv(il);
const int64_t q_offset = 0;
const int64_t k_offset = head_dim_q * n_head;
const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "attn_q_norm", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "attn_k_norm", il);
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
ext_factor, attn_factor, beta_fast, beta_slow);
const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
cb(cur, "attn_out", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
}
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
cur = ggml_add(ctx0, cur, residual);
cb(cur, "attn_residual", il);
residual = cur;
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_post_norm", il);
cur = ggml_add(ctx0, cur, residual);
cb(cur, "ffn_residual", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
inpL = cur;
}
cur = inpL;
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
// Explicit template instantiations
template struct llm_build_plamo3<false>;
template struct llm_build_plamo3<true>;

View File

@@ -26,16 +26,10 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * inpSA = inpL;
ggml_tensor * probs = nullptr;
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
il % hparams.n_no_rope_layer_step != 0;
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
cb(probs, "ffn_moe_logits", il);
// norm
@@ -58,11 +52,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
if (use_rope) {
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il);

View File

@@ -985,11 +985,6 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
{ "\\p{S}", unicode_cpt_flags::SYMBOL },
{ "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
{ "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
{ "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
{ "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
{ "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
};
static const std::map<int, int> k_ucat_cpt = {
@@ -1100,26 +1095,22 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
continue;
}
// Match \p{...} Unicode properties of varying lengths
if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
regex_expr[i + 1] == 'p' &&
regex_expr[i + 2] == '{') {
// Find the closing brace
size_t closing_brace = regex_expr.find('}', i + 3);
if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
if (!inside) {
regex_expr_collapsed += '[';
}
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i = closing_brace;
continue;
regex_expr[i + 2] == '{' &&
regex_expr[i + 4] == '}') {
const std::string pat = regex_expr.substr(i, 5);
if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
if (!inside) {
regex_expr_collapsed += '[';
}
regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
if (!inside) {
regex_expr_collapsed += ']';
}
i += 4;
continue;
}
}

View File

@@ -45,14 +45,13 @@
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
@@ -139,62 +138,6 @@
#define TN_TOK_BOI "v.boi"
#define TN_TOK_EOI "v.eoi"
// (conformer) lfm2
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s"
#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s"
#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u"
#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v"
#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s"
#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s"
#define TN_CONV_DW "%s.blk.%d.conv_dw.%s"
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
// mobilenetv5 (gemma3n) definitions
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
// Stage 0 Block (Edge Residual)
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
// Stage 1+ Block (Universal Inverted Residual)
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
// Attention Components
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
// MSFA
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
@@ -212,8 +155,6 @@ enum projector_type {
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_QWEN3VL,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_GEMMA3NV,
PROJECTOR_TYPE_GEMMA3NA,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
@@ -224,15 +165,12 @@ enum projector_type {
PROJECTOR_TYPE_GLMA,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_MUSIC_FLAMINGO,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -246,8 +184,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
@@ -257,15 +193,12 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLMA, "glma"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@@ -4,7 +4,6 @@
#include "clip.h"
#include "clip-impl.h"
#include <array>
#include <vector>
#include <unordered_set>
#include <cstdint>
@@ -61,7 +60,6 @@ struct clip_hparams {
std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
// audio
int32_t n_mel_bins = 0; // whisper preprocessor
@@ -144,74 +142,11 @@ struct clip_layer {
ggml_tensor * deepstack_fc2_w = nullptr;
ggml_tensor * deepstack_fc2_b = nullptr;
// lfm2
ggml_tensor * ff_norm_w = nullptr;
ggml_tensor * ff_norm_b = nullptr;
ggml_tensor * ff_norm_1_w = nullptr;
ggml_tensor * ff_norm_1_b = nullptr;
ggml_tensor * ff_up_1_w = nullptr;
ggml_tensor * ff_up_1_b = nullptr;
ggml_tensor * ff_down_1_w = nullptr;
ggml_tensor * ff_down_1_b = nullptr;
ggml_tensor * pos_bias_u = nullptr;
ggml_tensor * pos_bias_v = nullptr;
ggml_tensor * norm_conv_w = nullptr;
ggml_tensor * norm_conv_b = nullptr;
ggml_tensor * linear_pos_w = nullptr;
ggml_tensor * conv_norm_w = nullptr;
ggml_tensor * conv_norm_b = nullptr;
ggml_tensor * conv_dw_w = nullptr;
ggml_tensor * conv_dw_b = nullptr;
ggml_tensor * conv_pw1_w = nullptr;
ggml_tensor * conv_pw1_b = nullptr;
ggml_tensor * conv_pw2_w = nullptr;
ggml_tensor * conv_pw2_b = nullptr;
bool has_deepstack() const {
return deepstack_fc1_w != nullptr;
}
};
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
struct mobilenetv5_block {
// Stage 0 (Edge Residual)
ggml_tensor * s0_conv_exp_w = nullptr;
ggml_tensor * s0_bn1_w = nullptr;
ggml_tensor * s0_conv_pwl_w = nullptr;
ggml_tensor * s0_bn2_w = nullptr;
// Stage 1+ (Universal Inverted Residual)
ggml_tensor * dw_start_w = nullptr;
ggml_tensor * dw_start_bn_w = nullptr;
ggml_tensor * pw_exp_w = nullptr;
ggml_tensor * pw_exp_bn_w = nullptr;
ggml_tensor * dw_mid_w = nullptr;
ggml_tensor * dw_mid_bn_w = nullptr;
ggml_tensor * pw_proj_w = nullptr;
ggml_tensor * pw_proj_bn_w = nullptr;
ggml_tensor * layer_scale_w = nullptr;
// Attention (MQA) components
ggml_tensor * attn_q_w = nullptr;
ggml_tensor * attn_k_w = nullptr;
ggml_tensor * attn_v_w = nullptr;
ggml_tensor * attn_o_w = nullptr;
// Optional downsampling/norm in attention
ggml_tensor * attn_k_dw_w = nullptr;
ggml_tensor * attn_k_norm_w = nullptr;
ggml_tensor * attn_v_dw_w = nullptr;
ggml_tensor * attn_v_norm_w = nullptr;
// Block norm (often present in attention blocks)
ggml_tensor * attn_norm_w = nullptr;
};
struct clip_model {
clip_modality modality = CLIP_MODALITY_VISION;
projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +263,6 @@ struct clip_model {
ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr;
// mobilenetv5 for gemma3n
std::vector<mobilenetv5_block> mobilenet_blocks;
std::vector<int> mobilenet_stage_ends;
ggml_tensor * mobilenet_stem_conv_w = nullptr;
ggml_tensor * mobilenet_stem_conv_b = nullptr;
ggml_tensor * mobilenet_stem_norm_w = nullptr;
ggml_tensor * mm_post_proj_norm_w = nullptr;
// Multi-Scale Fusion Adapter (MSFA) components
ggml_tensor * msfa_concat_conv_w = nullptr;
ggml_tensor * msfa_concat_norm_w = nullptr;
ggml_tensor * msfa_ffn_expand_w = nullptr;
ggml_tensor * msfa_ffn_project_w = nullptr;
ggml_tensor * msfa_ffn_expand_bn = nullptr;
ggml_tensor * msfa_ffn_project_bn = nullptr;
// pixtral, glm4v
ggml_tensor * token_embd_img_break = nullptr;
ggml_tensor * mm_patch_merger_w = nullptr;
@@ -368,16 +286,9 @@ struct clip_model {
ggml_tensor * mm_boi = nullptr;
ggml_tensor * mm_eoi = nullptr;
// lfm2 audio
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
ggml_tensor * pre_encode_out_w = nullptr;
ggml_tensor * pre_encode_out_b = nullptr;
bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
}
bool audio_has_stack_frames() const {

View File

@@ -801,10 +801,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_siglip>(ctx, img);
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
@@ -835,7 +831,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
} break;
@@ -855,18 +850,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_llava>(ctx, img);
} break;
case PROJECTOR_TYPE_LFM2A:
{
builder = std::make_unique<clip_graph_conformer>(ctx, img);
} break;
case PROJECTOR_TYPE_GLM4V:
{
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
} break;
default:
GGML_ABORT("missing cgraph builder");
}
@@ -1167,14 +1154,6 @@ struct clip_model_loader {
// test model (tinygemma3) has a different value, we optionally read it
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
// Similar configuration to Gemma3
hparams.n_merge = 1; // MobileNetV5 handles resizing internally
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
@@ -1192,20 +1171,6 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
}
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
std::vector<int> wa_layer_indexes_vec;
get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
for (auto & layer : wa_layer_indexes_vec) {
hparams.wa_layer_indexes.insert(layer);
}
// support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
hparams.set_limit_image_tokens(1, 62500);
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GLM4V:
{
hparams.rope_theta = 10000.0f;
@@ -1224,7 +1189,6 @@ struct clip_model_loader {
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1240,15 +1204,6 @@ struct clip_model_loader {
hparams.audio_window_len = 400;
hparams.audio_hop_len = 160;
} break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
hparams.audio_chunk_len = 1; // in seconds
hparams.audio_sample_rate = 16000;
hparams.audio_n_fft = 512;
hparams.audio_window_len = 400;
hparams.audio_hop_len = 160;
} break;
default:
break;
}
@@ -1274,14 +1229,7 @@ struct clip_model_loader {
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
if (!hparams.wa_layer_indexes.empty()) {
LOG_INF("%s: wa_layer_indexes: ", __func__);
for (auto & layer : hparams.wa_layer_indexes) {
LOG_INF("%d ", layer);
}
LOG_INF("\n");
}
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
if (hparams.image_min_pixels > 0) {
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
}
@@ -1363,10 +1311,6 @@ struct clip_model_loader {
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
hparams.n_layer = 0; // gemma3n does not use normal layer structure
}
// layers
model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1441,7 +1385,6 @@ struct clip_model_loader {
}
}
switch (model.proj_type) {
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
@@ -1554,14 +1497,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GLM4V:
{
model.projection = get_tensor(TN_MM_PROJECTOR);
@@ -1581,112 +1516,11 @@ struct clip_model_loader {
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
// Dynamically load blocks stage by stage
for (int stage = 0; stage < 4; ++stage) {
int blocks_found_in_stage = 0;
for (int blk_idx = 0; ; ++blk_idx) {
bool found_block = false;
mobilenetv5_block block;
// 1. Check for Edge Residual (S0)
block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
if (block.s0_conv_exp_w) {
found_block = true;
block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
}
// 2. Check for UIR (Universal Inverted Residual)
else {
// Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
if (block.dw_start_w || block.pw_exp_w) {
found_block = true;
if (block.dw_start_w) {
block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
}
if (block.pw_exp_w) {
block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
}
block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
if (block.dw_mid_w) {
block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
}
block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
if (block.pw_proj_w) {
block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
}
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
}
}
// 3. Check for Attention (MQA)
// Even if UIR/Edge check failed, this might be a pure attention block
ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
if (attn_q_check) {
found_block = true;
block.attn_q_w = attn_q_check;
block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
// Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
if (!block.layer_scale_w) {
block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
}
}
if (found_block) {
model.mobilenet_blocks.push_back(block);
blocks_found_in_stage++;
} else {
// End of blocks for this stage
break;
}
}
// Track where this stage ends in the flat vector
if (blocks_found_in_stage > 0) {
model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
}
}
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
model.projection = get_tensor(TN_MM_PROJECTOR);
} break;
case PROJECTOR_TYPE_LFM2:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_KIMIVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1746,17 +1580,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
} break;
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
} break;
case PROJECTOR_TYPE_INTERNVL:
{
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1805,52 +1628,6 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
} break;
case PROJECTOR_TYPE_LFM2A:
{
for (int i : {0, 2, 3, 5, 6}) {
model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
}
model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = model.layers[il];
layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"));
layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"));
layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight"));
layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"));
layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight"));
layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"));
layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight"));
layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"));
layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight"));
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
}
} break;
default:
GGML_ASSERT(false && "unknown projector type");
}
@@ -2155,7 +1932,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
try {
clip_model_loader loader(fname);
bool skip_audio = false;
if (loader.has_vision) {
ctx_vision = new clip_ctx(ctx_params);
@@ -2165,14 +1941,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
loader.warmup(*ctx_vision);
}
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
// we can remove this check when we implement audio support for Gemma 3N
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
}
if (loader.has_audio && !skip_audio) {
if (loader.has_audio) {
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
@@ -2896,57 +2668,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
const int patch_size = params.patch_size; // typically 16
const int merge_size = params.n_merge; // typically 2
const int align_size = patch_size * merge_size; // 32
const int max_num_patches = params.image_max_pixels > 0 ?
params.image_max_pixels / (patch_size * patch_size) : 256;
// Linear search for optimal scale to fit within max_num_patches
float scale = 1.0f;
int target_height = original_size.height;
int target_width = original_size.width;
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
float scaled_size = size * scale;
// Round up to nearest multiple of align_size
int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
// Ensure at least one patch
return std::max(align_size, aligned);
};
// Linear search with 0.02 step size
while (scale > 0.0f) {
target_height = get_scaled_image_size(scale, original_size.height);
target_width = get_scaled_image_size(scale, original_size.width);
int num_patches_h = target_height / patch_size;
int num_patches_w = target_width / patch_size;
int num_patches = num_patches_h * num_patches_w;
if (num_patches > max_num_patches) {
scale -= 0.02f;
} else {
break;
}
}
clip_image_size new_size = {target_width, target_height};
// Resize the image
clip_image_u8 resized;
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
// Normalize to float32
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
// Add to results
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
@@ -3010,16 +2731,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
clip_image_u8 resized_image;
int sz = params.image_size;
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3189,7 +2900,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
default:
break;
@@ -3205,7 +2915,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
default:
break;
@@ -3266,7 +2975,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_YOUTUVL:
{
// dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2);
@@ -3282,12 +2990,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int scale_factor = ctx->model.hparams.n_merge;
n_patches /= (scale_factor * scale_factor);
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
// MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
// regardless of input size (see architecture description)
n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
@@ -3313,7 +3015,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
n_patches = img->nx;
@@ -3346,10 +3047,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
{
n_patches += 2; // for BOI and EOI token embeddings
} break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
} break;
default:
GGML_ABORT("unsupported projector type");
}
@@ -3400,6 +3097,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int pos_w = image_size_width / patch_size;
const int pos_h = image_size_height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) {
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3548,11 +3246,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_YOUTUVL:
{
// pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT
const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
@@ -3563,7 +3259,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
std::vector<int> inv_idx(ph * pw);
if (use_window_attn) {
const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
const int attn_window_size = 112;
const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3680,7 +3376,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_QWEN2A:
@@ -3688,7 +3383,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM:
{
@@ -3711,27 +3405,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_LFM2A:
{
GGML_ASSERT(imgs.entries.size() == 1);
const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
auto d_model = 512;
auto seq_len = n_frames * 2 - 1;
std::vector<float> pos_emb(d_model*seq_len);
std::vector<double> inv_freq(d_model / 2);
for (size_t i = 0; i < inv_freq.size(); ++i) {
inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
}
for (int64_t pos = 0; pos < seq_len; ++pos) {
for (size_t i = 0; i < inv_freq.size(); ++i) {
const float ang = (n_frames - pos - 1) * inv_freq[i];
pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even
pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd
}
}
set_input_f32("pos_emb", pos_emb);
} break;
default:
GGML_ABORT("Unknown projector type");
}
@@ -3802,19 +3475,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_YOUTUVL:
return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.projection->ne[1];
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL:
return ctx->model.mm_3_w->ne[1];
@@ -3829,8 +3499,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_LFM2A:
return ctx->model.position_embeddings->ne[0];
case PROJECTOR_TYPE_GLM4V:
return ctx->model.mm_ffn_down_w->ne[1];
default:
@@ -3839,7 +3507,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
}
int clip_is_minicpmv(const struct clip_ctx * ctx) {
// TODO: remove this function
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
return ctx->model.hparams.minicpmv_version;
}
@@ -3847,26 +3514,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
}
bool clip_is_glm(const struct clip_ctx * ctx) {
// TODO: remove this function
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
}
bool clip_is_mrope(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
return true;
default:
return false;
}
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
}
bool clip_is_llava(const struct clip_ctx * ctx) {
return ctx->model.hparams.has_llava_projector;
}
bool clip_is_gemma3(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
}
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_VISION;
}
@@ -3876,16 +3541,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
}
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return true;
default:
return false;
}
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {

View File

@@ -106,8 +106,7 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_mrope(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx);
// note for contributor: this clip_is_(model) pattern is deprecated
// do NOT add new functions like this
bool clip_is_gemma3(const struct clip_ctx * ctx);
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

View File

@@ -1,217 +0,0 @@
#include "models.h"
ggml_cgraph * clip_graph_conformer::build() {
const int n_frames = img.nx;
const int n_pos = n_frames / 2;
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
ggml_set_name(pos_emb, "pos_emb");
ggml_set_input(pos_emb);
ggml_build_forward_expand(gf, pos_emb);
ggml_tensor * inp = build_inp_raw(1);
cb(inp, "input", -1);
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
// pre encode, conv subsampling
{
// layer.0 - conv2d
cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
cb(cur, "conformer.pre_encode.conv.{}", 0);
// layer.1 - relu
cur = ggml_relu_inplace(ctx0, cur);
// layer.2 conv2d dw
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
cb(cur, "conformer.pre_encode.conv.{}", 2);
// layer.3 conv2d
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
cb(cur, "conformer.pre_encode.conv.{}", 3);
// layer.4 - relu
cur = ggml_relu_inplace(ctx0, cur);
// layer.5 conv2d dw
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
cb(cur, "conformer.pre_encode.conv.{}", 5);
// layer.6 conv2d
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
cb(cur, "conformer.pre_encode.conv.{}", 6);
// layer.7 - relu
cur = ggml_relu_inplace(ctx0, cur);
// flatten channel and frequency axis
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
// calculate out
cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
cb(cur, "conformer.pre_encode.out", -1);
}
// pos_emb
cb(pos_emb, "pos_emb", -1);
for (int il = 0; il < hparams.n_layer; il++) {
const auto & layer = model.layers[il];
auto * residual = cur;
cb(cur, "layer.in", il);
// feed_forward1
cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
il);
cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
const auto fc_factor = 0.5f;
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
// self-attention
{
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_self_att", il);
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
// build_attn won't fit due to matrix_ac and matrix_bd separation
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
// rel shift
{
const auto pos_len = matrix_bd->ne[0];
const auto q_len = matrix_bd->ne[1];
const auto h = matrix_bd->ne[2];
matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
}
matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
cb(scores, "conformer.layers.{}.self_attn.id0", il);
ggml_tensor * attn = ggml_soft_max(ctx0, scores);
ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
out = ggml_add(ctx0, out, layer.o_b);
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
cur = out;
}
residual = ggml_add(ctx0, residual, cur);
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_conv", il);
// conv
{
auto * x = cur;
x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
x = ggml_add(ctx0, x, layer.conv_pw1_b);
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
// ggml_glu doesn't support sigmoid
// TODO @ngxson : support this ops in ggml
{
int64_t d = x->ne[0] / 2;
ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
}
// use ggml_ssm_conv for f32 precision
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
x = ggml_roll(ctx0, x, 4, 0, 0, 0);
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
x = ggml_add(ctx0, x, layer.conv_dw_b);
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
x = ggml_silu(ctx0, x);
// pointwise_conv2
x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
x = ggml_add(ctx0, x, layer.conv_pw2_b);
cur = x;
}
residual = ggml_add(ctx0, residual, cur);
cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
cb(residual, "conformer.layers.{}.conv.id", il);
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_out", il);
}
// audio adapter
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cb(cur, "audio_adapter.model.{}", 0);
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
cb(cur, "projected", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@@ -1,451 +0,0 @@
#include "models.h"
// Helpers for MobileNetV5 Blocks
// RMS Norm 2D - normalizes over channels for each spatial position
ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
// inp: [W, H, C, B]
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
cur = ggml_cont(ctx0, cur);
cur = ggml_rms_norm(ctx0, cur, eps);
if (weight) {
cur = ggml_mul(ctx0, cur, weight);
}
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
cur = ggml_cont(ctx0, cur);
return cur;
}
// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
const int64_t ih = inp->ne[1]; // height
const int64_t iw = inp->ne[0]; // width
// Calculate output size (ceil division)
const int64_t oh = (ih + stride_h - 1) / stride_h;
const int64_t ow = (iw + stride_w - 1) / stride_w;
// Calculate padding needed
const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
// Split padding asymmetrically
const int pad_h_top = pad_h / 2;
const int pad_h_bottom = pad_h - pad_h_top;
const int pad_w_left = pad_w / 2;
const int pad_w_right = pad_w - pad_w_left;
// Apply padding if needed
// ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
// For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
if (pad_h > 0 || pad_w > 0) {
inp = ggml_pad_ext(ctx0, inp,
pad_w_left, pad_w_right, // width padding (dim 0)
pad_h_top, pad_h_bottom, // height padding (dim 1)
0, 0, // no channel padding (dim 2)
0, 0); // no batch padding (dim 3)
}
return inp;
}
// Edge Residual Block (Stage 0)
ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
ggml_tensor * cur = inp;
// 1. Expansion Conv (3x3)
if (stride == 2) {
// Case: Downsampling (Block 0)
// Replicates Conv2dSame(kernel=3, stride=2)
cur = pad_same_2d(cur, 3, 3, stride, stride);
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
} else {
// Case: Normal 3x3 Block (Block 1, 2)
// Replicates Conv2d(kernel=3, stride=1, padding=1)
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
}
// BN + Activation
if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
cur = ggml_gelu(ctx0, cur);
// 2. Pointwise Linear Conv (1x1)
// 1x1 Convs usually have padding=0 and stride=1
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
// 3. Residual Connection
// Only apply residual if spatial dimensions and channels match (stride 1)
if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
cur = ggml_add(ctx0, cur, inp);
}
return cur;
}
// Universal Inverted Residual Block (Stage 1+)
ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
ggml_tensor * cur = inp;
// 1. Depthwise Start (Optional)
// NOTE: dw_start always has stride=1 (no downsampling here)
if (block.dw_start_w) {
int k = block.dw_start_w->ne[0]; // 3 or 5
int p = k / 2;
cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
}
// 2. Pointwise Expansion (1x1)
if (block.pw_exp_w) {
// Standard 1x1 conv, pad=0, stride=1
cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
cur = ggml_gelu(ctx0, cur);
}
// 3. Depthwise Mid (Optional)
// NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
if (block.dw_mid_w) {
int k = block.dw_mid_w->ne[0]; // 3 or 5
if (stride > 1) {
// Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
cur = pad_same_2d(cur, k, k, stride, stride);
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
} else {
// Case: Stride 1 -> Use Standard Symmetric Padding
int p = k / 2;
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
}
if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
cur = ggml_gelu(ctx0, cur);
}
// 4. Pointwise Projection (1x1)
if (block.pw_proj_w) {
cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
}
// Apply Layer Scaling if present
if (block.layer_scale_w) {
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
}
// 5. Residual Connection
bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
bool same_channel = (inp->ne[2] == cur->ne[2]);
if (same_spatial && same_channel) {
cur = ggml_add(ctx0, cur, inp);
}
return cur;
}
// Attention Block (MQA)
ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
ggml_tensor * cur = inp;
// Norm
if (block.attn_norm_w) {
cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
}
// 1. Q Calculation
ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
// 2. K Calculation (Downsampled)
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
ggml_tensor * k_inp = cur;
if (block.attn_k_dw_w) {
int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
if (block.attn_k_norm_w) {
k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
}
}
ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
// 3. V Calculation (Downsampled)
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
ggml_tensor * v_inp = cur;
if (block.attn_v_dw_w) {
int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
if (block.attn_v_norm_w) {
v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
}
}
ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
const int D = k->ne[2]; // Head dimension
const int n_head = q->ne[2] / D;
const int N = W * H;
// Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
q = ggml_cont(ctx0, q);
const int Wk = k->ne[0]; const int Hk = k->ne[1];
const int M = Wk * Hk;
// Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
k = ggml_reshape_3d(ctx0, k, M, D, B);
k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
k = ggml_cont(ctx0, k);
// Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
v = ggml_reshape_3d(ctx0, v, M, D, B);
v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
v = ggml_cont(ctx0, v); // [M, D, 1, B]
// Multi-Query Attention
float scale = 1.0f / sqrtf((float)D);
// Step 1: Compute Q @ K.T
ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
scores = ggml_scale(ctx0, scores, scale);
scores = ggml_soft_max(ctx0, scores);
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
kqv = ggml_cont(ctx0, kqv);
kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
kqv = ggml_cont(ctx0, kqv);
// Output projection
cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
// Residual & Layer Scale
if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
if (block.layer_scale_w) {
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
}
cur = ggml_add(ctx0, cur, inp);
}
return cur;
}
ggml_cgraph * clip_graph_mobilenetv5::build() {
ggml_tensor * inp = build_inp_raw();
// 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
if (model.mobilenet_stem_conv_b) {
cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
}
if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
cur = ggml_gelu(ctx0, cur);
// 2. Blocks
std::vector<ggml_tensor*> intermediate_features;
const int total_blocks = model.mobilenet_blocks.size();
auto is_stage_start = [&](int i) {
if (i == 0) return true;
for (int end_idx : model.mobilenet_stage_ends) {
if (i == end_idx + 1) return true;
}
return false;
};
auto is_fusion_point = [&](int i) {
if (model.mobilenet_stage_ends.size() >= 4) {
if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
} else {
if (i == total_blocks - 1) return true;
}
return false;
};
for (int i = 0; i < total_blocks; i++) {
const auto & block = model.mobilenet_blocks[i];
int stride = is_stage_start(i) ? 2 : 1;
if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
else cur = build_inverted_residual(cur, block, stride);
if (is_fusion_point(i)) {
intermediate_features.push_back(cur);
}
}
// 3. Multi-Scale Fusion Adapter (MSFA)
if (!intermediate_features.empty()) {
// A. Reference Resolution: PyTorch implementation uses inputs[0]
// We assume intermediate_features[0] is the "High Resolution" target.
// In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
ggml_tensor* target_feat = intermediate_features[0];
int high_res_w = target_feat->ne[0];
int high_res_h = target_feat->ne[1];
std::vector<ggml_tensor*> resized_feats;
// B. Resize inputs to match inputs[0] (High Resolution)
for (auto feat : intermediate_features) {
int feat_w = feat->ne[0];
int feat_h = feat->ne[1];
// PyTorch: if feat_size < high_resolution: interpolate
if (feat_w < high_res_w || feat_h < high_res_h) {
// Calculate scale factor.
// Note: PyTorch 'nearest' works on arbitrary float scales.
// ggml_upscale generally takes integer factors or target sizes depending on helper.
// Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
int scale_w = high_res_w / feat_w;
// int scale_h = high_res_h / feat_h;
// Safety check for non-integer scaling if strictly replicating
GGML_ASSERT(high_res_w % feat_w == 0);
// Upsample (Nearest Neighbor)
// 2 is the scale factor
feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
}
resized_feats.push_back(feat);
}
// C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
cur = resized_feats[0];
for (size_t k = 1; k < resized_feats.size(); ++k) {
cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
}
// D. FFN (UniversalInvertedResidual)
// Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
// 1. Expansion
if (model.msfa_ffn_expand_w) {
// 1x1 Conv
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
if (model.msfa_ffn_expand_bn) {
cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
}
cur = ggml_gelu(ctx0, cur);
}
// 2. Projection (No DW because kernel_size=0)
if (model.msfa_ffn_project_w) {
// 1x1 Conv
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
// UniversalInvertedResidual typically has a norm after projection
if (model.msfa_ffn_project_bn) {
cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
}
}
// E. Final Downsample to Target Resolution (Output Resolution)
// PyTorch: matches self.output_resolution (e.g. 16x16)
const int target_out_res = 16;
int current_w = cur->ne[0];
if (current_w > target_out_res) {
int s = current_w / target_out_res;
GGML_ASSERT(current_w % target_out_res == 0);
// Avg Pool: Kernel=s, Stride=s
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
}
// F. Final Norm
if (model.msfa_concat_norm_w) {
cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
}
}
// 4. Gemma 3n Multimodal Projection (Embedder)
// Input: 'cur' is [Width, Height, Channels, Batch]
int W = cur->ne[0];
int H = cur->ne[1];
int C = cur->ne[2];
int B = cur->ne[3];
GGML_ASSERT(C == hparams.n_embd);
// 1. Permute and Flatten to [Channels, Tokens, Batch]
// PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
cur = ggml_cont(ctx0, cur);
cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
cur = ggml_cont(ctx0, cur);
// 2. FEATURE SCALING
// PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
const float scale_factor = sqrtf((float)C);
cur = ggml_scale(ctx0, cur, scale_factor);
// 3. SOFT EMBEDDING NORM
// PyTorch: self._norm(x) * self.weight
// We must normalize regardless, then multiply if weight exists.
{
const float eps = 1e-6f; // Gemma3n uses 1e-6
cur = ggml_rms_norm(ctx0, cur, eps);
if (model.mm_soft_emb_norm_w) {
// Weight shape is (2048,) -> Element-wise broadcast multiply
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
}
}
// 4. PROJECTION
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
if (model.mm_input_proj_w) {
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
}
// 5. POST PROJECTION NORM
// PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
// with_scale=False means weight is registered as buffer with value 1.0
// So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
{
const float eps = 1e-6f;
cur = ggml_rms_norm(ctx0, cur, eps);
if (model.mm_post_proj_norm_w) {
// If weight is loaded, multiply (should be ~1.0 anyway)
cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
}
}
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@@ -2,11 +2,6 @@
#include "../clip-graph.h"
/*
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
* We encourage human contributors to ensure the quality and reliability of the codebase.
*/
struct clip_graph_siglip : clip_graph {
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_youtuvl : clip_graph {
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
@@ -67,45 +57,7 @@ struct clip_graph_whisper_enc : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_conformer : clip_graph {
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_glm4v : clip_graph {
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_mobilenetv5 : clip_graph {
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * rms_norm_2d(
ggml_tensor * inp,
ggml_tensor * weight,
float eps = 1e-6f);
ggml_tensor* pad_same_2d(
ggml_tensor* inp,
int kernel_h,
int kernel_w,
int stride_h,
int stride_w,
int dilation_h = 1,
int dilation_w = 1);
ggml_tensor * build_edge_residual(
ggml_tensor * inp,
const mobilenetv5_block & block,
int stride);
ggml_tensor * build_inverted_residual(
ggml_tensor * inp,
const mobilenetv5_block & block,
int stride);
ggml_tensor * build_mobilenet_attn(
ggml_tensor * inp,
const mobilenetv5_block & block);
};

View File

@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
// projection, in LFM2-VL input norm is optional
if (model.mm_input_norm_w) {
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
}
if (model.mm_input_norm_b) {
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
}
// projection
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,

View File

@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
FFN_GELU_ERF,
-1);
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
// projector
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
cur = ggml_norm(ctx0, cur, hparams.eps);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);

View File

@@ -1,179 +0,0 @@
#include "models.h"
ggml_cgraph * clip_graph_youtuvl::build() {
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4;
const int m = 2;
const int Wp = n_patches_x;
const int Hp = n_patches_y;
const int Hm = Hp / m;
const int Wm = Wp / m;
norm_type norm_t = NORM_TYPE_NORMAL;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp = build_inp_raw();
// change conv3d to linear
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
{
inp = ggml_reshape_4d(
ctx0, inp,
Wm * m * patch_size, m * patch_size, Hm, 3);
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, Wm, m * patch_size, Hm);
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_4d(
ctx0, inp,
m * patch_size * 3, patch_size, m, Hm * Wm);
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
inp = ggml_cont_4d(
ctx0, inp,
patch_size, 3, patch_size, Hm * Wm * m * m);
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
3*patch_size* patch_size, Hm * Wm * m * m, 1);
}
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
if (model.patch_bias) {
inp = ggml_add(ctx0, inp, model.patch_bias);
}
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
ggml_tensor * inpL = inp;
ggml_tensor * window_mask = nullptr;
ggml_tensor * window_idx = nullptr;
ggml_tensor * inv_window_idx = nullptr;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
if (use_window_attn) {
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// if flash attn is used, we need to pad the mask and cast to f16
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
nullptr, nullptr,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
inpL = cur;
}
ggml_tensor * embeddings = inpL;
if (use_window_attn) {
const int spatial_merge_unit = 4;
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
cb(embeddings, "window_order_restored", -1);
}
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
if (model.post_ln_w) {
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// Now apply merger (VLPatchMerger):
// 1. Apply RMS norm (ln_q in VLPatchMerger)
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
cb(embeddings, "merger_normed", -1);
// 2. First reshape for spatial merge (merge 2x2 patches)
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
cb(embeddings, "merger_reshaped", -1);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
FFN_GELU,
-1);
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@@ -9,250 +9,207 @@
#include <fstream>
#include <algorithm>
// some of the code here is copied from whisper.cpp
// most of the code here is copied from whisper.cpp
constexpr bool DEBUG = false;
void mtmd_audio_cache::fill_sin_cos_table(int n) {
sin_vals.resize(n);
cos_vals.resize(n);
for (int i = 0; i < n; i++) {
double theta = (2 * M_PI * i) / n;
sin_vals[i] = sinf(theta);
cos_vals[i] = cosf(theta);
}
}
struct mtmd_audio_mel_filters {
int32_t n_mel;
int32_t n_fft;
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
hann_window.resize(length);
int offset = -1;
if (periodic) {
offset = 0;
}
for (int i = 0; i < length; i++) {
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
}
}
std::vector<float> data;
};
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
int n_fft,
int sample_rate,
float fmin,
float fmax,
bool slaney_area_norm,
float scale) {
GGML_ASSERT(n_mel > 0 && n_fft > 1);
if (fmax <= 0.0f) {
fmax = 0.5f * sample_rate;
}
// note: this global cache is shared among all preprocessors
// if we want to use multiple preprocessors at the same time,
// we will need to enclose it in the preprocessor class in the future
static struct mtmd_audio_global_cache {
// precomputed sin/cos table for FFT
std::vector<float> sin_vals;
std::vector<float> cos_vals;
// Slaney scale (matches librosa default)
const double min_log_hz = 1000.0;
const double lin_slope = 3 / 200.;
const double min_log_mel = min_log_hz * lin_slope;
const double log_step = log(6.4) / 27.0;
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
};
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
};
// hann window
std::vector<float> hann_window;
// infer N_fft from n_fft_bins
const double bin_hz_step = double(sample_rate) / double(n_fft);
// mel filter bank
mtmd_audio_mel_filters filters;
// mel grid: n_mel + 2 edges
const double m_lo = hz_to_mel(fmin);
const double m_hi = hz_to_mel(fmax);
std::vector<double> mel_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
}
// convert to Hz
std::vector<double> hz_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
hz_pts[i] = mel_to_hz(mel_pts[i]);
}
const int n_fft_bins = n_fft / 2 + 1;
// filterbank
std::vector<float> out(n_mel * n_fft_bins, 0);
for (int m = 0; m < n_mel; ++m) {
const double f_left = hz_pts[m];
const double f_center = hz_pts[m + 1];
const double f_right = hz_pts[m + 2];
const double denom_l = std::max(1e-30, f_center - f_left);
const double denom_r = std::max(1e-30, f_right - f_center);
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
for (int k = 0; k < n_fft_bins; ++k) {
const double f = k * bin_hz_step;
double w = 0.0;
if (f >= f_left && f <= f_center) {
w = (f - f_left) / denom_l;
} else if (f > f_center && f <= f_right) {
w = (f_right - f) / denom_r;
}
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
void fill_sin_cos_table(int n) {
sin_vals.resize(n);
cos_vals.resize(n);
for (int i = 0; i < n; i++) {
double theta = (2 * M_PI * i) / n;
sin_vals[i] = sinf(theta);
cos_vals[i] = cosf(theta);
}
}
filters.n_mel = n_mel;
filters.n_fft = n_fft;
filters.data = std::move(out);
void fill_hann_window(int length, bool periodic) {
hann_window.resize(length);
int offset = -1;
if (periodic) {
offset = 0;
}
for (int i = 0; i < length; i++) {
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
}
}
if (DEBUG) { // debug
for (size_t i = 0; i < filters.data.size(); ++i) {
if (filters.data[i] != 0.0f) {
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
void fill_mel_filterbank_matrix(
int n_mel,
int n_fft,
int sample_rate, // e.g. 16000
float fmin = 0.0f, // e.g. 0.0
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
bool slaney_area_norm = true,
float scale = 1.0f // optional extra scaling; use 1.0f/1000.0f to mimic your code
) {
GGML_ASSERT(n_mel > 0 && n_fft > 1);
if (fmax <= 0.0f) {
fmax = 0.5f * sample_rate;
}
// Slaney scale (matches librosa default)
const double min_log_hz = 1000.0;
const double lin_slope = 3 / 200.;
const double min_log_mel = min_log_hz * lin_slope;
const double log_step = log(6.4) / 27.0;
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
};
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
};
// infer N_fft from n_fft_bins
const double bin_hz_step = double(sample_rate) / double(n_fft);
// mel grid: n_mel + 2 edges
const double m_lo = hz_to_mel(fmin);
const double m_hi = hz_to_mel(fmax);
std::vector<double> mel_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
}
// convert to Hz
std::vector<double> hz_pts(n_mel + 2);
for (int i = 0; i < n_mel + 2; ++i) {
hz_pts[i] = mel_to_hz(mel_pts[i]);
}
const int n_fft_bins = n_fft / 2 + 1;
// filterbank
std::vector<float> out(n_mel * n_fft_bins, 0);
for (int m = 0; m < n_mel; ++m) {
const double f_left = hz_pts[m];
const double f_center = hz_pts[m + 1];
const double f_right = hz_pts[m + 2];
const double denom_l = std::max(1e-30, f_center - f_left);
const double denom_r = std::max(1e-30, f_right - f_center);
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
for (int k = 0; k < n_fft_bins; ++k) {
const double f = k * bin_hz_step;
double w = 0.0;
if (f >= f_left && f <= f_center) {
w = (f - f_left) / denom_l;
} else if (f > f_center && f <= f_right) {
w = (f_right - f) / denom_r;
}
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
}
}
filters.n_mel = n_mel;
filters.n_fft = n_fft;
filters.data = std::move(out);
if (DEBUG) { // debug
for (size_t i = 0; i < filters.data.size(); ++i) {
if (filters.data[i] != 0.0f) {
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
}
}
}
}
}
} g_cache;
// Unified DFT implementation for both forward and inverse transforms
// Template parameters:
// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
// true = IDFT with exp(+2πi·k·n/N), scales by 1/N
// RealInput: true = input is real-valued (stride 1), avoids imaginary computations
// false = input is complex-valued (interleaved real/imag, stride 2)
template <bool Inverse, bool RealInput>
static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
const int n_sin_cos_vals = cache.sin_vals.size();
const int sin_cos_step = n_sin_cos_vals / N;
constexpr float sign = Inverse ? 1.0f : -1.0f;
const float scale = Inverse ? (1.0f / N) : 1.0f;
// naive Discrete Fourier Transform
// input is real-valued
// output is complex-valued
static void dft(const float * in, int N, float * out) {
const int n_sin_cos_vals = g_cache.sin_vals.size();
const int sin_cos_step = n_sin_cos_vals / N;
for (int k = 0; k < N; k++) {
float re = 0;
float im = 0;
for (int n = 0; n < N; n++) {
int idx = (k * n * sin_cos_step) % n_sin_cos_vals;
float cos_val = cache.cos_vals[idx];
float sin_val = cache.sin_vals[idx];
if constexpr (RealInput) {
// Real input: in_im = 0, simplifies to:
// re += in_re * cos_val
// im += sign * in_re * sin_val
float in_re = in[n];
re += in_re * cos_val;
im += sign * in_re * sin_val;
} else {
float in_re = in[n * 2 + 0];
float in_im = in[n * 2 + 1];
// (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
re += in_re * cos_val - sign * in_im * sin_val;
im += sign * in_re * sin_val + in_im * cos_val;
}
int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
re += in[n] * g_cache.cos_vals[idx]; // cos(t)
im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
}
out[k * 2 + 0] = re * scale;
out[k * 2 + 1] = im * scale;
out[k*2 + 0] = re;
out[k*2 + 1] = im;
}
}
// Cooley-Tukey FFT/IFFT unified implementation
// Template parameters:
// Inverse: false = FFT with exp(-2πi·k/N), no scaling
// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
// RealInput: true = input is real-valued (stride 1)
// false = input is complex-valued (interleaved real/imag, stride 2)
template <bool Inverse, bool RealInput>
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
const int n_sin_cos_vals = cache.sin_vals.size();
// Cooley-Tukey FFT
// poor man's implementation - use something better
// input is real-valued
// output is complex-valued
static void fft(float * in, int N, float * out) {
const int n_sin_cos_vals = g_cache.sin_vals.size();
if (N == 1) {
out[0] = in[0];
if constexpr (RealInput) {
out[1] = 0.0f;
} else {
out[1] = in[1];
}
out[1] = 0;
return;
}
const int half_N = N / 2;
if (N - half_N * 2 == 1) {
// Odd N: fall back to DFT
dft_impl<Inverse, RealInput>(cache, in, N, out);
if (N - half_N*2 == 1) {
dft(in, N, out);
return;
}
// Split into even and odd
if constexpr (RealInput) {
// Real input: stride is 1, copy only real values
float * even = in + N;
for (int i = 0; i < half_N; ++i) {
even[i] = in[2 * i];
}
float * even_fft = out + 2 * N;
fft_impl<Inverse, true>(cache, even, half_N, even_fft);
float * odd = even;
for (int i = 0; i < half_N; ++i) {
odd[i] = in[2 * i + 1];
}
float * odd_fft = even_fft + N;
fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
} else {
// Complex input: stride is 2, copy complex pairs
float * even = in + N * 2;
for (int i = 0; i < half_N; ++i) {
even[i * 2 + 0] = in[2 * i * 2 + 0];
even[i * 2 + 1] = in[2 * i * 2 + 1];
}
float * even_fft = out + 2 * N;
fft_impl<Inverse, false>(cache, even, half_N, even_fft);
float * odd = even;
for (int i = 0; i < half_N; ++i) {
odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
}
float * odd_fft = even_fft + N;
fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
float* even = in + N;
for (int i = 0; i < half_N; ++i) {
even[i]= in[2*i];
}
float* even_fft = out + 2 * N;
fft(even, half_N, even_fft);
float * even_fft = out + 2 * N;
float * odd_fft = even_fft + N;
float* odd = even;
for (int i = 0; i < half_N; ++i) {
odd[i] = in[2*i + 1];
}
float* odd_fft = even_fft + N;
fft(odd, half_N, odd_fft);
const int sin_cos_step = n_sin_cos_vals / N;
constexpr float sign = Inverse ? 1.0f : -1.0f;
constexpr float scale = Inverse ? 0.5f : 1.0f;
for (int k = 0; k < half_N; k++) {
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
float re = cache.cos_vals[idx];
float im = sign * cache.sin_vals[idx];
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
float re = g_cache.cos_vals[idx]; // cos(t)
float im = -g_cache.sin_vals[idx]; // sin(t)
float re_odd = odd_fft[2 * k + 0];
float im_odd = odd_fft[2 * k + 1];
float re_odd = odd_fft[2*k + 0];
float im_odd = odd_fft[2*k + 1];
out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
}
}
// Forward FFT for real input (used by mel spectrogram)
static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
fft_impl<false, true>(cache, in, N, out);
}
// Inverse FFT for complex input
static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
fft_impl<true, false>(cache, in, N, out);
}
struct filter_params {
int32_t n_mel;
int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
bool norm_per_feature = false;
};
static void log_mel_spectrogram_worker_thread(int ith,
const float * hann,
const std::vector<float> & samples,
int n_samples,
int frame_size,
int frame_step,
int n_threads,
const filter_params & params,
const mtmd_audio_cache & cache,
mtmd_audio_mel & out) {
static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
int n_samples, int frame_size, int frame_step, int n_threads,
const filter_params & params, mtmd_audio_mel & out) {
std::vector<float> fft_in(frame_size * 2, 0.0);
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
int n_fft_bins = params.n_fft_bins;
int i = ith;
const auto & filters = cache.filters;
const auto & filters = g_cache.filters;
// make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
// calculate FFT only when fft_in are not all zero
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int ith,
}
// FFT
fft(cache, fft_in.data(), frame_size, fft_out.data());
fft(fft_in.data(), frame_size, fft_out.data());
// Calculate modulus^2 of complex numbers
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
const int n_samples_in,
const int n_threads,
const filter_params & params,
const mtmd_audio_cache & cache,
mtmd_audio_mel & out) {
//const int64_t t_start_us = ggml_time_us();
@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
int n_samples = n_samples_in;
// Hann window
const float * hann = cache.hann_window.data();
const int frame_size = (params.n_fft_bins - 1) * 2;
const int frame_step = params.hop_length;
const float * hann = g_cache.hann_window.data();
const int frame_size = (params.n_fft_bins - 1) * 2;
const int frame_step = params.hop_length;
// Padding
std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(
// preemphasis
if (params.preemph) {
const int pad_amount = frame_size / 2;
const int pad_amount = frame_size / 2;
const float preemph = 0.97f;
float prev = samples_padded[pad_amount];
float prev = samples_padded[pad_amount];
for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
float cur = samples_padded[i];
samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
{
std::vector<std::thread> workers(n_threads - 1);
for (int iw = 0; iw < n_threads - 1; ++iw) {
workers[iw] =
std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
workers[iw] = std::thread(
log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
n_samples, frame_size, frame_step, n_threads,
std::cref(params), std::ref(out));
}
// main thread
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
cache, out);
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
for (int iw = 0; iw < n_threads - 1; ++iw) {
workers[iw].join();
}
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(
for (int j = 0; j < effective_n_len; ++j) {
auto &value = out.data[i * out.n_len + j];
value = (value - mean) / mstd;
value = (value - mean) / mstd;
}
// pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
//
void mtmd_audio_preprocessor_whisper::initialize() {
cache.fill_sin_cos_table(hparams.audio_n_fft);
cache.fill_hann_window(hparams.audio_window_len, true);
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
g_cache.fill_sin_cos_table(hparams.audio_n_fft);
g_cache.fill_hann_window(hparams.audio_window_len, true);
g_cache.fill_mel_filterbank_matrix(
hparams.n_mel_bins,
hparams.audio_n_fft,
hparams.audio_sample_rate);
}
bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples,
size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
bool mtmd_audio_preprocessor_whisper::preprocess(
const float * samples,
size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
if (n_samples == 0) {
// empty audio
return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
// if input is too short, pad with zeros
// this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
// TODO: maybe handle this better
size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
if (n_samples < min_samples) {
smpl.resize(min_samples, 0.0f);
std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
params.hop_length = hparams.audio_hop_len;
params.sample_rate = hparams.audio_sample_rate;
params.center_padding = false;
params.preemph = 0.0f; // disabled
params.preemph = 0.0f; // disabled
params.use_natural_log = false;
params.norm_per_feature = false;
// make sure the cache is initialized
GGML_ASSERT(!cache.sin_vals.empty());
GGML_ASSERT(!cache.cos_vals.empty());
GGML_ASSERT(!cache.filters.data.empty());
// make sure the global cache is initialized
GGML_ASSERT(!g_cache.sin_vals.empty());
GGML_ASSERT(!g_cache.cos_vals.empty());
GGML_ASSERT(!g_cache.filters.data.empty());
mtmd_audio_mel out_full;
bool ok = log_mel_spectrogram(samples, n_samples,
4, // n_threads
params, cache, out_full);
bool ok = log_mel_spectrogram(
samples,
n_samples,
4, // n_threads
params,
out_full);
if (!ok) {
return false;
}
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
}
const size_t frames_per_chunk = 3000;
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
if ((size_t) n_len < frames_per_chunk) {
break; // last uncomplete chunk will always be a padded chunk, safe to ignore
GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
if ((size_t)n_len < frames_per_chunk) {
break; // last uncomplete chunk will always be a padded chunk, safe to ignore
}
mtmd_audio_mel out_chunk;
out_chunk.n_len = n_len;
out_chunk.n_mel = out_full.n_mel;
out_chunk.n_len_org = out_full.n_mel; // unused
out_chunk.n_len_org = out_full.n_mel; // unused
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
for (int i = 0; i < out_full.n_mel; i++) {
auto src = out_full.data.begin() + i * out_full.n_len + off;
auto src = out_full.data.begin() + i*out_full.n_len + off;
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
}
@@ -579,152 +535,3 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
return true;
}
//
// mtmd_audio_preprocessor_conformer
//
void mtmd_audio_preprocessor_conformer::initialize() {
cache.fill_sin_cos_table(hparams.audio_n_fft);
cache.fill_hann_window(hparams.audio_window_len, true);
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
}
bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples,
size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
// empty audio
if (n_samples == 0) {
return false;
}
filter_params params;
params.n_mel = hparams.n_mel_bins;
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
params.hann_window_size = hparams.audio_window_len;
params.hop_length = hparams.audio_hop_len;
params.sample_rate = hparams.audio_sample_rate;
params.center_padding = true;
params.preemph = 0.97f;
params.use_natural_log = true;
params.norm_per_feature = true;
// make sure the cache is initialized
GGML_ASSERT(!cache.sin_vals.empty());
GGML_ASSERT(!cache.cos_vals.empty());
GGML_ASSERT(!cache.filters.data.empty());
mtmd_audio_mel out_full;
bool ok = log_mel_spectrogram(samples, n_samples,
4, // n_threads
params, cache, out_full);
if (!ok) {
return false;
}
output.push_back(std::move(out_full));
return true;
}
//
// mtmd_audio_streaming_istft implementation
//
mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
n_fft(n_fft),
hop_length(hop_length),
n_fft_bins(n_fft / 2 + 1),
overlap_buffer(n_fft, 0.0f),
window_sum_buffer(n_fft, 0.0f),
padding_to_remove((n_fft - hop_length) / 2),
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
ifft_out(n_fft * 2 * 4, 0.0f) {
cache.fill_sin_cos_table(n_fft);
cache.fill_hann_window(n_fft, true);
}
void mtmd_audio_streaming_istft::reset() {
std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
padding_to_remove = (n_fft - hop_length) / 2;
}
std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
std::vector<float> output(hop_length);
// copy frequencies
for (int j = 0; j < n_fft_bins; j++) {
ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
}
// mirror negative frequencies
for (int j = 1; j < n_fft_bins - 1; j++) {
int mirror_idx = n_fft - j;
ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate
}
ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
// update window sum and overlap buffer
for (int j = 0; j < n_fft; j++) {
window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
}
// extract hop_length samples with normalization
for (int i = 0; i < hop_length; i++) {
if (window_sum_buffer[i] > 1e-8f) {
output[i] = overlap_buffer[i] / window_sum_buffer[i];
} else {
output[i] = overlap_buffer[i];
}
}
// shift buffers left by hop_length
std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
// Remove padding if needed
int to_remove = std::min(padding_to_remove, (int) output.size());
padding_to_remove -= to_remove;
output.erase(output.begin(), output.begin() + to_remove);
return output;
}
std::vector<float> mtmd_audio_streaming_istft::flush() {
std::vector<float> output;
// Extract remaining samples from overlap buffer
// Continue until we've extracted all meaningful samples
int remaining = n_fft - hop_length;
while (remaining > 0) {
int chunk_size = std::min(remaining, hop_length);
for (int i = 0; i < chunk_size; i++) {
float sample;
if (window_sum_buffer[i] > 1e-8f) {
sample = overlap_buffer[i] / window_sum_buffer[i];
} else {
sample = overlap_buffer[i];
}
output.push_back(sample);
}
// Shift buffers
std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
remaining -= chunk_size;
}
return output;
}

View File

@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
std::vector<float> data;
};
struct mtmd_audio_mel_filters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
// cache for audio processing, each processor instance owns its own cache
struct mtmd_audio_cache {
std::vector<float> sin_vals;
std::vector<float> cos_vals;
std::vector<float> hann_window;
mtmd_audio_mel_filters filters;
void fill_sin_cos_table(int n);
void fill_hann_window(int length, bool periodic);
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
void fill_mel_filterbank_matrix(int n_mel,
int n_fft,
int sample_rate, // e.g. 16000
float fmin = 0.0f, // e.g. 0.0
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
bool slaney_area_norm = true,
float scale = 1.0f // optional extra scaling
);
};
struct mtmd_audio_preprocessor {
const clip_hparams & hparams;
@@ -63,51 +31,4 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
//
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
//
struct mtmd_audio_streaming_istft {
mtmd_audio_streaming_istft(int n_fft, int hop_length);
// reset streaming state
void reset();
// process a single STFT frame (streaming)
// frame_spectrum: [n_fft_bins x 2] interleaved real/imag
// returns: up to hop_length samples
std::vector<float> process_frame(const float * frame_spectrum);
// flush remaining samples at end of stream
std::vector<float> flush();
private:
int n_fft;
int hop_length;
int n_fft_bins;
// Own cache for output processing
mtmd_audio_cache cache;
// Streaming state
std::vector<float> overlap_buffer;
std::vector<float> window_sum_buffer;
int padding_to_remove;
// Working buffers for IFFT
std::vector<float> ifft_in;
std::vector<float> ifft_out;
};

View File

@@ -276,7 +276,7 @@ struct mtmd_context {
}
// set boi/eoi
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
if (proj == PROJECTOR_TYPE_GEMMA3) {
// <start_of_image> ... (image embeddings) ... <end_of_image>
img_beg = "<start_of_image>";
img_end = "<end_of_image>";
@@ -293,7 +293,7 @@ struct mtmd_context {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>";
img_end = "<|vision_end|>";
@@ -339,13 +339,8 @@ struct mtmd_context {
case PROJECTOR_TYPE_QWEN25O:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
break;
case PROJECTOR_TYPE_LFM2A:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
break;
default:
GGML_ABORT("unsupported audio projector type");
}
@@ -363,9 +358,6 @@ struct mtmd_context {
// [BEGIN_AUDIO] ... (embeddings) ...
aud_beg = "[BEGIN_AUDIO]";
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
// <sound> ... (embeddings) ...
aud_beg = "<sound>";
}
}
@@ -872,15 +864,10 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
}
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
switch (ctx->proj_type_v()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_YOUTUVL:
return true;
default:
return false;
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
return true;
}
return false;
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {

View File

@@ -27,9 +27,6 @@
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
* - Keep the API minimal, do not expose internal details unless necessary
*
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
* We encourage human contributors to ensure the quality and reliability of the codebase.
*/
#ifdef LLAMA_SHARED

View File

@@ -23,7 +23,7 @@ problem.
8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1b59924b8..a8a61b1e2 100644
index 8547ecc84..9f37ca70c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -64,7 +64,7 @@ index 1b59924b8..a8a61b1e2 100644
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index d7a93848d..51bf8bc55 100644
index da624c587..efc63e092 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
@@ -84,7 +84,7 @@ index d7a93848d..51bf8bc55 100644
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index f021de1d7..9f3cb06ad 100644
index ab0f6fe9c..6519af435 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -112,7 +112,7 @@ index f021de1d7..9f3cb06ad 100644
static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 56b59f0af..790cabca0 100644
index 70bf6f3d9..f2b7fe692 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 56b59f0af..790cabca0 100644
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 608ba1e55..d010ca00e 100644
index 0d37587f6..ff373d413 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3499,6 +3499,7 @@ struct ggml_backend_opencl_buffer_context {
@@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
@@ -144,10 +144,10 @@ index 608ba1e55..d010ca00e 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index d7c8ad8c1..281fa1bdb 100644
index 18a45d2d9..89041805e 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -557,6 +557,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
RPC_STATUS_ASSERT(status);
delete ctx;
@@ -156,7 +156,7 @@ index d7c8ad8c1..281fa1bdb 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 8f8176b67..5f8f3c210 100644
index e996d98be..84b679315 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -184,10 +184,10 @@ index 8f8176b67..5f8f3c210 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index b1a51a436..885864111 100644
index 34ec09d40..120191ca0 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12751,6 +12751,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
@@ -195,7 +195,7 @@ index b1a51a436..885864111 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -12894,6 +12895,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);

View File

@@ -10,10 +10,10 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a20c6525e..09874b67a 100644
index 7b01a2edf..63250cdf1 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1832,16 +1832,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
@@ -31,8 +31,8 @@ index a20c6525e..09874b67a 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -2032,7 +2023,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));

View File

@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 97c83de5f..26710548e 100644
index 35e3aef0a..84a3796b5 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@
@@ -33,7 +33,7 @@ index 97c83de5f..26710548e 100644
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
@@ -1842,7 +1855,29 @@ struct clip_model_loader {
@@ -1619,7 +1632,29 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
@@ -63,7 +63,7 @@ index 97c83de5f..26710548e 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -1869,7 +1904,11 @@ struct clip_model_loader {
@@ -1646,7 +1681,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}

View File

@@ -19,10 +19,10 @@ adds support for the Solar Pro architecture
create mode 100644 src/models/solar.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b0932794d..12e14f55c 100644
index 4192af7c0..bd44d73e7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -129,6 +129,7 @@ add_library(llama
@@ -125,6 +125,7 @@ add_library(llama
models/seed-oss.cpp
models/smallthinker.cpp
models/smollm3.cpp
@@ -31,10 +31,10 @@ index b0932794d..12e14f55c 100644
models/starcoder.cpp
models/starcoder2.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 2ead96546..36855e408 100644
index 8caf80afc..2ce8ffec0 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -89,6 +89,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
@@ -42,7 +42,7 @@ index 2ead96546..36855e408 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -215,6 +216,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
@@ -50,7 +50,7 @@ index 2ead96546..36855e408 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -347,6 +349,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
@@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
@@ -58,9 +58,9 @@ index 2ead96546..36855e408 100644
{ LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
@@ -2254,6 +2257,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
@@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
return {
LLM_TENSOR_TOKEN_EMBD,
};
+ case LLM_ARCH_SOLAR:
+ return {
@@ -81,7 +81,7 @@ index 2ead96546..36855e408 100644
default:
GGML_ABORT("unknown architecture for tensor mapping");
}
@@ -2422,6 +2441,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -90,10 +90,10 @@ index 2ead96546..36855e408 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 68ec6a18b..7fab2fa93 100644
index 6cbf9b1f8..14d461c76 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -93,6 +93,7 @@ enum llm_arch {
@@ -91,6 +91,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON,
@@ -101,7 +101,7 @@ index 68ec6a18b..7fab2fa93 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
@@ -219,6 +220,7 @@ enum llm_kv {
@@ -212,6 +213,7 @@ enum llm_kv {
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
@@ -109,7 +109,7 @@ index 68ec6a18b..7fab2fa93 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -473,6 +475,7 @@ enum llm_tensor {
@@ -465,6 +467,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
@@ -118,10 +118,10 @@ index 68ec6a18b..7fab2fa93 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index c847ef91b..700e4bcf5 100644
index fe1fa4341..aabff2f06 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -167,6 +167,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
@@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
}
@@ -137,7 +137,7 @@ index c847ef91b..700e4bcf5 100644
if (il < n_layer) {
return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 7ae3ec292..cc2ce7f15 100644
index f6e95b5d2..c6e673276 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -65,6 +65,8 @@ struct llama_hparams {
@@ -149,7 +149,7 @@ index 7ae3ec292..cc2ce7f15 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -266,6 +268,9 @@ struct llama_hparams {
@@ -259,6 +261,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const;
@@ -160,10 +160,10 @@ index 7ae3ec292..cc2ce7f15 100644
bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index e66febaa0..cf200c61f 100644
index ca2ea2461..8916a6242 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -489,7 +489,7 @@ namespace GGUFMeta {
@@ -466,7 +466,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
@@ -173,10 +173,10 @@ index e66febaa0..cf200c61f 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5de6493b9..d3c60d418 100644
index ae8207ee1..00cd579e0 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2074,6 +2074,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -198,7 +198,7 @@ index 5de6493b9..d3c60d418 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5626,6 +5641,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -233,7 +233,7 @@ index 5de6493b9..d3c60d418 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -7838,6 +7881,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
@@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_chameleon>(*this, params);
} break;
@@ -244,7 +244,7 @@ index 5de6493b9..d3c60d418 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -8116,6 +8163,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON:
@@ -253,10 +253,10 @@ index 5de6493b9..d3c60d418 100644
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h
index 79200a0d9..740cb7094 100644
index c6eb95318..b378b23ec 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -79,6 +79,7 @@ enum llm_type {
@@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
@@ -264,7 +264,7 @@ index 79200a0d9..740cb7094 100644
LLM_TYPE_26B,
LLM_TYPE_27B,
LLM_TYPE_30B,
@@ -410,6 +411,8 @@ struct llama_layer {
@@ -405,6 +406,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr;
@@ -274,10 +274,10 @@ index 79200a0d9..740cb7094 100644
struct llama_layer_convnext convnext;
diff --git a/src/models/models.h b/src/models/models.h
index 72b2b760c..4e2162c77 100644
index ffb36acc6..6d84a185d 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -533,6 +533,11 @@ struct llm_build_smollm3 : public llm_graph_context {
@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
};

View File

@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 09874b67a..0049d59bf 100644
index 63250cdf1..dd86a1745 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -25,7 +25,7 @@ index 09874b67a..0049d59bf 100644
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index b47dcbe61..6d1084f26 100644
index bb44edfad..13ced055f 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 6192a8704..993ec027f 100644
index 4c04c3300..f4747f262 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -26,4 +26,4 @@ index 6192a8704..993ec027f 100644
+ add_custom_target(ggml-cpu)
if (GGML_SYSTEM_ARCH STREQUAL "x86")
ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(sse42 SSE42)
ggml_add_cpu_backend_variant(sse42 SSE42)

View File

@@ -5,22 +5,21 @@ Subject: [PATCH] remove amx
disable amx as it reduces performance on some systems
---
ggml/src/CMakeLists.txt | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
ggml/src/CMakeLists.txt | 4 ----
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 993ec027f..cbda1380c 100644
index f4747f262..d55aed348 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -379,10 +379,7 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
endif()
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
@@ -365,10 +365,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
- if (NOT MSVC)
- # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif()
+ # AMX variants removed by ollama - sapphirerapids with AMX_TILE AMX_INT8 not included
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
# Many of these features are optional so we build versions with popular

View File

@@ -53,10 +53,10 @@ index b165d8bdc..f91d4faba 100644
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 0049d59bf..fefa6b478 100644
index dd86a1745..d63ce9c84 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1788,9 +1788,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index f7ba1fe31..f700f74db 100644
index a59b51893..53891a91f 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@

View File

@@ -10,10 +10,10 @@ Subject: [PATCH] add ollama vocab for grammar support
3 files changed, 58 insertions(+), 10 deletions(-)
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index 64ea2fd00..d87e52ded 100644
index 75d5d750c..a0299d181 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1079,6 +1079,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -1041,6 +1041,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
@@ -21,7 +21,7 @@ index 64ea2fd00..d87e52ded 100644
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index) {
@@ -1134,6 +1135,7 @@ struct llama_grammar * llama_grammar_init_impl(
@@ -1096,6 +1097,7 @@ struct llama_grammar * llama_grammar_init_impl(
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar {
vocab,
@@ -29,7 +29,7 @@ index 64ea2fd00..d87e52ded 100644
std::move(vec_rules),
std::move(stacks),
/* .partial_utf8 = */ {},
@@ -1148,6 +1150,7 @@ struct llama_grammar * llama_grammar_init_impl(
@@ -1110,6 +1112,7 @@ struct llama_grammar * llama_grammar_init_impl(
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
@@ -37,7 +37,7 @@ index 64ea2fd00..d87e52ded 100644
const char * grammar_str,
const char * grammar_root,
bool lazy,
@@ -1240,6 +1243,7 @@ struct llama_grammar * llama_grammar_init_impl(
@@ -1202,6 +1205,7 @@ struct llama_grammar * llama_grammar_init_impl(
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar {
vocab,
@@ -45,7 +45,7 @@ index 64ea2fd00..d87e52ded 100644
std::move(vec_rules),
std::move(stacks),
/* .partial_utf8 = */ {},
@@ -1263,6 +1267,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
@@ -1225,6 +1229,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
auto * result = new llama_grammar {
grammar.vocab,
@@ -53,7 +53,7 @@ index 64ea2fd00..d87e52ded 100644
grammar.rules,
grammar.stacks,
grammar.partial_utf8,
@@ -1291,7 +1296,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
@@ -1253,7 +1258,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
}
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
@@ -61,7 +61,7 @@ index 64ea2fd00..d87e52ded 100644
if (grammar.awaiting_trigger) {
return;
@@ -1313,9 +1317,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
@@ -1275,9 +1279,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
for (size_t i = 0; i < cur_p->size; ++i) {
const llama_token id = cur_p->data[i].id;
@@ -77,7 +77,7 @@ index 64ea2fd00..d87e52ded 100644
if (!allow_eog) {
cur_p->data[i].logit = -INFINITY;
}
@@ -1334,9 +1342,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
@@ -1296,9 +1304,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
}
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
@@ -90,7 +90,7 @@ index 64ea2fd00..d87e52ded 100644
if (grammar.awaiting_trigger) {
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
@@ -1380,13 +1389,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
@@ -1353,13 +1362,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
}
}
@@ -107,7 +107,7 @@ index 64ea2fd00..d87e52ded 100644
}
llama_grammar_accept_token(grammar, token, piece);
@@ -1462,3 +1472,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
@@ -1435,3 +1445,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
}
}
@@ -136,7 +136,7 @@ index 64ea2fd00..d87e52ded 100644
+ }
+}
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index b5a0e588e..57847583a 100644
index a4c978ac1..5c0da4049 100644
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@@ -6,8 +6,19 @@
@@ -159,7 +159,7 @@ index b5a0e588e..57847583a 100644
// grammar element type
enum llama_gretype {
@@ -129,6 +140,7 @@ struct llama_grammar {
@@ -127,6 +138,7 @@ struct llama_grammar {
// note: allow null vocab for testing (not great)
const llama_vocab * vocab;
@@ -167,7 +167,7 @@ index b5a0e588e..57847583a 100644
const llama_grammar_rules rules; // TODO: shared ptr
llama_grammar_stacks stacks;
@@ -157,12 +169,14 @@ struct llama_grammar {
@@ -155,12 +167,14 @@ struct llama_grammar {
// note: needed for tests (not great)
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
@@ -183,10 +183,10 @@ index b5a0e588e..57847583a 100644
const char * grammar_root,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 11f0394c4..bbc5dc591 100644
index 3f4a729bc..38a30ea05 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -2511,7 +2511,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
}
@@ -195,7 +195,7 @@ index 11f0394c4..bbc5dc591 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -2593,9 +2593,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += ")[\\s\\S]*";
std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };

View File

@@ -5,11 +5,11 @@ Subject: [PATCH] add argsort and cuda copy for i32
---
ggml/src/ggml-cpu/ops.cpp | 43 ++++++
ggml/src/ggml-cuda/argsort.cu | 120 +++++++++++++--
ggml/src/ggml-cuda/argsort.cu | 122 +++++++++++++--
ggml/src/ggml-cuda/cpy-utils.cuh | 6 +
ggml/src/ggml-cuda/cpy.cu | 40 +++++
ggml/src/ggml-metal/ggml-metal.metal | 215 +++++++++++++++++++++++++++
5 files changed, 413 insertions(+), 11 deletions(-)
5 files changed, 414 insertions(+), 12 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 303278397..7d1733adb 100644
@@ -73,10 +73,10 @@ index 303278397..7d1733adb 100644
{
GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 57c8a99a2..d982d279b 100644
index da9652c3b..b82be371c 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -189,13 +189,107 @@ void argsort_f32_i32_cuda_bitonic(const float * x,
@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
}
}
@@ -185,27 +185,28 @@ index 57c8a99a2..d982d279b 100644
GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
@@ -204,18 +298,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -183,18 +277,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+ if (src0->type == GGML_TYPE_I32) {
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ } else {
#ifdef GGML_CUDA_USE_CUB
-#ifdef GGML_CUDA_USE_CUB
- const int ncols_pad = next_power_of_2(ncols);
- const size_t shared_mem = ncols_pad * sizeof(int);
- const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+ const int ncols_pad = next_power_of_2(ncols);
+ const size_t shared_mem = ncols_pad * sizeof(int);
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-
- if (shared_mem > max_shared_mem || ncols > 1024) {
- ggml_cuda_pool & pool = ctx.pool();
- argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
- } else {
+ if (src0->type == GGML_TYPE_I32) {
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
} else {
- argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
- }
+#ifdef GGML_CUDA_USE_CUB
+ const int ncols_pad = next_power_of_2(ncols);
+ const size_t shared_mem = ncols_pad * sizeof(int);
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+ if (shared_mem > max_shared_mem || ncols > 1024) {
+ ggml_cuda_pool & pool = ctx.pool();
+ argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
@@ -233,10 +234,10 @@ index 7697c292d..00d773dd3 100644
+ *dst = *src;
+}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index ee84303ef..178e82d76 100644
index c4ceb4fc5..0e53ecc39 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -369,6 +369,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
@@ -280,7 +281,7 @@ index ee84303ef..178e82d76 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
@@ -495,6 +532,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_scalar_cuda<half, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
@@ -291,10 +292,10 @@ index ee84303ef..178e82d76 100644
if (can_be_transposed) {
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 16d17d26a..ff1afa291 100644
index 51bcbae30..236838e9e 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4955,8 +4955,77 @@ kernel void kernel_argsort_f32_i32(
@@ -4954,8 +4954,77 @@ kernel void kernel_argsort_f32_i32(
}
}
@@ -372,7 +373,7 @@ index 16d17d26a..ff1afa291 100644
typedef void (argsort_merge_t)(
constant ggml_metal_kargs_argsort_merge & args,
@@ -5111,8 +5180,154 @@ kernel void kernel_argsort_merge_f32_i32(
@@ -5110,8 +5179,154 @@ kernel void kernel_argsort_merge_f32_i32(
}
}

View File

@@ -23,7 +23,7 @@ index 78aa059dd..7fa8403b3 100644
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index a9d177864..393c329be 100644
index 4ed5f3577..a7ebe5dcd 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -319,6 +319,7 @@ extern "C" {
@@ -121,7 +121,7 @@ index 41419b617..73b39bfea 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index a8a61b1e2..259e10257 100644
index 9f37ca70c..1459d16dd 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe

View File

@@ -1,32 +1,31 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: nobody <>
Date: Sat, 10 Jan 2026 15:27:57 -0800
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 30 Nov 2025 11:05:56 -0800
Subject: [PATCH] ggml: Export GPU UUIDs
---
ggml/include/ggml-backend.h | 2 ++
ggml/src/ggml-cuda/ggml-cuda.cu | 53 ++++++++++++++++++++++++++++++
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.cpp | 1 +
3 files changed, 56 insertions(+)
3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 393c329be..99412fe56 100644
index a7ebe5dcd..03557bb31 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,8 @@ extern "C" {
@@ -158,6 +158,7 @@ extern "C" {
const char * description;
// device free memory in bytes
size_t memory_free;
+ // device UUID
+ const char * id;
// device total memory in bytes
size_t memory_total;
// device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 9f3cb06ad..a4dce694b 100644
index 6519af435..c9d3a2b03 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -191,6 +191,51 @@ static int ggml_cuda_parse_id(char devName[]) {
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
}
#endif // defined(GGML_USE_HIP)
@@ -78,15 +77,46 @@ index 9f3cb06ad..a4dce694b 100644
static ggml_cuda_device_info ggml_cuda_init() {
ggml_cuda_device_info info = {};
@@ -4125,6 +4170,7 @@ struct ggml_backend_cuda_device_context {
@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
- device_vmm ? "yes" : "no", prop.warpSize);
+ device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
#elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str());
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str());
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string pci_bus_id;
+ std::string id;
int op_offload_min_batch_size;
};
@@ -4214,6 +4260,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
}
#endif // defined(__linux__)
@@ -98,7 +128,7 @@ index 9f3cb06ad..a4dce694b 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -4254,6 +4305,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -106,7 +136,7 @@ index 9f3cb06ad..a4dce694b 100644
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -4860,6 +4912,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
@@ -115,7 +145,7 @@ index 9f3cb06ad..a4dce694b 100644
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 790cabca0..516d74064 100644
index f2b7fe692..8fc1c2fb5 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen

View File

@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index b68de7429..743f892a1 100644
index 2638fe4fc..c4e905a4e 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
@@ -31,10 +31,10 @@ index b68de7429..743f892a1 100644
return "<__media__>";
}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 44d05ceae..5f2e579e1 100644
index 9f7e861e9..72cec1937 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -83,6 +83,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
@@ -80,6 +80,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
typedef struct mtmd_input_chunks mtmd_input_chunks;
typedef struct mtmd_input_text mtmd_input_text;

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index f700f74db..5581dd0ae 100644
index 53891a91f..8d4851312 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2479,7 +2479,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {

View File

@@ -1,362 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 10 Jan 2026 15:36:34 -0800
Subject: [PATCH] ggml: Add batch size hint to graph_compute
This adds a batch_size parameter to backend graph_compute functions
to provide optimization hints for processing.
---
ggml/include/ggml-backend.h | 5 ++++-
ggml/src/ggml-backend-impl.h | 4 ++--
ggml/src/ggml-backend.cpp | 19 +++++++++++++------
ggml/src/ggml-blas/ggml-blas.cpp | 3 ++-
ggml/src/ggml-cann/ggml-cann.cpp | 4 +++-
ggml/src/ggml-cpu/ggml-cpu.cpp | 4 +++-
ggml/src/ggml-cuda/ggml-cuda.cu | 4 +++-
ggml/src/ggml-hexagon/ggml-hexagon.cpp | 4 +++-
ggml/src/ggml-metal/ggml-metal.cpp | 4 +++-
ggml/src/ggml-opencl/ggml-opencl.cpp | 4 +++-
ggml/src/ggml-rpc/ggml-rpc.cpp | 4 +++-
ggml/src/ggml-sycl/ggml-sycl.cpp | 4 +++-
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 ++-
ggml/src/ggml-webgpu/ggml-webgpu.cpp | 3 ++-
ggml/src/ggml-zdnn/ggml-zdnn.cpp | 3 ++-
ggml/src/ggml-zendnn/ggml-zendnn.cpp | 4 +++-
16 files changed, 54 insertions(+), 22 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 99412fe56..97f630faa 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -98,7 +98,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
@@ -308,6 +308,9 @@ extern "C" {
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
+ // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
+ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
+
// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 59190b7c4..a833756f9 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -106,8 +106,8 @@ extern "C" {
// compute the graph with the plan
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- // compute graph (always async if supported by the backend)
- enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ // compute graph (always async if supported by the backend). batch_size may be -1 if unknown
+ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
// (optional) event synchronization
// record an event on this stream
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 259e10257..096f52790 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -353,14 +353,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
}
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1);
ggml_backend_synchronize(backend);
return err;
}
-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
GGML_ASSERT(backend);
- return backend->iface.graph_compute(backend, cgraph);
+ return backend->iface.graph_compute(backend, cgraph, batch_size);
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -727,6 +727,8 @@ struct ggml_backend_sched {
bool op_offload;
+ int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
+
int debug;
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
@@ -825,7 +827,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
- if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+ if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
@@ -1577,7 +1579,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
if (!sched->callback_eval) {
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
@@ -1599,7 +1601,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
@@ -1689,12 +1691,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
+ sched->batch_size = -1;
ggml_backend_sched_reset(sched);
return sched;
}
+void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) {
+ sched->batch_size = batch_size;
+}
+
void ggml_backend_sched_free(ggml_backend_sched_t sched) {
if (sched == NULL) {
return;
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index 84956cbb9..39920ec19 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -220,7 +220,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
delete backend;
}
-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -250,6 +250,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
+ GGML_UNUSED(batch_size);
}
static struct ggml_backend_i blas_backend_i = {
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 51bf8bc55..60e955640 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2191,8 +2191,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
* completes successfully, otherwise an appropriate error status.
*/
-static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+
+ GGML_UNUSED(batch_size);
ggml_cann_set_device(cann_ctx->device);
g_nz_workspaces[cann_ctx->device].clear();
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index f4713a421..92ba577a5 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
GGML_UNUSED(backend);
}
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan);
+
+ GGML_UNUSED(batch_size);
}
static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index a4dce694b..49f9c2b56 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3789,9 +3789,11 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
#endif // USE_CUDA_GRAPH
}
-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
+ GGML_UNUSED(batch_size);
+
ggml_cuda_set_device(cuda_ctx->device);
bool use_cuda_graph = false;
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 365a24b49..fc9b6a7f5 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2468,9 +2468,11 @@ static inline int last_compute_op(ggml_cgraph * graph) {
return last;
}
-static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
+static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph, int batch_size) {
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+ GGML_UNUSED(batch_size);
+
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
const int last = last_compute_op(graph);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 516d74064..5f8d6c175 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -419,9 +419,11 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
GGML_UNUSED(dst);
}
-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
+ GGML_UNUSED(batch_size);
+
return ggml_metal_graph_compute(ctx, cgraph);
}
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index d010ca00e..1f0bc8e85 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -2998,9 +2998,11 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
-static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+ GGML_UNUSED(batch_size);
+
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 281fa1bdb..b5f7adf89 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -866,9 +866,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
}
-static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+ GGML_UNUSED(batch_size);
+
GGML_ASSERT(cgraph->n_nodes > 0);
bool reuse = rpc_ctx->gc.is_cached(cgraph);
if (reuse) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 5f8f3c210..d0e3d5412 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4171,9 +4171,11 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
}
#endif
-static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
+ GGML_UNUSED(batch_size);
+
#ifdef GGML_SYCL_GRAPH
bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
if (use_sycl_graph) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 885864111..58de820f2 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -13540,8 +13540,9 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds;
}
-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
+ GGML_UNUSED(batch_size);
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
if (vk_instance.debug_utils_support) {
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 5b8f7f72d..7b928fb5f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1616,8 +1616,9 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
}
}
-static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
+ GGML_UNUSED(batch_size);
ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
webgpu_context ctx = backend_ctx->webgpu_ctx;
diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
index edbeb8eef..a90066365 100644
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@@ -407,7 +407,8 @@ static void ggml_backend_zdnn_free(ggml_backend_t backend) {
free(backend);
}
-static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
+ GGML_UNUSED(batch_size);
return ggml_zdnn_graph_compute(backend, cgraph);
}
diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
index fd07f983d..954a64e37 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -205,9 +205,11 @@ static void ggml_backend_zendnn_free(ggml_backend_t backend) {
delete backend;
}
-static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_zendnn_context * ctx = (ggml_backend_zendnn_context *)backend->context;
+ GGML_UNUSED(batch_size);
+
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index e8eef035f..a208c7789 100644
index f68829a61..2024d3d37 100644
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,6 +1,6 @@

View File

@@ -1,321 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 10 Jan 2026 15:49:32 -0800
Subject: [PATCH] ggml: No-alloc mode for memory sizing
Adds infrastructure for scheduler no-alloc mode that enables
fast memory sizing calculations without actual allocations.
---
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-backend-impl.h | 16 ++++++++
ggml/src/ggml-backend.cpp | 75 +++++++++++++++++++++++++++++++++--
ggml/src/ggml-cuda/common.cuh | 62 ++++++++++++++++++++++++++++-
4 files changed, 149 insertions(+), 5 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 97f630faa..cc4f0e5af 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -306,6 +306,7 @@ extern "C" {
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Provide a hint on the batch size to optimize processing (uses heuristics if unset)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index a833756f9..93eb8e511 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" {
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
+
+ // (optional) returns a dummy buffer that is equivalent to one created by alloc_buffer but without actually being backed
+ // by memory
+ ggml_backend_buffer_t (*noalloc_buffer)(ggml_backend_buffer_type_t buft, size_t size);
};
struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface;
ggml_backend_dev_t device;
void * context;
+ bool no_alloc;
};
//
@@ -63,6 +68,7 @@ extern "C" {
void * context;
size_t size;
enum ggml_backend_buffer_usage usage;
+ bool no_alloc;
};
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -117,6 +123,16 @@ extern "C" {
// (optional) sort/optimize the nodes in the graph
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+ // (optional) reserves intermediate buffers needed for the compution
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
+ enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
+
+ // (optional) returns the memory needed after calling graph_reserve
+ size_t (*buffer_size) (ggml_backend_t backend);
+
+ // (optional) frees memory from intermediate buffers that was allocated either by graph_compute or graph_reserve
+ void (*reset) (ggml_backend_t backend);
};
struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 096f52790..c12e83cc0 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -36,11 +36,25 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
}
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- GGML_ASSERT(buft);
if (size == 0) {
// return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0);
}
+
+ if (buft->no_alloc) {
+ ggml_backend_buffer_t buf;
+
+ if (buft->iface.noalloc_buffer != NULL) {
+ buf = buft->iface.noalloc_buffer(buft, size);
+ } else {
+ buf = ggml_backend_buffer_init(buft, {}, NULL, size);
+ }
+
+ buf->no_alloc = true;
+ return buf;
+ }
+
+ GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size);
}
@@ -94,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY,
+ /* .no_alloc = */ false
};
return buffer;
@@ -126,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL;
}
+ // If we aren't allocating memory, return a placeholder non-NULL pointer
+ // that meets alignment requirements
+ if (buffer->no_alloc) {
+ return (void *)ggml_backend_buffer_get_alignment(buffer);
+ }
+
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
// I don't know whether the above comment is correct
if (!buffer->iface.get_base) {
@@ -736,6 +757,12 @@ struct ggml_backend_sched {
int debug_realloc;
int debug_graph_size;
int debug_prev_graph_size;
+
+ // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
+ // if false, dummy buffers are used for faster memory sizing calculations
+ // the scheduler needs to be recreated with allocated buffers before it can be used
+ // for computation
+ bool alloc_buffers;
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1635,6 +1662,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size,
bool parallel,
bool op_offload) {
+ return ggml_backend_sched_new_ext(backends, bufts, n_backends, graph_size, parallel, op_offload, true);
+ }
+
+ggml_backend_sched_t ggml_backend_sched_new_ext(
+ ggml_backend_t * backends,
+ ggml_backend_buffer_type_t * bufts,
+ int n_backends,
+ size_t graph_size,
+ bool parallel,
+ bool op_offload,
+ bool alloc_buffers) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1687,11 +1725,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
}
}
+
+ sched->bufts[b]->no_alloc = !alloc_buffers;
}
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
sched->batch_size = -1;
+ sched->alloc_buffers = alloc_buffers;
ggml_backend_sched_reset(sched);
@@ -1710,6 +1751,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]);
}
+
+ if (sched->backends[b]->iface.reset != NULL) {
+ sched->backends[b]->iface.reset(sched->backends[b]);
+ }
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
@@ -1765,6 +1810,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false;
}
+ if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+ return false;
+ }
+
+ struct ggml_backend_sched_split * splits = sched->splits;
+ for (int i = 0; i < sched->n_splits; i++) {
+ struct ggml_backend_sched_split * split = &splits[i];
+ int split_backend_id = split->backend_id;
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
+
+ if (split_backend->iface.graph_reserve != NULL) {
+ enum ggml_status ec = split_backend->iface.graph_reserve(split_backend, &split->graph, sched->alloc_buffers);
+ if (ec != GGML_STATUS_SUCCESS) {
+ return false;
+ }
+ }
+ }
+
ggml_backend_sched_reset(sched);
return true;
@@ -1870,7 +1933,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
- return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+ size_t size = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+
+ if (backend->iface.buffer_size != NULL) {
+ size += backend->iface.buffer_size(backend);
+ }
+
+ return size;
}
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 9516d8ec8..e37e6d8fb 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -37,6 +37,41 @@
#include "vendors/cuda.h"
#endif // defined(GGML_USE_HIP)
+extern bool reserving_graph;
+
+// If we are reserving the graph, pointers might be invalid and will fail if cudaMemcpyAsync tries to validate them.
+// However, since we don't actually expect a result, we don't need to actually do the memcpy.
+static cudaError_t cudaMemcpyAsyncReserve ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
+ if (!reserving_graph) {
+ return cudaMemcpyAsync(dst, src, count, kind, stream);
+ } else {
+ return cudaSuccess;
+ }
+}
+
+static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
+ if (!reserving_graph) {
+ return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+ } else {
+ return cudaSuccess;
+ }
+}
+
+static cudaError_t cudaMemsetAsyncReserve ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) {
+ if (!reserving_graph) {
+ return cudaMemsetAsync(devPtr, value, count, stream);
+ } else {
+ return cudaSuccess;
+ }
+}
+
+#undef cudaMemcpyAsync
+#define cudaMemcpyAsync cudaMemcpyAsyncReserve
+#undef cudaMemcpy2DAsync
+#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve
+#undef cudaMemsetAsync
+#define cudaMemsetAsync cudaMemsetAsyncReserve
+
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -977,6 +1012,9 @@ struct ggml_cuda_pool {
virtual void * alloc(size_t size, size_t * actual_size) = 0;
virtual void free(void * ptr, size_t size) = 0;
+
+ virtual bool alloc_memory() = 0;
+ virtual size_t alloc_size() = 0;
};
template<typename T>
@@ -1283,11 +1321,15 @@ struct ggml_backend_cuda_context {
// pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
- static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no, bool alloc);
ggml_cuda_pool & pool(int device) {
if (pools[device][curr_stream_no] == nullptr) {
- pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
+ bool alloc = true;
+ if (pools[device][0] != nullptr) {
+ alloc = pools[device][0]->alloc_memory();
+ }
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
}
return *pools[device][curr_stream_no];
}
@@ -1295,6 +1337,22 @@ struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() {
return pool(device);
}
+
+ void pool_set_alloc(bool alloc) {
+ GGML_ASSERT(pools[device][curr_stream_no] == nullptr || pools[device][curr_stream_no]->alloc_memory() == alloc);
+
+ if (pools[device][curr_stream_no] == nullptr) {
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
+ }
+ }
+
+ size_t pool_get_alloc_size(int stream_no) {
+ if (pools[device][stream_no] == nullptr) {
+ return 0;
+ }
+
+ return pools[device][stream_no]->alloc_size();
+ }
};
struct ggml_cuda_mm_fusion_args_host {

View File

@@ -8,16 +8,16 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f220010a1..ba0ff9df9 100644
index 8786d4ee3..9e6998272 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1391,8 +1391,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
@@ -1051,8 +1051,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd_inp();
- // when computing embeddings, all tokens are output
- const bool output_all = cparams.embeddings;
- const bool output_all = cparams.embeddings;
+ const bool output_all = false;
const bool has_samplers = !sampling.samplers.empty();
const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -1,24 +1,25 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 10 Jan 2026 15:56:42 -0800
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 27 Aug 2025 14:39:48 -0700
Subject: [PATCH] ggml: Enable resetting backend devices
Allows resetting CUDA devices to free primary context allocations
(~300 MB of VRAM per device) when a device is unused.
Touching a CUDA device causes the allocation of a primary context
with CUDA data structures (~300 MB of VRAM). If a device is
unused then it can be reset to free these data structures.
---
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-backend-impl.h | 4 ++++
ggml/src/ggml-backend.cpp | 8 ++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 11 +++++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 16 +++++++++++++++-
ggml/src/ggml-cuda/vendors/hip.h | 1 +
src/llama.cpp | 4 +++-
6 files changed, 28 insertions(+), 1 deletion(-)
6 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index cc4f0e5af..006867064 100644
index dbbb61d9c..92ca32a4b 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -179,6 +179,7 @@ extern "C" {
@@ -178,6 +178,7 @@ extern "C" {
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
@@ -27,7 +28,7 @@ index cc4f0e5af..006867064 100644
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 93eb8e511..c815c2eed 100644
index 7bdf9d81f..21b35ac5c 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -195,6 +195,10 @@ extern "C" {
@@ -42,7 +43,7 @@ index 93eb8e511..c815c2eed 100644
struct ggml_backend_device {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index c12e83cc0..4ce406c39 100644
index 7746e8b92..189e97170 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -532,6 +532,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
@@ -61,10 +62,10 @@ index c12e83cc0..4ce406c39 100644
GGML_ASSERT(device);
return device->iface.get_buffer_type(device);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 49f9c2b56..e3fbd2dd4 100644
index eeaae3fe4..6852d2e20 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -115,6 +115,11 @@ int ggml_cuda_get_device() {
@@ -113,6 +113,11 @@ int ggml_cuda_get_device() {
return id;
}
@@ -76,7 +77,19 @@ index 49f9c2b56..e3fbd2dd4 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
cudaError_t err;
@@ -4769,6 +4774,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
@@ -4448,7 +4453,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
+ // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+ props->memory_total = props->memory_free = 0;
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
@@ -4908,6 +4916,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}
@@ -88,7 +101,7 @@ index 49f9c2b56..e3fbd2dd4 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -4785,6 +4795,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
@@ -4924,6 +4937,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
@@ -97,22 +110,22 @@ index 49f9c2b56..e3fbd2dd4 100644
// backend reg
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 016b04e5a..164d367d0 100644
index 951a88d56..4e162258d 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -51,6 +51,7 @@
@@ -49,6 +49,7 @@
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
diff --git a/src/llama.cpp b/src/llama.cpp
index f1096d960..da72d8e77 100644
index f69964b6d..759152b76 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -993,10 +993,12 @@ static struct llama_model * llama_model_load_from_file_impl(
@@ -921,10 +921,12 @@ static struct llama_model * llama_model_load_from_file_impl(
for (auto * dev : model->devices) {
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);

View File

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs
1 file changed, 68 insertions(+), 3 deletions(-)
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
index b5d46cbe7..f8a4ac7b5 100644
index c9073cef0..f473a2a2c 100644
--- a/ggml/src/mem_nvml.cpp
+++ b/ggml/src/mem_nvml.cpp
@@ -13,6 +13,7 @@

View File

@@ -59,10 +59,10 @@ index 88ed79111..71ca60214 100644
} else {
if (sector < sections.v[0]) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index ff1afa291..21b51cc8d 100644
index 236838e9e..c98d269d1 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4243,14 +4243,14 @@ kernel void kernel_rope_multi(
@@ -4242,14 +4242,14 @@ kernel void kernel_rope_multi(
float theta_base;
if (FC_rope_is_imrope) {
@@ -82,10 +82,10 @@ index ff1afa291..21b51cc8d 100644
} else {
if (sector < args.sect_0) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
index aacec9846..0163d8bbc 100644
index 9726b722d..1c8c69422 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
@@ -155,14 +155,14 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
@@ -148,14 +148,14 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
float theta_base = 0.0;
if (p.is_imrope != 0) {

View File

@@ -1,401 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 10 Jan 2026 16:17:12 -0800
Subject: [PATCH] ollama: Add memory detection using DXGI + PDH
Add Windows-specific VRAM detection using DXGI and PDH performance counters.
This provides accurate memory reporting for both integrated and discrete GPUs.
Add luid field to Vulkan device context for device matching.
---
ggml/src/CMakeLists.txt | 1 +
ggml/src/ggml-impl.h | 3 +
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 24 +++
ggml/src/mem_dxgi_pdh.cpp | 297 +++++++++++++++++++++++++++
4 files changed, 325 insertions(+)
create mode 100644 ggml/src/mem_dxgi_pdh.cpp
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 47d7ea9ce..755ea86a7 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -207,6 +207,7 @@ add_library(ggml-base
ggml-quants.h
mem_hip.cpp
mem_nvml.cpp
+ mem_dxgi_pdh.cpp
gguf.cpp)
set_target_properties(ggml-base PROPERTIES
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 9549d0495..eacabb191 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -680,6 +680,9 @@ GGML_API void ggml_nvml_release();
GGML_API int ggml_hip_mgmt_init();
GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool is_integrated_gpu);
GGML_API void ggml_hip_mgmt_release();
+GGML_API int ggml_dxgi_pdh_init();
+GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu);
+GGML_API void ggml_dxgi_pdh_release();
#ifdef __cplusplus
}
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index b3eea31af..5dbe20e21 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
#define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16"
#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
#define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000)
+#define VK_LUID_SIZE_KHR VK_LUID_SIZE
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
VkStructureType sType;
@@ -14294,6 +14295,7 @@ struct ggml_backend_vk_device_context {
std::string pci_id;
std::string id;
std::string uuid;
+ std::string luid;
int major;
int minor;
int driver_major;
@@ -14318,6 +14320,20 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
+ GGML_LOG_DEBUG("ggml_backend_vk_device_get_memory called: uuid %s\n", ctx->uuid.c_str());
+ GGML_LOG_DEBUG("ggml_backend_vk_device_get_memory called: luid %s\n", ctx->luid.c_str());
+
+ // Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic)
+ if (ggml_dxgi_pdh_init() == 0) {
+ GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n");
+ int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_dxgi_pdh_release();
+ return;
+ }
+ ggml_dxgi_pdh_release();
+ }
// Use vendor specific management libraries for best VRAM reporting if available
if (!ctx->is_integrated_gpu) {
@@ -15090,6 +15106,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
}
}
ctx->uuid = oss.str();
+ const auto& luid = device_id_props.deviceLUID;
+ char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars
+ snprintf(luid_str, sizeof(luid_str), // high part + low part
+ "0x%02x%02x%02x%02x%02x%02x%02x%02x",
+ luid[7], luid[6], luid[5], luid[4],
+ luid[3], luid[2], luid[1], luid[0]
+ );
+ ctx->luid = std::string(luid_str);
ctx->major = 0;
ctx->minor = 0;
// TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
diff --git a/ggml/src/mem_dxgi_pdh.cpp b/ggml/src/mem_dxgi_pdh.cpp
new file mode 100644
index 000000000..4dd66c25f
--- /dev/null
+++ b/ggml/src/mem_dxgi_pdh.cpp
@@ -0,0 +1,297 @@
+// DXGI and PDH Performance Counters Library
+// This Windows-only (10/11) library provides accurate VRAM reporting
+#include "ggml.h"
+#include "ggml-impl.h"
+
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+#include <windows.h>
+#include <pdh.h>
+#include <dxgi1_2.h>
+#include <sstream>
+#include <thread>
+#include <filesystem>
+#include <mutex>
+
+namespace fs = std::filesystem;
+
+static std::mutex ggml_dxgi_pdh_lock;
+
+/*
+Struct to keep track of GPU adapter information at runtime
+*/
+struct GpuInfo {
+ std::wstring description; // debug field
+ LUID luid;
+ std::wstring pdhInstance;
+ double dedicatedTotal = 0.0;
+ double sharedTotal = 0.0;
+ double dedicatedUsage = 0.0;
+ double sharedUsage = 0.0;
+};
+
+/*
+DLL Function Pointers
+*/
+struct {
+ void *dxgi_dll_handle;
+ void *pdh_dll_handle;
+ // DXGI Functions
+ HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory);
+ // PDH functions
+ PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery);
+ PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter);
+ PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery);
+ PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue);
+ PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery);
+} dll_functions {
+ nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr
+};
+
+/*
+Create a PDH Instance name
+*/
+static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) {
+ std::wstringstream ss;
+ ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart
+ << L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart;
+ return ss.str();
+}
+
+/*
+Conversion from Bytes to GigaBytes
+*/
+template <typename T>
+static inline double b_to_gb(T n)
+{
+ return (double(n) / (1024.0 * 1024 * 1024));
+}
+
+/*
+Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI
+*/
+static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) {
+ auto dedicatedVideoMemory = desc.DedicatedVideoMemory;
+ auto sharedSystemMemory = desc.SharedSystemMemory;
+ GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory));
+ if (info) {
+ info->dedicatedTotal = dedicatedVideoMemory; // values in bytes
+ info->sharedTotal = sharedSystemMemory;
+ }
+}
+
+/*
+Enumerate over the GPU adapters detected using DXGI and return their information
+*/
+static std::vector<GpuInfo> get_dxgi_gpu_infos() {
+ std::vector<GpuInfo> infos;
+ IDXGIFactory1* pFactory = nullptr;
+
+ if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) {
+ UINT i = 0;
+ IDXGIAdapter1* pAdapter = nullptr;
+ while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) {
+ DXGI_ADAPTER_DESC1 desc;
+ pAdapter->GetDesc1(&desc);
+
+ // Get all the GPU adapter info
+ GpuInfo info;
+ fetch_dxgi_adapter_desc1(desc, &info);
+ info.description = std::wstring(desc.Description);
+ info.luid = desc.AdapterLuid;
+ info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid);
+ infos.push_back(info);
+
+ pAdapter->Release();
+ ++i;
+ }
+ pFactory->Release();
+ }
+ return infos;
+}
+
+static bool get_gpu_memory_usage(GpuInfo& gpu) {
+ PDH_HQUERY query;
+ if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) {
+ return false;
+ }
+
+ struct GpuCounters {
+ PDH_HCOUNTER dedicated;
+ PDH_HCOUNTER shared;
+ };
+
+ GpuCounters gpuCounter{};
+
+ std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage";
+ std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage";
+
+ if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS ||
+ dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) {
+ GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str());
+ dll_functions.PdhCloseQuery(query);
+ return false;
+ }
+
+ // Sample the data
+ if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) {
+ dll_functions.PdhCloseQuery(query);
+ return false;
+ }
+
+ // Read final values
+ PDH_FMT_COUNTERVALUE val;
+
+ if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
+ gpu.dedicatedUsage = val.doubleValue;
+
+ if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
+ gpu.sharedUsage = val.doubleValue;
+
+ dll_functions.PdhCloseQuery(query);
+ return true;
+}
+
+
+extern "C" {
+
+ int ggml_dxgi_pdh_init() {
+ GGML_LOG_DEBUG("%s called\n", __func__);
+ std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
+ if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) {
+ // Already initialized as we have both DLL handles
+ return ERROR_SUCCESS;
+ }
+
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+ fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll");
+ fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll");
+
+ // Call LoadLibraryW on both DLLs to ensure they are loaded
+ void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str());
+ void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str());
+ if(dxgi == NULL || pdh == NULL) {
+ if (dxgi != NULL) {
+ FreeLibrary((HMODULE)(dxgi));
+ }
+ if (pdh != NULL) {
+ FreeLibrary((HMODULE)(pdh));
+ }
+ SetErrorMode(old_mode);
+ return ERROR_DLL_NOT_FOUND;
+ }
+ else {
+ // save the dll handles
+ dll_functions.dxgi_dll_handle = dxgi;
+ dll_functions.pdh_dll_handle = pdh;
+ }
+
+ // Get pointers to the library functions loaded by the DLLs
+ dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1");
+ dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW");
+ dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW");
+ dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData");
+ dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue");
+ dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery");
+
+ SetErrorMode(old_mode); // set old mode before any return
+
+ // Check if any function pointers are NULL (not found)
+ if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) {
+ GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__);
+ FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
+ FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
+ dll_functions.dxgi_dll_handle = NULL;
+ dll_functions.pdh_dll_handle = NULL;
+ return ERROR_PROC_NOT_FOUND;
+ }
+
+ // No other initializations needed, successfully loaded the libraries and functions!
+ return ERROR_SUCCESS;
+ }
+
+ void ggml_dxgi_pdh_release() {
+ std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
+ if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) {
+ // Already freed the DLLs
+ return;
+ }
+
+ // Call FreeLibrary on both DLLs
+ FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
+ FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
+
+ dll_functions.dxgi_dll_handle = NULL;
+ dll_functions.pdh_dll_handle = NULL;
+
+ return; // successfully released
+ }
+
+ int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
+
+ std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
+
+ // Enumerate GPUs using DXGI and find the matching LUID
+ // This also fetches the total memory info for each of the enumerated GPUs
+ std::vector<GpuInfo> gpus = get_dxgi_gpu_infos();
+ GpuInfo *targetGpu = nullptr;
+ for (auto& gpu : gpus) {
+ char luid_buffer[32]; // "0x" + 16 hex digits + null terminator
+ snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart);
+ std::string gpu_luid_str(luid_buffer);
+ if (gpu_luid_str == std::string(luid)) {
+ targetGpu = &gpu;
+ break;
+ }
+ }
+ if (!targetGpu) {
+ GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid);
+ return ERROR_NOT_FOUND;
+ }
+
+ // Get the current memory usage for the target GPU
+ int status = get_gpu_memory_usage(*targetGpu);
+ if (!status) {
+ GGML_LOG_ERROR("Failed to get GPU memory usage.\n");
+ return ERROR_DEVICE_NOT_AVAILABLE;
+ }
+
+ // Calculate the free memory based on whether it's an integrated or discrete GPU
+ if (is_integrated_gpu) {
+ // IGPU free = SharedTotal - SharedUsage
+ GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
+ *free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory
+ *total = targetGpu->sharedTotal + targetGpu->dedicatedTotal;
+ }
+ else {
+ // DGPU free = DedicatedTotal - DedicatedUsage
+ GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
+ *free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage;
+ *total = targetGpu->dedicatedTotal;
+ }
+
+ return ERROR_SUCCESS;
+ }
+
+} // extern "C"
+
+#else // #ifdef _WIN32
+
+extern "C" {
+
+ // DXGI + PDH not available for Linux implementation
+ int ggml_dxgi_pdh_init() {
+ return -1;
+ }
+ void ggml_dxgi_pdh_release() {}
+ int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
+ return -1;
+ }
+
+} // extern "C"
+
+#endif // #ifdef _WIN32

View File

@@ -10,10 +10,10 @@ fallback to cpu
1 file changed, 3 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 4dbf7097a..311691206 100644
index 334a30135..5c9dfd032 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4494,6 +4494,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -4633,6 +4633,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
return false;
}

View File

@@ -9,10 +9,10 @@ Rever to prior logic of assuming an empty projector type is mlp
1 file changed, 4 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 26710548e..a8c170cd7 100644
index 84a3796b5..d3a37842d 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -973,6 +973,10 @@ struct clip_model_loader {
@@ -960,6 +960,10 @@ struct clip_model_loader {
if (proj_type.empty()) {
if (modality == CLIP_MODALITY_VISION) {
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);

View File

@@ -8,10 +8,10 @@ Subject: [PATCH] win: exit instead of abort
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 09b8eb466..659c7f4ed 100644
index eb3ae72ea..c9242a15a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -252,8 +252,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
@@ -250,8 +250,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
fprintf(stderr, "%s\n", message);
ggml_print_backtrace();
}

View File

@@ -1,47 +0,0 @@
# Patch Notes
Notes on patches that had conflicts or could be simplified during the Jan 2026 update to `b1377188`.
## Patches with Conflicts
### 0004-solar-pro.patch
- **Conflict**: `src/llama-arch.cpp` - upstream added `LLM_ARCH_MAINCODER` in same location
- **Resolution**: Keep both architectures (MAINCODER from upstream, SOLAR from patch)
- **Simplification**: Consider upstreaming Solar Pro support
### 0009-remove-amx.patch
- **Conflict**: `ggml/src/CMakeLists.txt` - upstream restructured CPU variants significantly
- **Resolution**: Kept upstream's expanded variant list but removed sapphirerapids with AMX
- **Simplification**: Upstream now conditionalizes AMX behind `if (NOT MSVC)`, consider if full removal is still needed
### 0015-ggml-Export-GPU-UUIDs.patch
- **Conflict**: Could not build fake ancestor due to significant file changes
- **Resolution**: Applied manually - added `id` field to props struct, UUID parsing function, etc.
- **Note**: Upstream added `device_id` for PCI bus IDs; our `id` is for GPU UUIDs (different purposes)
### 0018-ggml-Add-batch-size-hint.patch
- **Status**: Large API change affecting 8 files
- **Conflict**: Function signature changes across multiple backends
- **Simplification**: Consider upstreaming this - it's a clean optimization
## Patches That Applied Cleanly
- 0001-0003, 0005-0008, 0010-0014, 0016-0017
## Patches Not Yet Attempted
- 0019-0031
## Potential Upstream Candidates
These patches fix general issues that could benefit upstream:
- 0002-pretokenizer - graceful handling of unknown pretokenizers
- 0003-clip-unicode - Windows unicode path support
- 0004-solar-pro - Solar Pro architecture
- 0005-fix-deepseek-deseret-regex - Windows regex fix
- 0010-fix-string-arr-kv-loading - gguf array loading
## Ollama-Specific (Won't Upstream)
- 0011-ollama-debug-tensor
- 0012-add-ollama-vocab-for-grammar-support
- 0015-ggml-Export-GPU-UUIDs (custom UUID field)
- 0024-GPU-discovery-enhancements
- 0025-NVML-fallback-for-unified-memory-GPUs
- 0028-Add-memory-detection-using-DXGI-PDH

View File

@@ -72,7 +72,7 @@ struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
try {
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
std::vector<std::string> splits = {};
llama_model_loader ml(std::string(fname), splits, false, false, false, false, nullptr, nullptr);
llama_model_loader ml(std::string(fname), splits, false, false, false, nullptr, nullptr);
vocab->load(ml, kv);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());

Some files were not shown because too many files have changed in this diff Show More