x/grammar: add experimental GPU accelerated constrained decoding package

2026-01-12 01:21:26 -05:00 · 2026-01-11 00:50:11 -08:00
224 changed files with 8677 additions and 14671 deletions
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=b1377188784f9aea26b8abde56d4aee8c733eec7
+FETCH_HEAD=ec98e2002

 .PHONY: help
 help:
--- a/go.mod
+++ b/go.mod
@@ -15,8 +15,8 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.17.0
-	golang.org/x/sys v0.37.0
+	golang.org/x/sync v0.19.0
+	golang.org/x/sys v0.39.0
 )

 require (
@@ -30,8 +30,8 @@ require (
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
 	github.com/wk8/go-ordered-map/v2 v2.1.8
 	golang.org/x/image v0.22.0
-	golang.org/x/mod v0.30.0
-	golang.org/x/tools v0.38.0
+	golang.org/x/mod v0.31.0
+	golang.org/x/tools v0.40.0
 	gonum.org/v1/gonum v0.15.0
 )

@@ -81,11 +81,11 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.43.0
-	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
-	golang.org/x/net v0.46.0 // indirect
-	golang.org/x/term v0.36.0
-	golang.org/x/text v0.30.0
+	golang.org/x/crypto v0.46.0
+	golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93
+	golang.org/x/net v0.48.0 // indirect
+	golang.org/x/term v0.38.0
+	golang.org/x/text v0.32.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -233,16 +233,16 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
-golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
+golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
+golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
-golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4=
-golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
+golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0=
+golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
@@ -264,8 +264,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
-golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
+golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
+golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -278,8 +278,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
-golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
+golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
+golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -289,8 +289,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
-golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -306,17 +306,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
-golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
+golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
-golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
+golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q=
+golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
-golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
+golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
+golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -330,8 +330,8 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
-golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
+golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
+golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/llama/README.md
+++ b/llama/README.md
@@ -14,28 +14,25 @@ make -f Makefile.sync apply-patches

 ### Updating Base Commit

-To update to a new base commit:
+**Pin to new base commit**

-1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash.
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.

-2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts.
+When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.

-3. **Apply patches**:
-   ```shell
-   make -f Makefile.sync apply-patches
-   ```
+Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.

-4. **Resolve conflicts** (if any): When `git am` fails on a patch:
-   - Fix conflicts in `./vendor/`
-   - Stage the resolved files: `git -C llama/vendor add <file>`
-   - Continue: `git -C llama/vendor am --continue`
-   - Re-run: `make -f Makefile.sync apply-patches`
-   - Repeat until all patches are applied.
+```shell
+make -f Makefile.sync apply-patches
+```

-5. **Regenerate patches and sync**:
-   ```shell
-   make -f Makefile.sync format-patches sync
-   ```
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
+
+Once all patches are applied, commit the changes to the tracking repository.
+
+```shell
+make -f Makefile.sync format-patches sync
+```

 ### Generating Patches

--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "b1377188784f9aea26b8abde56d4aee8c733eec7";
+char const *LLAMA_COMMIT = "ec98e2002";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
    }

-    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
@@ -1078,15 +1078,12 @@ struct common_init_result::impl {
    impl() = default;
    ~impl() = default;

-    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
-
    llama_model_ptr   model;
    llama_context_ptr context;

    std::vector<llama_adapter_lora_ptr> lora;

    std::vector<common_sampler_ptr> samplers;
-    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };

 common_init_result::common_init_result(common_params & params) :
@@ -1095,9 +1092,9 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

@@ -1110,25 +1107,6 @@ common_init_result::common_init_result(common_params & params) :

    const llama_vocab * vocab = llama_model_get_vocab(model);

-    // load and optionally apply lora adapters (must be loaded before context creation)
-    for (auto & la : params.lora_adapters) {
-        llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
-        if (lora == nullptr) {
-            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
-            pimpl->model.reset(model);
-            return;
-        }
-
-        char buf[1024];
-        la.ptr = lora.get();
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
-        la.task_name = buf;
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
-        la.prompt_prefix = buf;
-        pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
-    }
-
    // updates params.sampling
    // TODO: fix naming
    common_init_sampler_from_model(model, params.sampling);
@@ -1163,19 +1141,10 @@ common_init_result::common_init_result(common_params & params) :
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

-    // init the backend samplers as part of the context creation
    pimpl->samplers.resize(cparams.n_seq_max);
-    pimpl->samplers_seq_config.resize(cparams.n_seq_max);

    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
-        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
-    }
-
-    // TODO: temporarily gated behind a flag
-    if (params.sampling.backend_sampling) {
-        cparams.samplers   = pimpl->samplers_seq_config.data();
-        cparams.n_samplers = pimpl->samplers_seq_config.size();
    }

    llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1199,12 +1168,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
    return pimpl->samplers[seq_id].get();
 }

-void common_init_result::reset_samplers() {
-    for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
-        llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
-    }
-}
-
 std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }
@@ -1280,6 +1243,24 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        }
    }

+    // load and optionally apply lora adapters
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
+            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            return res;
+        }
+
+        char buf[1024];
+        la.ptr = lora.get();
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
+        la.task_name = buf;
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        la.prompt_prefix = buf;
+        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
+    }
+
    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }
@@ -1320,9 +1301,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
        llama_set_warmup(lctx, false);
-
-        // reset samplers to reset RNG state after warmup to the seeded state
-        res->reset_samplers();
    }

    return res;
@@ -1361,12 +1339,14 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.devices = params.devices.data();
    }

-    mparams.n_gpu_layers    = params.n_gpu_layers;
+    if (params.n_gpu_layers != -1) {
+        mparams.n_gpu_layers = params.n_gpu_layers;
+    }
+
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
-    mparams.use_direct_io   = params.use_direct_io;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -80,7 +80,6 @@ int32_t cpu_get_num_math();
 //

 enum llama_example {
-    LLAMA_EXAMPLE_DEBUG,
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_COMPLETION,
@@ -217,8 +216,6 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

-    bool backend_sampling = false;
-
    bool has_logit_bias() const {
        return !logit_bias.empty();
    }
@@ -332,14 +329,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
-    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
-    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
-
-    // margin per device in bytes for fitting parameters to free memory:
-    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
+    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
+    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

@@ -375,11 +370,6 @@ struct common_params {
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT

-    // llama-debug specific options
-    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
-    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
-    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
-
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
@@ -430,8 +420,7 @@ struct common_params {
    bool kv_unified        = false; // enable unified KV cache

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // enable mmap to use filesystem cache
-    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
@@ -486,8 +475,7 @@ struct common_params {
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
-    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
-    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -496,11 +484,8 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

-    // webui configs
-    bool webui = true;
-    std::string webui_config_json;
-
    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;
@@ -700,9 +685,7 @@ struct common_init_result {

    llama_model * model();
    llama_context * context();
-
    common_sampler * sampler(llama_seq_id seq_id);
-    void reset_samplers();

    std::vector<llama_adapter_lora_ptr> & lora();

--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;

-    struct llama_sampler * grmr;
    struct llama_sampler * chain;

+    bool grammar;
+
    ring_buffer<llama_token> prev;

    std::vector<llama_token_data> cur;
@@ -120,34 +121,17 @@ struct common_sampler {
    }

    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs  = llama_get_sampled_probs_ith     (ctx, idx);
-        const float *       sampled_logits = llama_get_sampled_logits_ith    (ctx, idx);
-        const llama_token * sampled_ids    = llama_get_sampled_candidates_ith(ctx, idx);
+        const auto * logits = llama_get_logits_ith(ctx, idx);

        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);

        const int n_vocab = llama_vocab_n_tokens(vocab);

-        if (sampled_probs) {
-            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
-            cur.resize(sampled_probs_count);
-            for (uint32_t i = 0; i < sampled_probs_count; ++i) {
-                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
-            }
-        } else if (sampled_logits) {
-            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
-            cur.resize(sampled_logits_count);
-            for (uint32_t i = 0; i < sampled_logits_count; i++) {
-                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
-            }
-        } else {
-            const auto * logits = llama_get_logits_ith(ctx, idx);
-            GGML_ASSERT(logits != nullptr);
-            cur.resize(n_vocab);
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-            }
+        cur.resize(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
        }

        cur_p = { cur.data(), cur.size(), -1, false };
@@ -176,50 +160,45 @@ std::string common_params_sampling::print() const {
    return std::string(result);
 }

-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    const llama_vocab * vocab = llama_model_get_vocab(model);

    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

-    llama_sampler * grmr = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);

+    bool grammar = false;
    std::vector<llama_sampler *> samplers;

    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
+        grammar = true;
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
        std::vector<std::string> trigger_patterns;
+        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
            switch (trigger.type) {
                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
                {
                    const auto & word = trigger.value;
-                    trigger_patterns.push_back(regex_escape(word));
+                    patterns_anywhere.push_back(regex_escape(word));
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
                {
-                    trigger_patterns.push_back(trigger.value);
+                    patterns_anywhere.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
                {
-                    const auto & pattern = trigger.value;
-                    std::string anchored = "^$";
-                    if (!pattern.empty()) {
-                        anchored = (pattern.front() != '^' ? "^" : "")
-                            + pattern
-                            + (pattern.back() != '$' ? "$" : "");
-                    }
-                    trigger_patterns.push_back(anchored);
+                    trigger_patterns.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -233,6 +212,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            }
        }

+        if (!patterns_anywhere.empty()) {
+            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
+        }
+
        std::vector<const char *> trigger_patterns_c;
        trigger_patterns_c.reserve(trigger_patterns.size());
        for (const auto & regex : trigger_patterns) {
@@ -241,12 +224,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

        if (!params.grammar.empty()) {
             if (params.grammar_lazy) {
-                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                         trigger_patterns_c.data(), trigger_patterns_c.size(),
-                         trigger_tokens.data(), trigger_tokens.size());
+                 samplers.push_back(
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
+                             trigger_tokens.data(),     trigger_tokens.size()));
             } else {
-                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
             }
+
+             grammar = true;
        }
    }

@@ -315,16 +301,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        llama_sampler_chain_add(chain, smpl);
    }

-    if (grmr && params.backend_sampling) {
-        LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
-
-        params.backend_sampling = false;
-    }
-
    auto * result = new common_sampler {
        /* .params  = */ params,
-        /* .grmr    = */ grmr,
        /* .chain   = */ chain,
+        /* .grammar = */ grammar,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
        /* .cur_p   = */ {},
@@ -335,7 +315,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
@@ -345,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();

-    if (gsmpl->grmr && accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
+    if (gsmpl->grammar) {
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);

-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
+            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+
+            // the grammar sampler is always the first one
+            if (i == 0) {
+                if (accept_grammar) {
+                    llama_sampler_accept(smpl, token);
+                }
+            } else {
+                llama_sampler_accept(smpl, token);
+            }
+        }
+    } else {
+        llama_sampler_accept(gsmpl->chain, token);
+    }

    gsmpl->prev.push_back(token);
 }
@@ -361,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
-        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .grammar = */ gsmpl->grammar,
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
        /* .cur_p   = */ gsmpl->cur_p,
@@ -418,7 +410,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
    return gsmpl->chain;
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -426,61 +418,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    llama_token id = LLAMA_TOKEN_NULL;

-    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    // Check if a backend sampler has already sampled a token in which case we
-    // return that token id directly.
-    {
-        id = llama_get_sampled_token_ith(ctx, idx);
-
-        if (id != LLAMA_TOKEN_NULL) {
-            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
-
-            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
-
-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
-
-            return id;
-        }
-    }
-
    gsmpl->set_logits(ctx, idx);

-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
-
-    llama_sampler_apply(chain, &cur_p);
-
-    id = cur_p.data[cur_p.selected].id;
-
-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -490,7 +432,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@@ -498,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample

    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -510,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }

    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -520,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
--- a/llama/llama.cpp/common/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -36,8 +36,7 @@ struct common_sampler;

 // llama_sampler API overloads

-// note: can mutate params in some cases
-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);

 void common_sampler_free(struct common_sampler * gsmpl);

@@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

-// get the underlying llama_sampler_chain
 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);

 // extended sampling implementation:
@@ -59,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);

 // generalized version of common_sampler_sample
 //
@@ -80,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -286,7 +286,7 @@ extern "C" {
        // NULL-terminated list of buffer types to use for tensors that match a pattern
        const struct llama_model_tensor_buft_override * tensor_buft_overrides;

-        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
+        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -309,7 +309,6 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;      // only load the vocabulary, no weights
        bool use_mmap;        // use mmap if possible
-        bool use_direct_io;   // use direct io, takes precedence over use_mmap
        bool use_mlock;       // force system to keep model in RAM
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -317,11 +316,6 @@ extern "C" {
        bool no_alloc;        // only load metadata and simulate memory allocations
    };

-    struct llama_sampler_seq_config {
-        llama_seq_id           seq_id;
-        struct llama_sampler * sampler;
-    };
-
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggml-org/llama.cpp/pull/7544
    struct llama_context_params {
@@ -370,12 +364,6 @@ extern "C" {
        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
-
-        // [EXPERIMENTAL]
-        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
-        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
-        struct llama_sampler_seq_config * samplers;
-        size_t                            n_samplers;
    };

    // model quantization parameters
@@ -479,23 +467,16 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

-    enum llama_params_fit_status {
-        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occured, e.g. because no model could be found at the specified path
-    };
-
    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    //   - returns true if the parameters could be successfully modified to fit device memory
-    //   - this function is NOT thread safe because it modifies the global llama logger state
-    //   - only parameters that have the same value as in llama_default_model_params are modified
-    LLAMA_API enum llama_params_fit_status llama_params_fit(
+    // returns true if the parameters could be successfully modified to fit device memory
+    // this function is NOT thread safe because it modifies the global llama logger state
+    LLAMA_API bool llama_params_fit(
                                   const char   * path_model,
                    struct llama_model_params   * mparams,
                    struct llama_context_params * cparams,
                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t * margins,               // margins of memory to leave per device in bytes
+                                         size_t   margin,                // margin of memory to leave per device in bytes
                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

@@ -536,7 +517,6 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -620,8 +600,6 @@ extern "C" {
    //

    // Load a LoRA adapter from file
-    // The adapter is valid as long as the associated model is not freed
-    // All adapters must be loaded before context creation
    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
            struct llama_model * model,
            const char * path_lora);
@@ -1005,32 +983,6 @@ extern "C" {
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);

-    //
-    // backend sampling API [EXPERIMENTAL]
-    // note: use only if the llama_context was created with at least one llama_sampler_seq_config
-    //
-
-    // Get the backend sampled token for the ith token.
-    // Returns LLAMA_TOKEN_NULL if no token was sampled.
-    LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled probabilites for the ith token
-    // The index matches llama_get_sampled_token_ith().
-    // Returns NULL if no probabilites were generated.
-    LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled logits for the ith token
-    // Returns NULL if no logits were sampled.
-    LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled candidates (token ids) for the ith token
-    // These are needed to map probability/logit indices to vocab token ids.
-    // Returns NULL if no candidates were sampled.
-    LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
-
    //
    // Vocab
    //
@@ -1202,16 +1154,11 @@ extern "C" {
    //
    //    llama_sampler_free(smpl);
    //
+    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
+    //

    typedef void * llama_sampler_context_t;

-    struct llama_sampler_data {
-        struct ggml_tensor * logits;
-        struct ggml_tensor * probs;
-        struct ggml_tensor * sampled;
-        struct ggml_tensor * candidates;
-    };
-
    // user code can implement the interface below in order to create custom llama_sampler
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
@@ -1221,45 +1168,17 @@ extern "C" {
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL

-        // [EXPERIMENTAL]
-        // backend sampling interface:
-
-        // return true if the backend supports all ops needed by the sampler
-        // note: call once per sampler
-        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
-
-        // call after .backend_apply()
-        void (*backend_accept)(
-                struct llama_sampler * smpl,
-                struct ggml_context  * ctx,
-                struct ggml_cgraph   * gf,
-                struct ggml_tensor   * selected_token);
-
-        // call after .backend_init()
-        void (*backend_apply)(
-                struct llama_sampler      * smpl,
-                struct ggml_context       * ctx,
-                struct ggml_cgraph        * gf,
-                struct llama_sampler_data * data);
-
-        // called before graph execution to set inputs for the current ubatch
-        void (*backend_set_input)(struct llama_sampler * smpl);
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
    };

    struct llama_sampler {
-        struct llama_sampler_i * iface;
-
-        llama_sampler_context_t ctx;
+        const struct llama_sampler_i * iface;
+        llama_sampler_context_t        ctx;
    };

-    // [EXPERIMENTAL]
-    // attach a sampler to the context
-    // note: prefer initializing the context with llama_context_params.samplers when possible
-    // note: changing the samplers of a context can cause graph reallocations and degraded performance
-    LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
-
    // mirror of llama_sampler_i:
-    LLAMA_API struct llama_sampler * llama_sampler_init  (      struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1275,15 +1194,7 @@ extern "C" {

    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-
-    // return NULL if:
-    //   - the sampler is NULL
-    //   - the sampler is not a llama_sampler_chain
-    //   - the index is out of bounds, unless i == -1
-    //   - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
-    LLAMA_API struct llama_sampler * llama_sampler_chain_get(      struct llama_sampler * chain, int32_t i);
-
-    // the total number of samplers in the chain
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);

    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -1292,9 +1203,7 @@ extern "C" {
    // available samplers:

    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-
-    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    /// Setting k <= 0 makes this a noop
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
    return nullptr;
 }

-static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

-    llama_model & model = adapter.model;
-
    ggml_context * ctx_init;
    gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
@@ -413,17 +411,14 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
        }
    }

-    // update number of nodes used
-    model.n_lora_nodes += adapter.get_n_nodes();
-
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }

 llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    llama_adapter_lora * adapter = new llama_adapter_lora(*model);
+    llama_adapter_lora * adapter = new llama_adapter_lora();

    try {
-        llama_adapter_lora_init_impl(path_lora, *adapter);
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
        return adapter;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -474,10 +469,6 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
 }

 void llama_adapter_lora_free(llama_adapter_lora * adapter) {
-    // update number of nodes used
-    GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
-    adapter->model.n_lora_nodes -= adapter->get_n_nodes();
-
    delete adapter;
 }

--- a/llama/llama.cpp/src/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
@@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
 };

 struct llama_adapter_lora {
-    llama_model & model;
-
    // map tensor name to lora_a_b
    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;

@@ -75,14 +73,10 @@ struct llama_adapter_lora {
    // activated lora (aLoRA)
    std::vector<llama_token> alora_invocation_tokens;

-    llama_adapter_lora(llama_model & model) : model(model) {}
+    llama_adapter_lora() = default;
    ~llama_adapter_lora() = default;

    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
-
-    uint32_t get_n_nodes() const {
-        return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
-    }
 };

 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_STARCODER,        "starcoder"        },
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
-    { LLM_ARCH_MODERN_BERT,      "modern-bert"      },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
@@ -42,7 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_PHIMOE,           "phimoe"           },
    { LLM_ARCH_PLAMO,            "plamo"            },
    { LLM_ARCH_PLAMO2,           "plamo2"           },
-    { LLM_ARCH_PLAMO3,           "plamo3"           },
    { LLM_ARCH_CODESHELL,        "codeshell"        },
    { LLM_ARCH_ORION,            "orion"            },
    { LLM_ARCH_INTERNLM2,        "internlm2"        },
@@ -117,9 +115,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_RND1,             "rnd1"             },
    { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-    { LLM_ARCH_MIMO2,            "mimo2"           },
-    { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
-    { LLM_ARCH_MAINCODER,        "maincoder"        },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -153,7 +148,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
-    { LLM_KV_EMBEDDING_LENGTH_OUT,              "%s.embedding_length_out"              },
    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
@@ -211,7 +205,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -223,7 +216,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-    { LLM_KV_ROPE_FREQ_BASE_SWA,            "%s.rope.freq_base_swa"                   },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
@@ -508,7 +500,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_MISTRAL3:
-        case LLM_ARCH_LLAMA_EMBED:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
@@ -790,20 +781,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_CLS,
                LLM_TENSOR_CLS_OUT,
            };
-        case LLM_ARCH_MODERN_BERT:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_TOKEN_EMBD_NORM,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_CLS,
-                LLM_TENSOR_CLS_OUT,
-            };
        case LLM_ARCH_JINA_BERT_V2:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -1083,22 +1060,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_POST_NORM,
                LLM_TENSOR_FFN_POST_NORM,
            };
-        case LLM_ARCH_PLAMO3:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_POST_NORM,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_POST_NORM,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-            };
        case LLM_ARCH_CODESHELL:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -2079,7 +2040,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM_LFM2,
                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_DENSE_2_OUT,
            };
        case LLM_ARCH_LFM2MOE:
            return {
@@ -2098,7 +2058,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_SHORTCONV_INPROJ,
                LLM_TENSOR_SHORTCONV_OUTPROJ,
                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM_LFM2,
+                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
                LLM_TENSOR_FFN_DOWN_EXPS,
@@ -2214,49 +2174,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_VISEXP_FFN_DOWN,
                LLM_TENSOR_VISEXP_FFN_UP,
            };
-        case LLM_ARCH_MIMO2:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_SINKS,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_GATE_INP,
-                LLM_TENSOR_FFN_GATE_EXPS,
-                LLM_TENSOR_FFN_DOWN_EXPS,
-                LLM_TENSOR_FFN_UP_EXPS,
-                LLM_TENSOR_FFN_EXP_PROBS_B,
-            };
        case LLM_ARCH_GPTJ:
        case LLM_ARCH_UNKNOWN:
            return {
                LLM_TENSOR_TOKEN_EMBD,
            };
-        case LLM_ARCH_MAINCODER:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-            };
        case LLM_ARCH_SOLAR:
            return {
                LLM_TENSOR_TOKEN_EMBD,
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -24,7 +24,6 @@ enum llm_arch {
    LLM_ARCH_STARCODER,
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
-    LLM_ARCH_MODERN_BERT,
    LLM_ARCH_NOMIC_BERT,
    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_NEO_BERT,
@@ -46,7 +45,6 @@ enum llm_arch {
    LLM_ARCH_PHIMOE,
    LLM_ARCH_PLAMO,
    LLM_ARCH_PLAMO2,
-    LLM_ARCH_PLAMO3,
    LLM_ARCH_CODESHELL,
    LLM_ARCH_ORION,
    LLM_ARCH_INTERNLM2,
@@ -121,9 +119,6 @@ enum llm_arch {
    LLM_ARCH_RND1,
    LLM_ARCH_PANGU_EMBED,
    LLM_ARCH_MISTRAL3,
-    LLM_ARCH_MIMO2,
-    LLM_ARCH_LLAMA_EMBED,
-    LLM_ARCH_MAINCODER,
    LLM_ARCH_UNKNOWN,
 };

@@ -157,7 +152,6 @@ enum llm_kv {
    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
-    LLM_KV_EMBEDDING_LENGTH_OUT,
    LLM_KV_FEATURES_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -215,7 +209,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_GATE_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ATTENTION_SLIDING_WINDOW,
-    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -227,7 +220,6 @@ enum llm_kv {
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_FREQ_BASE_SWA,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -74,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
    { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
-    { "solar-open",        LLM_CHAT_TEMPLATE_SOLAR_OPEN        },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -217,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GROK_2;
    } else if (tmpl_contains(LU8("[unused9]系统：[unused10]"))) {
        return LLM_CHAT_TEMPLATE_PANGU_EMBED;
-    } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
-        return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -848,14 +845,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[unused9]助手：";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
-        for (auto message : chat) {
-            std::string role(message->role);
-            ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
-        }
-        if (add_ass) {
-            ss << "<|begin|>assistant";
-        }
    } else {
        // template not supported
        return -1;
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -54,7 +54,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
    LLM_CHAT_TEMPLATE_PANGU_EMBED,
-    LLM_CHAT_TEMPLATE_SOLAR_OPEN,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -60,25 +60,6 @@ llama_context::llama_context(
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;

-    // Initialize backend samplers here so they are part of the sampling graph
-    // before the reserve passes run later in this function. This avoids a later
-    // re-reserve when graph nodes change.
-    if (params.samplers != nullptr && params.n_samplers > 0) {
-        for (size_t i = 0; i < params.n_samplers; ++i) {
-            const auto & config = params.samplers[i];
-
-            if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
-                throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
-            }
-
-            if (set_sampler(config.seq_id, config.sampler)) {
-                const int n_samplers = llama_sampler_chain_n(config.sampler);
-
-                LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
-            }
-        }
-    }
-
    auto rope_scaling_type = params.rope_scaling_type;
    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
        rope_scaling_type = hparams.rope_scaling_type_train;
@@ -250,10 +231,7 @@ llama_context::llama_context(
        // graph outputs buffer
        {
            // resized during inference when a batch uses more outputs
-            // Create a dummy batch for initialization.
-            llama_batch dummy_batch = {};
-            dummy_batch.n_tokens = 0;
-            if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
                throw std::runtime_error("failed to reserve initial output buffer");
            }

@@ -316,8 +294,8 @@ llama_context::llama_context(
        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
        bool pipeline_parallel =
            model.n_devices() > 1 &&
-            model.n_gpu_layers() > model.hparams.n_layer &&
-            model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
+            model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+            model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
            cparams.offload_kqv &&
            !model.has_tensor_overrides();

@@ -478,35 +456,26 @@ llama_context::llama_context(
            LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
        }
    }
-
-    // Initialize the full vocabulary token ids for backend samplers.
-    {
-        const int n_vocab = model.vocab.n_tokens();
-
-        sampling.token_ids_full_vocab.resize(n_vocab);
-        for (int i = 0; i < n_vocab; ++i) {
-            sampling.token_ids_full_vocab[i] = i;
-        }
-    }
 }

 llama_context::~llama_context() {
-    if (!model.hparams.no_alloc) {
-        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-            ggml_backend_t             backend = backend_ptrs[i];
-            ggml_backend_buffer_type_t buft    = backend_buft[i];
+    // FIXME this currently results in a use-after-free bug if the model is freed before the context
+    // if (!model.hparams.no_alloc) {
+    //     for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+    //         ggml_backend_t             backend = backend_ptrs[i];
+    //         ggml_backend_buffer_type_t buft    = backend_buft[i];

-            const size_t size_exp = backend_buf_exp_size[i];
-            const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-            if (size_exp == size_act) {
-                LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
-                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-            } else {
-                LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
-                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-            }
-        }
-    }
+    //         const size_t size_exp = backend_buf_exp_size[i];
+    //         const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+    //         if (size_exp == size_act) {
+    //             LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+    //         } else {
+    //             LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+    //         }
+    //     }
+    // }
    ggml_opt_free(opt_ctx);
 }

@@ -648,35 +617,6 @@ float * llama_context::get_logits() {
    return logits;
 }

-int64_t llama_context::output_resolve_row(int32_t i) const {
-    int64_t j = -1;
-
-    // support negative indices (last output row)
-    if (i < 0) {
-        j = n_outputs + i;
-        if (j < 0) {
-            throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
-        }
-    } else if ((size_t) i >= output_ids.size()) {
-        throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
-    } else {
-        // use output_ids to translate the batch token index into a row number
-        // that holds this token's data.
-        j = output_ids[i];
-    }
-
-    if (j < 0) {
-        // the batch token was not configured to output anything
-        throw std::runtime_error(format("batch.logits[%d] != true", i));
-    }
-
-    if (j >= n_outputs) {
-        throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
-    }
-
-    return j;
-}
-
 float * llama_context::get_logits_ith(int32_t i) {
    int64_t j = -1;

@@ -687,7 +627,6 @@ float * llama_context::get_logits_ith(int32_t i) {
            throw std::runtime_error("no logits");
        }

-        // TODO: use output_resolve_row()
        if (i < 0) {
            j = n_outputs + i;
            if (j < 0) {
@@ -724,10 +663,6 @@ float * llama_context::get_embeddings() {
    return embd;
 }

-llama_token * llama_context::get_sampled_tokens()  const{
-    return sampling.sampled;
-}
-
 float * llama_context::get_embeddings_ith(int32_t i) {
    int64_t j = -1;

@@ -738,7 +673,6 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error("no embeddings");
        }

-        // TODO: use output_resolve_row()
        if (i < 0) {
            j = n_outputs + i;
            if (j < 0) {
@@ -758,8 +692,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
        }

-        const uint32_t n_embd_out = model.hparams.get_n_embd_out();
-        return embd + j*n_embd_out;
+        return embd + j*model.hparams.n_embd;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@@ -779,136 +712,6 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
    return it->second.data();
 }

-llama_token llama_context::get_sampled_token_ith(int32_t idx) {
-    output_reorder();
-
-    if (sampling.sampled == nullptr) {
-        return LLAMA_TOKEN_NULL;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        GGML_ASSERT(row < (int64_t) sampling.sampled_size);
-        return sampling.sampled[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
-        return LLAMA_TOKEN_NULL;
-    }
-}
-
-float * llama_context::get_sampled_probs_ith(int32_t idx) {
-    output_reorder();
-
-    if (sampling.probs == nullptr) {
-        return nullptr;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
-            return nullptr;
-        }
-        return sampling.probs + row*model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
-        return nullptr;
-    }
-}
-
-float * llama_context::get_sampled_logits_ith(int32_t idx) {
-    output_reorder();
-
-    if (sampling.logits == nullptr) {
-        return nullptr;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
-            return nullptr;
-        }
-        return sampling.logits + row*model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
-        return nullptr;
-    }
-}
-
-const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
-    output_reorder();
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if (sampling.candidates != nullptr &&
-            (size_t) row < sampling.candidates_count.size() &&
-            sampling.candidates_count[row] > 0) {
-            return sampling.candidates + row*model.vocab.n_tokens();
-        }
-    } catch (const std::exception & err) {
-        // fallback to full vocab list
-    }
-
-    return sampling.token_ids_full_vocab.data();
-}
-
-size_t llama_context::get_sampled_candidates_count(int32_t idx) {
-    output_reorder();
-
-    if (sampling.candidates == nullptr) {
-        return 0;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.candidates_count.size()) {
-            return 0;
-        }
-        return sampling.candidates_count[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
-        return 0;
-    }
-}
-
-size_t llama_context::get_sampled_logits_count(int32_t idx) {
-    output_reorder();
-
-    if (sampling.logits == nullptr) {
-        return model.vocab.n_tokens();
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.logits_count.size()) {
-            return 0;
-        }
-        return sampling.logits_count[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
-        return 0;
-    }
-}
-
-size_t llama_context::get_sampled_probs_count(int32_t idx) {
-    output_reorder();
-
-    if (sampling.probs == nullptr) {
-        return 0;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.probs_count.size()) {
-            return 0;
-        }
-        return sampling.probs_count[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
-        return 0;
-    }
-}
-
-
 void llama_context::attach_threadpool(
           ggml_threadpool_t threadpool,
           ggml_threadpool_t threadpool_batch) {
@@ -965,42 +768,6 @@ void llama_context::set_warmup(bool value) {
    cparams.warmup = value;
 }

-bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
-    LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
-
-    const bool can_offload =
-        sampler &&
-        sampler->iface->backend_init &&
-        sampler->iface->backend_apply &&
-        llama_sampler_chain_n(sampler) > 0;
-
-    if (sampler && can_offload) {
-        ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
-        auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
-        if (host_buft) {
-            buft = host_buft;
-        }
-
-        sampler->iface->backend_init(sampler, buft);
-
-        sampling.samplers[seq_id] = sampler;
-
-        return true;
-    }
-
-    if (sampler && !can_offload) {
-        LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
-
-        sampling.samplers.erase(seq_id);
-
-        return false;
-    }
-
-    sampling.samplers.erase(seq_id);
-
-    return true;
-}
-
 void llama_context::set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale) {
@@ -1141,7 +908,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    n_queued_tokens += n_tokens;

    // reserve output buffer
-    if (output_reserve(n_tokens, batch_inp) < n_tokens) {
+    if (output_reserve(n_tokens) < n_tokens) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
        return -2;
    };
@@ -1195,10 +962,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
                {
                    // extract token embeddings
                    GGML_ASSERT(embd != nullptr);
-                    const uint32_t n_embd_out = hparams.get_n_embd_out();

-                    GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
-                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                } break;
            case LLAMA_POOLING_TYPE_MEAN:
            case LLAMA_POOLING_TYPE_CLS:
@@ -1266,112 +1032,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
    return 0;
 }

-static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
-    std::map<llama_seq_id, uint32_t> seq_to_row;
-    // how many output tokens we have seen so far for this ubatch.
-    uint32_t local = 0;
-    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-        // skip tokens that are not output.
-        if (!ubatch.output[i]) {
-            continue;
-        }
-
-        const llama_seq_id seq_id = ubatch.seq_id[i][0];
-        // row_offset is the number of output tokens before this ubatch.
-        seq_to_row[seq_id] = row_offset + local;
-        ++local;
-    }
-    return seq_to_row;
-}
-
-static void copy_tensor_async_ints(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
-    llama_token * sampled,
-    size_t sampled_size,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
-    ggml_backend_sched_t sched) {
-    if (sampled == nullptr) {
-        return;
-    }
-
-    for (const auto & [seq_id, tensor] : tensor_map) {
-        auto it = seq_to_row.find(seq_id);
-        if (it == seq_to_row.end()) {
-            continue;
-        }
-
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < sampled_size);
-
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
-
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
-    }
-}
-
-static void copy_tensor_async_floats(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
-    float * dst,
-    size_t stride,
-    std::vector<uint32_t> & counts,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
-    ggml_backend_sched_t sched) {
-    if (dst == nullptr) {
-        return;
-    }
-
-    for (const auto & [seq_id, tensor] : tensor_map) {
-        auto it = seq_to_row.find(seq_id);
-        if (it == seq_to_row.end()) {
-            continue;
-        }
-
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < counts.size());
-
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
-
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        float * row_ptr = dst + (size_t) row * stride;
-        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
-
-        // Update the actual number of logits/probabilities that were written for this row.
-        counts[row] = ggml_nelements(tensor);
-    }
-}
-
-static void copy_tensor_async_candidates(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
-    llama_token * dst,
-    size_t stride,
-    std::vector<uint32_t> & counts,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
-    ggml_backend_sched_t sched) {
-    if (dst == nullptr) {
-        return;
-    }
-
-    for (const auto & [seq_id, tensor] : tensor_map) {
-        auto it = seq_to_row.find(seq_id);
-        if (it == seq_to_row.end()) {
-            continue;
-        }
-
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < counts.size());
-
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
-
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        llama_token * row_ptr = dst + (size_t) row * stride;
-        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
-
-        // Update the actual number of candidates that were written.
-        counts[row] = ggml_nelements(tensor);
-    }
-}
-
 int llama_context::decode(const llama_batch & batch_inp) {
    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT

@@ -1392,35 +1052,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const int64_t n_embd  = hparams.n_embd_inp();

    const bool output_all = false;
-    const bool has_samplers = !sampling.samplers.empty();

-    const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
-
-    // TODO: avoid this workaround in the future
-    if (has_samplers && batch_inp.logits) {
-        std::vector<int32_t> seq_output_count(n_seq_max, 0);
-
-        for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
-            if (batch_inp.logits[i] == 0) {
-                continue;
-            }
-
-            const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
-
-            for (int32_t s = 0; s < ns; ++s) {
-                const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
-
-                seq_output_count[seq_id]++;
-                if (seq_output_count[seq_id] > 1) {
-                    LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
-                            __func__, seq_id, seq_output_count[seq_id]);
-                    return -1;
-                }
-            }
-        }
-    }
-
-    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@@ -1501,7 +1134,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    }

    // reserve output buffer
-    if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
        return -2;
    };
@@ -1574,10 +1207,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }

        // extract logits
-        // For multi-sequence batches that mix backend samplers and CPU sampler
-        // this is currently inefficient as we copy all logits even for the
-        // backend sampled tokens.
-        if (logits && t_logits && n_outputs > 0) {
+        if (t_logits && n_outputs > 0) {
            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
            GGML_ASSERT(backend_res != nullptr);
            GGML_ASSERT(logits != nullptr);
@@ -1592,7 +1222,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }

        // extract embeddings
-        if (embd && t_embd && n_outputs > 0) {
+        if (t_embd && n_outputs > 0) {
            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
            GGML_ASSERT(backend_embd != nullptr);

@@ -1601,13 +1231,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
                    {
                        // extract token embeddings
                        GGML_ASSERT(embd != nullptr);
-                        const uint32_t n_embd_out = hparams.get_n_embd_out();
-                        float * embd_out = embd + n_outputs_prev*n_embd_out;
+                        float * embd_out = embd + n_outputs_prev*n_embd;

                        if (n_outputs) {
                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
+                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
                        }
                    } break;
                case LLAMA_POOLING_TYPE_MEAN:
@@ -1647,22 +1276,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
            }
        }

-        // This flag indicates whether a backend sampler has actually sampled a specific
-        // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
-        const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
-
-        if (has_samplers && has_sampled) {
-            const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
-            const auto stride = n_vocab;
-
-            // async copy the sampling data from the backend to the host
-            copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
-
-            copy_tensor_async_floats    (res->t_sampled_logits, sampling.logits,     stride, sampling.logits_count,     seq_to_output_row, sched.get());
-            copy_tensor_async_floats    (res->t_sampled_probs,  sampling.probs,      stride, sampling.probs_count,      seq_to_output_row, sched.get());
-            copy_tensor_async_candidates(res->t_candidates,     sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
-        }
-
        n_outputs_prev += n_outputs;
    } while (mctx->next());

@@ -1726,15 +1339,15 @@ int llama_context::decode(const llama_batch & batch_inp) {
 // output
 //

-uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;

    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());

-    const auto n_batch    = cparams.n_batch;
-    const auto n_vocab    = vocab.n_tokens();
-    const auto n_embd_out = hparams.get_n_embd_out();
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;

    bool has_logits = true;
    bool has_embd   = cparams.embeddings;
@@ -1745,53 +1358,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        has_embd   = true;
    }

-    // Check which sampling modes are needed for the current batch.
-    // TODO: avoid this branching by working with the worst-case
-    bool has_sampling = false;
-    bool cpu_logits   = false;
-
-    if (batch.logits) {
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            if (!batch.logits[i]) {
-                continue;
-            }
-            for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
-                llama_seq_id seq_id = batch.seq_id[i][j];
-                if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
-                    has_sampling = true;
-                } else {
-                    cpu_logits = true;
-                }
-            }
-        }
-    } else {
-        // When batch.logits is nullptr (when loading state with a dummy batch),
-        // allocate CPU logits.
-        cpu_logits = true;
-    }
-
-    size_t backend_float_count = 0;
-    size_t backend_token_count = 0;
-
-    // Allocate CPU logits buffer only if needed by sequences in this batch
-    logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
-    embd_size   = has_embd ? n_embd_out*n_outputs_max : 0;
-
-    // TODO: avoid this branching by working with the worst-case
-    if (!has_sampling) {
-        sampling.logits_size     = 0;
-        sampling.probs_size      = 0;
-        sampling.sampled_size    = 0;
-        sampling.candidates_size = 0;
-    } else {
-        sampling.logits_size     = n_vocab*n_outputs_max;
-        sampling.probs_size      = n_vocab*n_outputs_max;
-        sampling.sampled_size    =         n_outputs_max;
-        sampling.candidates_size = n_vocab*n_outputs_max;
-
-        backend_float_count = sampling.logits_size  + sampling.probs_size;
-        backend_token_count = sampling.sampled_size + sampling.candidates_size;
-    }
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;

    if (output_ids.empty()) {
        // init, never resized afterwards
@@ -1799,9 +1367,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
    }

    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
-    const size_t new_size  =
-        (logits_size + embd_size + backend_float_count) * sizeof(float) +
-        (                          backend_token_count) * sizeof(llama_token);
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);

    // alloc only when more than the current capacity is required
    // TODO: also consider shrinking the buffer
@@ -1809,11 +1375,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        if (buf_output) {
 #ifndef NDEBUG
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
            synchronize();
-
-            // TODO: not needed?
            buf_output = nullptr;
            logits = nullptr;
            embd = nullptr;
@@ -1835,49 +1399,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba

    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());

-    logits = nullptr;
-    embd   = nullptr;
-
-    size_t offset = 0;
-    uint8_t * base = (uint8_t *) output_base;
-
-    logits = (has_logits && cpu_logits) ? output_base : nullptr;
-    offset += logits_size * sizeof(float);
-
-    embd = has_embd ? (float *) (base + offset) : nullptr;
-    offset += embd_size * sizeof(float);
-
-    sampling.logits     = nullptr;
-    sampling.probs      = nullptr;
-    sampling.sampled    = nullptr;
-    sampling.candidates = nullptr;
-
-    if (has_sampling) {
-        sampling.logits = (float *) (base + offset);
-        offset += sampling.logits_size * sizeof(float);
-
-        sampling.probs = (float *) (base + offset);
-        offset += sampling.probs_size * sizeof(float);
-
-        sampling.sampled = (llama_token *) (base + offset);
-        offset += sampling.sampled_size * sizeof(llama_token);
-
-        sampling.candidates = (llama_token *) (base + offset);
-        offset += sampling.candidates_size * sizeof(llama_token);
-
-        // The count vectors keep track of the actual number of logits/probs/candidates
-        // copied from the backend for each output row.
-
-        sampling.logits_count.resize(n_outputs_max);
-        sampling.probs_count.resize(n_outputs_max);
-        sampling.candidates_count.resize(n_outputs_max);
-
-        std::fill(sampling.logits_count.begin(),     sampling.logits_count.end(),     0);
-        std::fill(sampling.probs_count.begin(),      sampling.probs_count.end(),      0);
-        std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
-
-        std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
-    }
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;

    // set all ids as invalid (negative)
    std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1906,40 +1429,6 @@ void llama_context::output_reorder() {
                std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
            }
        }
-
-        if (sampling.logits && sampling.logits_size > 0) {
-            for (uint64_t k = 0; k < n_vocab; ++k) {
-                std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
-            }
-        }
-
-        if (sampling.probs && sampling.probs_size > 0) {
-            for (uint64_t k = 0; k < n_vocab; ++k) {
-                std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
-            }
-        }
-
-        if (sampling.candidates && sampling.candidates_size > 0) {
-            for (uint64_t k = 0; k < n_vocab; ++k) {
-                std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
-            }
-        }
-
-        if (sampling.sampled && sampling.sampled_size > 0) {
-            std::swap(sampling.sampled[i0], sampling.sampled[i1]);
-        }
-
-        if (!sampling.logits_count.empty()) {
-            std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
-        }
-
-        if (!sampling.probs_count.empty()) {
-            std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
-        }
-
-        if (!sampling.candidates_count.empty()) {
-            std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
-        }
    }

    output_swaps.clear();
@@ -1953,9 +1442,7 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
    if (model.arch == LLM_ARCH_QWEN3NEXT) {
        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
    }
-    uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    res += model.n_lora_nodes;
-    return res;
+    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
 }

 llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -1969,7 +1456,7 @@ ggml_cgraph * llama_context::graph_reserve(

    if (n_tokens % n_seqs != 0) {
        n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
-        n_outputs = std::max(n_outputs, n_tokens);
+        n_outputs = std::min(n_outputs, n_tokens);

        LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
    }
@@ -1988,15 +1475,6 @@ ggml_cgraph * llama_context::graph_reserve(
    llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
    llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);

-    // set one output token per sequence in order to activate all backend samplers
-    std::vector<llama_seq_id> seq_ids(n_seqs);
-    for (uint32_t i = 0; i < n_seqs; ++i) {
-        seq_ids[i] = i;
-        ubatch.n_seq_id[i] = 1;
-        ubatch.seq_id[i] = &seq_ids[i];
-        ubatch.output[i] = true;
-    }
-
    auto * res = gf_res_reserve.get();

    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
@@ -2027,7 +1505,7 @@ llm_graph_params llama_context::graph_params(
                        llm_graph_result * res,
                      const llama_ubatch & ubatch,
            const llama_memory_context_i * mctx,
-                          llm_graph_type   gtype) const {
+            llm_graph_type   gtype) const {
    return {
        /*.arch        =*/ model.arch,
        /*.hparams     =*/ model.hparams,
@@ -2040,7 +1518,6 @@ llm_graph_params llama_context::graph_params(
        /*.loras       =*/ &loras,
        /*.mctx        =*/ mctx,
        /*.cross       =*/ &cross,
-        /*.samplers    =*/ sampling.samplers,
        /*.n_outputs   =*/ n_outputs,
        /*.cb          =*/ graph_get_cb(),
        /*.res         =*/ res,
@@ -2093,7 +1570,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
+        const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
        if (ubatch.n_tokens < 32 || full_offload) {
            if (il != -1 && strcmp(name, "norm") == 0) {
                const auto & dev_layer = model.dev_layer(il);
@@ -2496,9 +1973,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
        }
    }

-    // TODO: handle sampling buffers and samplers state ?
-    //       https://github.com/ggml-org/llama.cpp/pull/17004
-
    if (memory != nullptr) {
        LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
        memory->state_write(io);
@@ -2531,10 +2005,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        auto n_outputs = this->n_outputs;
        io.read_to(&n_outputs, sizeof(n_outputs));

-        // Create a dummy batch for state loading.
-        llama_batch dummy_batch = {};
-        dummy_batch.n_tokens = 0;
-        if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
+        if (n_outputs > output_reserve(n_outputs)) {
            throw std::runtime_error("could not reserve outputs");
        }

@@ -2588,9 +2059,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        }
    }

-    // TODO: handle sampling buffers and samplers state ?
-    //       https://github.com/ggml-org/llama.cpp/pull/17004
-
    if (memory) {
        LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);

@@ -2779,7 +2247,7 @@ void llama_context::opt_epoch_iter(
        }

        // reserve output buffer
-        if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+        if (output_reserve(n_outputs_all) < n_outputs_all) {
            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
            GGML_ABORT("TODO: handle this error");
        };
@@ -2924,8 +2392,6 @@ llama_context_params llama_context_default_params() {
        /*.op_offload                  =*/ true,
        /*.swa_full                    =*/ true,
        /*.kv_unified                  =*/ false,
-        /*.sampler                     =*/ nullptr,
-        /*.n_sampler                   =*/ 0,
    };

    return result;
@@ -3085,15 +2551,7 @@ float * llama_get_logits(llama_context * ctx) {
 float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();

-    float * res = nullptr;
-
-    res = ctx->get_sampled_logits_ith(i);
-
-    if (!res) {
-        res = ctx->get_logits_ith(i);
-    }
-
-    return res;
+    return ctx->get_logits_ith(i);
 }

 float * llama_get_embeddings(llama_context * ctx) {
@@ -3114,52 +2572,6 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
    return ctx->get_embeddings_seq(seq_id);
 }

-bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
-    return ctx->set_sampler(seq_id, smpl);
-}
-
-llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return ctx->get_sampled_token_ith(i);
-}
-
-float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return ctx->get_sampled_probs_ith(i);
-}
-
-float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return ctx->get_sampled_logits_ith(i);
-}
-
-llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
-}
-
-uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
-}
-
-uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
-}
-
-uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
-}
-
 // llama adapter API

 int32_t llama_set_adapter_lora(
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -70,18 +70,6 @@ struct llama_context {
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);

-    llama_token * get_sampled_tokens() const;
-    llama_token   get_sampled_token_ith(int32_t idx);
-
-    float * get_sampled_logits_ith(int32_t idx);
-    size_t  get_sampled_logits_count(int32_t idx);
-
-    float * get_sampled_probs_ith(int32_t idx);
-    size_t  get_sampled_probs_count(int32_t idx);
-
-    const llama_token * get_sampled_candidates_ith(int32_t idx);
-    size_t get_sampled_candidates_count(int32_t idx);
-
    void attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
@@ -204,13 +192,10 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);

    void output_reorder();

-    // map the output row index `i` to batch index
-    int64_t output_resolve_row(int32_t i) const;
-
    //
    // graph
    //
@@ -228,8 +213,6 @@ public:
    ggml_cgraph * graph_reserve(
        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);

-    bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
-
 private:
    llm_graph_params graph_params(
                        llm_graph_result * res,
@@ -269,31 +252,6 @@ private:
    size_t  embd_size = 0; // capacity (of floats) for embeddings
    float * embd      = nullptr;

-    // TODO: simplify
-    struct sampling_info {
-        std::map<llama_seq_id, llama_sampler *> samplers;
-
-        float       * logits      = nullptr;
-        size_t        logits_size = 0;
-
-        llama_token * sampled      = nullptr;
-        size_t        sampled_size = 0;
-
-        float       * probs        = nullptr;
-        size_t        probs_size   = 0;
-
-        llama_token * candidates   = nullptr;
-        size_t        candidates_size = 0;
-
-        std::vector<uint32_t> logits_count;
-        std::vector<uint32_t> probs_count;
-        std::vector<uint32_t> candidates_count;
-
-        std::vector<llama_token> token_ids_full_vocab;
-    };
-
-    sampling_info sampling;
-
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -369,44 +369,6 @@ static void print_rule(
    fprintf(file, "\n");
 }

-//
-// Regex utilities
-//
-
-size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
-    auto find_start_pos = [](const std::smatch & match) {
-        // get from the first matched capturing group to the end of the string
-        size_t start = std::string::npos;
-        for (auto i = 1u; i < match.size(); i++) {
-            if (match.length(i) > 0) {
-                start = match.position(i);
-                break;
-            }
-        }
-        if (start == std::string::npos) {
-            start = match.position(0);
-        }
-        return start;
-    };
-
-    if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
-        // match against the entire input
-        std::smatch match;
-        if (std::regex_match(input, match, regex)) {
-            return find_start_pos(match);
-        }
-    }
-
-    // search anywhere
-    std::smatch match;
-    if (std::regex_search(input, match, regex)) {
-        return find_start_pos(match);
-    }
-
-    return std::string::npos;
-}
-
-
 //
 // implementation
 //
@@ -1359,10 +1321,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;

+            std::smatch match;
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
-                auto start = trigger_pattern.find(grammar.trigger_buffer);
-                if (start != std::string::npos) {
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
+                    // get from the first matched capturing group to the end of the string
+                    size_t start = std::string::npos;
+                    for (auto i = 1u; i < match.size(); i++) {
+                        if (match.length(i) > 0) {
+                            start = match.position(i);
+                            break;
+                        }
+                    }
+                    if (start == std::string::npos) {
+                        start = match.position(0);
+                    }

                    // replay tokens that overlap with [start, end)
                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
--- a/llama/llama.cpp/src/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -130,8 +130,6 @@ struct llama_grammar_parser {
 struct llama_grammar_trigger_pattern {
    std::string pattern;
    std::regex  regex;
-
-    size_t find(const std::string & input) const;
 };

 struct llama_grammar {
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -12,7 +12,6 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
-#include <unordered_set>

 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
@@ -33,7 +32,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
    bool res = true;

    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
-    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);

    return res;
 }
@@ -63,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
    bool res = true;

-    res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+    res &= pos->ne[0] == params.ubatch.n_tokens;

    return res;
 }
@@ -522,43 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
    return res;
 }

-void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
-    // set the inputs only for the active samplers in the current ubatch
-    std::unordered_set<llama_seq_id> active_samplers;
-    for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
-        if (ubatch->output[i]) {
-            llama_seq_id seq_id = ubatch->seq_id[i][0];
-            active_samplers.insert(seq_id);
-        }
-    }
-
-    for (auto seq_id : active_samplers) {
-        if (samplers.find(seq_id) == samplers.end()) {
-            continue;
-        }
-
-        auto & sampler = samplers[seq_id];
-
-        if (sampler->iface->backend_set_input) {
-            sampler->iface->backend_set_input(sampler);
-        }
-    }
-}
-
-bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
-    if (samplers.size() != params.samplers.size()) {
-        return false;
-    }
-
-    for (const auto & [seq_id, sampler] : params.samplers) {
-        if (samplers[seq_id] != sampler) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 //
 // llm_graph_result
 //
@@ -579,10 +541,6 @@ void llm_graph_result::reset() {
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
-    t_sampled.clear();
-    t_sampled_probs.clear();
-    t_sampled_logits.clear();
-    t_candidates.clear();

    params = {};

@@ -607,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }

-void llm_graph_result::set_outputs() {
-    if (t_logits != nullptr) {
-        ggml_set_output(t_logits);
-    }
-    if (t_embd != nullptr) {
-        ggml_set_output(t_embd);
-    }
-    if (t_embd_pooled != nullptr) {
-        ggml_set_output(t_embd_pooled);
-    }
-    for (auto & [seq_id, t] : t_sampled) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_probs) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_logits) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_candidates) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-}
-
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
    if (!this->params.allow_reuse(params)) {
        if (debug > 1) {
@@ -720,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    loras            (params.loras),
    mctx             (params.mctx),
    cross            (params.cross),
-    samplers         (params.samplers),
    cb_func          (params.cb),
    res              (params.res),
    ctx0             (res->get_ctx()),
@@ -1326,10 +1251,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {

    res->add_input(std::move(inp));

-    // make sure the produced embeddings are immediately materialized in the ggml graph
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18599
-    ggml_build_forward_expand(gf, cur);
-
    return cur;
 }

@@ -1913,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
-        ggml_set_name(inp->self_kq_mask, "self_kq_mask");

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-        ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
    }

    {
@@ -1929,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);
-        ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }

    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2071,18 +1988,14 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
 void llm_graph_context::build_dense_out(
    ggml_tensor * dense_2,
    ggml_tensor * dense_3) const {
-    if (!cparams.embeddings || !(dense_2 || dense_3)) {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
        return;
    }
    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");

-    if (dense_2) {
-        cur = ggml_mul_mat(ctx0, dense_2, cur);
-    }
-    if (dense_3) {
-        cur = ggml_mul_mat(ctx0, dense_3, cur);
-    }
+    cur = ggml_mul_mat(ctx0, dense_2, cur);
+    cur = ggml_mul_mat(ctx0, dense_3, cur);
    cb(cur, "result_embd_pooled", -1);
    res->t_embd_pooled = cur;
    ggml_build_forward_expand(gf, cur);
@@ -2173,87 +2086,6 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }

-void llm_graph_context::build_sampling() const {
-    if (samplers.empty() || !res->t_logits) {
-        return;
-    }
-
-    auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
-    res->add_input(std::move(inp_sampling));
-
-    std::map<llama_seq_id, int32_t> seq_to_logit_row;
-    int32_t logit_row_idx = 0;
-
-    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
-        if (ubatch.output[i]) {
-            llama_seq_id seq_id = ubatch.seq_id[i][0];
-            seq_to_logit_row[seq_id] = logit_row_idx;
-            logit_row_idx++;
-        }
-    }
-
-    // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
-    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
-
-    // add a dummy row of logits
-    // this trick makes the graph static, regardless of which samplers are activated
-    // this is important in order to minimize graph reallocations
-    // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
-    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
-
-    for (const auto & [seq_id, sampler] : samplers) {
-        const auto it = seq_to_logit_row.find(seq_id);
-
-        // inactive samplers always work on the first row
-        const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
-
-        ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
-        ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
-
-        struct llama_sampler_data data = {
-            /*.logits      =*/ logits_seq,
-            /*.probs       =*/ nullptr,
-            /*.sampled     =*/ nullptr,
-            /*.candidates  =*/ nullptr,
-        };
-
-        assert(sampler->iface->backend_apply);
-        sampler->iface->backend_apply(sampler, ctx0, gf, &data);
-
-        if (data.sampled != nullptr) {
-            res->t_sampled[seq_id] = data.sampled;
-            ggml_build_forward_expand(gf, data.sampled);
-        }
-
-        if (data.probs != nullptr) {
-            res->t_sampled_probs[seq_id] = data.probs;
-            ggml_build_forward_expand(gf, data.probs);
-        }
-
-        if (data.logits != nullptr) {
-            res->t_sampled_logits[seq_id] = data.logits;
-            ggml_build_forward_expand(gf, data.logits);
-        }
-
-        if (data.candidates != nullptr) {
-            res->t_candidates[seq_id] = data.candidates;
-            ggml_build_forward_expand(gf, data.candidates);
-        }
-    }
-
-    // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
-    /*
-    for (const auto & [seq_id, sampler] : samplers) {
-        if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
-            ggml_tensor * selected_token = it->second;
-            if (selected_token != nullptr) {
-                llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
-            }
-        }
-    }
-    */
-}
-
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -10,7 +10,6 @@
 #include <memory>
 #include <set>
 #include <functional>
-#include <map>

 struct ggml_cgraph;
 struct ggml_context;
@@ -397,18 +396,6 @@ public:
    const llama_memory_hybrid_context * mctx;
 };

-class llm_graph_input_sampling : public llm_graph_input_i {
-public:
-    llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
-        samplers(std::move(samplers)) { }
-    virtual ~llm_graph_input_sampling() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-    bool can_reuse(const llm_graph_params & params) override;
-
-    std::map<llama_seq_id, llama_sampler *> samplers;
-};
-
 //
 // llm_graph_result
 //
@@ -442,23 +429,6 @@ struct llm_graph_params {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;

-    std::map<llama_seq_id, llama_sampler *> samplers;
-
-    static bool samplers_equal(
-          const std::map<llama_seq_id, llama_sampler *> & lhs,
-          const std::map<llama_seq_id, llama_sampler *> & rhs) {
-        if (lhs.size() != rhs.size()) {
-            return false;
-        }
-        for (const auto & [seq_id, sampler] : lhs) {
-            auto it = rhs.find(seq_id);
-            if (it == rhs.end() || it->second != sampler) {
-                return false;
-            }
-        }
-        return true;
-    }
-
    uint32_t n_outputs;

    llm_graph_cb cb;
@@ -498,36 +468,15 @@ struct llm_graph_params {
            return false;
        }

-        if (n_outputs != other.n_outputs) {
-            return false;
-        }
-
-        if (!samplers_equal(samplers, other.samplers)) {
-            return false;
-        }
-
-        if (samplers.size() > 0) {
-            if (!ubatch.data || !other.ubatch.data) {
-                return false;
-            }
-
-            // check that the outputs are the same for all samplers
-            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                if (ubatch.output[i]    != other.ubatch.output[i] ||
-                    ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
-                    return false;
-                }
-            }
-        }
-
        return
            cparams.embeddings  == other.cparams.embeddings  &&
            cparams.causal_attn == other.cparams.causal_attn &&
-            arch  == other.arch  &&
-            gtype == other.gtype &&
-            cvec  == other.cvec  &&
-            loras == other.loras &&
-            cross == other.cross;
+            arch      == other.arch  &&
+            gtype     == other.gtype &&
+            cvec      == other.cvec  &&
+            loras     == other.loras &&
+            cross     == other.cross &&
+            n_outputs == other.n_outputs;
    }
 };

@@ -550,7 +499,6 @@ public:
    void reset();

    void set_inputs(const llama_ubatch * ubatch);
-    void set_outputs();

    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -569,11 +517,6 @@ public:
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;

-    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
-    std::map<llama_seq_id, ggml_tensor*> t_candidates;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
-
    std::vector<llm_graph_input_ptr> inputs;

    ggml_context_ptr ctx_compute;
@@ -649,8 +592,6 @@ struct llm_graph_context {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;

-    std::map<llama_seq_id, llama_sampler *> samplers;
-
    const llm_graph_cb & cb_func;

    llm_graph_result * res;
@@ -891,12 +832,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;

-    //
-    // sampling (backend sampling)
-    //
-
-    void build_sampling() const;
-
    //
    // dense (out)
    //
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
    return n_embd_inp;
 }

-uint32_t llama_hparams::get_n_embd_out() const {
-    return n_embd_out > 0 ? n_embd_out : n_embd;
-}
-
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);

--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -107,9 +107,9 @@ struct llama_hparams {

    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
-    float    rope_freq_base_train_swa  = 10000.0f;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
-    float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_freq_scale_train_swa;

    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@@ -125,11 +125,10 @@ struct llama_hparams {
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    // the size of the sliding window (0 - no SWA)
    uint32_t n_swa = 0;
-    // if swa_layers[il] == 1, then layer il is SWA
-    // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
    // by default, all layers are dense
-    // note: using uint32_t type for compatibility reason
-    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -164,9 +163,6 @@ struct llama_hparams {
    // for Classifiers
    uint32_t n_cls_out = 1;

-    // output embedding dimension (0 = use n_embd)
-    uint32_t n_embd_out = 0;
-
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
@@ -239,9 +235,6 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;

-    // dimension of output embeddings
-    uint32_t get_n_embd_out() const;
-
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;

--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -305,7 +305,7 @@ public:
            bool do_shift,
            stream_copy_info sc_info);

-    // used to create a batch processing context from a batch
+    // used to create a batch procesing context from a batch
    llama_kv_cache_context(
            llama_kv_cache * kv,
            slot_info_vec_t sinfos,
--- a/llama/llama.cpp/src/llama-mmap.cpp
+++ b/llama/llama.cpp/src/llama-mmap.cpp
@@ -13,10 +13,9 @@
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
-        #include <fcntl.h>
-        #include <sys/stat.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
+            #include <fcntl.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
        return ret;
    }

-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode) {
        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -110,7 +109,7 @@ struct llama_file::impl {
        }
    }

-    void read_raw(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        size_t bytes_read = 0;
        while (bytes_read < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +126,7 @@ struct llama_file::impl {
        }
    }

-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t val;
        read_raw(&val, sizeof(val));
        return val;
@@ -154,55 +153,16 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    bool has_direct_io() const {
-        return true;
-    }
-
    ~impl() {
        if (fp) {
            std::fclose(fp);
        }
    }
 #else
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
-#ifdef __linux__
-        // Try unbuffered I/O for read only
-        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
-            if (init_fd()) {
-                return;
-            }
-            LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
-                           fname, strerror(errno));
-        }
-#endif
-        init_fp(mode);
-    }
-
-#ifdef __linux__
-    bool init_fd() {
-        fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
-
-        if (fd != -1) {
-            struct stat file_stats{};
-            fstat(fd, &file_stats);
-
-            size = file_stats.st_size;
-            alignment = file_stats.st_blksize;
-
-            off_t ret = lseek(fd, 0, SEEK_SET);
-            if (ret == -1) {
-                throw std::runtime_error(format("seek error: %s", strerror(errno)));
-            }
-            return true;
-        }
-        return false;
-    }
-#endif
-
-    void init_fp(const char * mode) {
-        fp = ggml_fopen(fname.c_str(), mode);
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
@@ -210,118 +170,46 @@ struct llama_file::impl {
    }

    size_t tell() const {
-        if (fd == -1) {
-            long ret = std::ftell(fp);
-            if (ret == -1) {
-                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
-            }
-
-            return (size_t) ret;
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        if (ret == -1) {
+            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
        }

-        off_t pos = lseek(fd, 0, SEEK_CUR);
-        if (pos == -1) {
-            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
-        }
-        return (size_t) pos;
+        return (size_t) ret;
    }

    void seek(size_t offset, int whence) const {
-        off_t ret = 0;
-        if (fd == -1) {
-            ret = std::fseek(fp, (long) offset, whence);
-        } else {
-            ret = lseek(fd, offset, whence);
-        }
-        if (ret == -1) {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        if (ret != 0) {
            throw std::runtime_error(format("seek error: %s", strerror(errno)));
        }
    }

-    void read_raw_unsafe(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
-        if (fd == -1) {
-            std::size_t ret = std::fread(ptr, len, 1, fp);
-            if (ferror(fp)) {
-                throw std::runtime_error(format("read error: %s", strerror(errno)));
-            }
-            if (ret != 1) {
-                throw std::runtime_error("unexpectedly reached end of file");
-            }
-        } else {
-            size_t bytes_read = 0;
-            while (bytes_read < len) {
-                const size_t to_read = len - bytes_read;
-                ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
-
-                if (ret == -1) {
-                    if (errno == EINTR) {
-                        continue;  // Interrupted by signal, retry
-                    }
-                    // Fallback to std::fread in case the DMA controller cannot access the buffer
-                    if (errno == EFAULT) {
-                        auto curr_off = tell();
-                        close(fd);
-                        fd = -1;
-                        alignment = 1;
-                        init_fp("rb");
-                        seek(curr_off, SEEK_SET);
-                        read_raw_unsafe(ptr, len);
-                        return;
-                    }
-                    throw std::runtime_error(format("read error: %s", strerror(errno)));
-                }
-                if (ret == 0) {
-                    // EOF: allow if this read was only pulling alignment padding past file end
-                    off_t pos = lseek(fd, 0, SEEK_CUR);
-                    if (pos != -1 && (size_t) pos == size) {
-                        std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
-                        return;
-                    }
-                    throw std::runtime_error("unexpectedly reached end of file");
-                }
-
-                bytes_read += (size_t) ret;
-            }
+        std::size_t ret = std::fread(ptr, len, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error("unexpectedly reached end of file");
        }
    }

-    void read_aligned_chunk(void * dest, size_t size) {
-        size_t offset = tell();
-        off_t aligned_offset = offset & ~(alignment - 1);
-        off_t offset_from_alignment = offset - aligned_offset;
-        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
-
-        void * raw_buffer = nullptr;
-        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
-        if (ret != 0) {
-            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
-        }
-
-        struct aligned_buffer_deleter {
-            void operator()(void * p) const { free(p); }
-        };
-        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
-
-        seek(aligned_offset, SEEK_SET);
-        read_raw_unsafe(buffer.get(), bytes_to_read);
-
-        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
-        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
-    }
-
-    void read_raw(void * ptr, size_t len) {
-        if (has_direct_io()) {
-            read_aligned_chunk(ptr, len);
-        } else {
-            read_raw_unsafe(ptr, len);
-        }
-    }
-
-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
@@ -342,41 +230,23 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    bool has_direct_io() const {
-        return fd != -1 && alignment > 1;
-    }
-
    ~impl() {
-        if (fd != -1) {
-            close(fd);
-        } else {
+        if (fp) {
            std::fclose(fp);
        }
    }
-    int fd = -1;
-    std::string fname;
 #endif

-    size_t read_alignment() const {
-        return alignment;
-    }
-
-    size_t alignment = 1;
-
-    FILE * fp{};
-    size_t size{};
+    FILE * fp;
+    size_t size;
 };

-llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
-    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
+llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
 llama_file::~llama_file() = default;

 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }

-size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
-bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
-
 int llama_file::file_id() const {
 #ifdef _WIN32
    return _fileno(pimpl->fp);
@@ -390,14 +260,9 @@ int llama_file::file_id() const {
 }

 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
-#ifdef _WIN32
-void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
-#else
-void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
-#endif
+void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }

-uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }

 void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
 void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
--- a/llama/llama.cpp/src/llama-mmap.h
+++ b/llama/llama.cpp/src/llama-mmap.h
@@ -3,7 +3,6 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
-#include <cstdio>

 struct llama_file;
 struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

 struct llama_file {
-    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
+    llama_file(const char * fname, const char * mode);
    ~llama_file();

    size_t tell() const;
@@ -24,16 +23,12 @@ struct llama_file {

    void seek(size_t offset, int whence) const;

-    void read_raw(void * ptr, size_t len);
-    void read_raw_unsafe(void * ptr, size_t len);
-    void read_aligned_chunk(void * dest, size_t size);
-    uint32_t read_u32();
+    void read_raw(void * ptr, size_t len) const;
+    uint32_t read_u32() const;

    void write_raw(const void * ptr, size_t len) const;
    void write_u32(uint32_t val) const;

-    size_t read_alignment() const;
-    bool has_direct_io() const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -462,29 +462,6 @@ namespace GGUFMeta {
        return get_key_or_arr(llm_kv(kid), result, n, required);
    }

-    bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
-        const std::string key = llm_kv(kid);
-
-        const int id = gguf_find_key(meta.get(), key.c_str());
-
-        if (id < 0) {
-            if (required) {
-                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        // throw and error if type is an array
-        if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
-            if (required) {
-                throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        return get_key(key, result, required);
-    }
-
    // TODO: this is not very clever - figure out something better
    template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -495,7 +472,6 @@ llama_model_loader::llama_model_loader(
        const std::string & fname,
        std::vector<std::string> & splits,
        bool use_mmap,
-        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -528,17 +504,9 @@ llama_model_loader::llama_model_loader(
    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
    llm_kv = LLM_KV(llm_arch_from_string(arch_name));

-    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+    files.emplace_back(new llama_file(fname.c_str(), "rb"));
    contexts.emplace_back(ctx);

-    use_direct_io = use_direct_io && files.back()->has_direct_io();
-
-    // Disable mmap in case Direct I/O is enabled and available
-    if (use_direct_io && use_mmap) {
-        use_mmap = false;
-        LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
-    }
-
    // Save tensors data offset of the main file.
    // For subsidiary files, `meta` tensor data offset must not be used,
    // so we build a unified tensors index for weights.
@@ -604,7 +572,7 @@ llama_model_loader::llama_model_loader(
                }
            }

-            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+            files.emplace_back(new llama_file(fname_split, "rb"));
            contexts.emplace_back(ctx);

            // Save tensors data offset info of the shard.
@@ -748,7 +716,6 @@ llama_model_loader::llama_model_loader(
    }

    this->use_mmap = use_mmap;
-    this->use_direct_io = use_direct_io;
    this->check_tensors = check_tensors;
    this->no_alloc = no_alloc;
 }
@@ -968,15 +935,7 @@ bool llama_model_loader::load_all_data(
    // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
    // NVMe raid configurations might require more / larger buffers.
    constexpr size_t n_buffers = 4;
-
-    size_t alignment = 1;
-    for (const auto & file : files) {
-        alignment = std::max(file->read_alignment(), alignment);
-    }
-
-    // Buffer size: balance between memory usage and I/O efficiency
-    // 64MB works well for NVMe drives
-    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
+    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB

    std::vector<ggml_backend_buffer_t> host_buffers;
    std::vector<ggml_backend_event_t> events;
@@ -1026,7 +985,6 @@ bool llama_model_loader::load_all_data(
        // If the backend is supported, create pinned memory buffers and events for synchronisation.
        for (size_t idx = 0; idx < n_buffers; ++idx) {
            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
-
            if (!buf) {
                LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                    ggml_backend_dev_name(dev));
@@ -1108,7 +1066,6 @@ bool llama_model_loader::load_all_data(
            }
        } else {
            const auto & file = files.at(weight->idx);
-
            if (ggml_backend_buffer_is_host(cur->buffer)) {
                file->seek(weight->offs, SEEK_SET);
                file->read_raw(cur->data, n_size);
@@ -1120,54 +1077,19 @@ bool llama_model_loader::load_all_data(
            } else {
                // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                if (upload_backend) {
-                    size_t offset = weight->offs;
-                    alignment = file->read_alignment();
-                    size_t aligned_offset = offset & ~(alignment - 1);
-                    size_t offset_from_alignment = offset - aligned_offset;
-                    file->seek(aligned_offset, SEEK_SET);
-
-                    // Calculate aligned read boundaries
-                    size_t read_start = aligned_offset;
-                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
+                    file->seek(weight->offs, SEEK_SET);

                    size_t bytes_read = 0;
-                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)

-                    while (bytes_read < read_end - read_start) {
-                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+                    while (bytes_read < n_size) {
+                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);

-                        // Align the destination pointer within the pinned buffer
-                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
-
-                        // Wait for previous upload to complete before reusing buffer
                        ggml_backend_event_synchronize(events[buffer_idx]);
-
-                        // Read aligned chunk from file
-                        file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
-
-                        // Calculate actual data portion (excluding alignment padding)
-                        uintptr_t ptr_data = ptr_dest_aligned;
-                        size_t data_to_copy = read_size;
-
-                        // Skip alignment padding at start of first chunk
-                        if (bytes_read == 0) {
-                            ptr_data += offset_from_alignment;
-                            data_to_copy -= offset_from_alignment;
-                        }
-
-                        // Trim alignment padding at end of last chunk
-                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
-                            data_to_copy -= (read_end - (offset + n_size));
-                        }
-
-                        // Async upload actual data to GPU
-                        ggml_backend_tensor_set_async(upload_backend, cur,
-                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
                        ggml_backend_event_record(events[buffer_idx], upload_backend);

-                        data_read += data_to_copy;
-                        bytes_read += read_size;
-
+                        bytes_read += read_iteration;
                        ++buffer_idx;
                        buffer_idx %= n_buffers;
                    }
--- a/llama/llama.cpp/src/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@@ -70,7 +70,6 @@ struct llama_model_loader {
    size_t   n_bytes    = 0;

    bool use_mmap = false;
-    bool use_direct_io = false;
    bool check_tensors;
    bool no_alloc;

@@ -98,7 +97,6 @@ struct llama_model_loader {
        const std::string & fname,
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
-        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -133,8 +131,6 @@ struct llama_model_loader {
    template<typename T>
    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);

-    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
-
    std::string get_arch_name() const;

    enum llm_arch get_arch() const;
--- a/llama/llama.cpp/src/llama-model-saver.cpp
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
-    if (hparams.n_embd_out > 0) {
-        add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out);
-    }
    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -31,14 +31,12 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_17M:           return "17M";
        case LLM_TYPE_22M:           return "22M";
        case LLM_TYPE_33M:           return "33M";
-        case LLM_TYPE_47M:           return "47M";
        case LLM_TYPE_60M:           return "60M";
        case LLM_TYPE_70M:           return "70M";
        case LLM_TYPE_80M:           return "80M";
        case LLM_TYPE_109M:          return "109M";
        case LLM_TYPE_137M:          return "137M";
        case LLM_TYPE_140M:          return "140M";
-        case LLM_TYPE_149M:          return "149M";
        case LLM_TYPE_160M:          return "160M";
        case LLM_TYPE_190M:          return "190M";
        case LLM_TYPE_220M:          return "220M";
@@ -48,7 +46,6 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_335M:          return "335M";
        case LLM_TYPE_350M:          return "350M";
        case LLM_TYPE_360M:          return "360M";
-        case LLM_TYPE_395M:          return "395M";
        case LLM_TYPE_410M:          return "410M";
        case LLM_TYPE_450M:          return "450M";
        case LLM_TYPE_475M:          return "475M";
@@ -126,12 +123,10 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_31B_A3_5B:     return "31B.A3.5B";
        case LLM_TYPE_80B_A3B:       return "80B.A3B";
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
-        case LLM_TYPE_102B_A12B:     return "102B.A12B";
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
        case LLM_TYPE_300B_A47B:     return "300B.A47B";
-        case LLM_TYPE_310B_A15B:     return "310B.A15B";
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
        case LLM_TYPE_E2B:           return "E2B";
        case LLM_TYPE_E4B:           return "E4B";
@@ -507,7 +502,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {

    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
-    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out, false);
    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
@@ -579,7 +573,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);

-    // TODO: Handle SWA metadata similarly when models start implementing it
    // rope_freq_scale (inverse of the kv) is optional
    float ropescale = 0.0f;
    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -588,6 +581,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    }
    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;

+    // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);

    // non-transformer models do not have attention heads
@@ -606,7 +603,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {

        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);

-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
@@ -630,7 +627,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    // arch-specific KVs
    switch (arch) {
        case LLM_ARCH_LLAMA:
-        case LLM_ARCH_LLAMA_EMBED:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

@@ -675,10 +671,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.f_attn_temp_scale       = 0.1f;
                    hparams.f_attn_temp_offset      = 1.0f;
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                }

                switch (hparams.n_expert) {
@@ -724,10 +716,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (hparams.n_swa > 0) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(4);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@@ -887,34 +875,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MODERN_BERT:
-            {
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    uint32_t swa_period = 3;
-                    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn);
-                ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type, false);
-
-                switch (hparams.n_layer) {
-                    case 12:
-                        type = LLM_TYPE_47M; break; // granite-embedding-small
-                    case 22:
-                        type = LLM_TYPE_149M; break; // modern-bert-base
-                    case 28:
-                        type = LLM_TYPE_395M; break; // modern-bert-large
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        case LLM_ARCH_JINA_BERT_V2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
@@ -1116,14 +1076,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MAINCODER:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_1B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        case LLM_ARCH_QWEN3VL:
            {
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@@ -1242,25 +1194,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
            } break;
-        case LLM_ARCH_PLAMO3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    uint32_t swa_period = 8;
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_2B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        case LLM_ARCH_GPT2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1314,10 +1247,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.n_swa = 4096; // default value of gemma 2
                hparams.set_swa_pattern(2);
                hparams.attn_soft_cap = true;
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
@@ -1342,7 +1272,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(6);

-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+                    hparams.rope_freq_base_train_swa  = 10000.0f;
+                    hparams.rope_freq_scale_train_swa = 1.0f;
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@@ -1372,9 +1303,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(5);

                hparams.n_layer_kv_from_start     = 20;
+                hparams.rope_freq_base_train_swa  = 10000.0f;
+                hparams.rope_freq_scale_train_swa = 1.0f;
                hparams.f_attention_scale         = 1.0f;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

@@ -1390,8 +1322,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(6);

                hparams.causal_attn = false; // embeddings do not use causal attention
+                hparams.rope_freq_base_train_swa = 10000.0f;
+                hparams.rope_freq_scale_train_swa = 1.0f;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1530,10 +1463,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.set_swa_pattern(4);
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
@@ -1572,10 +1502,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (found_swa && hparams.n_swa > 0) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(4);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@@ -1703,7 +1629,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -1799,7 +1725,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {

                switch (hparams.n_layer) {
                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
-                    case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
                    default: type = LLM_TYPE_UNKNOWN;
                }
@@ -1918,10 +1843,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.n_swa = 4096;
                    hparams.set_swa_pattern(4);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                }

                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
@@ -2239,10 +2160,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.set_swa_pattern(2);

-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-
                switch (hparams.n_layer) {
                    case 24: type = LLM_TYPE_20B; break;
                    case 36: type = LLM_TYPE_120B; break;
@@ -2287,10 +2204,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
                    hparams.n_swa         = 4096;
                    hparams.set_swa_pattern(4, true);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
                    hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -2409,22 +2322,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MIMO2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa);
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
-
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_310B_A15B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        default: throw std::runtime_error("unsupported model architecture");
    }

@@ -2447,16 +2344,15 @@ void llama_model::load_vocab(llama_model_loader & ml) {

 bool llama_model::load_tensors(llama_model_loader & ml) {
    const auto & split_mode   = params.split_mode;
+    const auto & n_gpu_layers = params.n_gpu_layers;
    const auto & use_mlock    = params.use_mlock;
    const auto & tensor_split = params.tensor_split;

-    const int n_layer      = hparams.n_layer;
-    const int n_gpu_layers = this->n_gpu_layers();
+    const int n_layer = hparams.n_layer;

    const bool use_mmap_buffer = true;

-    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
-        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");

    // build a list of buffer types for the CPU and GPU devices
    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -2467,11 +2363,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
    }

-    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
-
    // calculate the split points
    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
    std::vector<float> splits(n_devices());
@@ -2482,13 +2373,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            size_t total;
            size_t free;
            ggml_backend_dev_memory(dev, &free, &total);
-
-            // devices can return 0 bytes for free and total memory if they do not
-            // have any to report. in this case, we will use the host memory as a fallback
-            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-            if (free == 0 && total == 0) {
-                ggml_backend_dev_memory(cpu_dev, &free, &total);
-            }
            splits[i] = free;
        }
    } else {
@@ -2505,10 +2389,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        splits[i] /= split_sum;
    }

-    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
+    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
+        const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
            return {cpu_dev, &pimpl->cpu_buft_list};
@@ -2748,7 +2636,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            case LLM_ARCH_GRANITE:
            case LLM_ARCH_GRANITE_MOE:
            case LLM_ARCH_MISTRAL3:
-            case LLM_ARCH_LLAMA_EMBED:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@@ -3283,37 +3170,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
                    }
                } break;
-            case LLM_ARCH_MODERN_BERT:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    for(int i = 0; i < n_layer; ++i) {
-                        auto& layer = layers[i];
-
-                        if ( i != 0 ) {
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        } else{
-                            // layer 0 uses identity
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        }
-
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                    }
-
-                    cls       = create_tensor(tn(LLM_TENSOR_CLS,     "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
-                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
-
-                } break;
            case LLM_ARCH_NEO_BERT:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
@@ -3378,14 +3234,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-
-                        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
-                        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
-                        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
-
-                        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
-                        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
-                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);

                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
@@ -3913,44 +3762,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
                    }
                } break;
-            case LLM_ARCH_PLAMO3:
-                {
-                    const int64_t head_dim_q = hparams.n_embd_head_k;
-                    const int64_t head_dim_v = hparams.n_embd_head_v;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        const int64_t num_attention_heads = hparams.n_head(i);
-                        const int64_t num_key_value_heads = hparams.n_head_kv(i);
-                        const int64_t q_proj_dim = num_attention_heads * head_dim_q;
-                        const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
-                        const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
-                        const int64_t n_ff_cur   = hparams.n_ff(i);
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
-                                {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_cur * 2}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
-                    }
-                } break;
            case LLM_ARCH_GPT2:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4841,11 +4652,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);

                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@@ -4908,11 +4715,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);

                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@@ -5279,9 +5082,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);

                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);

@@ -5393,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    const int64_t n_group    = hparams.ssm_n_group;
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;

+                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+                    const int64_t n_ff_shexp = hparams.n_ff_shexp;
+
                    // embeddings
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@@ -5444,9 +5250,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
                        }  else {
                            if (n_expert != 0) {
-                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-                                const int64_t n_ff_shexp = hparams.n_ff_shexp;
-
                                layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);

@@ -6476,8 +6279,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);

                    if (output == NULL) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6522,9 +6325,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
                        }
                    }
-
-                    // for LFM2-ColBert-350M
-                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
                } break;
            case LLM_ARCH_SMALLTHINKER:
                {
@@ -6827,75 +6627,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
                    }
                } break;
-            case LLM_ARCH_MIMO2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
-                        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
-                        uint32_t n_head = hparams.n_head(i);
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
-
-                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        // non-MoE branch
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-
-                        // MoE branch
-                        int64_t n_ff_exp = hparams.n_ff_exp;
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_MAINCODER:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@@ -7005,12 +6736,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    if (llama_supports_gpu_offload()) {
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

-        int n_repeating = n_gpu;
-        if (n_repeating > 0) {
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
-            n_repeating--;
        }
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);

        const int max_backend_supported_layers = hparams.n_layer + 1;
        const int max_offloadable_layers       = hparams.n_layer + 1;
@@ -7077,14 +6806,6 @@ size_t llama_model::n_devices() const {
    return devices.size();
 }

-uint32_t llama_model::n_gpu_layers() const {
-    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
-}
-
-llama_split_mode llama_model::split_mode() const {
-    return params.split_mode;
-}
-
 std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, size_t> ret;
    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
@@ -7177,10 +6898,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-            LLAMA_LOG_INFO("%s: freq_base_swa    = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
-            LLAMA_LOG_INFO("%s: freq_scale_swa   = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
-        }
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7413,7 +7130,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_NEO_BERT:
        case LLM_ARCH_WAVTOKENIZER_DEC:
-        case LLM_ARCH_MODERN_BERT:
        case LLM_ARCH_GEMMA_EMBEDDING:
        case LLM_ARCH_DREAM:
        case LLM_ARCH_LLADA:
@@ -7531,24 +7247,16 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    switch (arch) {
        case LLM_ARCH_LLAMA:
            {
-                llm = std::make_unique<llm_build_llama<false>>(*this, params);
+                llm = std::make_unique<llm_build_llama>(*this, params);
            } break;
        case LLM_ARCH_LLAMA4:
            {
                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_llama<false>>(*this, params);
+                    llm = std::make_unique<llm_build_llama>(*this, params);
                } else {
                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
                }
            } break;
-        case LLM_ARCH_LLAMA_EMBED:
-            {
-                llm = std::make_unique<llm_build_llama<true>>(*this, params);
-            } break;
-        case LLM_ARCH_MAINCODER:
-            {
-                llm = std::make_unique<llm_build_maincoder>(*this, params);
-            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params);
@@ -7581,10 +7289,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_bert>(*this, params);
            } break;
-        case LLM_ARCH_MODERN_BERT:
-            {
-                llm = std::make_unique<llm_build_modern_bert>(*this, params);
-            } break;
        case LLM_ARCH_NEO_BERT:
            {
                llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -7674,14 +7378,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_plamo2>(*this, params);
            } break;
-        case LLM_ARCH_PLAMO3:
-            {
-                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
-                }
-            } break;
        case LLM_ARCH_GPT2:
            {
                llm = std::make_unique<llm_build_gpt2>(*this, params);
@@ -7986,10 +7682,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_mistral3>(*this, params);
            } break;
-        case LLM_ARCH_MIMO2:
-            {
-                llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
-            } break;
        default:
            GGML_ABORT("fatal error");
    }
@@ -7997,17 +7689,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    // add on pooling layer
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);

-    // add backend sampling layers (if any)
-    llm->build_sampling();
-
    // if the gguf model was converted with --sentence-transformers-dense-modules
    // there will be two additional dense projection layers
    // dense linear projections are applied after pooling
    // TODO: move reranking logic here and generalize
    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);

-    llm->res->set_outputs();
-
    return llm->res->get_gf();
 }

@@ -8020,7 +7707,7 @@ llama_model_params llama_model_default_params() {
    llama_model_params result = {
        /*.devices                     =*/ nullptr,
        /*.tensor_buft_overrides       =*/ nullptr,
-        /*.n_gpu_layers                =*/ -1,
+        /*.n_gpu_layers                =*/ 999,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
@@ -8029,7 +7716,6 @@ llama_model_params llama_model_default_params() {
        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
-        /*.use_direct_io               =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
        /*.use_extra_bufts             =*/ true,
@@ -8064,10 +7750,6 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
    return model->hparams.n_embd_inp();
 }

-int32_t llama_model_n_embd_out(const llama_model * model) {
-    return model->hparams.get_n_embd_out();
-}
-
 int32_t llama_model_n_layer(const llama_model * model) {
    return model->hparams.n_layer;
 }
@@ -8171,8 +7853,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_ERNIE4_5:
        case LLM_ARCH_ERNIE4_5_MOE:
        case LLM_ARCH_MISTRAL3:
-        case LLM_ARCH_LLAMA_EMBED:
-        case LLM_ARCH_MAINCODER:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
@@ -8182,7 +7862,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DBRX:
        case LLM_ARCH_BERT:
        case LLM_ARCH_JINA_BERT_V3:
-        case LLM_ARCH_MODERN_BERT:
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_STABLELM:
@@ -8202,7 +7881,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_PHIMOE:
        case LLM_ARCH_PLAMO:
        case LLM_ARCH_PLAMO2:
-        case LLM_ARCH_PLAMO3:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_GEMMA3:
@@ -8233,7 +7911,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_PANGU_EMBED:
        case LLM_ARCH_AFMOE:
        case LLM_ARCH_QWEN3NEXT:
-        case LLM_ARCH_MIMO2:
            return LLAMA_ROPE_TYPE_NEOX;

        case LLM_ARCH_QWEN2VL:
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -24,14 +24,12 @@ enum llm_type {
    LLM_TYPE_17M,
    LLM_TYPE_22M,
    LLM_TYPE_33M,
-    LLM_TYPE_47M,
    LLM_TYPE_60M,
    LLM_TYPE_70M,
    LLM_TYPE_80M,
    LLM_TYPE_109M,
    LLM_TYPE_137M,
    LLM_TYPE_140M,
-    LLM_TYPE_149M,
    LLM_TYPE_160M,
    LLM_TYPE_190M,
    LLM_TYPE_220M,
@@ -41,7 +39,6 @@ enum llm_type {
    LLM_TYPE_335M,
    LLM_TYPE_350M,
    LLM_TYPE_360M,
-    LLM_TYPE_395M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
@@ -120,12 +117,10 @@ enum llm_type {
    LLM_TYPE_31B_A3_5B,
    LLM_TYPE_80B_A3B, // Qwen3 Next
    LLM_TYPE_100B_A6B,
-    LLM_TYPE_102B_A12B, // Solar-Open
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
    LLM_TYPE_230B_A10B, // Minimax M2
    LLM_TYPE_235B_A22B,
    LLM_TYPE_300B_A47B, // Ernie MoE big
-    LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
    LLM_TYPE_355B_A32B, // GLM-4.5
    LLM_TYPE_E2B,
    LLM_TYPE_E4B,
@@ -470,6 +465,8 @@ struct llama_model {
    struct ggml_tensor * dense_2_out_layers = nullptr;
    struct ggml_tensor * dense_3_out_layers = nullptr;

+    llama_model_params params;
+
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;

@@ -479,9 +476,6 @@ struct llama_model {
    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

-    // for keeping track of extra nodes used by lora adapters
-    uint32_t n_lora_nodes = 0;
-
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;

@@ -503,9 +497,6 @@ struct llama_model {
    size_t n_tensors() const;
    size_t n_devices() const;

-    uint32_t n_gpu_layers() const;
-    llama_split_mode split_mode() const;
-
    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;

    // total number of parameters in the model
@@ -534,8 +525,6 @@ struct llama_model {
    ggml_cgraph * build_graph(const llm_graph_params & params) const;

 private:
-    llama_model_params params;
-
    struct impl;
    std::unique_ptr<impl> pimpl;
 };
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
--- a/llama/llama.cpp/src/llama-sampling.h
+++ b/llama/llama.cpp/src/llama-sampling.h
@@ -14,19 +14,7 @@ struct llama_grammar;
 struct llama_sampler_chain {
    llama_sampler_chain_params params;

-    // has .backend_init() been called?
-    bool is_init = false;
-
-    struct info {
-        bool is_backend;
-
-        llama_sampler * ptr;
-    };
-
-    std::vector<info> samplers;
-
-    // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
-    std::vector<llama_token_data> cur;
+    std::vector<struct llama_sampler *> samplers;

    // timing

@@ -36,9 +24,9 @@ struct llama_sampler_chain {
 };

 struct llama_sampler * llama_sampler_init_dry_testing(
-        int32_t context_size,
-        float   dry_multiplier,
-        float   dry_base,
-        int32_t dry_allowed_length,
-        int32_t dry_penalty_last_n,
-        const std::vector<std::vector<llama_token>> & seq_breakers);
+                         int32_t   context_size,
+                           float   dry_multiplier,
+                           float   dry_base,
+                         int32_t   dry_allowed_length,
+                         int32_t   dry_penalty_last_n,
+  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_YOUTU:
-                regex_exprs = {
-                    "[가-힣ㄱ-ㆎ]+|[！…“”‘’—：；，、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
-                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                regex_exprs = {
                    "[\r\n]",
@@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
-            case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1856,11 +1849,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "deepseek-v3") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
                clean_spaces = false;
-            } else if (
-                    tokenizer_pre == "youtu") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
-                clean_spaces = false;
-                ignore_merges = true;
            } else if (
                    tokenizer_pre == "falcon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1879,8 +1867,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
                    tokenizer_pre == "a.x-4.0" ||
-                    tokenizer_pre == "mellum"  ||
-                    tokenizer_pre == "modern-bert" ) {
+                    tokenizer_pre == "mellum") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
@@ -2016,10 +2003,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "minimax-m2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                clean_spaces = false;
-            } else if (
-                tokenizer_pre == "solar-open") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
-                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2193,8 +2176,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        //       for now, we apply this workaround to find the tokens based on their text

        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
            if (special_eot_id == LLAMA_TOKEN_NULL) {
                if (false
@@ -2210,10 +2191,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2224,10 +2205,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|eom_id|>"
                        ) {
                    special_eom_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2244,10 +2225,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_prefix|>" // GLM-4.5
                        ) {
                    special_fim_pre_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2264,10 +2245,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_suffix|>" // GLM-4.5
                        ) {
                    special_fim_suf_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2284,10 +2265,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_middle|>" // GLM-4.5
                        ) {
                    special_fim_mid_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2301,10 +2282,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<PAD>"
                        ) {
                    special_fim_pad_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2319,10 +2300,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<reponame>"    // Granite
                        ) {
                    special_fim_rep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2333,41 +2314,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|file_sep|>" // Qwen
                        ) {
                    special_fim_sep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
        }

-        // auto-detect unused tokens: e.g. control tokens with the word "unused"
-        // ideally, these tokens should be marked as unused during conversion
-        {
-            uint32_t n_unused = 0;
-
-            for (const auto & t : token_to_id) {
-                auto & attr = id_to_token[t.second].attr;
-
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    continue;
-                }
-
-                if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
-                    if (strstr(t.first.c_str(), "unused") != NULL) {
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
-                    }
-                }
-
-                if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
-                    n_unused++;
-                }
-            }
-
-            LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
-        }
-
        // maintain a list of tokens that cause end-of-generation
        // this is currently determined based on the token text, which is obviously not ideal
        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2386,16 +2341,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        }

        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            if (false
                    || t.first == "<|eot_id|>"
                    || t.first == "<|im_end|>"
                    || t.first == "<|end|>"
                    || t.first == "<|return|>" // o200k_harmony
                    || t.first == "<|call|>"   // o200k_harmony
-                    || t.first == "<|flush|>"  // solar-open
-                    || t.first == "<|calls|>"  // solar-open
                    || t.first == "<end_of_turn>"
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
@@ -2405,28 +2356,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.second, t.first.c_str());
-                    attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            } else {
-                if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
-                    // token is control, but not marked as EOG -> print a debug log
-                    if (special_eog_ids.count(t.second) == 0) {
-                        LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                                __func__, t.second, t.first.c_str());
-                    }
+                // token is control, but not marked as EOG -> print a debug log
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
                }
            }
        }

        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
-                attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
+                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }

@@ -2446,42 +2393,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }

-        // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
-        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
+        // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
+        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
        //       we remove the "<|end|>" token from the EOG list
        {
            bool has_return = false;
            bool has_call   = false;
            bool has_end    = false;
-            bool has_flush  = false;

            llama_token end_id = LLAMA_TOKEN_NULL;

            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
            for (auto tid : special_eog_ids) {
-                auto & text = id_to_token[tid].text;
+                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());

-                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, text.c_str());
-
-                if (text == "<|return|>") {
+                if (id_to_token[tid].text == "<|return|>") {
                    has_return = true;
-                } else if (text == "<|call|>" || text == "<|calls|>") {
+                } else if (id_to_token[tid].text == "<|call|>") {
                    has_call = true;
-                } else if (text == "<|flush|>") {
-                    has_flush = true;
-                } else if (text == "<|end|>") {
+                } else if (id_to_token[tid].text == "<|end|>") {
                    has_end = true;
                    end_id = tid;
                }
            }

-            if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
+            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
-
-                auto & attr = id_to_token[end_id].attr;
-                attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
-
-                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
    }
@@ -2579,13 +2518,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
            }
-        } else if (_contains_any(model_name, {"modern-bert"})) {
-            if (token_to_id.count("[MASK]") == 0 ) {
-                LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
-            }
-            else {
-                _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
-            }
        }
    }
 }
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -51,8 +51,6 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
    LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2      = 41,
    LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
-    LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
-    LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
 };

 struct LLM_KV;
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -71,9 +71,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
    }, &ud);

    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap  = false;
-    mparams_copy.use_mlock = false;
+    mparams_copy.no_alloc = true;
+    mparams_copy.use_mmap = false;

    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
    if (model == nullptr) {
@@ -111,20 +110,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        }
    }
    for (size_t i = 0; i < ret.size(); i++) {
-        size_t free;
-        size_t total;
+        size_t free, total;
        ggml_backend_dev_memory(model->devices[i], &free, &total);
-
-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-        if (free == 0 && total == 0) {
-            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (cpu_dev == nullptr) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-            ggml_backend_dev_memory(cpu_dev, &free, &total);
-        }
        ret[i].free  = free;
        ret[i].total = total;
    }
@@ -152,15 +139,12 @@ enum layer_fraction_t {
 };
 // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue

-class llama_params_fit_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
 static void llama_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    constexpr int64_t MiB = 1024*1024;
+    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    typedef std::vector<llama_device_memory_data> dmds_t;
    const llama_model_params default_mparams = llama_model_default_params();

@@ -179,12 +163,6 @@ static void llama_params_fit_impl(
        return;
    }

-    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
-    margins.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        margins.push_back(margins_s[id]);
-    }
-
    std::vector<std::string> dev_names;
    {
        dev_names.reserve(nd);
@@ -202,12 +180,11 @@ static void llama_params_fit_impl(
        }
    }

-    int64_t sum_free            = 0;
-    int64_t sum_projected_free  = 0;
-    int64_t sum_projected_used  = 0;
-    int64_t sum_projected_model = 0;
-    std::vector<int64_t> projected_free_per_device;
-    projected_free_per_device.reserve(nd);
+    int64_t sum_total          = 0;
+    int64_t sum_projected_free = 0;
+    int64_t min_projected_free = INT64_MAX;
+    int64_t sum_projected_used = 0;
+    int64_t sum_projected_ctx  = 0;

    if (nd > 1) {
        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -217,98 +194,59 @@ static void llama_params_fit_impl(

        const int64_t projected_used = dmd.mb.total();
        const int64_t projected_free = dmd.free - projected_used;
-        projected_free_per_device.push_back(projected_free);

-        sum_free            += dmd.free;
-        sum_projected_used  += projected_used;
-        sum_projected_free  += projected_free;
-        sum_projected_model += dmd.mb.model;
+        sum_total          += dmd.total;
+        sum_projected_used += projected_used;
+        sum_projected_free += projected_free;
+        min_projected_free  = std::min(min_projected_free, projected_free);
+        sum_projected_ctx  += dmd.mb.context;

        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
+                projected_free >= 0 ? "surplus" : "deficit");
        }
    }
-    assert(sum_free >= 0 && sum_projected_used >= 0);
+    assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
+    assert(sum_projected_used >= sum_projected_ctx);
    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (nd == 1) {
-        if (projected_free_per_device[0] >= margins[0]) {
+        __func__, sum_projected_used/MiB, sum_total/MiB);
+    if (min_projected_free >= margin) {
+        if (nd == 1) {
            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
-            return;
-        }
-    } else {
-        bool changes_needed = false;
-        for (size_t id = 0; id < nd; id++) {
-            if (projected_free_per_device[id] < margins[id]) {
-                changes_needed = true;
-                break;
-            }
-        }
-        if (!changes_needed) {
-            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                __func__, min_projected_free/MiB, margin/MiB);
            return;
        }
+        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
+            __func__, min_projected_free/MiB, margin/MiB);
+        return;
    }

    // step 2: try reducing memory use by reducing the context size

    {
-        int64_t global_surplus = sum_projected_free;
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus -= margins[id];
-        }
+        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
        if (global_surplus < 0) {
-            if (nd == 1) {
-                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
-            } else {
-                LLAMA_LOG_INFO(
-                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
-                    __func__, -global_surplus/MiB);
-            }
+            LLAMA_LOG_INFO(nd == 1 ?
+                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
+                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
+                __func__, margin/MiB, -global_surplus/MiB);
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
-                    for (size_t id = 0; id < nd; id++) {
-                        sum_used_target -= margins[id];
-                    }
-                    if (nd > 1) {
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
-                        //   - for dense models only whole layers can be assigned to devices
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
-                        //   - use slightly more than the expected average for nd devices to be safe
-                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
-                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
-                    }
-
-                    int64_t sum_projected_used_min_ctx = 0;
-                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    for (const auto & dmd : dmds_min_ctx) {
-                        sum_projected_used_min_ctx += dmd.mb.total();
-                    }
-                    if (sum_used_target > sum_projected_used_min_ctx) {
-                        // linear interpolation between minimum and maximum context size:
-                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
-                            / (sum_projected_used - sum_projected_used_min_ctx);
-                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
-                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
-                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
+                    const uint32_t ctx_reduction = std::min(
+                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
+                    cparams->n_ctx = hp_nct - ctx_reduction;
+                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
+                    global_surplus += memory_reduction;
+                    LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    if (global_surplus >= 0) {
                        if (nd == 1) {
                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
-                    } else {
-                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
                    LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
@@ -321,28 +259,32 @@ static void llama_params_fit_impl(
    }

    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
    }
    if (nd > 1) {
        if (!tensor_split) {
-            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
+            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
        }
        if (mparams->tensor_split) {
            for (size_t id = 0; id < nd; id++) {
                if (mparams->tensor_split[id] != 0.0f) {
-                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
+                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
                }
            }
        }
        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+        }
+        if (hp_ngl < 2*nd) {
+            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
+                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
        }
    }
    if (!tensor_buft_overrides) {
-        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
+        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
    }
    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
+        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
    }

    // step 3: iteratively fill the back to front with "dense" layers
@@ -395,11 +337,6 @@ static void llama_params_fit_impl(

        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
        layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
-
-        uint32_t n_full() const {
-            assert(n_layer >= n_part);
-            return n_layer - n_part;
-        }
    };

    const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -408,7 +345,8 @@ static void llama_params_fit_impl(
    auto set_ngl_tensor_split_tbo = [&](
            const std::vector<ngl_t> & ngl_per_device,
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
+            llama_model_params & mparams,
+            const bool add_nonrepeating) {
        mparams.n_gpu_layers = 0;
        for (size_t id = 0; id < nd; id++) {
            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -416,25 +354,29 @@ static void llama_params_fit_impl(
                tensor_split[id] = ngl_per_device[id].n_layer;
            }
        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
+        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides

+        if (add_nonrepeating) {
+            mparams.n_gpu_layers += 1;
+            tensor_split[nd - 1] += 1;
+        }
        mparams.tensor_split = tensor_split;

        size_t itbo = 0;
        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
+            il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
                if (itbo + 1 >= ntbo) {
                    tensor_buft_overrides[itbo].pattern = nullptr;
                    tensor_buft_overrides[itbo].buft    = nullptr;
                    itbo++;
                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
+                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+                        + std::to_string(ntbo) + " is insufficient for model\n");
                }
                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
+                tensor_buft_overrides[itbo].buft = overflow_bufts[id];
                itbo++;
            }
            il0 += ngl_per_device[id].n_part;
@@ -449,9 +391,10 @@ static void llama_params_fit_impl(
    auto get_memory_for_layers = [&](
            const char * func_name,
            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+            const bool add_nonrepeating) -> std::vector<int64_t> {
        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);

        const dmds_t dmd_nl = llama_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -484,9 +427,9 @@ static void llama_params_fit_impl(
        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        for (size_t id = 0; id < nd; id++) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
+        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
+            global_surplus_cpu_moe += dmd.free;
+            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
        }

        if (global_surplus_cpu_moe > 0) {
@@ -505,18 +448,27 @@ static void llama_params_fit_impl(
    std::vector<int64_t> targets; // maximum acceptable memory use per device
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
+        targets.push_back(dmds_full[id].free - margin);
        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
+    // whether for the optimal memory use we expect to load at least some MoE tensors:
+    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
+
+    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+    for (size_t id = 0; id < nd - 1; ++id) {
+        overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
    }
+    overflow_bufts.push_back(ggml_backend_cpu_buffer_type());

    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+    if (hp_nex > 0) {
+        for (size_t id = 0; id < nd; id++) {
+            ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
+        }
+    }

    // optimize the number of layers per device using the method of false position:
    //   - ngl_per_device has 0 layers for each device, lower bound
@@ -524,30 +476,22 @@ static void llama_params_fit_impl(
    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
    //   - check memory use of our guess, replace either the low or high bound
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
-    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
        LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
    } else {
        LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
    }
+    uint32_t n_unassigned = hp_ngl;
    for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl + 1;
-        for (size_t jd = id + 1; jd < nd; ++jd) {
-            assert(n_unassigned >= ngl_per_device[jd].n_layer);
-            n_unassigned -= ngl_per_device[jd].n_layer;
-        }
-
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        ngl_per_device_high[id].n_layer = n_unassigned;
        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
+            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
        }
        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
            if (mem_high[id] > targets[id]) {
-                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -556,26 +500,25 @@ static void llama_params_fit_impl(
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
                    ngl_per_device_test[id].n_layer += step_size;
                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
-                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
+                        ngl_per_device_test[id].n_part += step_size;
                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);

                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
-                        mem            = mem_test;
+                        ngl_per_device  = ngl_per_device_test;
+                        mem             = mem_test;
+                        n_unassigned   -= ngl_per_device[id].n_layer;
                        LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
-                ngl_per_device = ngl_per_device_high;
-                mem            = mem_high;
+                ngl_per_device  = ngl_per_device_high;
+                n_unassigned   -= ngl_per_device[id].n_layer;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }
@@ -586,7 +529,7 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
        return;
    }

@@ -606,20 +549,24 @@ static void llama_params_fit_impl(
    assert(id_dense_start < nd);

    LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
+    for (size_t id = 0; id <= id_dense_start; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
+            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
            ngl_per_device_high[id].n_layer += n_layer_move;
            ngl_per_device_high[jd].n_layer -= n_layer_move;
            ngl_per_device_high[jd].n_part = 0;
        }
        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);

        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+            assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
+            assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
+            assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                   >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+            uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            while (delta > 1) {
                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                step_size = std::max(step_size, uint32_t(1));
@@ -635,11 +582,11 @@ static void llama_params_fit_impl(
                    ngl_per_device_test[id].n_layer += n_convert_jd;
                    n_converted_test += n_convert_jd;

-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
+                    if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
                        break;
                    }
                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);

                if (mem_test[id] <= targets[id]) {
                    ngl_per_device = ngl_per_device_test;
@@ -654,38 +601,32 @@ static void llama_params_fit_impl(
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+                delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                    - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            }
        } else {
            ngl_per_device = ngl_per_device_high;
-            mem            = mem_high;
            id_dense_start = id_dense_start_high;
            LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }

        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
+        if (ngl_per_device[id_dense_start].n_layer > 0) {
            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
            size_t id_dense_start_test = id_dense_start;
            ngl_per_device_test[id_dense_start_test].n_layer--;
            ngl_per_device_test[id_dense_start_test].n_part--;
            ngl_per_device_test[id].n_layer++;
            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
+            if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
                id_dense_start_test++;
            }
            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
-            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
-            if (id < nd - 1) {
-                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
-            }
            LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+            if (mem_test[id] < targets[id]) {
                ngl_per_device = ngl_per_device_test;
-                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -693,10 +634,9 @@ static void llama_params_fit_impl(

                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -705,10 +645,9 @@ static void llama_params_fit_impl(
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -723,41 +662,30 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }

-    // print info for devices that were not changed during the conversion from dense only to full layers:
-    for (size_t id = id_dense_start + 1; id < nd; id++) {
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LLAMA_LOG_INFO(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
 }

-enum llama_params_fit_status llama_params_fit(
+bool llama_params_fit(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    const int64_t t0_us = llama_time_us();
-    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
+    bool ok = true;
    try {
-        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const llama_params_fit_exception & e) {
-        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
    } catch (const std::runtime_error & e) {
-        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
+        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
+        ok = false;
    }
    const int64_t t1_us = llama_time_us();
    LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
+    return ok;
 }

 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
-        /*.no_perf =*/ true,
+        /*.no_perf                     =*/ true,
    };

    return result;
@@ -830,7 +758,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
    model.t_start_us = tm.t_start_us;

    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();

--- a/llama/llama.cpp/src/models/afmoe.cpp
+++ b/llama/llama.cpp/src/models/afmoe.cpp
@@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
-        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
-                              (il + 1) % hparams.n_no_rope_layer_step != 0;
-
        // dual attention normalization (pre)
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
            cb(Qcur, "Qcur_normed", il);
            cb(Kcur, "Kcur_normed", il);

+            // RoPE only for sliding_attention layers
+            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+                                ((il + 1) % hparams.n_no_rope_layer_step) != 0;
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur_rope", il);

                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur_rope", il);
            }
--- a/llama/llama.cpp/src/models/bert.cpp
+++ b/llama/llama.cpp/src/models/bert.cpp
@@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
            cb(cur, "ffn_out", il);
        } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-            const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
-            auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
            cur = build_ffn(cur,
-                    model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+                    model.layers[il].ffn_up, NULL, NULL,
                    model.layers[il].ffn_gate, NULL, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    type_op, LLM_FFN_PAR, il);
+                    model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            cur = build_ffn(cur,
--- a/llama/llama.cpp/src/models/cogvlm.cpp
+++ b/llama/llama.cpp/src/models/cogvlm.cpp
@@ -3,14 +3,12 @@
 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
+    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);

-    ggml_tensor * inpL;
-    ggml_tensor * cur;
-
+    ggml_tensor *inpL, *cur;
    inpL = build_inp_embd(model.tok_embd);

    ggml_tensor * inp_pos = build_inp_pos();
@@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
        }

        ggml_tensor * inpSA = inpL;
-        cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cur                 = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);

        // build self attention
        {
--- a/llama/llama.cpp/src/models/cohere2-iswa.cpp
+++ b/llama/llama.cpp/src/models/cohere2-iswa.cpp
@@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const

    for (int il = 0; il < n_layer; ++il) {
        const bool is_swa = hparams.is_swa(il);
-        // UNUSED:
-        // const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
--- a/llama/llama.cpp/src/models/deepseek2.cpp
+++ b/llama/llama.cpp/src/models/deepseek2.cpp
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                true, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
--- a/llama/llama.cpp/src/models/gemma-embedding.cpp
+++ b/llama/llama.cpp/src/models/gemma-embedding.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }

    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/llama/llama.cpp/src/models/gemma2-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma2-iswa.cpp
@@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll

            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);

            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);

            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/models/gemma3.cpp
+++ b/llama/llama.cpp/src/models/gemma3.cpp
@@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
-
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

--- a/llama/llama.cpp/src/models/gemma3n-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma3n-iswa.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model),
@@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
-
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

@@ -245,7 +248,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto inp = std::make_unique<llm_graph_input_embd>();
+    auto          inp = std::make_unique<llm_graph_input_embd>();
    ggml_tensor * inp_per_layer;
    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
@@ -255,20 +258,10 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
        cb(inp_per_layer, "inp_per_layer_selected", -1);
-        res->add_input(std::move(inp));
    } else {
-        // Vision embedding path: use padding token (ID=0) embedding
-        const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
-
-        // Extract and dequantize padding token embedding (column 0)
-        ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
-        ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
-        inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
-
-        // Reshape to [n_embd_altup, n_layer, 1]
-        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
-        cb(inp_per_layer, "inp_per_layer_vision", -1);
+        GGML_ABORT("TODO: support embd input");
    }
+    res->add_input(std::move(inp));
    return inp_per_layer;
 }

@@ -286,7 +279,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
                                              -1);  // [n_embd_altup, n_layer, n_tokens]
    cb(per_layer_proj, "per_layer_proj", -1);

-    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
+    inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
    inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
    cb(inp_per_layer, "inp_per_layer", -1);

--- a/llama/llama.cpp/src/models/llama-iswa.cpp
+++ b/llama/llama.cpp/src/models/llama-iswa.cpp
@@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;

@@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );

                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
            } else if (inp_attn_scale) {
--- a/llama/llama.cpp/src/models/llama.cpp
+++ b/llama/llama.cpp/src/models/llama.cpp
@@ -1,7 +1,6 @@
 #include "models.h"

-template <bool embed>
-llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15,14 +14,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

-    using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
-
-    inp_attn_type * inp_attn = nullptr;
-    if constexpr (embed) {
-        inp_attn = build_attn_inp_no_cache();
-    } else {
-        inp_attn = build_attn_inp_kv();
-    }
+    auto * inp_attn = build_attn_inp_kv();

    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

@@ -153,16 +145,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    cb(cur, "result_norm", -1);
    res->t_embd = cur;

-    if constexpr (!embed) {
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
+    // lm_head
+    cur = build_lora_mm(model.output, cur);

-        cb(cur, "result_output", -1);
-        res->t_logits = cur;
-    }
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;

    ggml_build_forward_expand(gf, cur);
 }
-
-template struct llm_build_llama<false>;
-template struct llm_build_llama<true>;
--- a/llama/llama.cpp/src/models/maincoder.cpp
+++ b/llama/llama.cpp/src/models/maincoder.cpp
@@ -1,117 +0,0 @@
-#include "models.h"
-
-llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    auto * inp_attn = build_attn_inp_kv();
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        // norm
-        cur = build_norm(inpL,
-                model.layers[il].attn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_normed", il);
-
-            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-            cb(Kcur, "Kcur_normed", il);
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        }
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // feed-forward network
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                model.layers[il].ffn_gate, NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(cur, "ffn_out", il);
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/mimo2-iswa.cpp
+++ b/llama/llama.cpp/src/models/mimo2-iswa.cpp
@@ -1,123 +0,0 @@
-
-#include "models.h"
-
-llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn = build_attn_inp_kv_iswa();
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        uint32_t n_head_l    = hparams.n_head(il);
-        uint32_t n_head_kv_l = hparams.n_head_kv(il);
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        cur = inpL;
-
-        // self_attention
-        {
-            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-            Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            ggml_tensor * sinks = model.layers[il].attn_sinks;
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
-        }
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        // feed-forward network
-        if (model.layers[il].ffn_gate_inp == nullptr) {
-            // dense branch
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
-                                0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
-            cb(cur, "ffn_moe_out", il);
-        }
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/models.h
+++ b/llama/llama.cpp/src/models/models.h
@@ -303,7 +303,6 @@ struct llm_build_llada_moe : public llm_graph_context {
    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
 };

-template <bool embed>
 struct llm_build_llama : public llm_graph_context {
    llm_build_llama(const llama_model & model, const llm_graph_params & params);
 };
@@ -312,18 +311,10 @@ struct llm_build_llama_iswa : public llm_graph_context {
    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_maincoder : public llm_graph_context {
-    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mamba : public llm_graph_context_mamba {
    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_mimo2_iswa : public llm_graph_context {
-    llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_minicpm3 : public llm_graph_context {
    llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
 };
@@ -336,10 +327,6 @@ struct llm_build_mistral3 : public llm_graph_context {
    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_modern_bert : public llm_graph_context {
-    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mpt : public llm_graph_context {
    llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };
@@ -409,11 +396,6 @@ struct llm_build_plamo : public llm_graph_context {
    llm_build_plamo(const llama_model & model, const llm_graph_params & params);
 };

-template <bool iswa>
-struct llm_build_plamo3 : public llm_graph_context {
-    llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_plm : public llm_graph_context {
    llm_build_plm(const llama_model & model, const llm_graph_params & params);
 };
--- a/llama/llama.cpp/src/models/modern-bert.cpp
+++ b/llama/llama.cpp/src/models/modern-bert.cpp
@@ -1,116 +0,0 @@
-#include "models.h"
-
-llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    // construct input embeddings (token, type, position)
-    inpL = build_inp_embd(model.tok_embd);
-    cb(inpL, "inp_embd", -1);
-
-    // embed layer norm
-    inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
-    cb(inpL, "inp_norm", -1);
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    auto * inp_attn = build_attn_inp_no_cache();
-
-    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        cur = inpL;
-
-        // attention layer norm
-        if (model.layers[il].attn_norm) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-        }
-
-        // self attention
-        cur = build_lora_mm(model.layers[il].wqkv, cur);
-        cb(cur, "wqkv", il);
-
-        const size_t type_size = ggml_type_size(cur->type);
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
-
-        // RoPE
-        Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-        Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        cur = build_attn(inp_attn,
-                    model.layers[il].wo, nullptr,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        cb(cur, "kqv_out", il);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-        }
-
-        // re-add the layer input
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // attention layer norm
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                NULL,                      NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
-
-        // attentions bypass the intermediate layer
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM, -1);
-    cb(cur, "final_norm_out", -1);
-
-    if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
-        // extracting cls token
-        cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
-        cb(cur, "cls_pooled_embd", -1);
-    }
-
-    cb(cur, "res_embd", -1);
-    res->t_embd = cur;
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/openai-moe-iswa.cpp
+++ b/llama/llama.cpp/src/models/openai-moe-iswa.cpp
@@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

        // norm
@@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,

            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

--- a/llama/llama.cpp/src/models/plamo3.cpp
+++ b/llama/llama.cpp/src/models/plamo3.cpp
@@ -1,128 +0,0 @@
-#include "models.h"
-
-template <bool iswa>
-llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context(params) {
-    const int64_t head_dim_q = hparams.n_embd_head_k;
-    const int64_t head_dim_v = hparams.n_embd_head_v;
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
-    inp_attn_type * inp_attn = nullptr;
-
-    if constexpr (iswa) {
-        inp_attn = build_attn_inp_kv_iswa();
-    } else {
-        inp_attn = build_attn_inp_kv();
-    }
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * residual = inpL;
-
-        float freq_base_l  = 0.0f;
-        float freq_scale_l = 0.0f;
-        if constexpr (iswa) {
-            freq_base_l  = model.get_rope_freq_base (cparams, il);
-            freq_scale_l = model.get_rope_freq_scale(cparams, il);
-        } else {
-            freq_base_l  = freq_base;
-            freq_scale_l = freq_scale;
-        }
-
-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
-        cb(cur, "wqkv", il);
-
-        const int32_t n_head    = hparams.n_head(il);
-        const int32_t n_head_kv = hparams.n_head_kv(il);
-
-        const int64_t q_offset = 0;
-        const int64_t k_offset = head_dim_q * n_head;
-        const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
-                head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
-                head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
-                head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-        cb(Qcur, "attn_q_norm", il);
-        Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-        cb(Kcur, "attn_k_norm", il);
-
-        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-        Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-
-        const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
-
-        cur = build_attn(inp_attn,
-                model.layers[il].wo, NULL,
-                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
-        cb(cur, "attn_out", il);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur      = ggml_get_rows(ctx0, cur, inp_out_ids);
-            residual = ggml_get_rows(ctx0, residual, inp_out_ids);
-        }
-
-        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, residual);
-        cb(cur, "attn_residual", il);
-
-        residual = cur;
-
-        cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                NULL,                      NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
-        cb(cur, "ffn_out", il);
-
-        cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, residual);
-        cb(cur, "ffn_residual", il);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-    res->t_embd = cur;
-
-    cur = build_lora_mm(model.output, cur);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
-
-// Explicit template instantiations
-template struct llm_build_plamo3<false>;
-template struct llm_build_plamo3<true>;
--- a/llama/llama.cpp/src/models/smallthinker.cpp
+++ b/llama/llama.cpp/src/models/smallthinker.cpp
@@ -26,16 +26,10 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA  = inpL;
+        ggml_tensor * probs  = nullptr;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
-        const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
-                              il % hparams.n_no_rope_layer_step != 0;
-
-        ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
+        probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
        cb(probs, "ffn_moe_logits", il);

        // norm
@@ -58,11 +52,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

-            if (use_rope) {
-                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+            if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                                    ext_factor, attn_factor, beta_fast, beta_slow);

-                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                                    ext_factor, attn_factor, beta_fast, beta_slow);
            }
            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -985,11 +985,6 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
        { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
        { "\\p{S}", unicode_cpt_flags::SYMBOL },
-        { "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
-        { "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
-        { "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
-        { "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
-        { "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
    };

    static const std::map<int, int> k_ucat_cpt = {
@@ -1100,26 +1095,22 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                        continue;
                    }

-                    // Match \p{...} Unicode properties of varying lengths
-                    if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
+                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
                        regex_expr[i + 1] == 'p' &&
-                        regex_expr[i + 2] == '{') {
-                        // Find the closing brace
-                        size_t closing_brace = regex_expr.find('}', i + 3);
-                        if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
-                            const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
-                            if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
-                                if (!inside) {
-                                    regex_expr_collapsed += '[';
-                                }
-                                regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
-                                regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
-                                if (!inside) {
-                                    regex_expr_collapsed += ']';
-                                }
-                                i = closing_brace;
-                                continue;
+                        regex_expr[i + 2] == '{' &&
+                        regex_expr[i + 4] == '}') {
+                        const std::string pat = regex_expr.substr(i, 5);
+                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+                            if (!inside) {
+                                regex_expr_collapsed += '[';
                            }
+                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
+                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
+                            if (!inside) {
+                                regex_expr_collapsed += ']';
+                            }
+                            i += 4;
+                            continue;
                        }
                    }

--- a/llama/llama.cpp/tools/mtmd/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -45,14 +45,13 @@
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"

-#define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
-#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
-#define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
-#define KEY_MINICPMV_VERSION       "clip.minicpmv_version"
-#define KEY_MINICPMV_QUERY_NUM     "clip.minicpmv_query_num"
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"

 // audio-specific
 #define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
@@ -139,62 +138,6 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"

-// (conformer) lfm2
-#define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
-#define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
-#define TN_FFN_NORM_1      "%s.blk.%d.ffn_norm_1.%s"
-#define TN_FFN_UP_1        "%s.blk.%d.ffn_up_1.%s"
-#define TN_FFN_DOWN_1      "%s.blk.%d.ffn_down_1.%s"
-#define TN_POS_BIAS_U      "%s.blk.%d.pos_bias_u"
-#define TN_POS_BIAS_V      "%s.blk.%d.pos_bias_v"
-#define TN_NORM_CONV       "%s.blk.%d.norm_conv.%s"
-#define TN_LINEAR_POS      "%s.blk.%d.linear_pos.%s"
-#define TN_CONV_DW         "%s.blk.%d.conv_dw.%s"
-#define TN_CONV_NORM       "%s.blk.%d.conv_norm.%s"
-#define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
-#define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
-
-// mobilenetv5 (gemma3n) definitions
-#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
-#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
-#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
-
-// Stage 0 Block (Edge Residual)
-#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
-#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
-#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
-#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
-
-// Stage 1+ Block (Universal Inverted Residual)
-#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
-#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
-#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
-#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
-#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
-#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
-#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
-#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
-#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
-
-// Attention Components
-#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
-#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
-#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
-#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
-#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
-#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
-#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
-#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
-#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
-
-// MSFA
-#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
-#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
-#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
-
-
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

@@ -212,8 +155,6 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_QWEN3VL,
    PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_GEMMA3NV,
-    PROJECTOR_TYPE_GEMMA3NA,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@@ -224,15 +165,12 @@ enum projector_type {
    PROJECTOR_TYPE_GLMA,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
-    PROJECTOR_TYPE_MUSIC_FLAMINGO,
    PROJECTOR_TYPE_LFM2,
    PROJECTOR_TYPE_KIMIVL,
    PROJECTOR_TYPE_LIGHTONOCR,
    PROJECTOR_TYPE_COGVLM,
    PROJECTOR_TYPE_JANUS_PRO,
-    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
-    PROJECTOR_TYPE_YOUTUVL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -246,8 +184,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
-    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
@@ -257,15 +193,12 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GLMA,      "glma"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
    { PROJECTOR_TYPE_LFM2,      "lfm2"},
    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
-    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
-    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/llama/llama.cpp/tools/mtmd/clip-model.h
+++ b/llama/llama.cpp/tools/mtmd/clip-model.h
@@ -4,7 +4,6 @@
 #include "clip.h"
 #include "clip-impl.h"

-#include <array>
 #include <vector>
 #include <unordered_set>
 #include <cstdint>
@@ -61,7 +60,6 @@ struct clip_hparams {
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
-    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)

    // audio
    int32_t n_mel_bins = 0; // whisper preprocessor
@@ -144,74 +142,11 @@ struct clip_layer {
    ggml_tensor * deepstack_fc2_w = nullptr;
    ggml_tensor * deepstack_fc2_b = nullptr;

-    // lfm2
-    ggml_tensor * ff_norm_w     = nullptr;
-    ggml_tensor * ff_norm_b     = nullptr;
-    ggml_tensor * ff_norm_1_w   = nullptr;
-    ggml_tensor * ff_norm_1_b   = nullptr;
-    ggml_tensor * ff_up_1_w     = nullptr;
-    ggml_tensor * ff_up_1_b     = nullptr;
-    ggml_tensor * ff_down_1_w   = nullptr;
-    ggml_tensor * ff_down_1_b   = nullptr;
-    ggml_tensor * pos_bias_u    = nullptr;
-    ggml_tensor * pos_bias_v    = nullptr;
-    ggml_tensor * norm_conv_w   = nullptr;
-    ggml_tensor * norm_conv_b   = nullptr;
-    ggml_tensor * linear_pos_w  = nullptr;
-
-    ggml_tensor * conv_norm_w   = nullptr;
-    ggml_tensor * conv_norm_b   = nullptr;
-    ggml_tensor * conv_dw_w     = nullptr;
-    ggml_tensor * conv_dw_b     = nullptr;
-    ggml_tensor * conv_pw1_w    = nullptr;
-    ggml_tensor * conv_pw1_b    = nullptr;
-    ggml_tensor * conv_pw2_w    = nullptr;
-    ggml_tensor * conv_pw2_b    = nullptr;
-
    bool has_deepstack() const {
        return deepstack_fc1_w != nullptr;
    }
 };

-// Expanded MobileNetV5 block structure for Gemma3n vision encoder
-struct mobilenetv5_block {
-    // Stage 0 (Edge Residual)
-    ggml_tensor * s0_conv_exp_w = nullptr;
-    ggml_tensor * s0_bn1_w      = nullptr;
-    ggml_tensor * s0_conv_pwl_w = nullptr;
-    ggml_tensor * s0_bn2_w      = nullptr;
-
-    // Stage 1+ (Universal Inverted Residual)
-    ggml_tensor * dw_start_w    = nullptr;
-    ggml_tensor * dw_start_bn_w = nullptr;
-
-    ggml_tensor * pw_exp_w      = nullptr;
-    ggml_tensor * pw_exp_bn_w   = nullptr;
-
-    ggml_tensor * dw_mid_w      = nullptr;
-    ggml_tensor * dw_mid_bn_w   = nullptr;
-
-    ggml_tensor * pw_proj_w     = nullptr;
-    ggml_tensor * pw_proj_bn_w  = nullptr;
-
-    ggml_tensor * layer_scale_w = nullptr;
-
-    // Attention (MQA) components
-    ggml_tensor * attn_q_w = nullptr;
-    ggml_tensor * attn_k_w = nullptr;
-    ggml_tensor * attn_v_w = nullptr;
-    ggml_tensor * attn_o_w = nullptr;
-
-    // Optional downsampling/norm in attention
-    ggml_tensor * attn_k_dw_w   = nullptr;
-    ggml_tensor * attn_k_norm_w = nullptr;
-    ggml_tensor * attn_v_dw_w   = nullptr;
-    ggml_tensor * attn_v_norm_w = nullptr;
-
-    // Block norm (often present in attention blocks)
-    ggml_tensor * attn_norm_w   = nullptr;
-};
-
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +263,6 @@ struct clip_model {
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;

-    // mobilenetv5 for gemma3n
-    std::vector<mobilenetv5_block> mobilenet_blocks;
-    std::vector<int> mobilenet_stage_ends;
-    ggml_tensor * mobilenet_stem_conv_w = nullptr;
-    ggml_tensor * mobilenet_stem_conv_b = nullptr;
-    ggml_tensor * mobilenet_stem_norm_w = nullptr;
-    ggml_tensor * mm_post_proj_norm_w = nullptr;
-
-    // Multi-Scale Fusion Adapter (MSFA) components
-    ggml_tensor * msfa_concat_conv_w = nullptr;
-    ggml_tensor * msfa_concat_norm_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_w = nullptr;
-    ggml_tensor * msfa_ffn_project_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_bn = nullptr;
-    ggml_tensor * msfa_ffn_project_bn = nullptr;
-
-
    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
@@ -368,16 +286,9 @@ struct clip_model {
    ggml_tensor * mm_boi = nullptr;
    ggml_tensor * mm_eoi = nullptr;

-    // lfm2 audio
-    std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
-    std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
-    ggml_tensor * pre_encode_out_w = nullptr;
-    ggml_tensor * pre_encode_out_b = nullptr;
-
    bool audio_has_avgpool() const {
        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL
-            || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
    }

    bool audio_has_stack_frames() const {
--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
@@ -801,10 +801,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -835,7 +831,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
            } break;
@@ -855,18 +850,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_llava>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_LFM2A:
-            {
-                builder = std::make_unique<clip_graph_conformer>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_GLM4V:
            {
                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
-            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@@ -1167,14 +1154,6 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
-
-                case PROJECTOR_TYPE_GEMMA3NV:
-                    {
-                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
-                        // Similar configuration to Gemma3
-                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
-                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@@ -1192,20 +1171,6 @@ struct clip_model_loader {
                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                        }
                    } break;
-                case PROJECTOR_TYPE_YOUTUVL:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
-                        std::vector<int> wa_layer_indexes_vec;
-                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
-                        for (auto & layer : wa_layer_indexes_vec) {
-                            hparams.wa_layer_indexes.insert(layer);
-                        }
-                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
-                        hparams.set_limit_image_tokens(1, 62500);
-                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
-                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@@ -1224,7 +1189,6 @@ struct clip_model_loader {
                case PROJECTOR_TYPE_QWEN2A:
                case PROJECTOR_TYPE_GLMA:
                case PROJECTOR_TYPE_VOXTRAL:
-                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                    {
                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1240,15 +1204,6 @@ struct clip_model_loader {
                        hparams.audio_window_len   = 400;
                        hparams.audio_hop_len      = 160;
                    } break;
-                case PROJECTOR_TYPE_LFM2A:
-                    {
-                        // audio preprocessing params
-                        hparams.audio_chunk_len        = 1; // in seconds
-                        hparams.audio_sample_rate      = 16000;
-                        hparams.audio_n_fft            = 512;
-                        hparams.audio_window_len       = 400;
-                        hparams.audio_hop_len          = 160;
-                    } break;
                default:
                    break;
            }
@@ -1274,14 +1229,7 @@ struct clip_model_loader {
                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
-                if (!hparams.wa_layer_indexes.empty()) {
-                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
-                    for (auto & layer : hparams.wa_layer_indexes) {
-                        LOG_INF("%d ", layer);
-                    }
-                    LOG_INF("\n");
-                }
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                if (hparams.image_min_pixels > 0) {
                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                }
@@ -1363,10 +1311,6 @@ struct clip_model_loader {

        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

-        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
-            hparams.n_layer = 0; // gemma3n does not use normal layer structure
-        }
-
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1441,7 +1385,6 @@ struct clip_model_loader {
            }
        }

-
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@@ -1554,14 +1497,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
-            case PROJECTOR_TYPE_YOUTUVL:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
-                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
-                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.projection     = get_tensor(TN_MM_PROJECTOR);
@@ -1581,112 +1516,11 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
-            case PROJECTOR_TYPE_GEMMA3NV:
-                {
-                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
-                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
-                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
-
-                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
-                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
-                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
-                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
-
-                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
-
-                    // Dynamically load blocks stage by stage
-                    for (int stage = 0; stage < 4; ++stage) {
-                        int blocks_found_in_stage = 0;
-
-                        for (int blk_idx = 0; ; ++blk_idx) {
-                            bool found_block = false;
-                            mobilenetv5_block block;
-
-                            // 1. Check for Edge Residual (S0)
-                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
-                            if (block.s0_conv_exp_w) {
-                                found_block = true;
-                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
-                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
-                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
-                            }
-                            // 2. Check for UIR (Universal Inverted Residual)
-                            else {
-                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
-                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
-                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
-
-                                if (block.dw_start_w || block.pw_exp_w) {
-                                    found_block = true;
-                                    if (block.dw_start_w) {
-                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
-                                    }
-                                    if (block.pw_exp_w) {
-                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
-                                    }
-                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
-                                    if (block.dw_mid_w) {
-                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
-                                    }
-                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
-                                    if (block.pw_proj_w) {
-                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
-                                    }
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            // 3. Check for Attention (MQA)
-                            // Even if UIR/Edge check failed, this might be a pure attention block
-                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
-                            if (attn_q_check) {
-                                found_block = true;
-                                block.attn_q_w = attn_q_check;
-                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
-                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
-                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
-                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
-                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
-                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
-                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
-                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
-                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
-                                if (!block.layer_scale_w) {
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            if (found_block) {
-                                model.mobilenet_blocks.push_back(block);
-                                blocks_found_in_stage++;
-                            } else {
-                                // End of blocks for this stage
-                                break;
-                            }
-                        }
-
-                        // Track where this stage ends in the flat vector
-                        if (blocks_found_in_stage > 0) {
-                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
-                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
-                        }
-                    }
-                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
-                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
                } break;
            case PROJECTOR_TYPE_LFM2:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
-                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1746,17 +1580,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                } break;
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                {
-                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
-                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
-                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
-                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1805,52 +1628,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
                } break;
-            case PROJECTOR_TYPE_LFM2A:
-                {
-                    for (int i : {0, 2, 3, 5, 6}) {
-                        model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
-                        model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
-                    }
-                    model.pre_encode_out_w    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
-                    model.pre_encode_out_b    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
-
-                    model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
-                    model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
-                    model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
-
-                    for (int il = 0; il < hparams.n_layer; ++il) {
-                        auto & layer = model.layers[il];
-
-                        layer.ff_norm_w   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "weight"));
-                        layer.ff_norm_b   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "bias"));
-                        layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
-                        layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
-                        layer.ff_up_1_w   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "weight"));
-                        layer.ff_up_1_b   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "bias"));
-                        layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
-                        layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
-
-                        layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
-                        layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
-
-                        layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
-                        layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
-
-                        layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
-
-                        layer.conv_norm_w  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
-                        layer.conv_norm_b  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
-                        layer.conv_dw_w    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "weight"));
-                        layer.conv_dw_b    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "bias"));
-                        layer.conv_pw1_w   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "weight"));
-                        layer.conv_pw1_b   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "bias"));
-                        layer.conv_pw2_w   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "weight"));
-                        layer.conv_pw2_b   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
-                    }
-                } break;
            default:
                GGML_ASSERT(false && "unknown projector type");
        }
@@ -2155,7 +1932,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    try {
        clip_model_loader loader(fname);
-        bool skip_audio = false;

        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@@ -2165,14 +1941,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }

-            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
-            // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
-
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

-        if (loader.has_audio && !skip_audio) {
+        if (loader.has_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@@ -2896,57 +2668,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                // res_imgs->data[0] = *res;
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                const int patch_size = params.patch_size;  // typically 16
-                const int merge_size = params.n_merge;      // typically 2
-                const int align_size = patch_size * merge_size;  // 32
-
-                const int max_num_patches = params.image_max_pixels > 0 ?
-                    params.image_max_pixels / (patch_size * patch_size) : 256;
-
-                // Linear search for optimal scale to fit within max_num_patches
-                float scale = 1.0f;
-                int target_height = original_size.height;
-                int target_width = original_size.width;
-
-                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
-                    float scaled_size = size * scale;
-                    // Round up to nearest multiple of align_size
-                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
-                    // Ensure at least one patch
-                    return std::max(align_size, aligned);
-                };
-
-                // Linear search with 0.02 step size
-                while (scale > 0.0f) {
-                    target_height = get_scaled_image_size(scale, original_size.height);
-                    target_width = get_scaled_image_size(scale, original_size.width);
-
-                    int num_patches_h = target_height / patch_size;
-                    int num_patches_w = target_width / patch_size;
-                    int num_patches = num_patches_h * num_patches_w;
-
-                    if (num_patches > max_num_patches) {
-                        scale -= 0.02f;
-                    } else {
-                        break;
-                    }
-                }
-
-                clip_image_size new_size = {target_width, target_height};
-
-                // Resize the image
-                clip_image_u8 resized;
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-
-                // Normalize to float32
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-
-                // Add to results
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;

        case PROJECTOR_TYPE_IDEFICS3:
            {
@@ -3010,16 +2731,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3189,7 +2900,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
            break;
@@ -3205,7 +2915,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        default:
            break;
@@ -3266,7 +2975,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
                int x_patch = img->nx / (params.patch_size * 2);
@@ -3282,12 +2990,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
-                // regardless of input size (see architecture description)
-                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
-            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@@ -3313,7 +3015,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                n_patches = img->nx;

@@ -3346,10 +3047,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            {
                n_patches += 2; // for BOI and EOI token embeddings
            } break;
-        case PROJECTOR_TYPE_LFM2A:
-            {
-                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
-            } break;
        default:
            GGML_ABORT("unsupported projector type");
    }
@@ -3400,6 +3097,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int pos_w = image_size_width  / patch_size;
    const int pos_h = image_size_height / patch_size;

+    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl

    auto get_inp_tensor = [&gf](const char * name) {
        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3548,11 +3246,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // pw * ph = number of tokens output by ViT after apply patch merger
                // ipw * ipw = number of vision token been processed inside ViT
-                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
                const int merge_ratio = 2;
                const int pw  = image_size_width  / patch_size / merge_ratio;
                const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3563,7 +3259,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                std::vector<int> inv_idx(ph * pw);

                if (use_window_attn) {
-                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int attn_window_size = 112;
                    const int grid_window = attn_window_size / patch_size / merge_ratio;
                    int dst = 0;
                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3680,7 +3376,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@@ -3688,7 +3383,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
            {
@@ -3711,27 +3405,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                }
                set_input_i32("pos_w", pos_data);
            } break;
-        case PROJECTOR_TYPE_LFM2A:
-            {
-                GGML_ASSERT(imgs.entries.size() == 1);
-                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
-
-                auto d_model = 512;
-                auto seq_len = n_frames * 2 - 1;
-                std::vector<float> pos_emb(d_model*seq_len);
-                std::vector<double> inv_freq(d_model / 2);
-                for (size_t i = 0; i < inv_freq.size(); ++i) {
-                    inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
-                }
-                for (int64_t pos = 0; pos < seq_len; ++pos) {
-                    for (size_t i = 0; i < inv_freq.size(); ++i) {
-                        const float ang = (n_frames - pos - 1) * inv_freq[i];
-                        pos_emb[pos*d_model + 2*i + 0] = sinf(ang);  // even
-                        pos_emb[pos*d_model + 2*i + 1] = cosf(ang);  // odd
-                    }
-                }
-                set_input_f32("pos_emb", pos_emb);
-            } break;
        default:
            GGML_ABORT("Unknown projector type");
    }
@@ -3802,19 +3475,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_JANUS_PRO:
-        case PROJECTOR_TYPE_YOUTUVL:
            return ctx->model.mm_1_b->ne[0];
        case PROJECTOR_TYPE_QWEN3VL:
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->model.mm_3_w->ne[1];
@@ -3829,8 +3499,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_COGVLM:
            return ctx->model.mm_4h_to_h_w->ne[1];
-        case PROJECTOR_TYPE_LFM2A:
-            return ctx->model.position_embeddings->ne[0];
        case PROJECTOR_TYPE_GLM4V:
            return ctx->model.mm_ffn_down_w->ne[1];
        default:
@@ -3839,7 +3507,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }

 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@@ -3847,26 +3514,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }

 bool clip_is_glm(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }

 bool clip_is_mrope(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
+        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
 }

 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }

+bool clip_is_gemma3(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3876,16 +3541,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }

 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_ULTRAVOX:
-        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
+        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
+        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
--- a/llama/llama.cpp/tools/mtmd/clip.h
+++ b/llama/llama.cpp/tools/mtmd/clip.h
@@ -106,8 +106,7 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-// note for contributor: this clip_is_(model) pattern is deprecated
-//                       do NOT add new functions like this
+bool clip_is_gemma3(const struct clip_ctx * ctx);

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

--- a/llama/llama.cpp/tools/mtmd/models/conformer.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/conformer.cpp
@@ -1,217 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_conformer::build() {
-    const int n_frames   = img.nx;
-    const int n_pos      = n_frames / 2;
-    const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
-    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
-
-    ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
-    ggml_set_name(pos_emb, "pos_emb");
-    ggml_set_input(pos_emb);
-    ggml_build_forward_expand(gf, pos_emb);
-
-    ggml_tensor * inp = build_inp_raw(1);
-    cb(inp, "input", -1);
-
-    auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
-
-    // pre encode, conv subsampling
-    {
-        // layer.0 - conv2d
-        cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
-        cb(cur, "conformer.pre_encode.conv.{}", 0);
-
-        // layer.1 - relu
-        cur = ggml_relu_inplace(ctx0, cur);
-
-        // layer.2 conv2d dw
-        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
-        cb(cur, "conformer.pre_encode.conv.{}", 2);
-
-        // layer.3 conv2d
-        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
-        cb(cur, "conformer.pre_encode.conv.{}", 3);
-
-        // layer.4 - relu
-        cur = ggml_relu_inplace(ctx0, cur);
-
-        // layer.5 conv2d dw
-        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
-        cb(cur, "conformer.pre_encode.conv.{}", 5);
-
-        // layer.6 conv2d
-        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
-        cb(cur, "conformer.pre_encode.conv.{}", 6);
-
-        // layer.7 - relu
-        cur = ggml_relu_inplace(ctx0, cur);
-
-        // flatten channel and frequency axis
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
-        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
-
-        // calculate out
-        cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
-        cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
-        cb(cur, "conformer.pre_encode.out", -1);
-    }
-
-    // pos_emb
-    cb(pos_emb, "pos_emb", -1);
-
-    for (int il = 0; il < hparams.n_layer; il++) {
-        const auto & layer = model.layers[il];
-
-        auto * residual = cur;
-
-        cb(cur, "layer.in", il);
-
-        // feed_forward1
-        cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
-
-        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
-                        il);
-        cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
-
-        const auto fc_factor = 0.5f;
-        residual             = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
-
-        // self-attention
-        {
-            cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
-            cb(cur, "conformer.layers.{}.norm_self_att", il);
-
-            ggml_tensor * Qcur     = ggml_mul_mat(ctx0, layer.q_w, cur);
-            Qcur                   = ggml_add(ctx0, Qcur, layer.q_b);
-            Qcur                   = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
-            ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
-            Q_bias_u               = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
-            ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
-            Q_bias_v               = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
-
-            // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
-            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
-            Kcur               = ggml_add(ctx0, Kcur, layer.k_b);
-            Kcur               = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
-            Kcur               = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-
-            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
-            Vcur               = ggml_add(ctx0, Vcur, layer.v_b);
-            Vcur               = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
-            Vcur               = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
-
-            // build_attn won't fit due to matrix_ac and matrix_bd separation
-            ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
-            matrix_ac               = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
-            cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
-
-            auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
-            cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
-            p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
-            p = ggml_permute(ctx0, p, 0, 2, 1, 3);
-
-            auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
-            matrix_bd        = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
-
-            // rel shift
-            {
-                const auto pos_len = matrix_bd->ne[0];
-                const auto q_len   = matrix_bd->ne[1];
-                const auto h       = matrix_bd->ne[2];
-                matrix_bd          = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
-                matrix_bd          = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
-                matrix_bd          = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
-                matrix_bd          = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
-                                                        matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
-                matrix_bd          = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
-            }
-
-            matrix_bd     = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
-                                               matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
-            auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
-            scores        = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
-            cb(scores, "conformer.layers.{}.self_attn.id0", il);
-
-            ggml_tensor * attn = ggml_soft_max(ctx0, scores);
-            ggml_tensor * x    = ggml_mul_mat(ctx0, attn, Vcur);
-            x                  = ggml_permute(ctx0, x, 2, 0, 1, 3);
-            x                  = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
-
-            ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
-            out               = ggml_add(ctx0, out, layer.o_b);
-            cb(out, "conformer.layers.{}.self_attn.linear_out", il);
-
-            cur = out;
-        }
-
-        residual = ggml_add(ctx0, residual, cur);
-        cur      = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_conv", il);
-
-        // conv
-        {
-            auto * x = cur;
-            x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
-            x = ggml_add(ctx0, x, layer.conv_pw1_b);
-            cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
-
-            // ggml_glu doesn't support sigmoid
-            // TODO @ngxson : support this ops in ggml
-            {
-                int64_t       d    = x->ne[0] / 2;
-                ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
-                x                  = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
-                x                  = ggml_cont(ctx0, ggml_transpose(ctx0, x));
-            }
-
-            // use ggml_ssm_conv for f32 precision
-            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
-            x = ggml_roll(ctx0, x, 4, 0, 0, 0);
-            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
-            x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
-            x = ggml_add(ctx0, x, layer.conv_dw_b);
-
-            x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
-            x = ggml_silu(ctx0, x);
-
-            // pointwise_conv2
-            x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
-            x = ggml_add(ctx0, x, layer.conv_pw2_b);
-
-            cur = x;
-        }
-
-        residual = ggml_add(ctx0, residual, cur);
-
-        cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
-
-        cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
-                        FFN_SILU, il);  // TODO(tarek): read activation for ffn from hparams
-        cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
-
-        residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
-        cb(residual, "conformer.layers.{}.conv.id", il);
-
-        cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_out", il);
-    }
-
-    // audio adapter
-    cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-    cb(cur, "audio_adapter.model.{}", 0);
-    cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
-
-    cb(cur, "projected", -1);
-
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
--- a/llama/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
@@ -1,451 +0,0 @@
-#include "models.h"
-
-// Helpers for MobileNetV5 Blocks
-// RMS Norm 2D - normalizes over channels for each spatial position
-ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
-    // inp: [W, H, C, B]
-
-    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_rms_norm(ctx0, cur, eps);
-
-    if (weight) {
-        cur = ggml_mul(ctx0, cur, weight);
-    }
-
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-
-    return cur;
-}
-
-// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
-ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
-    const int64_t ih = inp->ne[1];  // height
-    const int64_t iw = inp->ne[0];  // width
-
-    // Calculate output size (ceil division)
-    const int64_t oh = (ih + stride_h - 1) / stride_h;
-    const int64_t ow = (iw + stride_w - 1) / stride_w;
-
-    // Calculate padding needed
-    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
-    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
-
-    // Split padding asymmetrically
-    const int pad_h_top = pad_h / 2;
-    const int pad_h_bottom = pad_h - pad_h_top;
-    const int pad_w_left = pad_w / 2;
-    const int pad_w_right = pad_w - pad_w_left;
-
-    // Apply padding if needed
-    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
-    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
-    if (pad_h > 0 || pad_w > 0) {
-        inp = ggml_pad_ext(ctx0, inp,
-            pad_w_left, pad_w_right,     // width padding (dim 0)
-            pad_h_top, pad_h_bottom,      // height padding (dim 1)
-            0, 0,                         // no channel padding (dim 2)
-            0, 0);                        // no batch padding (dim 3)
-    }
-
-    return inp;
-}
-
-
-// Edge Residual Block (Stage 0)
-ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Expansion Conv (3x3)
-    if (stride == 2) {
-        // Case: Downsampling (Block 0)
-        // Replicates Conv2dSame(kernel=3, stride=2)
-        cur = pad_same_2d(cur, 3, 3, stride, stride);
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
-    } else {
-        // Case: Normal 3x3 Block (Block 1, 2)
-        // Replicates Conv2d(kernel=3, stride=1, padding=1)
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
-    }
-
-    // BN + Activation
-    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
-    cur = ggml_gelu(ctx0, cur);
-
-    // 2. Pointwise Linear Conv (1x1)
-    // 1x1 Convs usually have padding=0 and stride=1
-    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
-    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
-
-    // 3. Residual Connection
-    // Only apply residual if spatial dimensions and channels match (stride 1)
-    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Universal Inverted Residual Block (Stage 1+)
-ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Depthwise Start (Optional)
-    // NOTE: dw_start always has stride=1 (no downsampling here)
-    if (block.dw_start_w) {
-        int k = block.dw_start_w->ne[0]; // 3 or 5
-        int p = k / 2;
-        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
-        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
-    }
-
-    // 2. Pointwise Expansion (1x1)
-    if (block.pw_exp_w) {
-        // Standard 1x1 conv, pad=0, stride=1
-        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 3. Depthwise Mid (Optional)
-    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
-    if (block.dw_mid_w) {
-        int k = block.dw_mid_w->ne[0]; // 3 or 5
-
-        if (stride > 1) {
-            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
-            cur = pad_same_2d(cur, k, k, stride, stride);
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
-        } else {
-            // Case: Stride 1 -> Use Standard Symmetric Padding
-            int p = k / 2;
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
-        }
-
-        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 4. Pointwise Projection (1x1)
-    if (block.pw_proj_w) {
-        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
-    }
-
-    // Apply Layer Scaling if present
-    if (block.layer_scale_w) {
-        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-    }
-
-    // 5. Residual Connection
-    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
-    bool same_channel = (inp->ne[2] == cur->ne[2]);
-    if (same_spatial && same_channel) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Attention Block (MQA)
-ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
-    ggml_tensor * cur = inp;
-
-    // Norm
-    if (block.attn_norm_w) {
-        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
-    }
-
-    // 1. Q Calculation
-    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
-
-    // 2. K Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * k_inp = cur;
-    if (block.attn_k_dw_w) {
-        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
-        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
-        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_k_norm_w) {
-            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
-
-    // 3. V Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * v_inp = cur;
-    if (block.attn_v_dw_w) {
-        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
-        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
-        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_v_norm_w) {
-            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
-
-    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
-    const int D = k->ne[2]; // Head dimension
-    const int n_head = q->ne[2] / D;
-    const int N = W * H;
-
-    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
-    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
-    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
-    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
-    q = ggml_cont(ctx0, q);
-
-    const int Wk = k->ne[0]; const int Hk = k->ne[1];
-    const int M = Wk * Hk;
-
-    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
-    k = ggml_reshape_3d(ctx0, k, M, D, B);
-    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
-    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
-    k = ggml_cont(ctx0, k);
-
-    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
-    v = ggml_reshape_3d(ctx0, v, M, D, B);
-    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
-    v = ggml_cont(ctx0, v); // [M, D, 1, B]
-
-    // Multi-Query Attention
-    float scale = 1.0f / sqrtf((float)D);
-
-    // Step 1: Compute Q @ K.T
-    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
-
-    scores = ggml_scale(ctx0, scores, scale);
-
-    scores = ggml_soft_max(ctx0, scores);
-
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
-
-    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
-    kqv = ggml_cont(ctx0, kqv);
-
-
-    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
-    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
-    kqv = ggml_cont(ctx0, kqv);
-
-    // Output projection
-    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
-
-    // Residual & Layer Scale
-    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
-        if (block.layer_scale_w) {
-            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-        }
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-ggml_cgraph * clip_graph_mobilenetv5::build() {
-    ggml_tensor * inp = build_inp_raw();
-
-    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
-    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
-
-    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
-    if (model.mobilenet_stem_conv_b) {
-        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
-    }
-    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
-    cur = ggml_gelu(ctx0, cur);
-
-
-    // 2. Blocks
-    std::vector<ggml_tensor*> intermediate_features;
-    const int total_blocks = model.mobilenet_blocks.size();
-
-    auto is_stage_start = [&](int i) {
-        if (i == 0) return true;
-        for (int end_idx : model.mobilenet_stage_ends) {
-            if (i == end_idx + 1) return true;
-        }
-        return false;
-    };
-
-    auto is_fusion_point = [&](int i) {
-        if (model.mobilenet_stage_ends.size() >= 4) {
-                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
-                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
-        } else {
-            if (i == total_blocks - 1) return true;
-        }
-        return false;
-    };
-
-    for (int i = 0; i < total_blocks; i++) {
-        const auto & block = model.mobilenet_blocks[i];
-        int stride = is_stage_start(i) ? 2 : 1;
-
-        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
-        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
-        else                          cur = build_inverted_residual(cur, block, stride);
-
-        if (is_fusion_point(i)) {
-
-            intermediate_features.push_back(cur);
-        }
-    }
-
-    // 3. Multi-Scale Fusion Adapter (MSFA)
-    if (!intermediate_features.empty()) {
-
-        // A. Reference Resolution: PyTorch implementation uses inputs[0]
-        // We assume intermediate_features[0] is the "High Resolution" target.
-        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
-        ggml_tensor* target_feat = intermediate_features[0];
-        int high_res_w = target_feat->ne[0];
-        int high_res_h = target_feat->ne[1];
-
-        std::vector<ggml_tensor*> resized_feats;
-
-        // B. Resize inputs to match inputs[0] (High Resolution)
-        for (auto feat : intermediate_features) {
-            int feat_w = feat->ne[0];
-            int feat_h = feat->ne[1];
-
-            // PyTorch: if feat_size < high_resolution: interpolate
-            if (feat_w < high_res_w || feat_h < high_res_h) {
-                // Calculate scale factor.
-                // Note: PyTorch 'nearest' works on arbitrary float scales.
-                // ggml_upscale generally takes integer factors or target sizes depending on helper.
-                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
-                int scale_w = high_res_w / feat_w;
-                // int scale_h = high_res_h / feat_h;
-
-                // Safety check for non-integer scaling if strictly replicating
-                GGML_ASSERT(high_res_w % feat_w == 0);
-
-                // Upsample (Nearest Neighbor)
-                // 2 is the scale factor
-                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
-            }
-            resized_feats.push_back(feat);
-        }
-
-        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
-        cur = resized_feats[0];
-        for (size_t k = 1; k < resized_feats.size(); ++k) {
-            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
-        }
-
-        // D. FFN (UniversalInvertedResidual)
-        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
-
-        // 1. Expansion
-        if (model.msfa_ffn_expand_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
-
-            if (model.msfa_ffn_expand_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
-            }
-
-            cur = ggml_gelu(ctx0, cur);
-
-        }
-
-        // 2. Projection (No DW because kernel_size=0)
-        if (model.msfa_ffn_project_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
-
-            // UniversalInvertedResidual typically has a norm after projection
-            if (model.msfa_ffn_project_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
-            }
-
-        }
-
-        // E. Final Downsample to Target Resolution (Output Resolution)
-        // PyTorch: matches self.output_resolution (e.g. 16x16)
-        const int target_out_res = 16;
-        int current_w = cur->ne[0];
-
-        if (current_w > target_out_res) {
-            int s = current_w / target_out_res;
-
-            GGML_ASSERT(current_w % target_out_res == 0);
-
-            // Avg Pool: Kernel=s, Stride=s
-            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
-
-        }
-
-        // F. Final Norm
-        if (model.msfa_concat_norm_w) {
-            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
-
-        }
-    }
-
-    // 4. Gemma 3n Multimodal Projection (Embedder)
-    // Input: 'cur' is [Width, Height, Channels, Batch]
-    int W = cur->ne[0];
-    int H = cur->ne[1];
-    int C = cur->ne[2];
-    int B = cur->ne[3];
-
-    GGML_ASSERT(C == hparams.n_embd);
-
-    // 1. Permute and Flatten to [Channels, Tokens, Batch]
-    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
-    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
-    cur = ggml_cont(ctx0, cur);
-
-
-    // 2. FEATURE SCALING
-    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
-    const float scale_factor = sqrtf((float)C);
-    cur = ggml_scale(ctx0, cur, scale_factor);
-
-
-    // 3. SOFT EMBEDDING NORM
-    // PyTorch: self._norm(x) * self.weight
-    // We must normalize regardless, then multiply if weight exists.
-    {
-        const float eps = 1e-6f; // Gemma3n uses 1e-6
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_soft_emb_norm_w) {
-            // Weight shape is (2048,) -> Element-wise broadcast multiply
-            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
-        }
-
-    }
-
-    // 4. PROJECTION
-    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
-    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
-    if (model.mm_input_proj_w) {
-        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
-    }
-
-    // 5. POST PROJECTION NORM
-    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
-    // with_scale=False means weight is registered as buffer with value 1.0
-    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
-    {
-        const float eps = 1e-6f;
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_post_proj_norm_w) {
-            // If weight is loaded, multiply (should be ~1.0 anyway)
-            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
-        }
-    }
-
-    ggml_build_forward_expand(gf, cur);
-    return gf;
-}
--- a/llama/llama.cpp/tools/mtmd/models/models.h
+++ b/llama/llama.cpp/tools/mtmd/models/models.h
@@ -2,11 +2,6 @@

 #include "../clip-graph.h"

-/*
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
- */
-
 struct clip_graph_siglip : clip_graph {
    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
    ggml_cgraph * build() override;
 };

-struct clip_graph_youtuvl : clip_graph {
-    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-};
-
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -67,45 +57,7 @@ struct clip_graph_whisper_enc : clip_graph {
    ggml_cgraph * build() override;
 };

-struct clip_graph_conformer : clip_graph {
-    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-};
-
 struct clip_graph_glm4v : clip_graph {
    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
-
-struct clip_graph_mobilenetv5 : clip_graph {
-    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-
-    ggml_tensor * rms_norm_2d(
-        ggml_tensor * inp,
-        ggml_tensor * weight,
-        float eps = 1e-6f);
-
-    ggml_tensor* pad_same_2d(
-        ggml_tensor* inp,
-        int kernel_h,
-        int kernel_w,
-        int stride_h,
-        int stride_w,
-        int dilation_h = 1,
-        int dilation_w = 1);
-
-    ggml_tensor * build_edge_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_inverted_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_mobilenet_attn(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block);
-};
--- a/llama/llama.cpp/tools/mtmd/models/siglip.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/siglip.cpp
@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);

-        // projection, in LFM2-VL input norm is optional
-        if (model.mm_input_norm_w) {
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-        }
-
-        if (model.mm_input_norm_b) {
-            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-        }
+        // projection
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);

        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
--- a/llama/llama.cpp/tools/mtmd/models/whisper-enc.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/whisper-enc.cpp
@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
            FFN_GELU_ERF,
            -1);

-    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-        // projector
-        cur = build_ffn(cur,
-            model.mm_1_w, model.mm_1_b,
-            nullptr, nullptr,
-            model.mm_2_w, model.mm_2_b,
-            FFN_GELU_ERF,
-            -1);
-
    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
            cur = ggml_norm(ctx0, cur, hparams.eps);
            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
--- a/llama/llama.cpp/tools/mtmd/models/youtuvl.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/youtuvl.cpp
@@ -1,179 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_youtuvl::build() {
-    GGML_ASSERT(model.class_embedding == nullptr);
-    const int batch_size       = 1;
-    const bool use_window_attn = !hparams.wa_layer_indexes.empty();
-    const int n_pos            = n_patches;
-    const int num_position_ids = n_pos * 4;
-    const int m = 2;
-    const int Wp = n_patches_x;
-    const int Hp = n_patches_y;
-    const int Hm = Hp / m;
-    const int Wm = Wp / m;
-    norm_type norm_t = NORM_TYPE_NORMAL;
-
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-    ggml_tensor * inp = build_inp_raw();
-
-    // change conv3d to linear
-    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
-    {
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            Wm * m * patch_size, m * patch_size, Hm, 3);
-        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, Wm, m * patch_size, Hm);
-
-        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, patch_size, m, Hm * Wm);
-
-        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            patch_size, 3, patch_size, Hm * Wm * m * m);
-
-        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
-        inp = ggml_cont_3d(
-            ctx0, inp,
-            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
-    }
-    inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-
-    if (model.patch_bias) {
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
-    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-
-    ggml_tensor * inpL           = inp;
-    ggml_tensor * window_mask    = nullptr;
-    ggml_tensor * window_idx     = nullptr;
-    ggml_tensor * inv_window_idx = nullptr;
-
-    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-    }
-    if (use_window_attn) {
-        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
-        ggml_set_name(inv_window_idx, "inv_window_idx");
-        ggml_set_input(inv_window_idx);
-        // mask for window attention
-        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
-        ggml_set_name(window_mask, "window_mask");
-        ggml_set_input(window_mask);
-
-        // if flash attn is used, we need to pad the mask and cast to f16
-        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
-        }
-
-        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
-        GGML_ASSERT(batch_size == 1);
-        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
-        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
-        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
-    }
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        const auto & layer = model.layers[il];
-        const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
-
-        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-        // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-        // self-attention
-        {
-            ggml_tensor * Qcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
-            ggml_tensor * Kcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
-            ggml_tensor * Vcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
-
-            Qcur = ggml_rope_multi(
-                ctx0, Qcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            Kcur = ggml_rope_multi(
-                ctx0, Kcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-
-            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
-
-            cur = build_attn(layer.o_w, layer.o_b,
-                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
-        }
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, inpL);
-
-        inpL = cur; // inpL = residual, cur = hidden_states
-
-        // layernorm2
-        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-
-        // ffn
-        cur = build_ffn(cur,
-            layer.ff_up_w, layer.ff_up_b,
-            nullptr, nullptr,
-            layer.ff_down_w, layer.ff_down_b,
-            hparams.ffn_op, il);
-
-        // residual 2
-        cur = ggml_add(ctx0, inpL, cur);
-
-        inpL = cur;
-    }
-
-    ggml_tensor * embeddings = inpL;
-    if (use_window_attn) {
-        const int spatial_merge_unit = 4;
-        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
-        ggml_set_name(window_idx, "window_idx");
-        ggml_set_input(window_idx);
-        GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
-        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
-        cb(embeddings, "window_order_restored", -1);
-    }
-
-    // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
-    if (model.post_ln_w) {
-        embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-    }
-
-    // Now apply merger (VLPatchMerger):
-    // 1. Apply RMS norm (ln_q in VLPatchMerger)
-    embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
-    cb(embeddings, "merger_normed", -1);
-
-    // 2. First reshape for spatial merge (merge 2x2 patches)
-    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-    cb(embeddings, "merger_reshaped", -1);
-
-    embeddings = build_ffn(embeddings,
-                    model.mm_0_w, model.mm_0_b,
-                    nullptr, nullptr,
-                    model.mm_1_w, model.mm_1_b,
-                    FFN_GELU,
-                    -1);
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
--- a/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp
@@ -9,250 +9,207 @@
 #include <fstream>
 #include <algorithm>

-// some of the code here is copied from whisper.cpp
+// most of the code here is copied from whisper.cpp

 constexpr bool DEBUG = false;

-void mtmd_audio_cache::fill_sin_cos_table(int n) {
-    sin_vals.resize(n);
-    cos_vals.resize(n);
-    for (int i = 0; i < n; i++) {
-        double theta = (2 * M_PI * i) / n;
-        sin_vals[i]  = sinf(theta);
-        cos_vals[i]  = cosf(theta);
-    }
-}
+struct mtmd_audio_mel_filters {
+    int32_t n_mel;
+    int32_t n_fft;

-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
-    hann_window.resize(length);
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
-    }
-}
+    std::vector<float> data;
+};

-void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
-                                                  int   n_fft,
-                                                  int   sample_rate,
-                                                  float fmin,
-                                                  float fmax,
-                                                  bool  slaney_area_norm,
-                                                  float scale) {
-    GGML_ASSERT(n_mel > 0 && n_fft > 1);
-    if (fmax <= 0.0f) {
-        fmax = 0.5f * sample_rate;
-    }
+// note: this global cache is shared among all preprocessors
+//       if we want to use multiple preprocessors at the same time,
+//       we will need to enclose it in the preprocessor class in the future
+static struct mtmd_audio_global_cache {
+    // precomputed sin/cos table for FFT
+    std::vector<float> sin_vals;
+    std::vector<float> cos_vals;

-    // Slaney scale (matches librosa default)
-    const double min_log_hz  = 1000.0;
-    const double lin_slope   = 3 / 200.;
-    const double min_log_mel = min_log_hz * lin_slope;
-    const double log_step    = log(6.4) / 27.0;
-    auto         hz_to_mel   = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
-        return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
-    };
-    auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
-        return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
-    };
+    // hann window
+    std::vector<float> hann_window;

-    // infer N_fft from n_fft_bins
-    const double bin_hz_step = double(sample_rate) / double(n_fft);
+    // mel filter bank
+    mtmd_audio_mel_filters filters;

-    // mel grid: n_mel + 2 edges
-    const double        m_lo = hz_to_mel(fmin);
-    const double        m_hi = hz_to_mel(fmax);
-    std::vector<double> mel_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
-    }
-
-    // convert to Hz
-    std::vector<double> hz_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        hz_pts[i] = mel_to_hz(mel_pts[i]);
-    }
-
-    const int n_fft_bins = n_fft / 2 + 1;
-
-    // filterbank
-    std::vector<float> out(n_mel * n_fft_bins, 0);
-    for (int m = 0; m < n_mel; ++m) {
-        const double f_left   = hz_pts[m];
-        const double f_center = hz_pts[m + 1];
-        const double f_right  = hz_pts[m + 2];
-
-        const double denom_l = std::max(1e-30, f_center - f_left);
-        const double denom_r = std::max(1e-30, f_right - f_center);
-        const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
-
-        for (int k = 0; k < n_fft_bins; ++k) {
-            const double f = k * bin_hz_step;
-            double       w = 0.0;
-            if (f >= f_left && f <= f_center) {
-                w = (f - f_left) / denom_l;
-            } else if (f > f_center && f <= f_right) {
-                w = (f_right - f) / denom_r;
-            }
-            out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+    void fill_sin_cos_table(int n) {
+        sin_vals.resize(n);
+        cos_vals.resize(n);
+        for (int i = 0; i < n; i++) {
+            double theta = (2 * M_PI * i) / n;
+            sin_vals[i] = sinf(theta);
+            cos_vals[i] = cosf(theta);
        }
    }

-    filters.n_mel = n_mel;
-    filters.n_fft = n_fft;
-    filters.data  = std::move(out);
+    void fill_hann_window(int length, bool periodic) {
+        hann_window.resize(length);
+        int offset = -1;
+        if (periodic) {
+            offset = 0;
+        }
+        for (int i = 0; i < length; i++) {
+            hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+        }
+    }

-    if (DEBUG) {  // debug
-        for (size_t i = 0; i < filters.data.size(); ++i) {
-            if (filters.data[i] != 0.0f) {
-                printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+    void fill_mel_filterbank_matrix(
+        int n_mel,
+        int n_fft,
+        int sample_rate,            // e.g. 16000
+        float fmin = 0.0f,          // e.g. 0.0
+        float fmax = -1.0f,         // e.g. sr/2; pass -1 for auto
+        bool slaney_area_norm = true,
+        float scale = 1.0f          // optional extra scaling; use 1.0f/1000.0f to mimic your code
+    ) {
+        GGML_ASSERT(n_mel > 0 && n_fft > 1);
+        if (fmax <= 0.0f) {
+            fmax = 0.5f * sample_rate;
+        }
+
+        // Slaney scale (matches librosa default)
+        const double min_log_hz = 1000.0;
+        const double lin_slope = 3 / 200.;
+        const double min_log_mel = min_log_hz * lin_slope;
+        const double log_step = log(6.4) / 27.0;
+        auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
+            return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
+        };
+        auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
+            return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
+        };
+
+        // infer N_fft from n_fft_bins
+        const double bin_hz_step = double(sample_rate) / double(n_fft);
+
+        // mel grid: n_mel + 2 edges
+        const double m_lo = hz_to_mel(fmin);
+        const double m_hi = hz_to_mel(fmax);
+        std::vector<double> mel_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+        }
+
+        // convert to Hz
+        std::vector<double> hz_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            hz_pts[i] = mel_to_hz(mel_pts[i]);
+        }
+
+        const int n_fft_bins = n_fft / 2 + 1;
+
+        // filterbank
+        std::vector<float> out(n_mel * n_fft_bins, 0);
+        for (int m = 0; m < n_mel; ++m) {
+            const double f_left   = hz_pts[m];
+            const double f_center = hz_pts[m + 1];
+            const double f_right  = hz_pts[m + 2];
+
+            const double denom_l = std::max(1e-30, f_center - f_left);
+            const double denom_r = std::max(1e-30, f_right  - f_center);
+            const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
+
+            for (int k = 0; k < n_fft_bins; ++k) {
+                const double f = k * bin_hz_step;
+                double w = 0.0;
+                if (f >= f_left && f <= f_center) {
+                    w = (f - f_left) / denom_l;
+                } else if (f > f_center && f <= f_right) {
+                    w = (f_right - f) / denom_r;
+                }
+                out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+            }
+        }
+
+        filters.n_mel = n_mel;
+        filters.n_fft = n_fft;
+        filters.data  = std::move(out);
+
+        if (DEBUG) { // debug
+            for (size_t i = 0; i < filters.data.size(); ++i) {
+                if (filters.data[i] != 0.0f) {
+                    printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+                }
            }
        }
    }
-}
+} g_cache;

-// Unified DFT implementation for both forward and inverse transforms
-// Template parameters:
-//   Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
-//            true  = IDFT with exp(+2πi·k·n/N), scales by 1/N
-//   RealInput: true = input is real-valued (stride 1), avoids imaginary computations
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-    const int sin_cos_step   = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    const float     scale = Inverse ? (1.0f / N) : 1.0f;
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
+    const int sin_cos_step = n_sin_cos_vals / N;

    for (int k = 0; k < N; k++) {
        float re = 0;
        float im = 0;

        for (int n = 0; n < N; n++) {
-            int   idx     = (k * n * sin_cos_step) % n_sin_cos_vals;
-            float cos_val = cache.cos_vals[idx];
-            float sin_val = cache.sin_vals[idx];
-
-            if constexpr (RealInput) {
-                // Real input: in_im = 0, simplifies to:
-                // re += in_re * cos_val
-                // im += sign * in_re * sin_val
-                float in_re = in[n];
-                re += in_re * cos_val;
-                im += sign * in_re * sin_val;
-            } else {
-                float in_re = in[n * 2 + 0];
-                float in_im = in[n * 2 + 1];
-                // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
-                re += in_re * cos_val - sign * in_im * sin_val;
-                im += sign * in_re * sin_val + in_im * cos_val;
-            }
+            int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
+            re += in[n] * g_cache.cos_vals[idx]; // cos(t)
+            im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
        }

-        out[k * 2 + 0] = re * scale;
-        out[k * 2 + 1] = im * scale;
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
    }
 }

-// Cooley-Tukey FFT/IFFT unified implementation
-// Template parameters:
-//   Inverse: false = FFT with exp(-2πi·k/N), no scaling
-//            true  = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
-//   RealInput: true = input is real-valued (stride 1)
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
    if (N == 1) {
        out[0] = in[0];
-        if constexpr (RealInput) {
-            out[1] = 0.0f;
-        } else {
-            out[1] = in[1];
-        }
+        out[1] = 0;
        return;
    }

    const int half_N = N / 2;
-    if (N - half_N * 2 == 1) {
-        // Odd N: fall back to DFT
-        dft_impl<Inverse, RealInput>(cache, in, N, out);
+    if (N - half_N*2 == 1) {
+        dft(in, N, out);
        return;
    }

-    // Split into even and odd
-    if constexpr (RealInput) {
-        // Real input: stride is 1, copy only real values
-        float * even = in + N;
-        for (int i = 0; i < half_N; ++i) {
-            even[i] = in[2 * i];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, true>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i] = in[2 * i + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
-    } else {
-        // Complex input: stride is 2, copy complex pairs
-        float * even = in + N * 2;
-        for (int i = 0; i < half_N; ++i) {
-            even[i * 2 + 0] = in[2 * i * 2 + 0];
-            even[i * 2 + 1] = in[2 * i * 2 + 1];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, false>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
-            odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
+    float* even = in + N;
+    for (int i = 0; i < half_N; ++i) {
+        even[i]= in[2*i];
    }
+    float* even_fft = out + 2 * N;
+    fft(even, half_N, even_fft);

-    float * even_fft = out + 2 * N;
-    float * odd_fft  = even_fft + N;
+    float* odd = even;
+    for (int i = 0; i < half_N; ++i) {
+        odd[i] = in[2*i + 1];
+    }
+    float* odd_fft = even_fft + N;
+    fft(odd, half_N, odd_fft);

    const int sin_cos_step = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    constexpr float scale = Inverse ? 0.5f : 1.0f;
-
    for (int k = 0; k < half_N; k++) {
-        int   idx = k * sin_cos_step;  // t = 2*M_PI*k/N
-        float re  = cache.cos_vals[idx];
-        float im  = sign * cache.sin_vals[idx];
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re =  g_cache.cos_vals[idx]; // cos(t)
+        float im = -g_cache.sin_vals[idx]; // sin(t)

-        float re_odd = odd_fft[2 * k + 0];
-        float im_odd = odd_fft[2 * k + 1];
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];

-        out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
-        out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;

-        out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
-        out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
    }
 }

-// Forward FFT for real input (used by mel spectrogram)
-static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<false, true>(cache, in, N, out);
-}
-
-// Inverse FFT for complex input
-static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<true, false>(cache, in, N, out);
-}
-
 struct filter_params {
    int32_t n_mel;
    int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
    bool    norm_per_feature = false;
 };

-static void log_mel_spectrogram_worker_thread(int                        ith,
-                                              const float *              hann,
-                                              const std::vector<float> & samples,
-                                              int                        n_samples,
-                                              int                        frame_size,
-                                              int                        frame_step,
-                                              int                        n_threads,
-                                              const filter_params &      params,
-                                              const mtmd_audio_cache &   cache,
-                                              mtmd_audio_mel &           out) {
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const filter_params & params, mtmd_audio_mel & out) {
    std::vector<float> fft_in(frame_size * 2, 0.0);
    std::vector<float> fft_out(frame_size * 2 * 2 * 2);

    int n_fft_bins = params.n_fft_bins;
    int i = ith;

-    const auto & filters = cache.filters;
+    const auto & filters = g_cache.filters;

    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
-    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+    GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
    // calculate FFT only when fft_in are not all zero
    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
        const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
        }

        // FFT
-        fft(cache, fft_in.data(), frame_size, fft_out.data());
+        fft(fft_in.data(), frame_size, fft_out.data());

        // Calculate modulus^2 of complex numbers
        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
        const int     n_samples_in,
        const int     n_threads,
        const filter_params & params,
-        const mtmd_audio_cache & cache,
        mtmd_audio_mel & out) {
    //const int64_t t_start_us = ggml_time_us();

@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
    int n_samples = n_samples_in;

    // Hann window
-    const float * hann       = cache.hann_window.data();
-    const int     frame_size = (params.n_fft_bins - 1) * 2;
-    const int     frame_step = params.hop_length;
+    const float * hann = g_cache.hann_window.data();
+    const int frame_size = (params.n_fft_bins - 1) * 2;
+    const int frame_step = params.hop_length;

    // Padding
    std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(

    // preemphasis
    if (params.preemph) {
-        const int   pad_amount = frame_size / 2;
+        const int pad_amount = frame_size / 2;
        const float preemph = 0.97f;
-        float       prev = samples_padded[pad_amount];
+        float prev = samples_padded[pad_amount];
        for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
            float cur = samples_padded[i];
            samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
    {
        std::vector<std::thread> workers(n_threads - 1);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] =
-                std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
-                            frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
+                    n_samples, frame_size, frame_step, n_threads,
+                    std::cref(params), std::ref(out));
        }

        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
-                                          cache, out);
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
            workers[iw].join();
        }
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(

            for (int j = 0; j < effective_n_len; ++j) {
                auto &value = out.data[i * out.n_len + j];
-                value        = (value - mean) / mstd;
+                value = (value - mean) / mstd;
            }

            // pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
 //

 void mtmd_audio_preprocessor_whisper::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
 }

-bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 samples,
-                                                 size_t                        n_samples,
-                                                 std::vector<mtmd_audio_mel> & output) {
+bool mtmd_audio_preprocessor_whisper::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
    if (n_samples == 0) {
        // empty audio
        return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    // if input is too short, pad with zeros
    // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
    // TODO: maybe handle this better
-    size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1);  // +1 second margin
+    size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
    if (n_samples < min_samples) {
        smpl.resize(min_samples, 0.0f);
        std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    params.hop_length       = hparams.audio_hop_len;
    params.sample_rate      = hparams.audio_sample_rate;
    params.center_padding   = false;
-    params.preemph          = 0.0f;  // disabled
+    params.preemph          = 0.0f; // disabled
    params.use_natural_log  = false;
    params.norm_per_feature = false;

-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());

    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
    if (!ok) {
        return false;
    }
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
    }
    const size_t frames_per_chunk = 3000;
-    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
-    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
-        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
-        if ((size_t) n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
+        if ((size_t)n_len < frames_per_chunk) {
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
        }

        mtmd_audio_mel out_chunk;
        out_chunk.n_len     = n_len;
        out_chunk.n_mel     = out_full.n_mel;
-        out_chunk.n_len_org = out_full.n_mel;  // unused
+        out_chunk.n_len_org = out_full.n_mel; // unused
        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);

        for (int i = 0; i < out_full.n_mel; i++) {
-            auto src = out_full.data.begin() + i * out_full.n_len + off;
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
        }

@@ -579,152 +535,3 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s

    return true;
 }
-
-//
-// mtmd_audio_preprocessor_conformer
-//
-
-void mtmd_audio_preprocessor_conformer::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
-}
-
-bool mtmd_audio_preprocessor_conformer::preprocess(const float *                 samples,
-                                                   size_t                        n_samples,
-                                                   std::vector<mtmd_audio_mel> & output) {
-    // empty audio
-    if (n_samples == 0) {
-        return false;
-    }
-
-    filter_params params;
-    params.n_mel            = hparams.n_mel_bins;
-    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
-    params.hann_window_size = hparams.audio_window_len;
-    params.hop_length       = hparams.audio_hop_len;
-    params.sample_rate      = hparams.audio_sample_rate;
-    params.center_padding   = true;
-    params.preemph          = 0.97f;
-    params.use_natural_log  = true;
-    params.norm_per_feature = true;
-
-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
-
-    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
-    if (!ok) {
-        return false;
-    }
-
-    output.push_back(std::move(out_full));
-    return true;
-}
-
-//
-// mtmd_audio_streaming_istft implementation
-//
-
-mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
-    n_fft(n_fft),
-    hop_length(hop_length),
-    n_fft_bins(n_fft / 2 + 1),
-    overlap_buffer(n_fft, 0.0f),
-    window_sum_buffer(n_fft, 0.0f),
-    padding_to_remove((n_fft - hop_length) / 2),
-    ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
-    ifft_out(n_fft * 2 * 4, 0.0f) {
-    cache.fill_sin_cos_table(n_fft);
-    cache.fill_hann_window(n_fft, true);
-}
-
-void mtmd_audio_streaming_istft::reset() {
-    std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
-    std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
-    padding_to_remove = (n_fft - hop_length) / 2;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
-    std::vector<float> output(hop_length);
-
-    // copy frequencies
-    for (int j = 0; j < n_fft_bins; j++) {
-        ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
-        ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
-    }
-
-    // mirror negative frequencies
-    for (int j = 1; j < n_fft_bins - 1; j++) {
-        int mirror_idx              = n_fft - j;
-        ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
-        ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1];  // conjugate
-    }
-
-    ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
-
-    // update window sum and overlap buffer
-    for (int j = 0; j < n_fft; j++) {
-        window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
-        overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
-    }
-
-    // extract hop_length samples with normalization
-    for (int i = 0; i < hop_length; i++) {
-        if (window_sum_buffer[i] > 1e-8f) {
-            output[i] = overlap_buffer[i] / window_sum_buffer[i];
-        } else {
-            output[i] = overlap_buffer[i];
-        }
-    }
-
-    // shift buffers left by hop_length
-    std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
-    std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
-
-    std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
-    std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
-
-    // Remove padding if needed
-    int to_remove = std::min(padding_to_remove, (int) output.size());
-    padding_to_remove -= to_remove;
-    output.erase(output.begin(), output.begin() + to_remove);
-
-    return output;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::flush() {
-    std::vector<float> output;
-
-    // Extract remaining samples from overlap buffer
-    // Continue until we've extracted all meaningful samples
-    int remaining = n_fft - hop_length;
-    while (remaining > 0) {
-        int chunk_size = std::min(remaining, hop_length);
-
-        for (int i = 0; i < chunk_size; i++) {
-            float sample;
-            if (window_sum_buffer[i] > 1e-8f) {
-                sample = overlap_buffer[i] / window_sum_buffer[i];
-            } else {
-                sample = overlap_buffer[i];
-            }
-            output.push_back(sample);
-        }
-
-        // Shift buffers
-        std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
-        std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
-
-        std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
-        std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
-
-        remaining -= chunk_size;
-    }
-
-    return output;
-}
--- a/llama/llama.cpp/tools/mtmd/mtmd-audio.h
+++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.h
@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
    std::vector<float> data;
 };

-struct mtmd_audio_mel_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-// cache for audio processing, each processor instance owns its own cache
-struct mtmd_audio_cache {
-    std::vector<float> sin_vals;
-    std::vector<float> cos_vals;
-
-    std::vector<float> hann_window;
-
-    mtmd_audio_mel_filters filters;
-
-    void fill_sin_cos_table(int n);
-
-    void fill_hann_window(int length, bool periodic);
-
-    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
-    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
-    void fill_mel_filterbank_matrix(int   n_mel,
-                                    int   n_fft,
-                                    int   sample_rate,               // e.g. 16000
-                                    float fmin             = 0.0f,   // e.g. 0.0
-                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
-                                    bool  slaney_area_norm = true,
-                                    float scale = 1.0f  // optional extra scaling
-    );
-};
-
 struct mtmd_audio_preprocessor {
    const clip_hparams & hparams;

@@ -63,51 +31,4 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
-};
-
-struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
-    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
-    void initialize() override;
-    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
-};
-
-//
-// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
-//
-struct mtmd_audio_streaming_istft {
-    mtmd_audio_streaming_istft(int n_fft, int hop_length);
-
-    // reset streaming state
-    void reset();
-
-    // process a single STFT frame (streaming)
-    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
-    // returns: up to hop_length samples
-    std::vector<float> process_frame(const float * frame_spectrum);
-
-    // flush remaining samples at end of stream
-    std::vector<float> flush();
-
-  private:
-    int n_fft;
-    int hop_length;
-    int n_fft_bins;
-
-    // Own cache for output processing
-    mtmd_audio_cache cache;
-
-    // Streaming state
-    std::vector<float> overlap_buffer;
-    std::vector<float> window_sum_buffer;
-    int                padding_to_remove;
-
-    // Working buffers for IFFT
-    std::vector<float> ifft_in;
-    std::vector<float> ifft_out;
 };
--- a/llama/llama.cpp/tools/mtmd/mtmd.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd.cpp
@@ -276,7 +276,7 @@ struct mtmd_context {
        }

        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
@@ -293,7 +293,7 @@ struct mtmd_context {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
            img_end = "[IMG_END]";

-        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";
@@ -339,13 +339,8 @@ struct mtmd_context {
            case PROJECTOR_TYPE_QWEN25O:
            case PROJECTOR_TYPE_ULTRAVOX:
            case PROJECTOR_TYPE_VOXTRAL:
-            case PROJECTOR_TYPE_GLMA:
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                break;
-            case PROJECTOR_TYPE_LFM2A:
-                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
-                break;
            default:
                GGML_ABORT("unsupported audio projector type");
        }
@@ -363,9 +358,6 @@ struct mtmd_context {
            // [BEGIN_AUDIO] ... (embeddings) ...
            aud_beg = "[BEGIN_AUDIO]";

-        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-            // <sound> ... (embeddings) ...
-            aud_beg = "<sound>";
        }
    }

@@ -872,15 +864,10 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }

 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_YOUTUVL:
-            return true;
-        default:
-            return false;
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
+        return true;
    }
+    return false;
 }

 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
--- a/llama/llama.cpp/tools/mtmd/mtmd.h
+++ b/llama/llama.cpp/tools/mtmd/mtmd.h
@@ -27,9 +27,6 @@
 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 * - Keep the API minimal, do not expose internal details unless necessary
- *
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
 */

 #ifdef LLAMA_SHARED
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -23,7 +23,7 @@ problem.
 8 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 1b59924b8..a8a61b1e2 100644
+index 8547ecc84..9f37ca70c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -64,7 +64,7 @@ index 1b59924b8..a8a61b1e2 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index d7a93848d..51bf8bc55 100644
+index da624c587..efc63e092 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
@@ -84,7 +84,7 @@ index d7a93848d..51bf8bc55 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index f021de1d7..9f3cb06ad 100644
+index ab0f6fe9c..6519af435 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -112,7 +112,7 @@ index f021de1d7..9f3cb06ad 100644
 
 static void * ggml_cuda_host_malloc(size_t size) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 56b59f0af..790cabca0 100644
+index 70bf6f3d9..f2b7fe692 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 56b59f0af..790cabca0 100644
 
 static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 608ba1e55..d010ca00e 100644
+index 0d37587f6..ff373d413 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -3499,6 +3499,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -144,10 +144,10 @@ index 608ba1e55..d010ca00e 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index d7c8ad8c1..281fa1bdb 100644
+index 18a45d2d9..89041805e 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -557,6 +557,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     RPC_STATUS_ASSERT(status);
     delete ctx;
@@ -156,7 +156,7 @@ index d7c8ad8c1..281fa1bdb 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 8f8176b67..5f8f3c210 100644
+index e996d98be..84b679315 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -184,10 +184,10 @@ index 8f8176b67..5f8f3c210 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b1a51a436..885864111 100644
+index 34ec09d40..120191ca0 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -12751,6 +12751,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -195,7 +195,7 @@ index b1a51a436..885864111 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -12894,6 +12895,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a20c6525e..09874b67a 100644
+index 7b01a2edf..63250cdf1 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1832,16 +1832,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index a20c6525e..09874b67a 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -2032,7 +2023,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
+@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 97c83de5f..26710548e 100644
+index 35e3aef0a..84a3796b5 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@
@@ -33,7 +33,7 @@ index 97c83de5f..26710548e 100644
 struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
 
 //#define CLIP_DEBUG_FUNCTIONS
-@@ -1842,7 +1855,29 @@ struct clip_model_loader {
+@@ -1619,7 +1632,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index 97c83de5f..26710548e 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1869,7 +1904,11 @@ struct clip_model_loader {
+@@ -1646,7 +1681,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -19,10 +19,10 @@ adds support for the Solar Pro architecture
 create mode 100644 src/models/solar.cpp

 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index b0932794d..12e14f55c 100644
+index 4192af7c0..bd44d73e7 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
-@@ -129,6 +129,7 @@ add_library(llama
+@@ -125,6 +125,7 @@ add_library(llama
             models/seed-oss.cpp
             models/smallthinker.cpp
             models/smollm3.cpp
@@ -31,10 +31,10 @@ index b0932794d..12e14f55c 100644
             models/starcoder.cpp
             models/starcoder2.cpp
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 2ead96546..36855e408 100644
+index 8caf80afc..2ce8ffec0 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -89,6 +89,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -42,7 +42,7 @@ index 2ead96546..36855e408 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -215,6 +216,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
     { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
@@ -50,7 +50,7 @@ index 2ead96546..36855e408 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -347,6 +349,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
+@@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_ATTN_QKV,                               "blk.%d.attn_qkv" },
     { LLM_TENSOR_LAYER_OUT_NORM,                         "blk.%d.layer_output_norm" },
     { LLM_TENSOR_ATTN_OUT_NORM,                          "blk.%d.attn_output_norm" },
@@ -58,9 +58,9 @@ index 2ead96546..36855e408 100644
     { LLM_TENSOR_POS_EMBD,                               "position_embd" },
     { LLM_TENSOR_FFN_ACT,                                "blk.%d.ffn.act" },
     { LLM_TENSOR_TOKEN_EMBD_NORM,                        "token_embd_norm" },
-@@ -2254,6 +2257,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
-                 LLM_TENSOR_FFN_DOWN,
-                 LLM_TENSOR_FFN_UP,
+@@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
+             return {
+                 LLM_TENSOR_TOKEN_EMBD,
             };
 +        case LLM_ARCH_SOLAR:
 +            return {
@@ -81,7 +81,7 @@ index 2ead96546..36855e408 100644
         default:
             GGML_ABORT("unknown architecture for tensor mapping");
     }
-@@ -2422,6 +2441,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -90,10 +90,10 @@ index 2ead96546..36855e408 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 68ec6a18b..7fab2fa93 100644
+index 6cbf9b1f8..14d461c76 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -93,6 +93,7 @@ enum llm_arch {
+@@ -91,6 +91,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_GRANITE_HYBRID,
     LLM_ARCH_CHAMELEON,
@@ -101,7 +101,7 @@ index 68ec6a18b..7fab2fa93 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -219,6 +220,7 @@ enum llm_kv {
+@@ -212,6 +213,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
     LLM_KV_ATTENTION_TEMPERATURE_SCALE,
@@ -109,7 +109,7 @@ index 68ec6a18b..7fab2fa93 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -473,6 +475,7 @@ enum llm_tensor {
+@@ -465,6 +467,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -118,10 +118,10 @@ index 68ec6a18b..7fab2fa93 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index c847ef91b..700e4bcf5 100644
+index fe1fa4341..aabff2f06 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -167,6 +167,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+@@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
     return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
 }
 
@@ -137,7 +137,7 @@ index c847ef91b..700e4bcf5 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 7ae3ec292..cc2ce7f15 100644
+index f6e95b5d2..c6e673276 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -65,6 +65,8 @@ struct llama_hparams {
@@ -149,7 +149,7 @@ index 7ae3ec292..cc2ce7f15 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -266,6 +268,9 @@ struct llama_hparams {
+@@ -259,6 +261,9 @@ struct llama_hparams {
 
     uint32_t n_pos_per_embd() const;
 
@@ -160,10 +160,10 @@ index 7ae3ec292..cc2ce7f15 100644
 
     bool has_kv(uint32_t il) const;
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index e66febaa0..cf200c61f 100644
+index ca2ea2461..8916a6242 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -489,7 +489,7 @@ namespace GGUFMeta {
+@@ -466,7 +466,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
@@ -173,10 +173,10 @@ index e66febaa0..cf200c61f 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 5de6493b9..d3c60d418 100644
+index ae8207ee1..00cd579e0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -2074,6 +2074,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -198,7 +198,7 @@ index 5de6493b9..d3c60d418 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -5626,6 +5641,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -233,7 +233,7 @@ index 5de6493b9..d3c60d418 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -7838,6 +7881,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+@@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params);
             } break;
@@ -244,7 +244,7 @@ index 5de6493b9..d3c60d418 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -8116,6 +8163,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
         case LLM_ARCH_CHAMELEON:
@@ -253,10 +253,10 @@ index 5de6493b9..d3c60d418 100644
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_SMOLLM3:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 79200a0d9..740cb7094 100644
+index c6eb95318..b378b23ec 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -79,6 +79,7 @@ enum llm_type {
+@@ -76,6 +76,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
@@ -264,7 +264,7 @@ index 79200a0d9..740cb7094 100644
     LLM_TYPE_26B,
     LLM_TYPE_27B,
     LLM_TYPE_30B,
-@@ -410,6 +411,8 @@ struct llama_layer {
+@@ -405,6 +406,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
 
@@ -274,10 +274,10 @@ index 79200a0d9..740cb7094 100644
 
     struct llama_layer_convnext convnext;
 diff --git a/src/models/models.h b/src/models/models.h
-index 72b2b760c..4e2162c77 100644
+index ffb36acc6..6d84a185d 100644
 --- a/src/models/models.h
 +++ b/src/models/models.h
-@@ -533,6 +533,11 @@ struct llm_build_smollm3 : public llm_graph_context {
+@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
     llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
 };
 
--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 09874b67a..0049d59bf 100644
+index 63250cdf1..dd86a1745 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -25,7 +25,7 @@ index 09874b67a..0049d59bf 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index b47dcbe61..6d1084f26 100644
+index bb44edfad..13ced055f 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
--- a/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 6192a8704..993ec027f 100644
+index 4c04c3300..f4747f262 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -26,4 +26,4 @@ index 6192a8704..993ec027f 100644
 +    add_custom_target(ggml-cpu)
     if (GGML_SYSTEM_ARCH STREQUAL "x86")
         ggml_add_cpu_backend_variant(x64)
-         ggml_add_cpu_backend_variant(sse42              SSE42)
+         ggml_add_cpu_backend_variant(sse42        SSE42)
--- a/llama/patches/0009-remove-amx.patch
+++ b/llama/patches/0009-remove-amx.patch
@@ -5,22 +5,21 @@ Subject: [PATCH] remove amx

 disable amx as it reduces performance on some systems
 ---
- ggml/src/CMakeLists.txt | 5 +----
- 1 file changed, 1 insertion(+), 4 deletions(-)
+ ggml/src/CMakeLists.txt | 4 ----
+ 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 993ec027f..cbda1380c 100644
+index f4747f262..d55aed348 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -379,10 +379,7 @@ if (GGML_CPU_ALL_VARIANTS)
-             ggml_add_cpu_backend_variant(zen4           SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
-         endif()
-         ggml_add_cpu_backend_variant(alderlake          SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
+@@ -365,10 +365,6 @@ if (GGML_CPU_ALL_VARIANTS)
+         ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+         ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+         ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
 -        if (NOT MSVC)
 -            # MSVC doesn't support AMX
-            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
 -        endif()
-+        # AMX variants removed by ollama - sapphirerapids with AMX_TILE AMX_INT8 not included
     elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
         if (CMAKE_SYSTEM_NAME MATCHES "Linux")
             # Many of these features are optional so we build versions with popular
--- a/llama/patches/0010-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0010-fix-string-arr-kv-loading.patch
@@ -53,10 +53,10 @@ index b165d8bdc..f91d4faba 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 0049d59bf..fefa6b478 100644
+index dd86a1745..d63ce9c84 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1788,9 +1788,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
--- a/llama/patches/0011-ollama-debug-tensor.patch
+++ b/llama/patches/0011-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index f7ba1fe31..f700f74db 100644
+index a59b51893..53891a91f 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
--- a/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
@@ -10,10 +10,10 @@ Subject: [PATCH] add ollama vocab for grammar support
 3 files changed, 58 insertions(+), 10 deletions(-)

 diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
-index 64ea2fd00..d87e52ded 100644
+index 75d5d750c..a0299d181 100644
 --- a/src/llama-grammar.cpp
 +++ b/src/llama-grammar.cpp
-@@ -1079,6 +1079,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
+@@ -1041,6 +1041,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
 
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -21,7 +21,7 @@ index 64ea2fd00..d87e52ded 100644
         const llama_grammar_element ** rules,
         size_t n_rules,
         size_t start_rule_index) {
-@@ -1134,6 +1135,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1096,6 +1097,7 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
@@ -29,7 +29,7 @@ index 64ea2fd00..d87e52ded 100644
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */             {},
-@@ -1148,6 +1150,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1110,6 +1112,7 @@ struct llama_grammar * llama_grammar_init_impl(
 
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -37,7 +37,7 @@ index 64ea2fd00..d87e52ded 100644
                       const char * grammar_str,
                       const char * grammar_root,
                               bool lazy,
-@@ -1240,6 +1243,7 @@ struct llama_grammar * llama_grammar_init_impl(
+@@ -1202,6 +1205,7 @@ struct llama_grammar * llama_grammar_init_impl(
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
     return new llama_grammar {
         vocab,
@@ -45,7 +45,7 @@ index 64ea2fd00..d87e52ded 100644
         std::move(vec_rules),
         std::move(stacks),
         /* .partial_utf8 = */             {},
-@@ -1263,6 +1267,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
+@@ -1225,6 +1229,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
     auto * result = new llama_grammar {
         grammar.vocab,
@@ -53,7 +53,7 @@ index 64ea2fd00..d87e52ded 100644
         grammar.rules,
         grammar.stacks,
         grammar.partial_utf8,
-@@ -1291,7 +1296,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
+@@ -1253,7 +1258,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
 }
 
 void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
@@ -61,7 +61,7 @@ index 64ea2fd00..d87e52ded 100644
 
     if (grammar.awaiting_trigger) {
         return;
-@@ -1313,9 +1317,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
+@@ -1275,9 +1279,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 
     for (size_t i = 0; i < cur_p->size; ++i) {
         const llama_token id      = cur_p->data[i].id;
@@ -77,7 +77,7 @@ index 64ea2fd00..d87e52ded 100644
             if (!allow_eog) {
                 cur_p->data[i].logit = -INFINITY;
             }
-@@ -1334,9 +1342,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
+@@ -1296,9 +1304,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 }
 
 void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
@@ -90,7 +90,7 @@ index 64ea2fd00..d87e52ded 100644
 
     if (grammar.awaiting_trigger) {
         if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
-@@ -1380,13 +1389,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
+@@ -1353,13 +1362,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
         }
     }
 
@@ -107,7 +107,7 @@ index 64ea2fd00..d87e52ded 100644
     }
 
     llama_grammar_accept_token(grammar, token, piece);
-@@ -1462,3 +1472,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
+@@ -1435,3 +1445,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
     }
 }
 
@@ -136,7 +136,7 @@ index 64ea2fd00..d87e52ded 100644
 +    }
 +}
 diff --git a/src/llama-grammar.h b/src/llama-grammar.h
-index b5a0e588e..57847583a 100644
+index a4c978ac1..5c0da4049 100644
 --- a/src/llama-grammar.h
 +++ b/src/llama-grammar.h
@@ -6,8 +6,19 @@
@@ -159,7 +159,7 @@ index b5a0e588e..57847583a 100644
 
 // grammar element type
 enum llama_gretype {
-@@ -129,6 +140,7 @@ struct llama_grammar {
+@@ -127,6 +138,7 @@ struct llama_grammar {
 
     // note: allow null vocab for testing (not great)
     const llama_vocab * vocab;
@@ -167,7 +167,7 @@ index b5a0e588e..57847583a 100644
 
     const llama_grammar_rules  rules;  // TODO: shared ptr
           llama_grammar_stacks stacks;
-@@ -157,12 +169,14 @@ struct llama_grammar {
+@@ -155,12 +167,14 @@ struct llama_grammar {
 // note: needed for tests (not great)
 struct llama_grammar * llama_grammar_init_impl(
         const struct llama_vocab * vocab,
@@ -183,10 +183,10 @@ index b5a0e588e..57847583a 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index 11f0394c4..bbc5dc591 100644
+index 3f4a729bc..38a30ea05 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
-@@ -2511,7 +2511,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
         trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
     }
 
@@ -195,7 +195,7 @@ index 11f0394c4..bbc5dc591 100644
                                                  ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
 
-@@ -2593,9 +2593,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
+@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
             trigger_pattern += ")[\\s\\S]*";
 
             std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
--- a/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
@@ -5,11 +5,11 @@ Subject: [PATCH] add argsort and cuda copy for i32

 ---
 ggml/src/ggml-cpu/ops.cpp            |  43 ++++++
- ggml/src/ggml-cuda/argsort.cu        | 120 +++++++++++++--
+ ggml/src/ggml-cuda/argsort.cu        | 122 +++++++++++++--
 ggml/src/ggml-cuda/cpy-utils.cuh     |   6 +
 ggml/src/ggml-cuda/cpy.cu            |  40 +++++
 ggml/src/ggml-metal/ggml-metal.metal | 215 +++++++++++++++++++++++++++
- 5 files changed, 413 insertions(+), 11 deletions(-)
+ 5 files changed, 414 insertions(+), 12 deletions(-)

 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
 index 303278397..7d1733adb 100644
@@ -73,10 +73,10 @@ index 303278397..7d1733adb 100644
             {
                 GGML_ABORT("fatal error");
 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
-index 57c8a99a2..d982d279b 100644
+index da9652c3b..b82be371c 100644
 --- a/ggml/src/ggml-cuda/argsort.cu
 +++ b/ggml/src/ggml-cuda/argsort.cu
-@@ -189,13 +189,107 @@ void argsort_f32_i32_cuda_bitonic(const float *   x,
+@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float *   x,
     }
 }
 
@@ -185,27 +185,28 @@ index 57c8a99a2..d982d279b 100644
     GGML_ASSERT( dst->type == GGML_TYPE_I32);
     GGML_ASSERT(ggml_is_contiguous(src0));
 
-@@ -204,18 +298,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+@@ -183,18 +277,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 
-+    if (src0->type == GGML_TYPE_I32) {
-+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
-+    } else {
- #ifdef GGML_CUDA_USE_CUB
+-#ifdef GGML_CUDA_USE_CUB
 -    const int    ncols_pad      = next_power_of_2(ncols);
 -    const size_t shared_mem     = ncols_pad * sizeof(int);
 -    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-+        const int    ncols_pad      = next_power_of_2(ncols);
-+        const size_t shared_mem     = ncols_pad * sizeof(int);
-+        const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
- 
+-
 -    if (shared_mem > max_shared_mem || ncols > 1024) {
 -        ggml_cuda_pool & pool = ctx.pool();
 -        argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
-    } else {
+    if (src0->type == GGML_TYPE_I32) {
+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+     } else {
 -        argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
 -    }
+#ifdef GGML_CUDA_USE_CUB
+        const int    ncols_pad      = next_power_of_2(ncols);
+        const size_t shared_mem     = ncols_pad * sizeof(int);
+        const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
 +        if (shared_mem > max_shared_mem || ncols > 1024) {
 +            ggml_cuda_pool & pool = ctx.pool();
 +            argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
@@ -233,10 +234,10 @@ index 7697c292d..00d773dd3 100644
 +    *dst = *src;
 +}
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index ee84303ef..178e82d76 100644
+index c4ceb4fc5..0e53ecc39 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
-@@ -369,6 +369,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
+@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 
@@ -280,7 +281,7 @@ index ee84303ef..178e82d76 100644
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
-@@ -495,6 +532,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
             ggml_cpy_scalar_cuda<half, float>
                 (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
         }
@@ -291,10 +292,10 @@ index ee84303ef..178e82d76 100644
         if (can_be_transposed) {
             ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 16d17d26a..ff1afa291 100644
+index 51bcbae30..236838e9e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -4955,8 +4955,77 @@ kernel void kernel_argsort_f32_i32(
+@@ -4954,8 +4954,77 @@ kernel void kernel_argsort_f32_i32(
     }
 }
 
@@ -372,7 +373,7 @@ index 16d17d26a..ff1afa291 100644
 
 typedef void (argsort_merge_t)(
         constant   ggml_metal_kargs_argsort_merge & args,
-@@ -5111,8 +5180,154 @@ kernel void kernel_argsort_merge_f32_i32(
+@@ -5110,8 +5179,154 @@ kernel void kernel_argsort_merge_f32_i32(
     }
 }
 
--- a/llama/patches/0014-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch
@@ -23,7 +23,7 @@ index 78aa059dd..7fa8403b3 100644
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index a9d177864..393c329be 100644
+index 4ed5f3577..a7ebe5dcd 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -319,6 +319,7 @@ extern "C" {
@@ -121,7 +121,7 @@ index 41419b617..73b39bfea 100644
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index a8a61b1e2..259e10257 100644
+index 9f37ca70c..1459d16dd 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
--- a/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0015-ggml-Export-GPU-UUIDs.patch
@@ -1,32 +1,31 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: nobody <>
-Date: Sat, 10 Jan 2026 15:27:57 -0800
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Sun, 30 Nov 2025 11:05:56 -0800
 Subject: [PATCH] ggml: Export GPU UUIDs

 ---
- ggml/include/ggml-backend.h        |  2 ++
- ggml/src/ggml-cuda/ggml-cuda.cu    | 53 ++++++++++++++++++++++++++++++
+ ggml/include/ggml-backend.h        |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu    | 67 +++++++++++++++++++++++++++---
 ggml/src/ggml-metal/ggml-metal.cpp |  1 +
- 3 files changed, 56 insertions(+)
+ 3 files changed, 63 insertions(+), 6 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 393c329be..99412fe56 100644
+index a7ebe5dcd..03557bb31 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -158,6 +158,8 @@ extern "C" {
+@@ -158,6 +158,7 @@ extern "C" {
         const char * description;
         // device free memory in bytes
         size_t memory_free;
-+        // device UUID
 +        const char * id;
         // device total memory in bytes
         size_t memory_total;
         // device type
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 9f3cb06ad..a4dce694b 100644
+index 6519af435..c9d3a2b03 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -191,6 +191,51 @@ static int ggml_cuda_parse_id(char devName[]) {
+@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
 }
 #endif // defined(GGML_USE_HIP)
 
@@ -78,15 +77,46 @@ index 9f3cb06ad..a4dce694b 100644
 static ggml_cuda_device_info ggml_cuda_init() {
     ggml_cuda_device_info info = {};
 
-@@ -4125,6 +4170,7 @@ struct ggml_backend_cuda_device_context {
+@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
+                 info.devices[id].cc += prop.minor * 0x10;
+             }
+         }
+-        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
+                       id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
+-                      device_vmm ? "yes" : "no", prop.warpSize);
+                      device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
+ #elif defined(GGML_USE_MUSA)
+         // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
+         info.devices[id].warp_size = 32;
+         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+         info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
+         info.devices[id].cc += prop.minor * 0x10;
+-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                        ggml_cuda_parse_uuid(prop, id).c_str());
+ #else
+         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+         info.devices[id].cc = 100*prop.major + 10*prop.minor;
+-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+-                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                        ggml_cuda_parse_uuid(prop, id).c_str());
+         std::string device_name(prop.name);
+         if (device_name == "NVIDIA GeForce MX450") {
+             turing_devices_without_mma.push_back({ id, device_name });
+@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
     std::string name;
     std::string description;
     std::string pci_bus_id;
 +    std::string id;
-     int op_offload_min_batch_size;
 };
 
-@@ -4214,6 +4260,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
 }
 #endif // defined(__linux__)
 
@@ -98,7 +128,7 @@ index 9f3cb06ad..a4dce694b 100644
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
-@@ -4254,6 +4305,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
 
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
@@ -106,7 +136,7 @@ index 9f3cb06ad..a4dce694b 100644
     props->type        = ggml_backend_cuda_device_get_type(dev);
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
-@@ -4860,6 +4912,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 cudaDeviceProp prop;
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
@@ -115,7 +145,7 @@ index 9f3cb06ad..a4dce694b 100644
                 char pci_bus_id[16] = {};
                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 790cabca0..516d74064 100644
+index f2b7fe692..8fc1c2fb5 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -547,6 +547,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
--- a/llama/patches/0016-add-C-API-for-mtmd_input_text.patch
+++ b/llama/patches/0016-add-C-API-for-mtmd_input_text.patch
@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
 2 files changed, 13 insertions(+)

 diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
-index b68de7429..743f892a1 100644
+index 2638fe4fc..c4e905a4e 100644
 --- a/tools/mtmd/mtmd.cpp
 +++ b/tools/mtmd/mtmd.cpp
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
@@ -31,10 +31,10 @@ index b68de7429..743f892a1 100644
     return "<__media__>";
 }
 diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
-index 44d05ceae..5f2e579e1 100644
+index 9f7e861e9..72cec1937 100644
 --- a/tools/mtmd/mtmd.h
 +++ b/tools/mtmd/mtmd.h
-@@ -83,6 +83,9 @@ typedef struct mtmd_input_chunk  mtmd_input_chunk;
+@@ -80,6 +80,9 @@ typedef struct mtmd_input_chunk  mtmd_input_chunk;
 typedef struct mtmd_input_chunks mtmd_input_chunks;
 typedef struct mtmd_input_text   mtmd_input_text;
 
--- a/llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
+++ b/llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] no power throttling win32 with gnuc
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index f700f74db..5581dd0ae 100644
+index 53891a91f..8d4851312 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2479,7 +2479,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
--- a/llama/patches/0018-ggml-Add-batch-size-hint-to-graph_compute.patch
+++ b/llama/patches/0018-ggml-Add-batch-size-hint-to-graph_compute.patch
@@ -1,362 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 10 Jan 2026 15:36:34 -0800
-Subject: [PATCH] ggml: Add batch size hint to graph_compute
-
-This adds a batch_size parameter to backend graph_compute functions
-to provide optimization hints for processing.
---
- ggml/include/ggml-backend.h            |  5 ++++-
- ggml/src/ggml-backend-impl.h           |  4 ++--
- ggml/src/ggml-backend.cpp              | 19 +++++++++++++------
- ggml/src/ggml-blas/ggml-blas.cpp       |  3 ++-
- ggml/src/ggml-cann/ggml-cann.cpp       |  4 +++-
- ggml/src/ggml-cpu/ggml-cpu.cpp         |  4 +++-
- ggml/src/ggml-cuda/ggml-cuda.cu        |  4 +++-
- ggml/src/ggml-hexagon/ggml-hexagon.cpp |  4 +++-
- ggml/src/ggml-metal/ggml-metal.cpp     |  4 +++-
- ggml/src/ggml-opencl/ggml-opencl.cpp   |  4 +++-
- ggml/src/ggml-rpc/ggml-rpc.cpp         |  4 +++-
- ggml/src/ggml-sycl/ggml-sycl.cpp       |  4 +++-
- ggml/src/ggml-vulkan/ggml-vulkan.cpp   |  3 ++-
- ggml/src/ggml-webgpu/ggml-webgpu.cpp   |  3 ++-
- ggml/src/ggml-zdnn/ggml-zdnn.cpp       |  3 ++-
- ggml/src/ggml-zendnn/ggml-zendnn.cpp   |  4 +++-
- 16 files changed, 54 insertions(+), 22 deletions(-)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 99412fe56..97f630faa 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -98,7 +98,7 @@ extern "C" {
- 
-     GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
- 
-     // NOTE: will be removed, use device version instead
-     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
-@@ -308,6 +308,9 @@ extern "C" {
-     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
-     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
- 
-+    // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
-+    GGML_API void                 ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
-+
-     // Initialize backend buffers from a measure graph
-     GGML_API void                 ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
-     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
-diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 59190b7c4..a833756f9 100644
--- a/ggml/src/ggml-backend-impl.h
-+++ b/ggml/src/ggml-backend-impl.h
-@@ -106,8 +106,8 @@ extern "C" {
-         // compute the graph with the plan
-         enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- 
-        // compute graph (always async if supported by the backend)
-        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-+        // compute graph (always async if supported by the backend). batch_size may be -1 if unknown
-+        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
- 
-         // (optional) event synchronization
-         // record an event on this stream
-diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 259e10257..096f52790 100644
--- a/ggml/src/ggml-backend.cpp
-+++ b/ggml/src/ggml-backend.cpp
-@@ -353,14 +353,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
- }
- 
- enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
-+    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1);
-     ggml_backend_synchronize(backend);
-     return err;
- }
- 
-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-+enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
-     GGML_ASSERT(backend);
-    return backend->iface.graph_compute(backend, cgraph);
-+    return backend->iface.graph_compute(backend, cgraph, batch_size);
- }
- 
- bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-@@ -727,6 +727,8 @@ struct ggml_backend_sched {
- 
-     bool op_offload;
- 
-+    int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
-+
-     int debug;
- 
-     // used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
-@@ -825,7 +827,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
-         if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
-             // check if a backend with higher prio wants to offload the op
-            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
-+            if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
-                 for (int b = 0; b < src_backend_id; b++) {
-                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
-                         SET_CAUSE(tensor, "1.off");
-@@ -1577,7 +1579,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
-         }
- 
-         if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
-+            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size);
-             if (ec != GGML_STATUS_SUCCESS) {
-                 return ec;
-             }
-@@ -1599,7 +1601,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
- 
-                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
- 
-                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
-+                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size);
-                 if (ec != GGML_STATUS_SUCCESS) {
-                     return ec;
-                 }
-@@ -1689,12 +1691,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
- 
-     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
-     sched->op_offload = op_offload;
-+    sched->batch_size = -1;
- 
-     ggml_backend_sched_reset(sched);
- 
-     return sched;
- }
- 
-+void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) {
-+    sched->batch_size = batch_size;
-+}
-+
- void ggml_backend_sched_free(ggml_backend_sched_t sched) {
-     if (sched == NULL) {
-         return;
-diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
-index 84956cbb9..39920ec19 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas/ggml-blas.cpp
-@@ -220,7 +220,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
-     delete backend;
- }
- 
-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
-     ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
- 
-     for (int i = 0; i < cgraph->n_nodes; i++) {
-@@ -250,6 +250,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
-     return GGML_STATUS_SUCCESS;
- 
-     GGML_UNUSED(backend);
-+    GGML_UNUSED(batch_size);
- }
- 
- static struct ggml_backend_i blas_backend_i = {
-diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index 51bf8bc55..60e955640 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
-+++ b/ggml/src/ggml-cann/ggml-cann.cpp
-@@ -2191,8 +2191,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
-  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
-  *         completes successfully, otherwise an appropriate error status.
-  */
-static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
-+
-+    GGML_UNUSED(batch_size);
-     ggml_cann_set_device(cann_ctx->device);
-     g_nz_workspaces[cann_ctx->device].clear();
- 
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
-index f4713a421..92ba577a5 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
-+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
-@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
-     GGML_UNUSED(backend);
- }
- 
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
-     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
- 
-     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
-@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
-     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
- 
-     return ggml_graph_compute(cgraph, &cplan);
-+
-+    GGML_UNUSED(batch_size);
- }
- 
- static const struct ggml_backend_i ggml_backend_cpu_i = {
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index a4dce694b..49f9c2b56 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -3789,9 +3789,11 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
- #endif // USE_CUDA_GRAPH
- }
- 
-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
- 
-+    GGML_UNUSED(batch_size);
-+
-     ggml_cuda_set_device(cuda_ctx->device);
- 
-     bool use_cuda_graph             = false;
-diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
-index 365a24b49..fc9b6a7f5 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
-+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
-@@ -2468,9 +2468,11 @@ static inline int last_compute_op(ggml_cgraph * graph) {
-     return last;
- }
- 
-static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
-+static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph, int batch_size) {
-     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
- 
-+    GGML_UNUSED(batch_size);
-+
-     HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
- 
-     const int last = last_compute_op(graph);
-diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 516d74064..5f8d6c175 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
-+++ b/ggml/src/ggml-metal/ggml-metal.cpp
-@@ -419,9 +419,11 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
-     GGML_UNUSED(dst);
- }
- 
-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     ggml_metal_t ctx = (ggml_metal_t)backend->context;
- 
-+    GGML_UNUSED(batch_size);
-+
-     return ggml_metal_graph_compute(ctx, cgraph);
- }
- 
-diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index d010ca00e..1f0bc8e85 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
-+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -2998,9 +2998,11 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
- static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
- static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
- 
-static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
- 
-+    GGML_UNUSED(batch_size);
-+
-     for (int i = 0; i < cgraph->n_nodes; i++) {
-         ggml_tensor * node = cgraph->nodes[i];
- 
-diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 281fa1bdb..b5f7adf89 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
-+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -866,9 +866,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
-     memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
- }
- 
-static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
- 
-+    GGML_UNUSED(batch_size);
-+
-     GGML_ASSERT(cgraph->n_nodes > 0);
-     bool reuse = rpc_ctx->gc.is_cached(cgraph);
-     if (reuse) {
-diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 5f8f3c210..d0e3d5412 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
-+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -4171,9 +4171,11 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
- }
- #endif
- 
-static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
- 
-+    GGML_UNUSED(batch_size);
-+
- #ifdef GGML_SYCL_GRAPH
-     bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
-     if (use_sycl_graph) {
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 885864111..58de820f2 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -13540,8 +13540,9 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
-     return num_adds;
- }
- 
-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
-+    GGML_UNUSED(batch_size);
-     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
- 
-     if (vk_instance.debug_utils_support) {
-diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
-index 5b8f7f72d..7b928fb5f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
-+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
-@@ -1616,8 +1616,9 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
-     }
- }
- 
-static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-+static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
-     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
-+    GGML_UNUSED(batch_size);
- 
-     ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
-     webgpu_context                ctx         = backend_ctx->webgpu_ctx;
-diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
-index edbeb8eef..a90066365 100644
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
-+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
-@@ -407,7 +407,8 @@ static void ggml_backend_zdnn_free(ggml_backend_t backend) {
-     free(backend);
- }
- 
-static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-+    GGML_UNUSED(batch_size);
-     return ggml_zdnn_graph_compute(backend, cgraph);
- }
- 
-diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
-index fd07f983d..954a64e37 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
-+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
-@@ -205,9 +205,11 @@ static void ggml_backend_zendnn_free(ggml_backend_t backend) {
-     delete backend;
- }
- 
-static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-+static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
-     ggml_backend_zendnn_context * ctx = (ggml_backend_zendnn_context *)backend->context;
- 
-+    GGML_UNUSED(batch_size);
-+
-     for (int i = 0; i < cgraph->n_nodes; i++) {
-         struct ggml_tensor * node = cgraph->nodes[i];
- 
--- a/llama/patches/0019-fix-mtmd-audio.cpp-build-on-windows.patch
+++ b/llama/patches/0019-fix-mtmd-audio.cpp-build-on-windows.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
-index e8eef035f..a208c7789 100644
+index f68829a61..2024d3d37 100644
 --- a/tools/mtmd/mtmd-audio.cpp
 +++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,6 +1,6 @@
--- a/llama/patches/0020-ggml-No-alloc-mode-for-memory-sizing.patch
+++ b/llama/patches/0020-ggml-No-alloc-mode-for-memory-sizing.patch
@@ -1,321 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 10 Jan 2026 15:49:32 -0800
-Subject: [PATCH] ggml: No-alloc mode for memory sizing
-
-Adds infrastructure for scheduler no-alloc mode that enables
-fast memory sizing calculations without actual allocations.
---
- ggml/include/ggml-backend.h   |  1 +
- ggml/src/ggml-backend-impl.h  | 16 ++++++++
- ggml/src/ggml-backend.cpp     | 75 +++++++++++++++++++++++++++++++++--
- ggml/src/ggml-cuda/common.cuh | 62 ++++++++++++++++++++++++++++-
- 4 files changed, 149 insertions(+), 5 deletions(-)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 97f630faa..cc4f0e5af 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -306,6 +306,7 @@ extern "C" {
- 
-     // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
-+    GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
-     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
- 
-     // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
-diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index a833756f9..93eb8e511 100644
--- a/ggml/src/ggml-backend-impl.h
-+++ b/ggml/src/ggml-backend-impl.h
-@@ -26,12 +26,17 @@ extern "C" {
-         size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-         // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
-         bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
-+
-+        // (optional) returns a dummy buffer that is equivalent to one created by alloc_buffer but without actually being backed
-+        // by memory
-+        ggml_backend_buffer_t (*noalloc_buffer)(ggml_backend_buffer_type_t buft, size_t size);
-     };
- 
-     struct ggml_backend_buffer_type {
-         struct ggml_backend_buffer_type_i  iface;
-         ggml_backend_dev_t device;
-         void * context;
-+        bool no_alloc;
-     };
- 
-     //
-@@ -63,6 +68,7 @@ extern "C" {
-         void * context;
-         size_t size;
-         enum ggml_backend_buffer_usage usage;
-+        bool no_alloc;
-     };
- 
-     GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
-@@ -117,6 +123,16 @@ extern "C" {
- 
-         // (optional) sort/optimize the nodes in the graph
-         void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-+
-+        // (optional) reserves intermediate buffers needed for the compution
-+        // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
-+        enum ggml_status          (*graph_reserve)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
-+
-+        // (optional) returns the memory needed after calling graph_reserve
-+        size_t                    (*buffer_size)       (ggml_backend_t backend);
-+
-+        // (optional) frees memory from intermediate buffers that was allocated either by graph_compute or graph_reserve
-+        void                      (*reset)             (ggml_backend_t backend);
-     };
- 
-     struct ggml_backend {
-diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 096f52790..c12e83cc0 100644
--- a/ggml/src/ggml-backend.cpp
-+++ b/ggml/src/ggml-backend.cpp
-@@ -36,11 +36,25 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
- }
- 
- ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    GGML_ASSERT(buft);
-     if (size == 0) {
-         // return a dummy buffer for zero-sized allocations
-         return ggml_backend_buffer_init(buft, {}, NULL, 0);
-     }
-+
-+    if (buft->no_alloc) {
-+        ggml_backend_buffer_t buf;
-+
-+        if (buft->iface.noalloc_buffer != NULL) {
-+            buf = buft->iface.noalloc_buffer(buft, size);
-+        } else {
-+            buf = ggml_backend_buffer_init(buft, {}, NULL, size);
-+        }
-+
-+        buf->no_alloc = true;
-+        return buf;
-+    }
-+
-+    GGML_ASSERT(buft);
-     return buft->iface.alloc_buffer(buft, size);
- }
- 
-@@ -94,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
-         /* .buft      = */ buft,
-         /* .context   = */ context,
-         /* .size      = */ size,
-        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
-+        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY,
-+        /* .no_alloc  = */ false
-     };
- 
-     return buffer;
-@@ -126,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
-         return NULL;
-     }
- 
-+    // If we aren't allocating memory, return a placeholder non-NULL pointer
-+    // that meets alignment requirements
-+    if (buffer->no_alloc) {
-+        return (void *)ggml_backend_buffer_get_alignment(buffer);
-+    }
-+
-     // FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
-     //     I don't know whether the above comment is correct
-     if (!buffer->iface.get_base) {
-@@ -736,6 +757,12 @@ struct ggml_backend_sched {
-     int debug_realloc;
-     int debug_graph_size;
-     int debug_prev_graph_size;
-+
-+    // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
-+    // if false, dummy buffers are used for faster memory sizing calculations
-+    // the scheduler needs to be recreated with allocated buffers before it can be used
-+    // for computation
-+    bool alloc_buffers;
- };
- 
- #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
-@@ -1635,6 +1662,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
-         size_t graph_size,
-         bool parallel,
-         bool op_offload) {
-+            return ggml_backend_sched_new_ext(backends, bufts, n_backends, graph_size, parallel, op_offload, true);
-+        }
-+
-+ggml_backend_sched_t ggml_backend_sched_new_ext(
-+        ggml_backend_t * backends,
-+        ggml_backend_buffer_type_t * bufts,
-+        int n_backends,
-+        size_t graph_size,
-+        bool parallel,
-+        bool op_offload,
-+        bool alloc_buffers) {
-     GGML_ASSERT(n_backends > 0);
-     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
-     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
-@@ -1687,11 +1725,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
-                 sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
-             }
-         }
-+
-+        sched->bufts[b]->no_alloc = !alloc_buffers;
-     }
- 
-     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
-     sched->op_offload = op_offload;
-     sched->batch_size = -1;
-+    sched->alloc_buffers = alloc_buffers;
- 
-     ggml_backend_sched_reset(sched);
- 
-@@ -1710,6 +1751,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
-         for (int c = 0; c < sched->n_copies; c++) {
-             ggml_backend_event_free(sched->events[b][c]);
-         }
-+
-+        if (sched->backends[b]->iface.reset != NULL) {
-+            sched->backends[b]->iface.reset(sched->backends[b]);
-+        }
-     }
-     ggml_gallocr_free(sched->galloc);
-     ggml_free(sched->ctx);
-@@ -1765,6 +1810,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
-         return false;
-     }
- 
-+    if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
-+        return false;
-+    }
-+
-+    struct ggml_backend_sched_split * splits = sched->splits;
-+    for (int i = 0; i < sched->n_splits; i++) {
-+        struct ggml_backend_sched_split * split = &splits[i];
-+        int split_backend_id = split->backend_id;
-+        ggml_backend_t split_backend = sched->backends[split_backend_id];
-+
-+        if (split_backend->iface.graph_reserve != NULL) {
-+            enum ggml_status ec = split_backend->iface.graph_reserve(split_backend, &split->graph, sched->alloc_buffers);
-+            if (ec != GGML_STATUS_SUCCESS) {
-+                return false;
-+            }
-+        }
-+    }
-+
-     ggml_backend_sched_reset(sched);
- 
-     return true;
-@@ -1870,7 +1933,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
-     int backend_index = ggml_backend_sched_backend_id(sched, backend);
-     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
- 
-    return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-+    size_t size = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-+
-+    if (backend->iface.buffer_size != NULL) {
-+        size += backend->iface.buffer_size(backend);
-+    }
-+
-+    return size;
- }
- 
- void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
-diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index 9516d8ec8..e37e6d8fb 100644
--- a/ggml/src/ggml-cuda/common.cuh
-+++ b/ggml/src/ggml-cuda/common.cuh
-@@ -37,6 +37,41 @@
- #include "vendors/cuda.h"
- #endif // defined(GGML_USE_HIP)
- 
-+extern bool reserving_graph;
-+
-+// If we are reserving the graph, pointers might be invalid and will fail if cudaMemcpyAsync tries to validate them.
-+// However, since we don't actually expect a result, we don't need to actually do the memcpy.
-+static cudaError_t cudaMemcpyAsyncReserve ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
-+    if (!reserving_graph) {
-+        return cudaMemcpyAsync(dst, src, count, kind, stream);
-+    } else {
-+        return cudaSuccess;
-+    }
-+}
-+
-+static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream = 0 ) {
-+    if (!reserving_graph) {
-+        return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
-+    } else {
-+        return cudaSuccess;
-+    }
-+}
-+
-+static cudaError_t cudaMemsetAsyncReserve ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) {
-+    if (!reserving_graph) {
-+        return cudaMemsetAsync(devPtr, value, count, stream);
-+    } else {
-+        return cudaSuccess;
-+    }
-+}
-+
-+#undef cudaMemcpyAsync
-+#define cudaMemcpyAsync cudaMemcpyAsyncReserve
-+#undef cudaMemcpy2DAsync
-+#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve
-+#undef cudaMemsetAsync
-+#define cudaMemsetAsync cudaMemsetAsyncReserve
-+
- #define STRINGIZE_IMPL(...) #__VA_ARGS__
- #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
- 
-@@ -977,6 +1012,9 @@ struct ggml_cuda_pool {
- 
-     virtual void * alloc(size_t size, size_t * actual_size) = 0;
-     virtual void free(void * ptr, size_t size) = 0;
-+
-+    virtual bool alloc_memory() = 0;
-+    virtual size_t alloc_size() = 0;
- };
- 
- template<typename T>
-@@ -1283,11 +1321,15 @@ struct ggml_backend_cuda_context {
-     // pool
-     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
- 
-    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
-+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no, bool alloc);
- 
-     ggml_cuda_pool & pool(int device) {
-         if (pools[device][curr_stream_no] == nullptr) {
-            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
-+            bool alloc = true;
-+            if (pools[device][0] != nullptr) {
-+                alloc = pools[device][0]->alloc_memory();
-+            }
-+            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
-         }
-         return *pools[device][curr_stream_no];
-     }
-@@ -1295,6 +1337,22 @@ struct ggml_backend_cuda_context {
-     ggml_cuda_pool & pool() {
-         return pool(device);
-     }
-+
-+    void pool_set_alloc(bool alloc) {
-+        GGML_ASSERT(pools[device][curr_stream_no] == nullptr || pools[device][curr_stream_no]->alloc_memory() == alloc);
-+
-+        if (pools[device][curr_stream_no] == nullptr) {
-+            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
-+        }
-+    }
-+
-+    size_t pool_get_alloc_size(int stream_no) {
-+        if (pools[device][stream_no] == nullptr) {
-+            return 0;
-+        }
-+
-+        return pools[device][stream_no]->alloc_size();
-+    }
- };
- 
- struct ggml_cuda_mm_fusion_args_host {
--- a/llama/patches/0021-decode-disable-output_all.patch
+++ b/llama/patches/0021-decode-disable-output_all.patch
@@ -8,16 +8,16 @@ Subject: [PATCH] decode: disable output_all
 1 file changed, 1 insertion(+), 2 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index f220010a1..ba0ff9df9 100644
+index 8786d4ee3..9e6998272 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -1391,8 +1391,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
+@@ -1051,8 +1051,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     const int64_t n_vocab = vocab.n_tokens();
     const int64_t n_embd  = hparams.n_embd_inp();
 
 -    // when computing embeddings, all tokens are output
-    const bool output_all   = cparams.embeddings;
+-    const bool output_all = cparams.embeddings;
 +    const bool output_all = false;
-     const bool has_samplers = !sampling.samplers.empty();
 
-     const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
+     if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
+         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0022-ggml-Enable-resetting-backend-devices.patch
@@ -1,24 +1,25 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 10 Jan 2026 15:56:42 -0800
+From: Jesse Gross <jesse@ollama.com>
+Date: Wed, 27 Aug 2025 14:39:48 -0700
 Subject: [PATCH] ggml: Enable resetting backend devices

-Allows resetting CUDA devices to free primary context allocations
-(~300 MB of VRAM per device) when a device is unused.
+Touching a CUDA device causes the allocation of a primary context
+with CUDA data structures (~300 MB of VRAM). If a device is
+unused then it can be reset to free these data structures.
 ---
 ggml/include/ggml-backend.h      |  1 +
 ggml/src/ggml-backend-impl.h     |  4 ++++
 ggml/src/ggml-backend.cpp        |  8 ++++++++
- ggml/src/ggml-cuda/ggml-cuda.cu  | 11 +++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 16 +++++++++++++++-
 ggml/src/ggml-cuda/vendors/hip.h |  1 +
 src/llama.cpp                    |  4 +++-
- 6 files changed, 28 insertions(+), 1 deletion(-)
+ 6 files changed, 32 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index cc4f0e5af..006867064 100644
+index dbbb61d9c..92ca32a4b 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
-@@ -179,6 +179,7 @@ extern "C" {
+@@ -178,6 +178,7 @@ extern "C" {
     GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
     GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
     GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
@@ -27,7 +28,7 @@ index cc4f0e5af..006867064 100644
     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 93eb8e511..c815c2eed 100644
+index 7bdf9d81f..21b35ac5c 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
@@ -195,6 +195,10 @@ extern "C" {
@@ -42,7 +43,7 @@ index 93eb8e511..c815c2eed 100644
 
     struct ggml_backend_device {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index c12e83cc0..4ce406c39 100644
+index 7746e8b92..189e97170 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -532,6 +532,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
@@ -61,10 +62,10 @@ index c12e83cc0..4ce406c39 100644
     GGML_ASSERT(device);
     return device->iface.get_buffer_type(device);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 49f9c2b56..e3fbd2dd4 100644
+index eeaae3fe4..6852d2e20 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -115,6 +115,11 @@ int ggml_cuda_get_device() {
+@@ -113,6 +113,11 @@ int ggml_cuda_get_device() {
     return id;
 }
 
@@ -76,7 +77,19 @@ index 49f9c2b56..e3fbd2dd4 100644
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
     cudaError_t err;
-@@ -4769,6 +4774,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
+@@ -4448,7 +4453,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+     props->id          = ggml_backend_cuda_device_get_id(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
+-    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
+    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+    props->memory_total = props->memory_free = 0;
+ 
+     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
+ #ifdef GGML_CUDA_NO_PEER_COPY
+@@ -4908,6 +4916,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }
 
@@ -88,7 +101,7 @@ index 49f9c2b56..e3fbd2dd4 100644
 static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .get_name                = */ ggml_backend_cuda_device_get_name,
     /* .get_description         = */ ggml_backend_cuda_device_get_description,
-@@ -4785,6 +4795,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+@@ -4924,6 +4937,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .event_new               = */ ggml_backend_cuda_device_event_new,
     /* .event_free              = */ ggml_backend_cuda_device_event_free,
     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
@@ -97,22 +110,22 @@ index 49f9c2b56..e3fbd2dd4 100644
 
 // backend reg
 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 016b04e5a..164d367d0 100644
+index 951a88d56..4e162258d 100644
 --- a/ggml/src/ggml-cuda/vendors/hip.h
 +++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -51,6 +51,7 @@
+@@ -49,6 +49,7 @@
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
- #define cudaDeviceGetAttribute hipDeviceGetAttribute
 #define cudaDeviceProp hipDeviceProp_t
 +#define cudaDeviceReset hipDeviceReset
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 diff --git a/src/llama.cpp b/src/llama.cpp
-index f1096d960..da72d8e77 100644
+index f69964b6d..759152b76 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -993,10 +993,12 @@ static struct llama_model * llama_model_load_from_file_impl(
+@@ -921,10 +921,12 @@ static struct llama_model * llama_model_load_from_file_impl(
     for (auto * dev : model->devices) {
         ggml_backend_dev_props props;
         ggml_backend_dev_get_props(dev, &props);
--- a/llama/patches/0024-ollama-GPU-discovery-enhancements.patch
+++ b/llama/patches/0024-ollama-GPU-discovery-enhancements.patch
--- a/llama/patches/0025-NVML-fallback-for-unified-memory-GPUs.patch
+++ b/llama/patches/0025-NVML-fallback-for-unified-memory-GPUs.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] NVML fallback for unified memory GPUs
 1 file changed, 68 insertions(+), 3 deletions(-)

 diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
-index b5d46cbe7..f8a4ac7b5 100644
+index c9073cef0..f473a2a2c 100644
 --- a/ggml/src/mem_nvml.cpp
 +++ b/ggml/src/mem_nvml.cpp
@@ -13,6 +13,7 @@
--- a/llama/patches/0027-interleave-multi-rope.patch
+++ b/llama/patches/0027-interleave-multi-rope.patch
@@ -59,10 +59,10 @@ index 88ed79111..71ca60214 100644
     } else {
         if (sector < sections.v[0]) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index ff1afa291..21b51cc8d 100644
+index 236838e9e..c98d269d1 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -4243,14 +4243,14 @@ kernel void kernel_rope_multi(
+@@ -4242,14 +4242,14 @@ kernel void kernel_rope_multi(
 
             float theta_base;
             if (FC_rope_is_imrope) {
@@ -82,10 +82,10 @@ index ff1afa291..21b51cc8d 100644
             } else {
                 if (sector < args.sect_0) {
 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
-index aacec9846..0163d8bbc 100644
+index 9726b722d..1c8c69422 100644
 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
 +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
-@@ -155,14 +155,14 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
+@@ -148,14 +148,14 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
 
     float theta_base = 0.0;
     if (p.is_imrope != 0) {
--- a/llama/patches/0028-ollama-Add-memory-detection-using-DXGI-PDH.patch
+++ b/llama/patches/0028-ollama-Add-memory-detection-using-DXGI-PDH.patch
@@ -1,401 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 10 Jan 2026 16:17:12 -0800
-Subject: [PATCH] ollama: Add memory detection using DXGI + PDH
-
-Add Windows-specific VRAM detection using DXGI and PDH performance counters.
-This provides accurate memory reporting for both integrated and discrete GPUs.
-Add luid field to Vulkan device context for device matching.
---
- ggml/src/CMakeLists.txt              |   1 +
- ggml/src/ggml-impl.h                 |   3 +
- ggml/src/ggml-vulkan/ggml-vulkan.cpp |  24 +++
- ggml/src/mem_dxgi_pdh.cpp            | 297 +++++++++++++++++++++++++++
- 4 files changed, 325 insertions(+)
- create mode 100644 ggml/src/mem_dxgi_pdh.cpp
-
-diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 47d7ea9ce..755ea86a7 100644
--- a/ggml/src/CMakeLists.txt
-+++ b/ggml/src/CMakeLists.txt
-@@ -207,6 +207,7 @@ add_library(ggml-base
-             ggml-quants.h
-             mem_hip.cpp
-             mem_nvml.cpp
-+            mem_dxgi_pdh.cpp
-             gguf.cpp)
- 
- set_target_properties(ggml-base PROPERTIES
-diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
-index 9549d0495..eacabb191 100644
--- a/ggml/src/ggml-impl.h
-+++ b/ggml/src/ggml-impl.h
-@@ -680,6 +680,9 @@ GGML_API void ggml_nvml_release();
- GGML_API int ggml_hip_mgmt_init();
- GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total, bool is_integrated_gpu);
- GGML_API void ggml_hip_mgmt_release();
-+GGML_API int ggml_dxgi_pdh_init();
-+GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu);
-+GGML_API void ggml_dxgi_pdh_release();
- 
- #ifdef __cplusplus
- }
-diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b3eea31af..5dbe20e21 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
- #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME                        "VK_KHR_shader_bfloat16"
- #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
- #define VK_COMPONENT_TYPE_BFLOAT16_KHR                               ((VkComponentTypeKHR)1000141000)
-+#define VK_LUID_SIZE_KHR                  VK_LUID_SIZE
- 
- typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
-     VkStructureType                       sType;
-@@ -14294,6 +14295,7 @@ struct ggml_backend_vk_device_context {
-     std::string pci_id;
-     std::string id;
-     std::string uuid;
-+    std::string luid;
-     int major;
-     int minor;
-     int driver_major;
-@@ -14318,6 +14320,20 @@ static const char * ggml_backend_vk_device_get_id(ggml_backend_dev_t dev) {
- 
- static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-     ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
-+    GGML_LOG_DEBUG("ggml_backend_vk_device_get_memory called: uuid %s\n", ctx->uuid.c_str());
-+    GGML_LOG_DEBUG("ggml_backend_vk_device_get_memory called: luid %s\n", ctx->luid.c_str());
-+
-+    // Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic)
-+    if (ggml_dxgi_pdh_init() == 0) {
-+        GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n");
-+        int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu);
-+        if (status == 0) {
-+            GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total);
-+            ggml_dxgi_pdh_release();
-+            return;
-+        }
-+        ggml_dxgi_pdh_release();
-+    }
- 
-     // Use vendor specific management libraries for best VRAM reporting if available
-     if (!ctx->is_integrated_gpu) {
-@@ -15090,6 +15106,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
-                     }
-                 }
-                 ctx->uuid = oss.str();
-+                const auto& luid = device_id_props.deviceLUID;
-+                char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars
-+                snprintf(luid_str, sizeof(luid_str), // high part + low part
-+                    "0x%02x%02x%02x%02x%02x%02x%02x%02x",
-+                    luid[7], luid[6], luid[5], luid[4],
-+                    luid[3], luid[2], luid[1], luid[0]
-+                );
-+                ctx->luid = std::string(luid_str);
-                 ctx->major = 0;
-                 ctx->minor = 0;
-                 // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
-diff --git a/ggml/src/mem_dxgi_pdh.cpp b/ggml/src/mem_dxgi_pdh.cpp
-new file mode 100644
-index 000000000..4dd66c25f
--- /dev/null
-+++ b/ggml/src/mem_dxgi_pdh.cpp
-@@ -0,0 +1,297 @@
-+// DXGI and PDH Performance Counters Library
-+// This Windows-only (10/11) library provides accurate VRAM reporting
-+#include "ggml.h"
-+#include "ggml-impl.h"
-+
-+#ifdef _WIN32
-+#    define WIN32_LEAN_AND_MEAN
-+#    ifndef NOMINMAX
-+#        define NOMINMAX
-+#    endif
-+#include <windows.h>
-+#include <pdh.h>
-+#include <dxgi1_2.h>
-+#include <sstream>
-+#include <thread>
-+#include <filesystem>
-+#include <mutex>
-+
-+namespace fs = std::filesystem;
-+
-+static std::mutex ggml_dxgi_pdh_lock;
-+
-+/*
-+Struct to keep track of GPU adapter information at runtime
-+*/
-+struct GpuInfo {
-+    std::wstring description; // debug field
-+    LUID luid;
-+    std::wstring pdhInstance;
-+    double dedicatedTotal = 0.0;
-+    double sharedTotal = 0.0;
-+    double dedicatedUsage = 0.0;
-+    double sharedUsage = 0.0;
-+};
-+
-+/*
-+DLL Function Pointers
-+*/
-+struct {
-+    void *dxgi_dll_handle;
-+    void *pdh_dll_handle;
-+    // DXGI Functions
-+    HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory);
-+    // PDH functions
-+    PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery);
-+    PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter);
-+    PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery);
-+    PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue);
-+    PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery);
-+} dll_functions {
-+    nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr
-+};
-+
-+/*
-+Create a PDH Instance name
-+*/
-+static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) {
-+    std::wstringstream ss;
-+    ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart
-+        << L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart;
-+    return ss.str();
-+}
-+
-+/*
-+Conversion from Bytes to GigaBytes
-+*/
-+template <typename T>
-+static inline double b_to_gb(T n)
-+{
-+    return (double(n) / (1024.0 * 1024 * 1024));
-+}
-+
-+/*
-+Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI
-+*/
-+static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) {
-+    auto dedicatedVideoMemory = desc.DedicatedVideoMemory;
-+    auto sharedSystemMemory = desc.SharedSystemMemory;
-+    GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory));
-+    if (info) {
-+        info->dedicatedTotal = dedicatedVideoMemory; // values in bytes
-+        info->sharedTotal = sharedSystemMemory;
-+    }
-+}
-+
-+/*
-+Enumerate over the GPU adapters detected using DXGI and return their information
-+*/
-+static std::vector<GpuInfo> get_dxgi_gpu_infos() {
-+    std::vector<GpuInfo> infos;
-+    IDXGIFactory1* pFactory = nullptr;
-+
-+    if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) {
-+        UINT i = 0;
-+        IDXGIAdapter1* pAdapter = nullptr;
-+        while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) {
-+            DXGI_ADAPTER_DESC1 desc;
-+            pAdapter->GetDesc1(&desc);
-+
-+            // Get all the GPU adapter info
-+            GpuInfo info;
-+            fetch_dxgi_adapter_desc1(desc, &info);
-+            info.description = std::wstring(desc.Description);
-+            info.luid = desc.AdapterLuid;
-+            info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid);
-+            infos.push_back(info);
-+
-+            pAdapter->Release();
-+            ++i;
-+        }
-+        pFactory->Release();
-+    }
-+    return infos;
-+}
-+
-+static bool get_gpu_memory_usage(GpuInfo& gpu) {
-+    PDH_HQUERY query;
-+    if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) {
-+        return false;
-+    }
-+
-+    struct GpuCounters {
-+        PDH_HCOUNTER dedicated;
-+        PDH_HCOUNTER shared;
-+    };
-+
-+    GpuCounters gpuCounter{};
-+
-+    std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage";
-+    std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage";
-+
-+    if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS ||
-+        dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) {
-+            GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str());
-+            dll_functions.PdhCloseQuery(query);
-+            return false;
-+    }
-+
-+    // Sample the data
-+    if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) {
-+            dll_functions.PdhCloseQuery(query);
-+            return false;
-+    }
-+
-+    // Read final values
-+    PDH_FMT_COUNTERVALUE val;
-+
-+    if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
-+        gpu.dedicatedUsage = val.doubleValue;
-+
-+    if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
-+        gpu.sharedUsage = val.doubleValue;
-+
-+    dll_functions.PdhCloseQuery(query);
-+    return true;
-+}
-+
-+
-+extern "C" {
-+
-+    int ggml_dxgi_pdh_init() {
-+        GGML_LOG_DEBUG("%s called\n", __func__);
-+        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
-+        if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) {
-+            // Already initialized as we have both DLL handles
-+            return ERROR_SUCCESS;
-+        }
-+
-+        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-+        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-+        fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll");
-+        fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll");
-+
-+        // Call LoadLibraryW on both DLLs to ensure they are loaded
-+        void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str());
-+        void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str());
-+        if(dxgi == NULL || pdh == NULL) {
-+            if (dxgi != NULL) {
-+                FreeLibrary((HMODULE)(dxgi));
-+            }
-+            if (pdh != NULL) {
-+                FreeLibrary((HMODULE)(pdh));
-+            }
-+            SetErrorMode(old_mode);
-+            return ERROR_DLL_NOT_FOUND;
-+        }
-+        else {
-+            // save the dll handles
-+            dll_functions.dxgi_dll_handle = dxgi;
-+            dll_functions.pdh_dll_handle = pdh;
-+        }
-+
-+        // Get pointers to the library functions loaded by the DLLs
-+        dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1");
-+        dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW");
-+        dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW");
-+        dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData");
-+        dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue");
-+        dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery");
-+
-+        SetErrorMode(old_mode); // set old mode before any return
-+
-+        // Check if any function pointers are NULL (not found)
-+        if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) {
-+            GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__);
-+            FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
-+            FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
-+            dll_functions.dxgi_dll_handle = NULL;
-+            dll_functions.pdh_dll_handle = NULL;
-+            return ERROR_PROC_NOT_FOUND;
-+        }
-+
-+        // No other initializations needed, successfully loaded the libraries and functions!
-+        return ERROR_SUCCESS;
-+    }
-+
-+    void ggml_dxgi_pdh_release() {
-+        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
-+        if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) {
-+            // Already freed the DLLs
-+            return;
-+        }
-+
-+        // Call FreeLibrary on both DLLs
-+        FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
-+        FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
-+
-+        dll_functions.dxgi_dll_handle = NULL;
-+        dll_functions.pdh_dll_handle = NULL;
-+
-+        return; // successfully released
-+    }
-+
-+    int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
-+
-+        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
-+
-+        // Enumerate GPUs using DXGI and find the matching LUID
-+        // This also fetches the total memory info for each of the enumerated GPUs
-+        std::vector<GpuInfo> gpus = get_dxgi_gpu_infos();
-+        GpuInfo *targetGpu = nullptr;
-+        for (auto& gpu : gpus) {
-+            char luid_buffer[32]; // "0x" + 16 hex digits + null terminator
-+            snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart);
-+            std::string gpu_luid_str(luid_buffer);
-+            if (gpu_luid_str == std::string(luid)) {
-+                targetGpu = &gpu;
-+                break;
-+            }
-+        }
-+        if (!targetGpu) {
-+            GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid);
-+            return ERROR_NOT_FOUND;
-+        }
-+
-+        // Get the current memory usage for the target GPU
-+        int status = get_gpu_memory_usage(*targetGpu);
-+        if (!status) {
-+            GGML_LOG_ERROR("Failed to get GPU memory usage.\n");
-+            return ERROR_DEVICE_NOT_AVAILABLE;
-+        }
-+
-+        // Calculate the free memory based on whether it's an integrated or discrete GPU
-+        if (is_integrated_gpu) {
-+            // IGPU free = SharedTotal - SharedUsage
-+            GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
-+            *free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory
-+            *total = targetGpu->sharedTotal + targetGpu->dedicatedTotal;
-+        }
-+        else {
-+            // DGPU free = DedicatedTotal - DedicatedUsage
-+            GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
-+            *free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage;
-+            *total = targetGpu->dedicatedTotal;
-+        }
-+
-+        return ERROR_SUCCESS;
-+    }
-+
-+} // extern "C"
-+
-+#else // #ifdef _WIN32
-+
-+extern "C" {
-+
-+    // DXGI + PDH not available for Linux implementation
-+    int ggml_dxgi_pdh_init() {
-+        return -1;
-+    }
-+    void ggml_dxgi_pdh_release() {}
-+    int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
-+        return -1;
-+    }
-+
-+} // extern "C"
-+
-+#endif // #ifdef _WIN32
--- a/llama/patches/0029-ggml-cuda-skip-large-batches.patch
+++ b/llama/patches/0029-ggml-cuda-skip-large-batches.patch
@@ -10,10 +10,10 @@ fallback to cpu
 1 file changed, 3 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 4dbf7097a..311691206 100644
+index 334a30135..5c9dfd032 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -4494,6 +4494,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -4633,6 +4633,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                     return false;
                 }
--- a/llama/patches/0030-fix-bakllava-regression.patch
+++ b/llama/patches/0030-fix-bakllava-regression.patch
@@ -9,10 +9,10 @@ Rever to prior logic of assuming an empty projector type is mlp
 1 file changed, 4 insertions(+)

 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 26710548e..a8c170cd7 100644
+index 84a3796b5..d3a37842d 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
-@@ -973,6 +973,10 @@ struct clip_model_loader {
+@@ -960,6 +960,10 @@ struct clip_model_loader {
             if (proj_type.empty()) {
                 if (modality == CLIP_MODALITY_VISION) {
                     get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
--- a/llama/patches/0031-win-exit-instead-of-abort.patch
+++ b/llama/patches/0031-win-exit-instead-of-abort.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] win: exit instead of abort
 1 file changed, 6 insertions(+), 1 deletion(-)

 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 09b8eb466..659c7f4ed 100644
+index eb3ae72ea..c9242a15a 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -252,8 +252,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
+@@ -250,8 +250,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
         fprintf(stderr, "%s\n", message);
         ggml_print_backtrace();
     }
--- a/llama/patches/PATCHES.md
+++ b/llama/patches/PATCHES.md
@@ -1,47 +0,0 @@
-# Patch Notes
-
-Notes on patches that had conflicts or could be simplified during the Jan 2026 update to `b1377188`.
-
-## Patches with Conflicts
-
-### 0004-solar-pro.patch
- **Conflict**: `src/llama-arch.cpp` - upstream added `LLM_ARCH_MAINCODER` in same location
- **Resolution**: Keep both architectures (MAINCODER from upstream, SOLAR from patch)
- **Simplification**: Consider upstreaming Solar Pro support
-
-### 0009-remove-amx.patch
- **Conflict**: `ggml/src/CMakeLists.txt` - upstream restructured CPU variants significantly
- **Resolution**: Kept upstream's expanded variant list but removed sapphirerapids with AMX
- **Simplification**: Upstream now conditionalizes AMX behind `if (NOT MSVC)`, consider if full removal is still needed
-
-### 0015-ggml-Export-GPU-UUIDs.patch
- **Conflict**: Could not build fake ancestor due to significant file changes
- **Resolution**: Applied manually - added `id` field to props struct, UUID parsing function, etc.
- **Note**: Upstream added `device_id` for PCI bus IDs; our `id` is for GPU UUIDs (different purposes)
-
-### 0018-ggml-Add-batch-size-hint.patch
- **Status**: Large API change affecting 8 files
- **Conflict**: Function signature changes across multiple backends
- **Simplification**: Consider upstreaming this - it's a clean optimization
-
-## Patches That Applied Cleanly
- 0001-0003, 0005-0008, 0010-0014, 0016-0017
-
-## Patches Not Yet Attempted
- 0019-0031
-
-## Potential Upstream Candidates
-These patches fix general issues that could benefit upstream:
- 0002-pretokenizer - graceful handling of unknown pretokenizers
- 0003-clip-unicode - Windows unicode path support
- 0004-solar-pro - Solar Pro architecture
- 0005-fix-deepseek-deseret-regex - Windows regex fix
- 0010-fix-string-arr-kv-loading - gguf array loading
-
-## Ollama-Specific (Won't Upstream)
- 0011-ollama-debug-tensor
- 0012-add-ollama-vocab-for-grammar-support
- 0015-ggml-Export-GPU-UUIDs (custom UUID field)
- 0024-GPU-discovery-enhancements
- 0025-NVML-fallback-for-unified-memory-GPUs
- 0028-Add-memory-detection-using-DXGI-PDH
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -72,7 +72,7 @@ struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
    try {
        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
        std::vector<std::string> splits = {};
-        llama_model_loader ml(std::string(fname), splits, false, false, false, false, nullptr, nullptr);
+        llama_model_loader ml(std::string(fname), splits, false, false, false, nullptr, nullptr);
        vocab->load(ml, kv);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
--- a/Show More
+++ b/Show More