refactor: consolidate imagegen default constants

Add defaults.go with DefaultWidth and DefaultHeight constants. Steps are model-dependent and now handled by each model: - Z-Image uses 9 steps (turbo model) - Other models define their own defaults Updated all files using hardcoded 1024x1024 values to use constants.
fix: capture multiple stderr lines for better error reporting
2026-01-15 10:58:36 -05:00 · 2026-01-15 00:42:53 -08:00 · 2026-01-14 22:27:37 -08:00 · 2026-01-14 22:25:24 -08:00 · 2026-01-14 22:21:49 -08:00 · 2026-01-14 22:21:00 -08:00
253 changed files with 5245 additions and 15148 deletions
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -13,7 +13,7 @@ body:
    id: logs
    attributes:
      label: Relevant log output
-      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
+      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.mdx#how-to-troubleshoot-issues) for details.
      render: shell
    validations:
      required: false
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -372,13 +372,17 @@ jobs:
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
+      - name: Deduplicate CUDA libraries
+        run: |
+          ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              bin/ollama*)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,9 +48,10 @@ if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()

-if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
+if(APPLE)
    set(CMAKE_BUILD_RPATH "@loader_path")
    set(CMAKE_INSTALL_RPATH "@loader_path")
+    set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
@@ -189,13 +190,21 @@ if(MLX_ENGINE)
    install(TARGETS mlx mlxc
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc nvrtc-builtins cudnn nccl openblas gfortran
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
    )

+    # Install the Metal library for macOS arm64 (must be colocated with the binary)
+    # Metal backend is only built for arm64, not x86_64
+    if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+        install(FILES ${CMAKE_BINARY_DIR}/_deps/mlx-build/mlx/backend/metal/kernels/mlx.metallib
+            DESTINATION ${OLLAMA_INSTALL_DIR}
+            COMPONENT MLX)
+    endif()
+
    # Manually install cudart and cublas since they might not be picked up as direct dependencies
    if(CUDAToolkit_FOUND)
        file(GLOB CUDART_LIBS
--- a/4
+++ b/4
@@ -161,6 +161,9 @@ ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
 ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
+RUN mkdir -p dist/bin
+RUN --mount=type=cache,target=/root/.cache/go-build \
+    go build -tags mlx -trimpath -buildmode=pie -o dist/bin/ollama-mlx .

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@@ -182,6 +185,7 @@ COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
 COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/
+COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/bin/ /bin/

 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=b1377188784f9aea26b8abde56d4aee8c733eec7
+FETCH_HEAD=ec98e2002

 .PHONY: help
 help:
--- a/README.md
+++ b/README.md
@@ -290,6 +290,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Web & Desktop

+- [Onyx](https://github.com/onyx-dot-app/onyx)
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [SwiftChat (macOS with ReactNative)](https://github.com/aws-samples/swift-chat)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
--- a/api/client.go
+++ b/api/client.go
@@ -165,7 +165,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	return nil
 }

-const maxBufferSize = 512 * format.KiloByte
+const maxBufferSize = 8 * format.MegaByte

 func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
 	var buf io.Reader
--- a/api/types.go
+++ b/api/types.go
@@ -127,6 +127,10 @@ type GenerateRequest struct {
 	// each with an associated log probability. Only applies when Logprobs is true.
 	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
 	TopLogprobs int `json:"top_logprobs,omitempty"`
+
+	// Size specifies the image dimensions for image generation models.
+	// Format: "WxH" (e.g., "1024x1024"). OpenAI-compatible.
+	Size string `json:"size,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -100,7 +100,8 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		if filename == "" {
 			// No Modelfile found - check if current directory is an image gen model
 			if imagegen.IsTensorModelDir(".") {
-				return imagegenclient.CreateModel(args[0], ".", p)
+				quantize, _ := cmd.Flags().GetString("quantize")
+				return imagegenclient.CreateModel(args[0], ".", quantize, p)
 			}
 			reader = strings.NewReader("FROM .\n")
 		} else {
@@ -464,14 +465,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	name := args[0]

-	// Check if this is a known image generation model (skip Show/Pull)
-	if imagegen.HasTensorLayers(name) {
-		if opts.Prompt == "" && !interactive {
-			return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
-		}
-		return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
-	}
-
 	info, err := func() (*api.ShowResponse, error) {
 		showReq := &api.ShowRequest{Name: name}
 		info, err := client.Show(cmd.Context(), showReq)
@@ -533,9 +526,18 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return generateEmbedding(cmd, name, opts.Prompt, opts.KeepAlive, truncate, dimensions)
 	}

+	// Check if this is an image generation model
+	if slices.Contains(info.Capabilities, model.CapabilityImageGeneration) {
+		if opts.Prompt == "" && !interactive {
+			return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
+		}
+		return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
+	}
+
 	// Check for experimental flag
 	isExperimental, _ := cmd.Flags().GetBool("experimental")
 	yoloMode, _ := cmd.Flags().GetBool("experimental-yolo")
+	enableWebsearch, _ := cmd.Flags().GetBool("experimental-websearch")

 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
@@ -565,7 +567,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 		// Use experimental agent loop with tools
 		if isExperimental {
-			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
+			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode, enableWebsearch)
 		}

 		return generateInteractive(cmd, opts)
@@ -671,7 +673,11 @@ func PushHandler(cmd *cobra.Command, args []string) error {

 			bar, ok := bars[resp.Digest]
 			if !ok {
-				bar = progress.NewBar(fmt.Sprintf("pushing %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
+				msg := resp.Status
+				if msg == "" {
+					msg = fmt.Sprintf("pushing %s...", resp.Digest[7:19])
+				}
+				bar = progress.NewBar(msg, resp.Total, resp.Completed)
 				bars[resp.Digest] = bar
 				p.Add(resp.Digest, bar)
 			}
@@ -837,11 +843,6 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 }

 func ShowHandler(cmd *cobra.Command, args []string) error {
-	// Check if this is an image generation model
-	if imagegen.HasTensorLayers(args[0]) {
-		return imagegen.Show(args[0], os.Stdout)
-	}
-
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
@@ -1786,6 +1787,7 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
 	runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
 	runCmd.Flags().Bool("experimental-yolo", false, "Skip all tool approval prompts (use with caution)")
+	runCmd.Flags().Bool("experimental-websearch", false, "Enable web search tool in experimental mode")

 	// Image generation flags (width, height, steps, seed, etc.)
 	imagegen.RegisterFlags(runCmd)
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -1547,6 +1547,79 @@ func TestRunOptions_Copy_ThinkValueVariants(t *testing.T) {
 	}
 }

+func TestShowInfoImageGen(t *testing.T) {
+	var b bytes.Buffer
+	err := showInfo(&api.ShowResponse{
+		Details: api.ModelDetails{
+			Family:            "ZImagePipeline",
+			ParameterSize:     "10.3B",
+			QuantizationLevel: "FP8",
+		},
+		Capabilities: []model.Capability{model.CapabilityImageGeneration},
+		Requires:     "0.14.0",
+	}, false, &b)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expect := "  Model\n" +
+		"    architecture    ZImagePipeline    \n" +
+		"    parameters      10.3B             \n" +
+		"    quantization    FP8               \n" +
+		"    requires        0.14.0            \n" +
+		"\n" +
+		"  Capabilities\n" +
+		"    image    \n" +
+		"\n"
+	if diff := cmp.Diff(expect, b.String()); diff != "" {
+		t.Errorf("unexpected output (-want +got):\n%s", diff)
+	}
+}
+
+func TestPushProgressMessage(t *testing.T) {
+	tests := []struct {
+		name    string
+		status  string
+		digest  string
+		wantMsg string
+	}{
+		{
+			name:    "uses status when provided",
+			status:  "uploading model",
+			digest:  "sha256:abc123456789def",
+			wantMsg: "uploading model",
+		},
+		{
+			name:    "falls back to digest when status empty",
+			status:  "",
+			digest:  "sha256:abc123456789def",
+			wantMsg: "pushing abc123456789...",
+		},
+		{
+			name:    "handles short digest gracefully",
+			status:  "",
+			digest:  "sha256:abc",
+			wantMsg: "pushing sha256:abc...",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			msg := tt.status
+			if msg == "" {
+				if len(tt.digest) >= 19 {
+					msg = fmt.Sprintf("pushing %s...", tt.digest[7:19])
+				} else {
+					msg = fmt.Sprintf("pushing %s...", tt.digest)
+				}
+			}
+			if msg != tt.wantMsg {
+				t.Errorf("got %q, want %q", msg, tt.wantMsg)
+			}
+		})
+	}
+}
+
 func TestRunOptions_Copy_Independence(t *testing.T) {
 	// Test that modifications to original don't affect copy
 	originalThink := &api.ThinkValue{Value: "original"}
--- a/docs/capabilities/web-search.mdx
+++ b/docs/capabilities/web-search.mdx
@@ -110,7 +110,7 @@ More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/e
 import { Ollama } from "ollama";

 const client = new Ollama();
-const results = await client.webSearch({ query: "what is ollama?" });
+const results = await client.webSearch("what is ollama?");
 console.log(JSON.stringify(results, null, 2));
 ```

@@ -213,7 +213,7 @@ models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
 import { Ollama } from "ollama";

 const client = new Ollama();
-const fetchResult = await client.webFetch({ url: "https://ollama.com" });
+const fetchResult = await client.webFetch("https://ollama.com");
 console.log(JSON.stringify(fetchResult, null, 2));
 ```

--- a/docs/docs.json
+++ b/docs/docs.json
@@ -111,7 +111,9 @@
              "/integrations/zed",
              "/integrations/roo-code",
              "/integrations/n8n",
-              "/integrations/xcode"
+              "/integrations/xcode",
+              "/integrations/onyx",
+              "/integrations/marimo"
            ]
          },
          {
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -22,7 +22,7 @@ Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 2048 tokens.
+By default, Ollama uses a context window size of 4096 tokens.

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

--- a/docs/images/marimo-add-model.png
+++ b/docs/images/marimo-add-model.png
--- a/docs/images/marimo-chat.png
+++ b/docs/images/marimo-chat.png
--- a/docs/images/marimo-code-completion.png
+++ b/docs/images/marimo-code-completion.png
--- a/docs/images/marimo-models.png
+++ b/docs/images/marimo-models.png
--- a/docs/images/marimo-settings.png
+++ b/docs/images/marimo-settings.png
--- a/docs/images/onyx-login.png
+++ b/docs/images/onyx-login.png
--- a/docs/images/onyx-ollama-form.png
+++ b/docs/images/onyx-ollama-form.png
--- a/docs/images/onyx-ollama-llm.png
+++ b/docs/images/onyx-ollama-llm.png
--- a/docs/images/onyx-query.png
+++ b/docs/images/onyx-query.png
--- a/docs/integrations/marimo.mdx
+++ b/docs/integrations/marimo.mdx
@@ -0,0 +1,73 @@
+---
+title: marimo
+---
+
+## Install
+
+Install [marimo](https://marimo.io). You can use `pip` or `uv` for this. You 
+can also use `uv` to create a sandboxed environment for marimo by running:
+
+```
+uvx marimo edit --sandbox notebook.py
+```
+
+## Usage with Ollama
+
+1. In marimo, go to the user settings and go to the AI tab. From here
+you can find and configure Ollama as an AI provider. For local use you
+would typically point the base url to `http://localhost:11434/v1`.
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/marimo-settings.png" 
+    alt="Ollama settings in marimo"
+    width="50%"
+  />
+</div>
+
+2. Once the AI provider is set up, you can turn on/off specific AI models you'd like to access. 
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/marimo-models.png" 
+    alt="Selecting an Ollama model"
+    width="50%"
+  />
+</div>
+
+3. You can also add a model to the list of available models by scrolling to the bottom and using the UI there. 
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/marimo-add-model.png" 
+    alt="Adding a new Ollama model"
+    width="50%"
+  />
+</div>
+
+4. Once configured, you can now use Ollama for AI chats in marimo.
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/marimo-chat.png" 
+    alt="Configure code completion"
+    width="50%"
+  />
+</div>
+
+4. Alternatively, you can now use Ollama for **inline code completion** in marimo. This can be configured in the "AI Features" tab. 
+
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/marimo-code-completion.png" 
+    alt="Configure code completion"
+    width="50%"
+  />
+</div>
+
+
+## Connecting to ollama.com
+
+1. Sign in to ollama cloud via `ollama signin` 
+2. In the ollama model settings add a model that ollama hosts, like `gpt-oss:120b`.
+3. You can now refer to this model in marimo!
--- a/docs/integrations/onyx.mdx
+++ b/docs/integrations/onyx.mdx
@@ -0,0 +1,63 @@
+---
+title: Onyx
+---
+
+## Overview
+[Onyx](http://onyx.app/) is a self-hostable Chat UI that integrates with all Ollama models. Features include:
+- Creating custom Agents
+- Web search
+- Deep Research
+- RAG over uploaded documents and connected apps
+- Connectors to applications like Google Drive, Email, Slack, etc.
+- MCP and OpenAPI Actions support
+- Image generation
+- User/Groups management, RBAC, SSO, etc.
+
+Onyx can be deployed for single users or large organizations.
+
+## Install Onyx
+
+Deploy Onyx with the [quickstart guide](https://docs.onyx.app/deployment/getting_started/quickstart).
+
+<Info>
+Resourcing/scaling docs [here](https://docs.onyx.app/deployment/getting_started/resourcing).
+</Info>
+
+## Usage with Ollama 
+
+1. Login to your Onyx deployment (create an account first).
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/onyx-login.png" 
+    alt="Onyx Login Page"
+    width="75%"
+  />
+</div>
+2. In the set-up process select `Ollama` as the LLM provider.
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/onyx-ollama-llm.png" 
+    alt="Onyx Set Up Form"
+    width="75%"
+  />
+</div>
+3. Provide your **Ollama API URL** and select your models.
+<Note>If you're running Onyx in Docker, to access your computer's local network use `http://host.docker.internal` instead of `http://127.0.0.1`.</Note>
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/onyx-ollama-form.png" 
+    alt="Selecting Ollama Models"
+    width="75%"
+  />
+</div>
+
+You can also easily connect up Onyx Cloud with the `Ollama Cloud` tab of the setup.
+
+## Send your first query
+<div style={{ display: 'flex', justifyContent: 'center' }}>
+  <img 
+    src="/images/onyx-query.png" 
+    alt="Onyx Query Example"
+    width="75%"
+  />
+</div>
--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -1,5 +1,5 @@
 ---
-title: "Linux"
+title: Linux
 ---

 ## Install
@@ -13,14 +13,15 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 <Note>
-  If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+  If you are upgrading from a prior version, you should remove the old libraries
+  with `sudo rm -rf /usr/lib/ollama` first.
 </Note>

 Download and extract the package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
+    | sudo tar x -C /usr
 ```

 Start Ollama:
@@ -40,8 +41,8 @@ ollama -v
 If you have an AMD GPU, also download and extract the additional ROCm package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
+    | sudo tar x -C /usr
 ```

 ### ARM64 install
@@ -49,8 +50,8 @@ curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
 Download and extract the ARM64-specific package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-arm64.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-arm64.tar.zst \
+    | sudo tar x -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)
@@ -112,7 +113,11 @@ sudo systemctl status ollama
 ```

 <Note>
-  While AMD has contributed the `amdgpu` driver upstream to the official linux kernel source, the version is older and may not support all ROCm features. We recommend you install the latest driver from https://www.amd.com/en/support/linux-drivers for best support of your Radeon GPU.
+  While AMD has contributed the `amdgpu` driver upstream to the official linux
+  kernel source, the version is older and may not support all ROCm features. We
+  recommend you install the latest driver from
+  https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+  GPU.
 </Note>

 ## Customizing
@@ -141,8 +146,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by re-downloading Ollama:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
-    | sudo tar zx -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
+    | sudo tar x -C /usr
 ```

 ## Installing specific versions
@@ -191,4 +196,4 @@ Remove the downloaded models and Ollama service user and group:
 sudo userdel ollama
 sudo groupdel ollama
 sudo rm -r /usr/share/ollama
-```
+```
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -1,3 +0,0 @@
-# Troubleshooting
-
-For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)
--- a/llama/README.md
+++ b/llama/README.md
@@ -14,28 +14,25 @@ make -f Makefile.sync apply-patches

 ### Updating Base Commit

-To update to a new base commit:
+**Pin to new base commit**

-1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash.
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.

-2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts.
+When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.

-3. **Apply patches**:
-   ```shell
-   make -f Makefile.sync apply-patches
-   ```
+Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.

-4. **Resolve conflicts** (if any): When `git am` fails on a patch:
-   - Fix conflicts in `./vendor/`
-   - Stage the resolved files: `git -C llama/vendor add <file>`
-   - Continue: `git -C llama/vendor am --continue`
-   - Re-run: `make -f Makefile.sync apply-patches`
-   - Repeat until all patches are applied.
+```shell
+make -f Makefile.sync apply-patches
+```

-5. **Regenerate patches and sync**:
-   ```shell
-   make -f Makefile.sync format-patches sync
-   ```
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
+
+Once all patches are applied, commit the changes to the tracking repository.
+
+```shell
+make -f Makefile.sync format-patches sync
+```

 ### Generating Patches

--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "b1377188784f9aea26b8abde56d4aee8c733eec7";
+char const *LLAMA_COMMIT = "ec98e2002";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
    }

-    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
@@ -1078,15 +1078,12 @@ struct common_init_result::impl {
    impl() = default;
    ~impl() = default;

-    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
-
    llama_model_ptr   model;
    llama_context_ptr context;

    std::vector<llama_adapter_lora_ptr> lora;

    std::vector<common_sampler_ptr> samplers;
-    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };

 common_init_result::common_init_result(common_params & params) :
@@ -1095,9 +1092,9 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

@@ -1110,25 +1107,6 @@ common_init_result::common_init_result(common_params & params) :

    const llama_vocab * vocab = llama_model_get_vocab(model);

-    // load and optionally apply lora adapters (must be loaded before context creation)
-    for (auto & la : params.lora_adapters) {
-        llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
-        if (lora == nullptr) {
-            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
-            pimpl->model.reset(model);
-            return;
-        }
-
-        char buf[1024];
-        la.ptr = lora.get();
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
-        la.task_name = buf;
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
-        la.prompt_prefix = buf;
-        pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
-    }
-
    // updates params.sampling
    // TODO: fix naming
    common_init_sampler_from_model(model, params.sampling);
@@ -1163,19 +1141,10 @@ common_init_result::common_init_result(common_params & params) :
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

-    // init the backend samplers as part of the context creation
    pimpl->samplers.resize(cparams.n_seq_max);
-    pimpl->samplers_seq_config.resize(cparams.n_seq_max);

    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
-        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
-    }
-
-    // TODO: temporarily gated behind a flag
-    if (params.sampling.backend_sampling) {
-        cparams.samplers   = pimpl->samplers_seq_config.data();
-        cparams.n_samplers = pimpl->samplers_seq_config.size();
    }

    llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1199,12 +1168,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
    return pimpl->samplers[seq_id].get();
 }

-void common_init_result::reset_samplers() {
-    for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
-        llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
-    }
-}
-
 std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }
@@ -1280,6 +1243,24 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        }
    }

+    // load and optionally apply lora adapters
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
+            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            return res;
+        }
+
+        char buf[1024];
+        la.ptr = lora.get();
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
+        la.task_name = buf;
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        la.prompt_prefix = buf;
+        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
+    }
+
    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }
@@ -1320,9 +1301,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
        llama_set_warmup(lctx, false);
-
-        // reset samplers to reset RNG state after warmup to the seeded state
-        res->reset_samplers();
    }

    return res;
@@ -1361,12 +1339,14 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.devices = params.devices.data();
    }

-    mparams.n_gpu_layers    = params.n_gpu_layers;
+    if (params.n_gpu_layers != -1) {
+        mparams.n_gpu_layers = params.n_gpu_layers;
+    }
+
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
-    mparams.use_direct_io   = params.use_direct_io;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -80,7 +80,6 @@ int32_t cpu_get_num_math();
 //

 enum llama_example {
-    LLAMA_EXAMPLE_DEBUG,
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_COMPLETION,
@@ -217,8 +216,6 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

-    bool backend_sampling = false;
-
    bool has_logit_bias() const {
        return !logit_bias.empty();
    }
@@ -332,14 +329,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
-    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
-    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
-
-    // margin per device in bytes for fitting parameters to free memory:
-    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
+    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
+    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

@@ -375,11 +370,6 @@ struct common_params {
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT

-    // llama-debug specific options
-    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
-    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
-    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
-
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
@@ -430,8 +420,7 @@ struct common_params {
    bool kv_unified        = false; // enable unified KV cache

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // enable mmap to use filesystem cache
-    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
@@ -486,8 +475,7 @@ struct common_params {
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
-    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
-    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -496,11 +484,8 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

-    // webui configs
-    bool webui = true;
-    std::string webui_config_json;
-
    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;
@@ -700,9 +685,7 @@ struct common_init_result {

    llama_model * model();
    llama_context * context();
-
    common_sampler * sampler(llama_seq_id seq_id);
-    void reset_samplers();

    std::vector<llama_adapter_lora_ptr> & lora();

--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;

-    struct llama_sampler * grmr;
    struct llama_sampler * chain;

+    bool grammar;
+
    ring_buffer<llama_token> prev;

    std::vector<llama_token_data> cur;
@@ -120,34 +121,17 @@ struct common_sampler {
    }

    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs  = llama_get_sampled_probs_ith     (ctx, idx);
-        const float *       sampled_logits = llama_get_sampled_logits_ith    (ctx, idx);
-        const llama_token * sampled_ids    = llama_get_sampled_candidates_ith(ctx, idx);
+        const auto * logits = llama_get_logits_ith(ctx, idx);

        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);

        const int n_vocab = llama_vocab_n_tokens(vocab);

-        if (sampled_probs) {
-            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
-            cur.resize(sampled_probs_count);
-            for (uint32_t i = 0; i < sampled_probs_count; ++i) {
-                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
-            }
-        } else if (sampled_logits) {
-            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
-            cur.resize(sampled_logits_count);
-            for (uint32_t i = 0; i < sampled_logits_count; i++) {
-                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
-            }
-        } else {
-            const auto * logits = llama_get_logits_ith(ctx, idx);
-            GGML_ASSERT(logits != nullptr);
-            cur.resize(n_vocab);
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-            }
+        cur.resize(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
        }

        cur_p = { cur.data(), cur.size(), -1, false };
@@ -176,50 +160,45 @@ std::string common_params_sampling::print() const {
    return std::string(result);
 }

-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    const llama_vocab * vocab = llama_model_get_vocab(model);

    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

-    llama_sampler * grmr = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);

+    bool grammar = false;
    std::vector<llama_sampler *> samplers;

    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
+        grammar = true;
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
        std::vector<std::string> trigger_patterns;
+        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
            switch (trigger.type) {
                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
                {
                    const auto & word = trigger.value;
-                    trigger_patterns.push_back(regex_escape(word));
+                    patterns_anywhere.push_back(regex_escape(word));
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
                {
-                    trigger_patterns.push_back(trigger.value);
+                    patterns_anywhere.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
                {
-                    const auto & pattern = trigger.value;
-                    std::string anchored = "^$";
-                    if (!pattern.empty()) {
-                        anchored = (pattern.front() != '^' ? "^" : "")
-                            + pattern
-                            + (pattern.back() != '$' ? "$" : "");
-                    }
-                    trigger_patterns.push_back(anchored);
+                    trigger_patterns.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -233,6 +212,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            }
        }

+        if (!patterns_anywhere.empty()) {
+            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
+        }
+
        std::vector<const char *> trigger_patterns_c;
        trigger_patterns_c.reserve(trigger_patterns.size());
        for (const auto & regex : trigger_patterns) {
@@ -241,12 +224,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

        if (!params.grammar.empty()) {
             if (params.grammar_lazy) {
-                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                         trigger_patterns_c.data(), trigger_patterns_c.size(),
-                         trigger_tokens.data(), trigger_tokens.size());
+                 samplers.push_back(
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
+                             trigger_tokens.data(),     trigger_tokens.size()));
             } else {
-                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
             }
+
+             grammar = true;
        }
    }

@@ -315,16 +301,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        llama_sampler_chain_add(chain, smpl);
    }

-    if (grmr && params.backend_sampling) {
-        LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
-
-        params.backend_sampling = false;
-    }
-
    auto * result = new common_sampler {
        /* .params  = */ params,
-        /* .grmr    = */ grmr,
        /* .chain   = */ chain,
+        /* .grammar = */ grammar,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
        /* .cur_p   = */ {},
@@ -335,7 +315,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
@@ -345,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();

-    if (gsmpl->grmr && accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
+    if (gsmpl->grammar) {
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);

-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
+            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+
+            // the grammar sampler is always the first one
+            if (i == 0) {
+                if (accept_grammar) {
+                    llama_sampler_accept(smpl, token);
+                }
+            } else {
+                llama_sampler_accept(smpl, token);
+            }
+        }
+    } else {
+        llama_sampler_accept(gsmpl->chain, token);
+    }

    gsmpl->prev.push_back(token);
 }
@@ -361,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
-        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .grammar = */ gsmpl->grammar,
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
        /* .cur_p   = */ gsmpl->cur_p,
@@ -418,7 +410,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
    return gsmpl->chain;
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -426,61 +418,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    llama_token id = LLAMA_TOKEN_NULL;

-    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    // Check if a backend sampler has already sampled a token in which case we
-    // return that token id directly.
-    {
-        id = llama_get_sampled_token_ith(ctx, idx);
-
-        if (id != LLAMA_TOKEN_NULL) {
-            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
-
-            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
-
-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
-
-            return id;
-        }
-    }
-
    gsmpl->set_logits(ctx, idx);

-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
-
-    llama_sampler_apply(chain, &cur_p);
-
-    id = cur_p.data[cur_p.selected].id;
-
-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -490,7 +432,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@@ -498,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample

    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -510,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }

    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -520,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
--- a/llama/llama.cpp/common/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -36,8 +36,7 @@ struct common_sampler;

 // llama_sampler API overloads

-// note: can mutate params in some cases
-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);

 void common_sampler_free(struct common_sampler * gsmpl);

@@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

-// get the underlying llama_sampler_chain
 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);

 // extended sampling implementation:
@@ -59,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);

 // generalized version of common_sampler_sample
 //
@@ -80,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -286,7 +286,7 @@ extern "C" {
        // NULL-terminated list of buffer types to use for tensors that match a pattern
        const struct llama_model_tensor_buft_override * tensor_buft_overrides;

-        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
+        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -309,7 +309,6 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;      // only load the vocabulary, no weights
        bool use_mmap;        // use mmap if possible
-        bool use_direct_io;   // use direct io, takes precedence over use_mmap
        bool use_mlock;       // force system to keep model in RAM
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -317,11 +316,6 @@ extern "C" {
        bool no_alloc;        // only load metadata and simulate memory allocations
    };

-    struct llama_sampler_seq_config {
-        llama_seq_id           seq_id;
-        struct llama_sampler * sampler;
-    };
-
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggml-org/llama.cpp/pull/7544
    struct llama_context_params {
@@ -370,12 +364,6 @@ extern "C" {
        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
-
-        // [EXPERIMENTAL]
-        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
-        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
-        struct llama_sampler_seq_config * samplers;
-        size_t                            n_samplers;
    };

    // model quantization parameters
@@ -479,23 +467,16 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

-    enum llama_params_fit_status {
-        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occured, e.g. because no model could be found at the specified path
-    };
-
    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    //   - returns true if the parameters could be successfully modified to fit device memory
-    //   - this function is NOT thread safe because it modifies the global llama logger state
-    //   - only parameters that have the same value as in llama_default_model_params are modified
-    LLAMA_API enum llama_params_fit_status llama_params_fit(
+    // returns true if the parameters could be successfully modified to fit device memory
+    // this function is NOT thread safe because it modifies the global llama logger state
+    LLAMA_API bool llama_params_fit(
                                   const char   * path_model,
                    struct llama_model_params   * mparams,
                    struct llama_context_params * cparams,
                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t * margins,               // margins of memory to leave per device in bytes
+                                         size_t   margin,                // margin of memory to leave per device in bytes
                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

@@ -536,7 +517,6 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -620,8 +600,6 @@ extern "C" {
    //

    // Load a LoRA adapter from file
-    // The adapter is valid as long as the associated model is not freed
-    // All adapters must be loaded before context creation
    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
            struct llama_model * model,
            const char * path_lora);
@@ -1005,32 +983,6 @@ extern "C" {
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);

-    //
-    // backend sampling API [EXPERIMENTAL]
-    // note: use only if the llama_context was created with at least one llama_sampler_seq_config
-    //
-
-    // Get the backend sampled token for the ith token.
-    // Returns LLAMA_TOKEN_NULL if no token was sampled.
-    LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled probabilites for the ith token
-    // The index matches llama_get_sampled_token_ith().
-    // Returns NULL if no probabilites were generated.
-    LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled logits for the ith token
-    // Returns NULL if no logits were sampled.
-    LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled candidates (token ids) for the ith token
-    // These are needed to map probability/logit indices to vocab token ids.
-    // Returns NULL if no candidates were sampled.
-    LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
-
    //
    // Vocab
    //
@@ -1202,16 +1154,11 @@ extern "C" {
    //
    //    llama_sampler_free(smpl);
    //
+    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
+    //

    typedef void * llama_sampler_context_t;

-    struct llama_sampler_data {
-        struct ggml_tensor * logits;
-        struct ggml_tensor * probs;
-        struct ggml_tensor * sampled;
-        struct ggml_tensor * candidates;
-    };
-
    // user code can implement the interface below in order to create custom llama_sampler
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
@@ -1221,45 +1168,17 @@ extern "C" {
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL

-        // [EXPERIMENTAL]
-        // backend sampling interface:
-
-        // return true if the backend supports all ops needed by the sampler
-        // note: call once per sampler
-        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
-
-        // call after .backend_apply()
-        void (*backend_accept)(
-                struct llama_sampler * smpl,
-                struct ggml_context  * ctx,
-                struct ggml_cgraph   * gf,
-                struct ggml_tensor   * selected_token);
-
-        // call after .backend_init()
-        void (*backend_apply)(
-                struct llama_sampler      * smpl,
-                struct ggml_context       * ctx,
-                struct ggml_cgraph        * gf,
-                struct llama_sampler_data * data);
-
-        // called before graph execution to set inputs for the current ubatch
-        void (*backend_set_input)(struct llama_sampler * smpl);
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
    };

    struct llama_sampler {
-        struct llama_sampler_i * iface;
-
-        llama_sampler_context_t ctx;
+        const struct llama_sampler_i * iface;
+        llama_sampler_context_t        ctx;
    };

-    // [EXPERIMENTAL]
-    // attach a sampler to the context
-    // note: prefer initializing the context with llama_context_params.samplers when possible
-    // note: changing the samplers of a context can cause graph reallocations and degraded performance
-    LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
-
    // mirror of llama_sampler_i:
-    LLAMA_API struct llama_sampler * llama_sampler_init  (      struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1275,15 +1194,7 @@ extern "C" {

    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-
-    // return NULL if:
-    //   - the sampler is NULL
-    //   - the sampler is not a llama_sampler_chain
-    //   - the index is out of bounds, unless i == -1
-    //   - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
-    LLAMA_API struct llama_sampler * llama_sampler_chain_get(      struct llama_sampler * chain, int32_t i);
-
-    // the total number of samplers in the chain
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);

    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -1292,9 +1203,7 @@ extern "C" {
    // available samplers:

    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-
-    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    /// Setting k <= 0 makes this a noop
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
    return nullptr;
 }

-static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

-    llama_model & model = adapter.model;
-
    ggml_context * ctx_init;
    gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
@@ -413,17 +411,14 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
        }
    }

-    // update number of nodes used
-    model.n_lora_nodes += adapter.get_n_nodes();
-
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }

 llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    llama_adapter_lora * adapter = new llama_adapter_lora(*model);
+    llama_adapter_lora * adapter = new llama_adapter_lora();

    try {
-        llama_adapter_lora_init_impl(path_lora, *adapter);
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
        return adapter;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -474,10 +469,6 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
 }

 void llama_adapter_lora_free(llama_adapter_lora * adapter) {
-    // update number of nodes used
-    GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
-    adapter->model.n_lora_nodes -= adapter->get_n_nodes();
-
    delete adapter;
 }

--- a/llama/llama.cpp/src/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
@@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
 };

 struct llama_adapter_lora {
-    llama_model & model;
-
    // map tensor name to lora_a_b
    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;

@@ -75,14 +73,10 @@ struct llama_adapter_lora {
    // activated lora (aLoRA)
    std::vector<llama_token> alora_invocation_tokens;

-    llama_adapter_lora(llama_model & model) : model(model) {}
+    llama_adapter_lora() = default;
    ~llama_adapter_lora() = default;

    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
-
-    uint32_t get_n_nodes() const {
-        return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
-    }
 };

 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_STARCODER,        "starcoder"        },
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
-    { LLM_ARCH_MODERN_BERT,      "modern-bert"      },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
@@ -42,7 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_PHIMOE,           "phimoe"           },
    { LLM_ARCH_PLAMO,            "plamo"            },
    { LLM_ARCH_PLAMO2,           "plamo2"           },
-    { LLM_ARCH_PLAMO3,           "plamo3"           },
    { LLM_ARCH_CODESHELL,        "codeshell"        },
    { LLM_ARCH_ORION,            "orion"            },
    { LLM_ARCH_INTERNLM2,        "internlm2"        },
@@ -117,9 +115,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_RND1,             "rnd1"             },
    { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-    { LLM_ARCH_MIMO2,            "mimo2"           },
-    { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
-    { LLM_ARCH_MAINCODER,        "maincoder"        },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -153,7 +148,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
-    { LLM_KV_EMBEDDING_LENGTH_OUT,              "%s.embedding_length_out"              },
    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
@@ -211,7 +205,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -223,7 +216,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-    { LLM_KV_ROPE_FREQ_BASE_SWA,            "%s.rope.freq_base_swa"                   },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
@@ -508,7 +500,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_MISTRAL3:
-        case LLM_ARCH_LLAMA_EMBED:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
@@ -790,20 +781,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_CLS,
                LLM_TENSOR_CLS_OUT,
            };
-        case LLM_ARCH_MODERN_BERT:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_TOKEN_EMBD_NORM,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_CLS,
-                LLM_TENSOR_CLS_OUT,
-            };
        case LLM_ARCH_JINA_BERT_V2:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -1083,22 +1060,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_POST_NORM,
                LLM_TENSOR_FFN_POST_NORM,
            };
-        case LLM_ARCH_PLAMO3:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_POST_NORM,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_POST_NORM,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-            };
        case LLM_ARCH_CODESHELL:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -2079,7 +2040,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM_LFM2,
                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_DENSE_2_OUT,
            };
        case LLM_ARCH_LFM2MOE:
            return {
@@ -2098,7 +2058,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_SHORTCONV_INPROJ,
                LLM_TENSOR_SHORTCONV_OUTPROJ,
                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM_LFM2,
+                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
                LLM_TENSOR_FFN_DOWN_EXPS,
@@ -2214,49 +2174,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_VISEXP_FFN_DOWN,
                LLM_TENSOR_VISEXP_FFN_UP,
            };
-        case LLM_ARCH_MIMO2:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_SINKS,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_GATE_INP,
-                LLM_TENSOR_FFN_GATE_EXPS,
-                LLM_TENSOR_FFN_DOWN_EXPS,
-                LLM_TENSOR_FFN_UP_EXPS,
-                LLM_TENSOR_FFN_EXP_PROBS_B,
-            };
        case LLM_ARCH_GPTJ:
        case LLM_ARCH_UNKNOWN:
            return {
                LLM_TENSOR_TOKEN_EMBD,
            };
-        case LLM_ARCH_MAINCODER:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-            };
        case LLM_ARCH_SOLAR:
            return {
                LLM_TENSOR_TOKEN_EMBD,
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -24,7 +24,6 @@ enum llm_arch {
    LLM_ARCH_STARCODER,
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
-    LLM_ARCH_MODERN_BERT,
    LLM_ARCH_NOMIC_BERT,
    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_NEO_BERT,
@@ -46,7 +45,6 @@ enum llm_arch {
    LLM_ARCH_PHIMOE,
    LLM_ARCH_PLAMO,
    LLM_ARCH_PLAMO2,
-    LLM_ARCH_PLAMO3,
    LLM_ARCH_CODESHELL,
    LLM_ARCH_ORION,
    LLM_ARCH_INTERNLM2,
@@ -121,9 +119,6 @@ enum llm_arch {
    LLM_ARCH_RND1,
    LLM_ARCH_PANGU_EMBED,
    LLM_ARCH_MISTRAL3,
-    LLM_ARCH_MIMO2,
-    LLM_ARCH_LLAMA_EMBED,
-    LLM_ARCH_MAINCODER,
    LLM_ARCH_UNKNOWN,
 };

@@ -157,7 +152,6 @@ enum llm_kv {
    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
-    LLM_KV_EMBEDDING_LENGTH_OUT,
    LLM_KV_FEATURES_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -215,7 +209,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_GATE_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ATTENTION_SLIDING_WINDOW,
-    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -227,7 +220,6 @@ enum llm_kv {
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_FREQ_BASE_SWA,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -74,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
    { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
-    { "solar-open",        LLM_CHAT_TEMPLATE_SOLAR_OPEN        },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -217,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GROK_2;
    } else if (tmpl_contains(LU8("[unused9]系统：[unused10]"))) {
        return LLM_CHAT_TEMPLATE_PANGU_EMBED;
-    } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
-        return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -848,14 +845,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[unused9]助手：";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
-        for (auto message : chat) {
-            std::string role(message->role);
-            ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
-        }
-        if (add_ass) {
-            ss << "<|begin|>assistant";
-        }
    } else {
        // template not supported
        return -1;
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -54,7 +54,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
    LLM_CHAT_TEMPLATE_PANGU_EMBED,
-    LLM_CHAT_TEMPLATE_SOLAR_OPEN,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -60,25 +60,6 @@ llama_context::llama_context(
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;

-    // Initialize backend samplers here so they are part of the sampling graph
-    // before the reserve passes run later in this function. This avoids a later
-    // re-reserve when graph nodes change.
-    if (params.samplers != nullptr && params.n_samplers > 0) {
-        for (size_t i = 0; i < params.n_samplers; ++i) {
-            const auto & config = params.samplers[i];
-
-            if (llama_sampler_chain_get(config.sampler, -1) == nullptr) {
-                throw std::runtime_error("the backend samplers must be of type llama_sampler_chain");
-            }
-
-            if (set_sampler(config.seq_id, config.sampler)) {
-                const int n_samplers = llama_sampler_chain_n(config.sampler);
-
-                LLAMA_LOG_INFO("%s: setting backend sampler for seq_id %d (n = %d)\n", __func__, config.seq_id, n_samplers);
-            }
-        }
-    }
-
    auto rope_scaling_type = params.rope_scaling_type;
    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
        rope_scaling_type = hparams.rope_scaling_type_train;
@@ -250,10 +231,7 @@ llama_context::llama_context(
        // graph outputs buffer
        {
            // resized during inference when a batch uses more outputs
-            // Create a dummy batch for initialization.
-            llama_batch dummy_batch = {};
-            dummy_batch.n_tokens = 0;
-            if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
                throw std::runtime_error("failed to reserve initial output buffer");
            }

@@ -316,8 +294,8 @@ llama_context::llama_context(
        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
        bool pipeline_parallel =
            model.n_devices() > 1 &&
-            model.n_gpu_layers() > model.hparams.n_layer &&
-            model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
+            model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+            model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
            cparams.offload_kqv &&
            !model.has_tensor_overrides();

@@ -478,35 +456,26 @@ llama_context::llama_context(
            LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
        }
    }
-
-    // Initialize the full vocabulary token ids for backend samplers.
-    {
-        const int n_vocab = model.vocab.n_tokens();
-
-        sampling.token_ids_full_vocab.resize(n_vocab);
-        for (int i = 0; i < n_vocab; ++i) {
-            sampling.token_ids_full_vocab[i] = i;
-        }
-    }
 }

 llama_context::~llama_context() {
-    if (!model.hparams.no_alloc) {
-        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-            ggml_backend_t             backend = backend_ptrs[i];
-            ggml_backend_buffer_type_t buft    = backend_buft[i];
+    // FIXME this currently results in a use-after-free bug if the model is freed before the context
+    // if (!model.hparams.no_alloc) {
+    //     for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+    //         ggml_backend_t             backend = backend_ptrs[i];
+    //         ggml_backend_buffer_type_t buft    = backend_buft[i];

-            const size_t size_exp = backend_buf_exp_size[i];
-            const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-            if (size_exp == size_act) {
-                LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
-                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-            } else {
-                LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
-                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-            }
-        }
-    }
+    //         const size_t size_exp = backend_buf_exp_size[i];
+    //         const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+    //         if (size_exp == size_act) {
+    //             LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+    //         } else {
+    //             LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+    //         }
+    //     }
+    // }
    ggml_opt_free(opt_ctx);
 }

@@ -648,35 +617,6 @@ float * llama_context::get_logits() {
    return logits;
 }

-int64_t llama_context::output_resolve_row(int32_t i) const {
-    int64_t j = -1;
-
-    // support negative indices (last output row)
-    if (i < 0) {
-        j = n_outputs + i;
-        if (j < 0) {
-            throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
-        }
-    } else if ((size_t) i >= output_ids.size()) {
-        throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
-    } else {
-        // use output_ids to translate the batch token index into a row number
-        // that holds this token's data.
-        j = output_ids[i];
-    }
-
-    if (j < 0) {
-        // the batch token was not configured to output anything
-        throw std::runtime_error(format("batch.logits[%d] != true", i));
-    }
-
-    if (j >= n_outputs) {
-        throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
-    }
-
-    return j;
-}
-
 float * llama_context::get_logits_ith(int32_t i) {
    int64_t j = -1;

@@ -687,7 +627,6 @@ float * llama_context::get_logits_ith(int32_t i) {
            throw std::runtime_error("no logits");
        }

-        // TODO: use output_resolve_row()
        if (i < 0) {
            j = n_outputs + i;
            if (j < 0) {
@@ -724,10 +663,6 @@ float * llama_context::get_embeddings() {
    return embd;
 }

-llama_token * llama_context::get_sampled_tokens()  const{
-    return sampling.sampled;
-}
-
 float * llama_context::get_embeddings_ith(int32_t i) {
    int64_t j = -1;

@@ -738,7 +673,6 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error("no embeddings");
        }

-        // TODO: use output_resolve_row()
        if (i < 0) {
            j = n_outputs + i;
            if (j < 0) {
@@ -758,8 +692,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
        }

-        const uint32_t n_embd_out = model.hparams.get_n_embd_out();
-        return embd + j*n_embd_out;
+        return embd + j*model.hparams.n_embd;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@@ -779,136 +712,6 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
    return it->second.data();
 }

-llama_token llama_context::get_sampled_token_ith(int32_t idx) {
-    output_reorder();
-
-    if (sampling.sampled == nullptr) {
-        return LLAMA_TOKEN_NULL;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        GGML_ASSERT(row < (int64_t) sampling.sampled_size);
-        return sampling.sampled[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled token id %d, reason: %s\n", __func__, idx, err.what());
-        return LLAMA_TOKEN_NULL;
-    }
-}
-
-float * llama_context::get_sampled_probs_ith(int32_t idx) {
-    output_reorder();
-
-    if (sampling.probs == nullptr) {
-        return nullptr;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.probs_count.size() || sampling.probs_count[row] == 0) {
-            return nullptr;
-        }
-        return sampling.probs + row*model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled probs id %d, reason: %s\n", __func__, idx, err.what());
-        return nullptr;
-    }
-}
-
-float * llama_context::get_sampled_logits_ith(int32_t idx) {
-    output_reorder();
-
-    if (sampling.logits == nullptr) {
-        return nullptr;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.logits_count.size() || sampling.logits_count[row] == 0) {
-            return nullptr;
-        }
-        return sampling.logits + row*model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled logits id %d, reason: %s\n", __func__, idx, err.what());
-        return nullptr;
-    }
-}
-
-const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
-    output_reorder();
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if (sampling.candidates != nullptr &&
-            (size_t) row < sampling.candidates_count.size() &&
-            sampling.candidates_count[row] > 0) {
-            return sampling.candidates + row*model.vocab.n_tokens();
-        }
-    } catch (const std::exception & err) {
-        // fallback to full vocab list
-    }
-
-    return sampling.token_ids_full_vocab.data();
-}
-
-size_t llama_context::get_sampled_candidates_count(int32_t idx) {
-    output_reorder();
-
-    if (sampling.candidates == nullptr) {
-        return 0;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.candidates_count.size()) {
-            return 0;
-        }
-        return sampling.candidates_count[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled candidates count id %d, reason: %s\n", __func__, idx, err.what());
-        return 0;
-    }
-}
-
-size_t llama_context::get_sampled_logits_count(int32_t idx) {
-    output_reorder();
-
-    if (sampling.logits == nullptr) {
-        return model.vocab.n_tokens();
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.logits_count.size()) {
-            return 0;
-        }
-        return sampling.logits_count[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled logits count id %d, reason: %s\n", __func__, idx, err.what());
-        return 0;
-    }
-}
-
-size_t llama_context::get_sampled_probs_count(int32_t idx) {
-    output_reorder();
-
-    if (sampling.probs == nullptr) {
-        return 0;
-    }
-
-    try {
-        const int64_t row = output_resolve_row(idx);
-        if ((size_t) row >= sampling.probs_count.size()) {
-            return 0;
-        }
-        return sampling.probs_count[row];
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid backend sampled probs count id %d, reason: %s\n", __func__, idx, err.what());
-        return 0;
-    }
-}
-
-
 void llama_context::attach_threadpool(
           ggml_threadpool_t threadpool,
           ggml_threadpool_t threadpool_batch) {
@@ -965,42 +768,6 @@ void llama_context::set_warmup(bool value) {
    cparams.warmup = value;
 }

-bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
-    LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
-
-    const bool can_offload =
-        sampler &&
-        sampler->iface->backend_init &&
-        sampler->iface->backend_apply &&
-        llama_sampler_chain_n(sampler) > 0;
-
-    if (sampler && can_offload) {
-        ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
-        auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
-        if (host_buft) {
-            buft = host_buft;
-        }
-
-        sampler->iface->backend_init(sampler, buft);
-
-        sampling.samplers[seq_id] = sampler;
-
-        return true;
-    }
-
-    if (sampler && !can_offload) {
-        LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
-
-        sampling.samplers.erase(seq_id);
-
-        return false;
-    }
-
-    sampling.samplers.erase(seq_id);
-
-    return true;
-}
-
 void llama_context::set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale) {
@@ -1141,7 +908,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    n_queued_tokens += n_tokens;

    // reserve output buffer
-    if (output_reserve(n_tokens, batch_inp) < n_tokens) {
+    if (output_reserve(n_tokens) < n_tokens) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
        return -2;
    };
@@ -1195,10 +962,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
                {
                    // extract token embeddings
                    GGML_ASSERT(embd != nullptr);
-                    const uint32_t n_embd_out = hparams.get_n_embd_out();

-                    GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
-                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                } break;
            case LLAMA_POOLING_TYPE_MEAN:
            case LLAMA_POOLING_TYPE_CLS:
@@ -1266,112 +1032,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
    return 0;
 }

-static std::map<llama_seq_id, uint32_t> build_seq_to_output_row(const llama_ubatch & ubatch, uint32_t row_offset) {
-    std::map<llama_seq_id, uint32_t> seq_to_row;
-    // how many output tokens we have seen so far for this ubatch.
-    uint32_t local = 0;
-    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-        // skip tokens that are not output.
-        if (!ubatch.output[i]) {
-            continue;
-        }
-
-        const llama_seq_id seq_id = ubatch.seq_id[i][0];
-        // row_offset is the number of output tokens before this ubatch.
-        seq_to_row[seq_id] = row_offset + local;
-        ++local;
-    }
-    return seq_to_row;
-}
-
-static void copy_tensor_async_ints(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
-    llama_token * sampled,
-    size_t sampled_size,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
-    ggml_backend_sched_t sched) {
-    if (sampled == nullptr) {
-        return;
-    }
-
-    for (const auto & [seq_id, tensor] : tensor_map) {
-        auto it = seq_to_row.find(seq_id);
-        if (it == seq_to_row.end()) {
-            continue;
-        }
-
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < sampled_size);
-
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "sampled tokens tensor must be contiguous for async copy");
-
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        ggml_backend_tensor_get_async(backend, tensor, sampled + row, 0, sizeof(sampled[row]));
-    }
-}
-
-static void copy_tensor_async_floats(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
-    float * dst,
-    size_t stride,
-    std::vector<uint32_t> & counts,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
-    ggml_backend_sched_t sched) {
-    if (dst == nullptr) {
-        return;
-    }
-
-    for (const auto & [seq_id, tensor] : tensor_map) {
-        auto it = seq_to_row.find(seq_id);
-        if (it == seq_to_row.end()) {
-            continue;
-        }
-
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < counts.size());
-
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "logits/probs tensor must be contiguous for async copy");
-
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        float * row_ptr = dst + (size_t) row * stride;
-        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
-
-        // Update the actual number of logits/probabilities that were written for this row.
-        counts[row] = ggml_nelements(tensor);
-    }
-}
-
-static void copy_tensor_async_candidates(
-    const std::map<llama_seq_id, ggml_tensor*> & tensor_map,
-    llama_token * dst,
-    size_t stride,
-    std::vector<uint32_t> & counts,
-    const std::map<llama_seq_id, uint32_t> & seq_to_row,
-    ggml_backend_sched_t sched) {
-    if (dst == nullptr) {
-        return;
-    }
-
-    for (const auto & [seq_id, tensor] : tensor_map) {
-        auto it = seq_to_row.find(seq_id);
-        if (it == seq_to_row.end()) {
-            continue;
-        }
-
-        const uint32_t row = it->second;
-        GGML_ASSERT(row < counts.size());
-
-        GGML_ASSERT(ggml_is_contiguous(tensor) && "candidates tensor must be contiguous for async copy");
-
-        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, tensor);
-        llama_token * row_ptr = dst + (size_t) row * stride;
-        ggml_backend_tensor_get_async(backend, tensor, row_ptr, 0, ggml_nbytes(tensor));
-
-        // Update the actual number of candidates that were written.
-        counts[row] = ggml_nelements(tensor);
-    }
-}
-
 int llama_context::decode(const llama_batch & batch_inp) {
    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT

@@ -1392,35 +1052,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const int64_t n_embd  = hparams.n_embd_inp();

    const bool output_all = false;
-    const bool has_samplers = !sampling.samplers.empty();

-    const uint32_t n_seq_max = cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max;
-
-    // TODO: avoid this workaround in the future
-    if (has_samplers && batch_inp.logits) {
-        std::vector<int32_t> seq_output_count(n_seq_max, 0);
-
-        for (int32_t i = 0; i < batch_inp.n_tokens; ++i) {
-            if (batch_inp.logits[i] == 0) {
-                continue;
-            }
-
-            const int ns = batch_inp.n_seq_id ? batch_inp.n_seq_id[i] : 1;
-
-            for (int32_t s = 0; s < ns; ++s) {
-                const llama_seq_id seq_id = batch_inp.seq_id ? batch_inp.seq_id[i][s] : 0;
-
-                seq_output_count[seq_id]++;
-                if (seq_output_count[seq_id] > 1) {
-                    LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (seq_id %d had %d)\n",
-                            __func__, seq_id, seq_output_count[seq_id]);
-                    return -1;
-                }
-            }
-        }
-    }
-
-    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, n_seq_max, output_all)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@@ -1501,7 +1134,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    }

    // reserve output buffer
-    if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
        return -2;
    };
@@ -1574,10 +1207,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }

        // extract logits
-        // For multi-sequence batches that mix backend samplers and CPU sampler
-        // this is currently inefficient as we copy all logits even for the
-        // backend sampled tokens.
-        if (logits && t_logits && n_outputs > 0) {
+        if (t_logits && n_outputs > 0) {
            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
            GGML_ASSERT(backend_res != nullptr);
            GGML_ASSERT(logits != nullptr);
@@ -1592,7 +1222,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        }

        // extract embeddings
-        if (embd && t_embd && n_outputs > 0) {
+        if (t_embd && n_outputs > 0) {
            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
            GGML_ASSERT(backend_embd != nullptr);

@@ -1601,13 +1231,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
                    {
                        // extract token embeddings
                        GGML_ASSERT(embd != nullptr);
-                        const uint32_t n_embd_out = hparams.get_n_embd_out();
-                        float * embd_out = embd + n_outputs_prev*n_embd_out;
+                        float * embd_out = embd + n_outputs_prev*n_embd;

                        if (n_outputs) {
                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
+                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
                        }
                    } break;
                case LLAMA_POOLING_TYPE_MEAN:
@@ -1647,22 +1276,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
            }
        }

-        // This flag indicates whether a backend sampler has actually sampled a specific
-        // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
-        const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
-
-        if (has_samplers && has_sampled) {
-            const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
-            const auto stride = n_vocab;
-
-            // async copy the sampling data from the backend to the host
-            copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
-
-            copy_tensor_async_floats    (res->t_sampled_logits, sampling.logits,     stride, sampling.logits_count,     seq_to_output_row, sched.get());
-            copy_tensor_async_floats    (res->t_sampled_probs,  sampling.probs,      stride, sampling.probs_count,      seq_to_output_row, sched.get());
-            copy_tensor_async_candidates(res->t_candidates,     sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get());
-        }
-
        n_outputs_prev += n_outputs;
    } while (mctx->next());

@@ -1726,15 +1339,15 @@ int llama_context::decode(const llama_batch & batch_inp) {
 // output
 //

-uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;

    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());

-    const auto n_batch    = cparams.n_batch;
-    const auto n_vocab    = vocab.n_tokens();
-    const auto n_embd_out = hparams.get_n_embd_out();
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;

    bool has_logits = true;
    bool has_embd   = cparams.embeddings;
@@ -1745,53 +1358,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        has_embd   = true;
    }

-    // Check which sampling modes are needed for the current batch.
-    // TODO: avoid this branching by working with the worst-case
-    bool has_sampling = false;
-    bool cpu_logits   = false;
-
-    if (batch.logits) {
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            if (!batch.logits[i]) {
-                continue;
-            }
-            for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
-                llama_seq_id seq_id = batch.seq_id[i][j];
-                if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
-                    has_sampling = true;
-                } else {
-                    cpu_logits = true;
-                }
-            }
-        }
-    } else {
-        // When batch.logits is nullptr (when loading state with a dummy batch),
-        // allocate CPU logits.
-        cpu_logits = true;
-    }
-
-    size_t backend_float_count = 0;
-    size_t backend_token_count = 0;
-
-    // Allocate CPU logits buffer only if needed by sequences in this batch
-    logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
-    embd_size   = has_embd ? n_embd_out*n_outputs_max : 0;
-
-    // TODO: avoid this branching by working with the worst-case
-    if (!has_sampling) {
-        sampling.logits_size     = 0;
-        sampling.probs_size      = 0;
-        sampling.sampled_size    = 0;
-        sampling.candidates_size = 0;
-    } else {
-        sampling.logits_size     = n_vocab*n_outputs_max;
-        sampling.probs_size      = n_vocab*n_outputs_max;
-        sampling.sampled_size    =         n_outputs_max;
-        sampling.candidates_size = n_vocab*n_outputs_max;
-
-        backend_float_count = sampling.logits_size  + sampling.probs_size;
-        backend_token_count = sampling.sampled_size + sampling.candidates_size;
-    }
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;

    if (output_ids.empty()) {
        // init, never resized afterwards
@@ -1799,9 +1367,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
    }

    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
-    const size_t new_size  =
-        (logits_size + embd_size + backend_float_count) * sizeof(float) +
-        (                          backend_token_count) * sizeof(llama_token);
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);

    // alloc only when more than the current capacity is required
    // TODO: also consider shrinking the buffer
@@ -1809,11 +1375,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
        if (buf_output) {
 #ifndef NDEBUG
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_DEBUG("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
            synchronize();
-
-            // TODO: not needed?
            buf_output = nullptr;
            logits = nullptr;
            embd = nullptr;
@@ -1835,49 +1399,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba

    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());

-    logits = nullptr;
-    embd   = nullptr;
-
-    size_t offset = 0;
-    uint8_t * base = (uint8_t *) output_base;
-
-    logits = (has_logits && cpu_logits) ? output_base : nullptr;
-    offset += logits_size * sizeof(float);
-
-    embd = has_embd ? (float *) (base + offset) : nullptr;
-    offset += embd_size * sizeof(float);
-
-    sampling.logits     = nullptr;
-    sampling.probs      = nullptr;
-    sampling.sampled    = nullptr;
-    sampling.candidates = nullptr;
-
-    if (has_sampling) {
-        sampling.logits = (float *) (base + offset);
-        offset += sampling.logits_size * sizeof(float);
-
-        sampling.probs = (float *) (base + offset);
-        offset += sampling.probs_size * sizeof(float);
-
-        sampling.sampled = (llama_token *) (base + offset);
-        offset += sampling.sampled_size * sizeof(llama_token);
-
-        sampling.candidates = (llama_token *) (base + offset);
-        offset += sampling.candidates_size * sizeof(llama_token);
-
-        // The count vectors keep track of the actual number of logits/probs/candidates
-        // copied from the backend for each output row.
-
-        sampling.logits_count.resize(n_outputs_max);
-        sampling.probs_count.resize(n_outputs_max);
-        sampling.candidates_count.resize(n_outputs_max);
-
-        std::fill(sampling.logits_count.begin(),     sampling.logits_count.end(),     0);
-        std::fill(sampling.probs_count.begin(),      sampling.probs_count.end(),      0);
-        std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
-
-        std::fill_n(sampling.sampled, sampling.sampled_size, LLAMA_TOKEN_NULL);
-    }
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;

    // set all ids as invalid (negative)
    std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1906,40 +1429,6 @@ void llama_context::output_reorder() {
                std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
            }
        }
-
-        if (sampling.logits && sampling.logits_size > 0) {
-            for (uint64_t k = 0; k < n_vocab; ++k) {
-                std::swap(sampling.logits[i0*n_vocab + k], sampling.logits[i1*n_vocab + k]);
-            }
-        }
-
-        if (sampling.probs && sampling.probs_size > 0) {
-            for (uint64_t k = 0; k < n_vocab; ++k) {
-                std::swap(sampling.probs[i0*n_vocab + k], sampling.probs[i1*n_vocab + k]);
-            }
-        }
-
-        if (sampling.candidates && sampling.candidates_size > 0) {
-            for (uint64_t k = 0; k < n_vocab; ++k) {
-                std::swap(sampling.candidates[i0*n_vocab + k], sampling.candidates[i1*n_vocab + k]);
-            }
-        }
-
-        if (sampling.sampled && sampling.sampled_size > 0) {
-            std::swap(sampling.sampled[i0], sampling.sampled[i1]);
-        }
-
-        if (!sampling.logits_count.empty()) {
-            std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
-        }
-
-        if (!sampling.probs_count.empty()) {
-            std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
-        }
-
-        if (!sampling.candidates_count.empty()) {
-            std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
-        }
    }

    output_swaps.clear();
@@ -1953,9 +1442,7 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
    if (model.arch == LLM_ARCH_QWEN3NEXT) {
        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
    }
-    uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
-    res += model.n_lora_nodes;
-    return res;
+    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
 }

 llm_graph_result * llama_context::get_gf_res_reserve() const {
@@ -1969,7 +1456,7 @@ ggml_cgraph * llama_context::graph_reserve(

    if (n_tokens % n_seqs != 0) {
        n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
-        n_outputs = std::max(n_outputs, n_tokens);
+        n_outputs = std::min(n_outputs, n_tokens);

        LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
    }
@@ -1988,15 +1475,6 @@ ggml_cgraph * llama_context::graph_reserve(
    llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
    llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);

-    // set one output token per sequence in order to activate all backend samplers
-    std::vector<llama_seq_id> seq_ids(n_seqs);
-    for (uint32_t i = 0; i < n_seqs; ++i) {
-        seq_ids[i] = i;
-        ubatch.n_seq_id[i] = 1;
-        ubatch.seq_id[i] = &seq_ids[i];
-        ubatch.output[i] = true;
-    }
-
    auto * res = gf_res_reserve.get();

    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
@@ -2027,7 +1505,7 @@ llm_graph_params llama_context::graph_params(
                        llm_graph_result * res,
                      const llama_ubatch & ubatch,
            const llama_memory_context_i * mctx,
-                          llm_graph_type   gtype) const {
+            llm_graph_type   gtype) const {
    return {
        /*.arch        =*/ model.arch,
        /*.hparams     =*/ model.hparams,
@@ -2040,7 +1518,6 @@ llm_graph_params llama_context::graph_params(
        /*.loras       =*/ &loras,
        /*.mctx        =*/ mctx,
        /*.cross       =*/ &cross,
-        /*.samplers    =*/ sampling.samplers,
        /*.n_outputs   =*/ n_outputs,
        /*.cb          =*/ graph_get_cb(),
        /*.res         =*/ res,
@@ -2093,7 +1570,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
+        const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
        if (ubatch.n_tokens < 32 || full_offload) {
            if (il != -1 && strcmp(name, "norm") == 0) {
                const auto & dev_layer = model.dev_layer(il);
@@ -2496,9 +1973,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
        }
    }

-    // TODO: handle sampling buffers and samplers state ?
-    //       https://github.com/ggml-org/llama.cpp/pull/17004
-
    if (memory != nullptr) {
        LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
        memory->state_write(io);
@@ -2531,10 +2005,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        auto n_outputs = this->n_outputs;
        io.read_to(&n_outputs, sizeof(n_outputs));

-        // Create a dummy batch for state loading.
-        llama_batch dummy_batch = {};
-        dummy_batch.n_tokens = 0;
-        if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
+        if (n_outputs > output_reserve(n_outputs)) {
            throw std::runtime_error("could not reserve outputs");
        }

@@ -2588,9 +2059,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        }
    }

-    // TODO: handle sampling buffers and samplers state ?
-    //       https://github.com/ggml-org/llama.cpp/pull/17004
-
    if (memory) {
        LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);

@@ -2779,7 +2247,7 @@ void llama_context::opt_epoch_iter(
        }

        // reserve output buffer
-        if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+        if (output_reserve(n_outputs_all) < n_outputs_all) {
            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
            GGML_ABORT("TODO: handle this error");
        };
@@ -2924,8 +2392,6 @@ llama_context_params llama_context_default_params() {
        /*.op_offload                  =*/ true,
        /*.swa_full                    =*/ true,
        /*.kv_unified                  =*/ false,
-        /*.sampler                     =*/ nullptr,
-        /*.n_sampler                   =*/ 0,
    };

    return result;
@@ -3085,15 +2551,7 @@ float * llama_get_logits(llama_context * ctx) {
 float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
    ctx->synchronize();

-    float * res = nullptr;
-
-    res = ctx->get_sampled_logits_ith(i);
-
-    if (!res) {
-        res = ctx->get_logits_ith(i);
-    }
-
-    return res;
+    return ctx->get_logits_ith(i);
 }

 float * llama_get_embeddings(llama_context * ctx) {
@@ -3114,52 +2572,6 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
    return ctx->get_embeddings_seq(seq_id);
 }

-bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
-    return ctx->set_sampler(seq_id, smpl);
-}
-
-llama_token llama_get_sampled_token_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return ctx->get_sampled_token_ith(i);
-}
-
-float * llama_get_sampled_probs_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return ctx->get_sampled_probs_ith(i);
-}
-
-float * llama_get_sampled_logits_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return ctx->get_sampled_logits_ith(i);
-}
-
-llama_token * llama_get_sampled_candidates_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return const_cast<llama_token *>(ctx->get_sampled_candidates_ith(i));
-}
-
-uint32_t llama_get_sampled_candidates_count_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return static_cast<uint32_t>(ctx->get_sampled_candidates_count(i));
-}
-
-uint32_t llama_get_sampled_logits_count_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return static_cast<uint32_t>(ctx->get_sampled_logits_count(i));
-}
-
-uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
-    ctx->synchronize();
-
-    return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
-}
-
 // llama adapter API

 int32_t llama_set_adapter_lora(
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -70,18 +70,6 @@ struct llama_context {
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);

-    llama_token * get_sampled_tokens() const;
-    llama_token   get_sampled_token_ith(int32_t idx);
-
-    float * get_sampled_logits_ith(int32_t idx);
-    size_t  get_sampled_logits_count(int32_t idx);
-
-    float * get_sampled_probs_ith(int32_t idx);
-    size_t  get_sampled_probs_count(int32_t idx);
-
-    const llama_token * get_sampled_candidates_ith(int32_t idx);
-    size_t get_sampled_candidates_count(int32_t idx);
-
    void attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
@@ -204,13 +192,10 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);

    void output_reorder();

-    // map the output row index `i` to batch index
-    int64_t output_resolve_row(int32_t i) const;
-
    //
    // graph
    //
@@ -228,8 +213,6 @@ public:
    ggml_cgraph * graph_reserve(
        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);

-    bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
-
 private:
    llm_graph_params graph_params(
                        llm_graph_result * res,
@@ -269,31 +252,6 @@ private:
    size_t  embd_size = 0; // capacity (of floats) for embeddings
    float * embd      = nullptr;

-    // TODO: simplify
-    struct sampling_info {
-        std::map<llama_seq_id, llama_sampler *> samplers;
-
-        float       * logits      = nullptr;
-        size_t        logits_size = 0;
-
-        llama_token * sampled      = nullptr;
-        size_t        sampled_size = 0;
-
-        float       * probs        = nullptr;
-        size_t        probs_size   = 0;
-
-        llama_token * candidates   = nullptr;
-        size_t        candidates_size = 0;
-
-        std::vector<uint32_t> logits_count;
-        std::vector<uint32_t> probs_count;
-        std::vector<uint32_t> candidates_count;
-
-        std::vector<llama_token> token_ids_full_vocab;
-    };
-
-    sampling_info sampling;
-
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -369,44 +369,6 @@ static void print_rule(
    fprintf(file, "\n");
 }

-//
-// Regex utilities
-//
-
-size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
-    auto find_start_pos = [](const std::smatch & match) {
-        // get from the first matched capturing group to the end of the string
-        size_t start = std::string::npos;
-        for (auto i = 1u; i < match.size(); i++) {
-            if (match.length(i) > 0) {
-                start = match.position(i);
-                break;
-            }
-        }
-        if (start == std::string::npos) {
-            start = match.position(0);
-        }
-        return start;
-    };
-
-    if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
-        // match against the entire input
-        std::smatch match;
-        if (std::regex_match(input, match, regex)) {
-            return find_start_pos(match);
-        }
-    }
-
-    // search anywhere
-    std::smatch match;
-    if (std::regex_search(input, match, regex)) {
-        return find_start_pos(match);
-    }
-
-    return std::string::npos;
-}
-
-
 //
 // implementation
 //
@@ -1359,10 +1321,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;

+            std::smatch match;
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
-                auto start = trigger_pattern.find(grammar.trigger_buffer);
-                if (start != std::string::npos) {
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
+                    // get from the first matched capturing group to the end of the string
+                    size_t start = std::string::npos;
+                    for (auto i = 1u; i < match.size(); i++) {
+                        if (match.length(i) > 0) {
+                            start = match.position(i);
+                            break;
+                        }
+                    }
+                    if (start == std::string::npos) {
+                        start = match.position(0);
+                    }

                    // replay tokens that overlap with [start, end)
                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
--- a/llama/llama.cpp/src/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -130,8 +130,6 @@ struct llama_grammar_parser {
 struct llama_grammar_trigger_pattern {
    std::string pattern;
    std::regex  regex;
-
-    size_t find(const std::string & input) const;
 };

 struct llama_grammar {
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -12,7 +12,6 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
-#include <unordered_set>

 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
@@ -33,7 +32,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
    bool res = true;

    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
-    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);

    return res;
 }
@@ -63,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
    bool res = true;

-    res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+    res &= pos->ne[0] == params.ubatch.n_tokens;

    return res;
 }
@@ -522,43 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
    return res;
 }

-void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
-    // set the inputs only for the active samplers in the current ubatch
-    std::unordered_set<llama_seq_id> active_samplers;
-    for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
-        if (ubatch->output[i]) {
-            llama_seq_id seq_id = ubatch->seq_id[i][0];
-            active_samplers.insert(seq_id);
-        }
-    }
-
-    for (auto seq_id : active_samplers) {
-        if (samplers.find(seq_id) == samplers.end()) {
-            continue;
-        }
-
-        auto & sampler = samplers[seq_id];
-
-        if (sampler->iface->backend_set_input) {
-            sampler->iface->backend_set_input(sampler);
-        }
-    }
-}
-
-bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
-    if (samplers.size() != params.samplers.size()) {
-        return false;
-    }
-
-    for (const auto & [seq_id, sampler] : params.samplers) {
-        if (samplers[seq_id] != sampler) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 //
 // llm_graph_result
 //
@@ -579,10 +541,6 @@ void llm_graph_result::reset() {
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
-    t_sampled.clear();
-    t_sampled_probs.clear();
-    t_sampled_logits.clear();
-    t_candidates.clear();

    params = {};

@@ -607,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }

-void llm_graph_result::set_outputs() {
-    if (t_logits != nullptr) {
-        ggml_set_output(t_logits);
-    }
-    if (t_embd != nullptr) {
-        ggml_set_output(t_embd);
-    }
-    if (t_embd_pooled != nullptr) {
-        ggml_set_output(t_embd_pooled);
-    }
-    for (auto & [seq_id, t] : t_sampled) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_probs) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_logits) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_candidates) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-}
-
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
    if (!this->params.allow_reuse(params)) {
        if (debug > 1) {
@@ -720,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    loras            (params.loras),
    mctx             (params.mctx),
    cross            (params.cross),
-    samplers         (params.samplers),
    cb_func          (params.cb),
    res              (params.res),
    ctx0             (res->get_ctx()),
@@ -1326,10 +1251,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {

    res->add_input(std::move(inp));

-    // make sure the produced embeddings are immediately materialized in the ggml graph
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18599
-    ggml_build_forward_expand(gf, cur);
-
    return cur;
 }

@@ -1913,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
-        ggml_set_name(inp->self_kq_mask, "self_kq_mask");

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-        ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
    }

    {
@@ -1929,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);
-        ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }

    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2071,18 +1988,14 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
 void llm_graph_context::build_dense_out(
    ggml_tensor * dense_2,
    ggml_tensor * dense_3) const {
-    if (!cparams.embeddings || !(dense_2 || dense_3)) {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
        return;
    }
    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");

-    if (dense_2) {
-        cur = ggml_mul_mat(ctx0, dense_2, cur);
-    }
-    if (dense_3) {
-        cur = ggml_mul_mat(ctx0, dense_3, cur);
-    }
+    cur = ggml_mul_mat(ctx0, dense_2, cur);
+    cur = ggml_mul_mat(ctx0, dense_3, cur);
    cb(cur, "result_embd_pooled", -1);
    res->t_embd_pooled = cur;
    ggml_build_forward_expand(gf, cur);
@@ -2173,87 +2086,6 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }

-void llm_graph_context::build_sampling() const {
-    if (samplers.empty() || !res->t_logits) {
-        return;
-    }
-
-    auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
-    res->add_input(std::move(inp_sampling));
-
-    std::map<llama_seq_id, int32_t> seq_to_logit_row;
-    int32_t logit_row_idx = 0;
-
-    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
-        if (ubatch.output[i]) {
-            llama_seq_id seq_id = ubatch.seq_id[i][0];
-            seq_to_logit_row[seq_id] = logit_row_idx;
-            logit_row_idx++;
-        }
-    }
-
-    // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
-    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
-
-    // add a dummy row of logits
-    // this trick makes the graph static, regardless of which samplers are activated
-    // this is important in order to minimize graph reallocations
-    // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
-    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
-
-    for (const auto & [seq_id, sampler] : samplers) {
-        const auto it = seq_to_logit_row.find(seq_id);
-
-        // inactive samplers always work on the first row
-        const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
-
-        ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
-        ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
-
-        struct llama_sampler_data data = {
-            /*.logits      =*/ logits_seq,
-            /*.probs       =*/ nullptr,
-            /*.sampled     =*/ nullptr,
-            /*.candidates  =*/ nullptr,
-        };
-
-        assert(sampler->iface->backend_apply);
-        sampler->iface->backend_apply(sampler, ctx0, gf, &data);
-
-        if (data.sampled != nullptr) {
-            res->t_sampled[seq_id] = data.sampled;
-            ggml_build_forward_expand(gf, data.sampled);
-        }
-
-        if (data.probs != nullptr) {
-            res->t_sampled_probs[seq_id] = data.probs;
-            ggml_build_forward_expand(gf, data.probs);
-        }
-
-        if (data.logits != nullptr) {
-            res->t_sampled_logits[seq_id] = data.logits;
-            ggml_build_forward_expand(gf, data.logits);
-        }
-
-        if (data.candidates != nullptr) {
-            res->t_candidates[seq_id] = data.candidates;
-            ggml_build_forward_expand(gf, data.candidates);
-        }
-    }
-
-    // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
-    /*
-    for (const auto & [seq_id, sampler] : samplers) {
-        if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
-            ggml_tensor * selected_token = it->second;
-            if (selected_token != nullptr) {
-                llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
-            }
-        }
-    }
-    */
-}
-
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -10,7 +10,6 @@
 #include <memory>
 #include <set>
 #include <functional>
-#include <map>

 struct ggml_cgraph;
 struct ggml_context;
@@ -397,18 +396,6 @@ public:
    const llama_memory_hybrid_context * mctx;
 };

-class llm_graph_input_sampling : public llm_graph_input_i {
-public:
-    llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
-        samplers(std::move(samplers)) { }
-    virtual ~llm_graph_input_sampling() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-    bool can_reuse(const llm_graph_params & params) override;
-
-    std::map<llama_seq_id, llama_sampler *> samplers;
-};
-
 //
 // llm_graph_result
 //
@@ -442,23 +429,6 @@ struct llm_graph_params {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;

-    std::map<llama_seq_id, llama_sampler *> samplers;
-
-    static bool samplers_equal(
-          const std::map<llama_seq_id, llama_sampler *> & lhs,
-          const std::map<llama_seq_id, llama_sampler *> & rhs) {
-        if (lhs.size() != rhs.size()) {
-            return false;
-        }
-        for (const auto & [seq_id, sampler] : lhs) {
-            auto it = rhs.find(seq_id);
-            if (it == rhs.end() || it->second != sampler) {
-                return false;
-            }
-        }
-        return true;
-    }
-
    uint32_t n_outputs;

    llm_graph_cb cb;
@@ -498,36 +468,15 @@ struct llm_graph_params {
            return false;
        }

-        if (n_outputs != other.n_outputs) {
-            return false;
-        }
-
-        if (!samplers_equal(samplers, other.samplers)) {
-            return false;
-        }
-
-        if (samplers.size() > 0) {
-            if (!ubatch.data || !other.ubatch.data) {
-                return false;
-            }
-
-            // check that the outputs are the same for all samplers
-            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                if (ubatch.output[i]    != other.ubatch.output[i] ||
-                    ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
-                    return false;
-                }
-            }
-        }
-
        return
            cparams.embeddings  == other.cparams.embeddings  &&
            cparams.causal_attn == other.cparams.causal_attn &&
-            arch  == other.arch  &&
-            gtype == other.gtype &&
-            cvec  == other.cvec  &&
-            loras == other.loras &&
-            cross == other.cross;
+            arch      == other.arch  &&
+            gtype     == other.gtype &&
+            cvec      == other.cvec  &&
+            loras     == other.loras &&
+            cross     == other.cross &&
+            n_outputs == other.n_outputs;
    }
 };

@@ -550,7 +499,6 @@ public:
    void reset();

    void set_inputs(const llama_ubatch * ubatch);
-    void set_outputs();

    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -569,11 +517,6 @@ public:
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;

-    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
-    std::map<llama_seq_id, ggml_tensor*> t_candidates;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
-
    std::vector<llm_graph_input_ptr> inputs;

    ggml_context_ptr ctx_compute;
@@ -649,8 +592,6 @@ struct llm_graph_context {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;

-    std::map<llama_seq_id, llama_sampler *> samplers;
-
    const llm_graph_cb & cb_func;

    llm_graph_result * res;
@@ -891,12 +832,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;

-    //
-    // sampling (backend sampling)
-    //
-
-    void build_sampling() const;
-
    //
    // dense (out)
    //
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
    return n_embd_inp;
 }

-uint32_t llama_hparams::get_n_embd_out() const {
-    return n_embd_out > 0 ? n_embd_out : n_embd;
-}
-
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);

--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -107,9 +107,9 @@ struct llama_hparams {

    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
-    float    rope_freq_base_train_swa  = 10000.0f;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
-    float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_freq_scale_train_swa;

    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@@ -125,11 +125,10 @@ struct llama_hparams {
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    // the size of the sliding window (0 - no SWA)
    uint32_t n_swa = 0;
-    // if swa_layers[il] == 1, then layer il is SWA
-    // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
    // by default, all layers are dense
-    // note: using uint32_t type for compatibility reason
-    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -164,9 +163,6 @@ struct llama_hparams {
    // for Classifiers
    uint32_t n_cls_out = 1;

-    // output embedding dimension (0 = use n_embd)
-    uint32_t n_embd_out = 0;
-
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
@@ -239,9 +235,6 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;

-    // dimension of output embeddings
-    uint32_t get_n_embd_out() const;
-
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;

--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -305,7 +305,7 @@ public:
            bool do_shift,
            stream_copy_info sc_info);

-    // used to create a batch processing context from a batch
+    // used to create a batch procesing context from a batch
    llama_kv_cache_context(
            llama_kv_cache * kv,
            slot_info_vec_t sinfos,
--- a/llama/llama.cpp/src/llama-mmap.cpp
+++ b/llama/llama.cpp/src/llama-mmap.cpp
@@ -13,10 +13,9 @@
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
-        #include <fcntl.h>
-        #include <sys/stat.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
+            #include <fcntl.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
        return ret;
    }

-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode) {
        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -110,7 +109,7 @@ struct llama_file::impl {
        }
    }

-    void read_raw(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        size_t bytes_read = 0;
        while (bytes_read < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +126,7 @@ struct llama_file::impl {
        }
    }

-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t val;
        read_raw(&val, sizeof(val));
        return val;
@@ -154,55 +153,16 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    bool has_direct_io() const {
-        return true;
-    }
-
    ~impl() {
        if (fp) {
            std::fclose(fp);
        }
    }
 #else
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
-#ifdef __linux__
-        // Try unbuffered I/O for read only
-        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
-            if (init_fd()) {
-                return;
-            }
-            LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
-                           fname, strerror(errno));
-        }
-#endif
-        init_fp(mode);
-    }
-
-#ifdef __linux__
-    bool init_fd() {
-        fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
-
-        if (fd != -1) {
-            struct stat file_stats{};
-            fstat(fd, &file_stats);
-
-            size = file_stats.st_size;
-            alignment = file_stats.st_blksize;
-
-            off_t ret = lseek(fd, 0, SEEK_SET);
-            if (ret == -1) {
-                throw std::runtime_error(format("seek error: %s", strerror(errno)));
-            }
-            return true;
-        }
-        return false;
-    }
-#endif
-
-    void init_fp(const char * mode) {
-        fp = ggml_fopen(fname.c_str(), mode);
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
@@ -210,118 +170,46 @@ struct llama_file::impl {
    }

    size_t tell() const {
-        if (fd == -1) {
-            long ret = std::ftell(fp);
-            if (ret == -1) {
-                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
-            }
-
-            return (size_t) ret;
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        if (ret == -1) {
+            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
        }

-        off_t pos = lseek(fd, 0, SEEK_CUR);
-        if (pos == -1) {
-            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
-        }
-        return (size_t) pos;
+        return (size_t) ret;
    }

    void seek(size_t offset, int whence) const {
-        off_t ret = 0;
-        if (fd == -1) {
-            ret = std::fseek(fp, (long) offset, whence);
-        } else {
-            ret = lseek(fd, offset, whence);
-        }
-        if (ret == -1) {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        if (ret != 0) {
            throw std::runtime_error(format("seek error: %s", strerror(errno)));
        }
    }

-    void read_raw_unsafe(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
-        if (fd == -1) {
-            std::size_t ret = std::fread(ptr, len, 1, fp);
-            if (ferror(fp)) {
-                throw std::runtime_error(format("read error: %s", strerror(errno)));
-            }
-            if (ret != 1) {
-                throw std::runtime_error("unexpectedly reached end of file");
-            }
-        } else {
-            size_t bytes_read = 0;
-            while (bytes_read < len) {
-                const size_t to_read = len - bytes_read;
-                ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
-
-                if (ret == -1) {
-                    if (errno == EINTR) {
-                        continue;  // Interrupted by signal, retry
-                    }
-                    // Fallback to std::fread in case the DMA controller cannot access the buffer
-                    if (errno == EFAULT) {
-                        auto curr_off = tell();
-                        close(fd);
-                        fd = -1;
-                        alignment = 1;
-                        init_fp("rb");
-                        seek(curr_off, SEEK_SET);
-                        read_raw_unsafe(ptr, len);
-                        return;
-                    }
-                    throw std::runtime_error(format("read error: %s", strerror(errno)));
-                }
-                if (ret == 0) {
-                    // EOF: allow if this read was only pulling alignment padding past file end
-                    off_t pos = lseek(fd, 0, SEEK_CUR);
-                    if (pos != -1 && (size_t) pos == size) {
-                        std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
-                        return;
-                    }
-                    throw std::runtime_error("unexpectedly reached end of file");
-                }
-
-                bytes_read += (size_t) ret;
-            }
+        std::size_t ret = std::fread(ptr, len, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error("unexpectedly reached end of file");
        }
    }

-    void read_aligned_chunk(void * dest, size_t size) {
-        size_t offset = tell();
-        off_t aligned_offset = offset & ~(alignment - 1);
-        off_t offset_from_alignment = offset - aligned_offset;
-        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
-
-        void * raw_buffer = nullptr;
-        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
-        if (ret != 0) {
-            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
-        }
-
-        struct aligned_buffer_deleter {
-            void operator()(void * p) const { free(p); }
-        };
-        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
-
-        seek(aligned_offset, SEEK_SET);
-        read_raw_unsafe(buffer.get(), bytes_to_read);
-
-        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
-        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
-    }
-
-    void read_raw(void * ptr, size_t len) {
-        if (has_direct_io()) {
-            read_aligned_chunk(ptr, len);
-        } else {
-            read_raw_unsafe(ptr, len);
-        }
-    }
-
-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
@@ -342,41 +230,23 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    bool has_direct_io() const {
-        return fd != -1 && alignment > 1;
-    }
-
    ~impl() {
-        if (fd != -1) {
-            close(fd);
-        } else {
+        if (fp) {
            std::fclose(fp);
        }
    }
-    int fd = -1;
-    std::string fname;
 #endif

-    size_t read_alignment() const {
-        return alignment;
-    }
-
-    size_t alignment = 1;
-
-    FILE * fp{};
-    size_t size{};
+    FILE * fp;
+    size_t size;
 };

-llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
-    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
+llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
 llama_file::~llama_file() = default;

 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }

-size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
-bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
-
 int llama_file::file_id() const {
 #ifdef _WIN32
    return _fileno(pimpl->fp);
@@ -390,14 +260,9 @@ int llama_file::file_id() const {
 }

 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
-#ifdef _WIN32
-void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
-#else
-void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
-#endif
+void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }

-uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }

 void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
 void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
--- a/llama/llama.cpp/src/llama-mmap.h
+++ b/llama/llama.cpp/src/llama-mmap.h
@@ -3,7 +3,6 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
-#include <cstdio>

 struct llama_file;
 struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

 struct llama_file {
-    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
+    llama_file(const char * fname, const char * mode);
    ~llama_file();

    size_t tell() const;
@@ -24,16 +23,12 @@ struct llama_file {

    void seek(size_t offset, int whence) const;

-    void read_raw(void * ptr, size_t len);
-    void read_raw_unsafe(void * ptr, size_t len);
-    void read_aligned_chunk(void * dest, size_t size);
-    uint32_t read_u32();
+    void read_raw(void * ptr, size_t len) const;
+    uint32_t read_u32() const;

    void write_raw(const void * ptr, size_t len) const;
    void write_u32(uint32_t val) const;

-    size_t read_alignment() const;
-    bool has_direct_io() const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -462,29 +462,6 @@ namespace GGUFMeta {
        return get_key_or_arr(llm_kv(kid), result, n, required);
    }

-    bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
-        const std::string key = llm_kv(kid);
-
-        const int id = gguf_find_key(meta.get(), key.c_str());
-
-        if (id < 0) {
-            if (required) {
-                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        // throw and error if type is an array
-        if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
-            if (required) {
-                throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        return get_key(key, result, required);
-    }
-
    // TODO: this is not very clever - figure out something better
    template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -495,7 +472,6 @@ llama_model_loader::llama_model_loader(
        const std::string & fname,
        std::vector<std::string> & splits,
        bool use_mmap,
-        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -528,17 +504,9 @@ llama_model_loader::llama_model_loader(
    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
    llm_kv = LLM_KV(llm_arch_from_string(arch_name));

-    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+    files.emplace_back(new llama_file(fname.c_str(), "rb"));
    contexts.emplace_back(ctx);

-    use_direct_io = use_direct_io && files.back()->has_direct_io();
-
-    // Disable mmap in case Direct I/O is enabled and available
-    if (use_direct_io && use_mmap) {
-        use_mmap = false;
-        LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
-    }
-
    // Save tensors data offset of the main file.
    // For subsidiary files, `meta` tensor data offset must not be used,
    // so we build a unified tensors index for weights.
@@ -604,7 +572,7 @@ llama_model_loader::llama_model_loader(
                }
            }

-            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+            files.emplace_back(new llama_file(fname_split, "rb"));
            contexts.emplace_back(ctx);

            // Save tensors data offset info of the shard.
@@ -748,7 +716,6 @@ llama_model_loader::llama_model_loader(
    }

    this->use_mmap = use_mmap;
-    this->use_direct_io = use_direct_io;
    this->check_tensors = check_tensors;
    this->no_alloc = no_alloc;
 }
@@ -968,15 +935,7 @@ bool llama_model_loader::load_all_data(
    // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
    // NVMe raid configurations might require more / larger buffers.
    constexpr size_t n_buffers = 4;
-
-    size_t alignment = 1;
-    for (const auto & file : files) {
-        alignment = std::max(file->read_alignment(), alignment);
-    }
-
-    // Buffer size: balance between memory usage and I/O efficiency
-    // 64MB works well for NVMe drives
-    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
+    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB

    std::vector<ggml_backend_buffer_t> host_buffers;
    std::vector<ggml_backend_event_t> events;
@@ -1026,7 +985,6 @@ bool llama_model_loader::load_all_data(
        // If the backend is supported, create pinned memory buffers and events for synchronisation.
        for (size_t idx = 0; idx < n_buffers; ++idx) {
            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
-
            if (!buf) {
                LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                    ggml_backend_dev_name(dev));
@@ -1108,7 +1066,6 @@ bool llama_model_loader::load_all_data(
            }
        } else {
            const auto & file = files.at(weight->idx);
-
            if (ggml_backend_buffer_is_host(cur->buffer)) {
                file->seek(weight->offs, SEEK_SET);
                file->read_raw(cur->data, n_size);
@@ -1120,54 +1077,19 @@ bool llama_model_loader::load_all_data(
            } else {
                // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                if (upload_backend) {
-                    size_t offset = weight->offs;
-                    alignment = file->read_alignment();
-                    size_t aligned_offset = offset & ~(alignment - 1);
-                    size_t offset_from_alignment = offset - aligned_offset;
-                    file->seek(aligned_offset, SEEK_SET);
-
-                    // Calculate aligned read boundaries
-                    size_t read_start = aligned_offset;
-                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
+                    file->seek(weight->offs, SEEK_SET);

                    size_t bytes_read = 0;
-                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)

-                    while (bytes_read < read_end - read_start) {
-                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+                    while (bytes_read < n_size) {
+                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);

-                        // Align the destination pointer within the pinned buffer
-                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
-
-                        // Wait for previous upload to complete before reusing buffer
                        ggml_backend_event_synchronize(events[buffer_idx]);
-
-                        // Read aligned chunk from file
-                        file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
-
-                        // Calculate actual data portion (excluding alignment padding)
-                        uintptr_t ptr_data = ptr_dest_aligned;
-                        size_t data_to_copy = read_size;
-
-                        // Skip alignment padding at start of first chunk
-                        if (bytes_read == 0) {
-                            ptr_data += offset_from_alignment;
-                            data_to_copy -= offset_from_alignment;
-                        }
-
-                        // Trim alignment padding at end of last chunk
-                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
-                            data_to_copy -= (read_end - (offset + n_size));
-                        }
-
-                        // Async upload actual data to GPU
-                        ggml_backend_tensor_set_async(upload_backend, cur,
-                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
                        ggml_backend_event_record(events[buffer_idx], upload_backend);

-                        data_read += data_to_copy;
-                        bytes_read += read_size;
-
+                        bytes_read += read_iteration;
                        ++buffer_idx;
                        buffer_idx %= n_buffers;
                    }
--- a/llama/llama.cpp/src/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@@ -70,7 +70,6 @@ struct llama_model_loader {
    size_t   n_bytes    = 0;

    bool use_mmap = false;
-    bool use_direct_io = false;
    bool check_tensors;
    bool no_alloc;

@@ -98,7 +97,6 @@ struct llama_model_loader {
        const std::string & fname,
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
-        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -133,8 +131,6 @@ struct llama_model_loader {
    template<typename T>
    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);

-    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
-
    std::string get_arch_name() const;

    enum llm_arch get_arch() const;
--- a/llama/llama.cpp/src/llama-model-saver.cpp
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
-    if (hparams.n_embd_out > 0) {
-        add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out);
-    }
    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -31,14 +31,12 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_17M:           return "17M";
        case LLM_TYPE_22M:           return "22M";
        case LLM_TYPE_33M:           return "33M";
-        case LLM_TYPE_47M:           return "47M";
        case LLM_TYPE_60M:           return "60M";
        case LLM_TYPE_70M:           return "70M";
        case LLM_TYPE_80M:           return "80M";
        case LLM_TYPE_109M:          return "109M";
        case LLM_TYPE_137M:          return "137M";
        case LLM_TYPE_140M:          return "140M";
-        case LLM_TYPE_149M:          return "149M";
        case LLM_TYPE_160M:          return "160M";
        case LLM_TYPE_190M:          return "190M";
        case LLM_TYPE_220M:          return "220M";
@@ -48,7 +46,6 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_335M:          return "335M";
        case LLM_TYPE_350M:          return "350M";
        case LLM_TYPE_360M:          return "360M";
-        case LLM_TYPE_395M:          return "395M";
        case LLM_TYPE_410M:          return "410M";
        case LLM_TYPE_450M:          return "450M";
        case LLM_TYPE_475M:          return "475M";
@@ -126,12 +123,10 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_31B_A3_5B:     return "31B.A3.5B";
        case LLM_TYPE_80B_A3B:       return "80B.A3B";
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
-        case LLM_TYPE_102B_A12B:     return "102B.A12B";
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
        case LLM_TYPE_300B_A47B:     return "300B.A47B";
-        case LLM_TYPE_310B_A15B:     return "310B.A15B";
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
        case LLM_TYPE_E2B:           return "E2B";
        case LLM_TYPE_E4B:           return "E4B";
@@ -507,7 +502,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {

    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
-    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out, false);
    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
@@ -579,7 +573,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);

-    // TODO: Handle SWA metadata similarly when models start implementing it
    // rope_freq_scale (inverse of the kv) is optional
    float ropescale = 0.0f;
    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -588,6 +581,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    }
    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;

+    // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);

    // non-transformer models do not have attention heads
@@ -606,7 +603,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {

        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);

-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
@@ -630,7 +627,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    // arch-specific KVs
    switch (arch) {
        case LLM_ARCH_LLAMA:
-        case LLM_ARCH_LLAMA_EMBED:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

@@ -675,10 +671,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.f_attn_temp_scale       = 0.1f;
                    hparams.f_attn_temp_offset      = 1.0f;
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                }

                switch (hparams.n_expert) {
@@ -724,10 +716,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (hparams.n_swa > 0) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(4);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@@ -887,34 +875,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MODERN_BERT:
-            {
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    uint32_t swa_period = 3;
-                    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn);
-                ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type, false);
-
-                switch (hparams.n_layer) {
-                    case 12:
-                        type = LLM_TYPE_47M; break; // granite-embedding-small
-                    case 22:
-                        type = LLM_TYPE_149M; break; // modern-bert-base
-                    case 28:
-                        type = LLM_TYPE_395M; break; // modern-bert-large
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        case LLM_ARCH_JINA_BERT_V2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
@@ -1116,14 +1076,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MAINCODER:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_1B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        case LLM_ARCH_QWEN3VL:
            {
                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
@@ -1242,25 +1194,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
            } break;
-        case LLM_ARCH_PLAMO3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    uint32_t swa_period = 8;
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_2B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        case LLM_ARCH_GPT2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1314,10 +1247,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.n_swa = 4096; // default value of gemma 2
                hparams.set_swa_pattern(2);
                hparams.attn_soft_cap = true;
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
@@ -1342,7 +1272,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(6);

-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+                    hparams.rope_freq_base_train_swa  = 10000.0f;
+                    hparams.rope_freq_scale_train_swa = 1.0f;
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@@ -1372,9 +1303,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(5);

                hparams.n_layer_kv_from_start     = 20;
+                hparams.rope_freq_base_train_swa  = 10000.0f;
+                hparams.rope_freq_scale_train_swa = 1.0f;
                hparams.f_attention_scale         = 1.0f;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

@@ -1390,8 +1322,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.set_swa_pattern(6);

                hparams.causal_attn = false; // embeddings do not use causal attention
+                hparams.rope_freq_base_train_swa = 10000.0f;
+                hparams.rope_freq_scale_train_swa = 1.0f;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1530,10 +1463,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.set_swa_pattern(4);
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;

-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
@@ -1572,10 +1502,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                if (found_swa && hparams.n_swa > 0) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.set_swa_pattern(4);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                }
@@ -1703,7 +1629,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -1799,7 +1725,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {

                switch (hparams.n_layer) {
                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
-                    case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
                    default: type = LLM_TYPE_UNKNOWN;
                }
@@ -1918,10 +1843,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                    hparams.n_swa = 4096;
                    hparams.set_swa_pattern(4);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                }

                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
@@ -2239,10 +2160,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.set_swa_pattern(2);

-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-
                switch (hparams.n_layer) {
                    case 24: type = LLM_TYPE_20B; break;
                    case 36: type = LLM_TYPE_120B; break;
@@ -2287,10 +2204,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
                    hparams.n_swa         = 4096;
                    hparams.set_swa_pattern(4, true);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                } else {
                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
                    hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -2409,22 +2322,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MIMO2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa);
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
-
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_310B_A15B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
        default: throw std::runtime_error("unsupported model architecture");
    }

@@ -2447,16 +2344,15 @@ void llama_model::load_vocab(llama_model_loader & ml) {

 bool llama_model::load_tensors(llama_model_loader & ml) {
    const auto & split_mode   = params.split_mode;
+    const auto & n_gpu_layers = params.n_gpu_layers;
    const auto & use_mlock    = params.use_mlock;
    const auto & tensor_split = params.tensor_split;

-    const int n_layer      = hparams.n_layer;
-    const int n_gpu_layers = this->n_gpu_layers();
+    const int n_layer = hparams.n_layer;

    const bool use_mmap_buffer = true;

-    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
-        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");

    // build a list of buffer types for the CPU and GPU devices
    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -2467,11 +2363,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
    }

-    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
-
    // calculate the split points
    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
    std::vector<float> splits(n_devices());
@@ -2482,13 +2373,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            size_t total;
            size_t free;
            ggml_backend_dev_memory(dev, &free, &total);
-
-            // devices can return 0 bytes for free and total memory if they do not
-            // have any to report. in this case, we will use the host memory as a fallback
-            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-            if (free == 0 && total == 0) {
-                ggml_backend_dev_memory(cpu_dev, &free, &total);
-            }
            splits[i] = free;
        }
    } else {
@@ -2505,10 +2389,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        splits[i] /= split_sum;
    }

-    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
+    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
+        const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
            return {cpu_dev, &pimpl->cpu_buft_list};
@@ -2748,7 +2636,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            case LLM_ARCH_GRANITE:
            case LLM_ARCH_GRANITE_MOE:
            case LLM_ARCH_MISTRAL3:
-            case LLM_ARCH_LLAMA_EMBED:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@@ -3283,37 +3170,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
                    }
                } break;
-            case LLM_ARCH_MODERN_BERT:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    for(int i = 0; i < n_layer; ++i) {
-                        auto& layer = layers[i];
-
-                        if ( i != 0 ) {
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        } else{
-                            // layer 0 uses identity
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        }
-
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                    }
-
-                    cls       = create_tensor(tn(LLM_TENSOR_CLS,     "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
-                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
-
-                } break;
            case LLM_ARCH_NEO_BERT:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
@@ -3378,14 +3234,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);

                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-
-                        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
-                        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
-                        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
-
-                        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
-                        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
-                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);

                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
@@ -3913,44 +3762,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
                    }
                } break;
-            case LLM_ARCH_PLAMO3:
-                {
-                    const int64_t head_dim_q = hparams.n_embd_head_k;
-                    const int64_t head_dim_v = hparams.n_embd_head_v;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        const int64_t num_attention_heads = hparams.n_head(i);
-                        const int64_t num_key_value_heads = hparams.n_head_kv(i);
-                        const int64_t q_proj_dim = num_attention_heads * head_dim_q;
-                        const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
-                        const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
-                        const int64_t n_ff_cur   = hparams.n_ff(i);
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
-                                {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_cur * 2}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
-                    }
-                } break;
            case LLM_ARCH_GPT2:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4841,11 +4652,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);

                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@@ -4908,11 +4715,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);

                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@@ -5279,9 +5082,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);

                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);

@@ -5393,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    const int64_t n_group    = hparams.ssm_n_group;
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;

+                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+                    const int64_t n_ff_shexp = hparams.n_ff_shexp;
+
                    // embeddings
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@@ -5444,9 +5250,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd},         TENSOR_NOT_REQUIRED);
                        }  else {
                            if (n_expert != 0) {
-                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-                                const int64_t n_ff_shexp = hparams.n_ff_shexp;
-
                                layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);

@@ -6476,8 +6279,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);

                    if (output == NULL) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6522,9 +6325,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
                        }
                    }
-
-                    // for LFM2-ColBert-350M
-                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
                } break;
            case LLM_ARCH_SMALLTHINKER:
                {
@@ -6827,75 +6627,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
                    }
                } break;
-            case LLM_ARCH_MIMO2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
-                        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
-                        uint32_t n_head = hparams.n_head(i);
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
-
-                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        // non-MoE branch
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-
-                        // MoE branch
-                        int64_t n_ff_exp = hparams.n_ff_exp;
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_MAINCODER:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@@ -7005,12 +6736,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    if (llama_supports_gpu_offload()) {
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

-        int n_repeating = n_gpu;
-        if (n_repeating > 0) {
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
-            n_repeating--;
        }
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);

        const int max_backend_supported_layers = hparams.n_layer + 1;
        const int max_offloadable_layers       = hparams.n_layer + 1;
@@ -7077,14 +6806,6 @@ size_t llama_model::n_devices() const {
    return devices.size();
 }

-uint32_t llama_model::n_gpu_layers() const {
-    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
-}
-
-llama_split_mode llama_model::split_mode() const {
-    return params.split_mode;
-}
-
 std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, size_t> ret;
    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
@@ -7177,10 +6898,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-            LLAMA_LOG_INFO("%s: freq_base_swa    = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
-            LLAMA_LOG_INFO("%s: freq_scale_swa   = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
-        }
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7413,7 +7130,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_NEO_BERT:
        case LLM_ARCH_WAVTOKENIZER_DEC:
-        case LLM_ARCH_MODERN_BERT:
        case LLM_ARCH_GEMMA_EMBEDDING:
        case LLM_ARCH_DREAM:
        case LLM_ARCH_LLADA:
@@ -7531,24 +7247,16 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    switch (arch) {
        case LLM_ARCH_LLAMA:
            {
-                llm = std::make_unique<llm_build_llama<false>>(*this, params);
+                llm = std::make_unique<llm_build_llama>(*this, params);
            } break;
        case LLM_ARCH_LLAMA4:
            {
                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_llama<false>>(*this, params);
+                    llm = std::make_unique<llm_build_llama>(*this, params);
                } else {
                    llm = std::make_unique<llm_build_llama_iswa>(*this, params);
                }
            } break;
-        case LLM_ARCH_LLAMA_EMBED:
-            {
-                llm = std::make_unique<llm_build_llama<true>>(*this, params);
-            } break;
-        case LLM_ARCH_MAINCODER:
-            {
-                llm = std::make_unique<llm_build_maincoder>(*this, params);
-            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params);
@@ -7581,10 +7289,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_bert>(*this, params);
            } break;
-        case LLM_ARCH_MODERN_BERT:
-            {
-                llm = std::make_unique<llm_build_modern_bert>(*this, params);
-            } break;
        case LLM_ARCH_NEO_BERT:
            {
                llm = std::make_unique<llm_build_neo_bert>(*this, params);
@@ -7674,14 +7378,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_plamo2>(*this, params);
            } break;
-        case LLM_ARCH_PLAMO3:
-            {
-                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
-                }
-            } break;
        case LLM_ARCH_GPT2:
            {
                llm = std::make_unique<llm_build_gpt2>(*this, params);
@@ -7986,10 +7682,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_mistral3>(*this, params);
            } break;
-        case LLM_ARCH_MIMO2:
-            {
-                llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
-            } break;
        default:
            GGML_ABORT("fatal error");
    }
@@ -7997,17 +7689,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
    // add on pooling layer
    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);

-    // add backend sampling layers (if any)
-    llm->build_sampling();
-
    // if the gguf model was converted with --sentence-transformers-dense-modules
    // there will be two additional dense projection layers
    // dense linear projections are applied after pooling
    // TODO: move reranking logic here and generalize
    llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);

-    llm->res->set_outputs();
-
    return llm->res->get_gf();
 }

@@ -8020,7 +7707,7 @@ llama_model_params llama_model_default_params() {
    llama_model_params result = {
        /*.devices                     =*/ nullptr,
        /*.tensor_buft_overrides       =*/ nullptr,
-        /*.n_gpu_layers                =*/ -1,
+        /*.n_gpu_layers                =*/ 999,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
@@ -8029,7 +7716,6 @@ llama_model_params llama_model_default_params() {
        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
-        /*.use_direct_io               =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
        /*.use_extra_bufts             =*/ true,
@@ -8064,10 +7750,6 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
    return model->hparams.n_embd_inp();
 }

-int32_t llama_model_n_embd_out(const llama_model * model) {
-    return model->hparams.get_n_embd_out();
-}
-
 int32_t llama_model_n_layer(const llama_model * model) {
    return model->hparams.n_layer;
 }
@@ -8171,8 +7853,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_ERNIE4_5:
        case LLM_ARCH_ERNIE4_5_MOE:
        case LLM_ARCH_MISTRAL3:
-        case LLM_ARCH_LLAMA_EMBED:
-        case LLM_ARCH_MAINCODER:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
@@ -8182,7 +7862,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DBRX:
        case LLM_ARCH_BERT:
        case LLM_ARCH_JINA_BERT_V3:
-        case LLM_ARCH_MODERN_BERT:
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_STABLELM:
@@ -8202,7 +7881,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_PHIMOE:
        case LLM_ARCH_PLAMO:
        case LLM_ARCH_PLAMO2:
-        case LLM_ARCH_PLAMO3:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_GEMMA3:
@@ -8233,7 +7911,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_PANGU_EMBED:
        case LLM_ARCH_AFMOE:
        case LLM_ARCH_QWEN3NEXT:
-        case LLM_ARCH_MIMO2:
            return LLAMA_ROPE_TYPE_NEOX;

        case LLM_ARCH_QWEN2VL:
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -24,14 +24,12 @@ enum llm_type {
    LLM_TYPE_17M,
    LLM_TYPE_22M,
    LLM_TYPE_33M,
-    LLM_TYPE_47M,
    LLM_TYPE_60M,
    LLM_TYPE_70M,
    LLM_TYPE_80M,
    LLM_TYPE_109M,
    LLM_TYPE_137M,
    LLM_TYPE_140M,
-    LLM_TYPE_149M,
    LLM_TYPE_160M,
    LLM_TYPE_190M,
    LLM_TYPE_220M,
@@ -41,7 +39,6 @@ enum llm_type {
    LLM_TYPE_335M,
    LLM_TYPE_350M,
    LLM_TYPE_360M,
-    LLM_TYPE_395M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
@@ -120,12 +117,10 @@ enum llm_type {
    LLM_TYPE_31B_A3_5B,
    LLM_TYPE_80B_A3B, // Qwen3 Next
    LLM_TYPE_100B_A6B,
-    LLM_TYPE_102B_A12B, // Solar-Open
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
    LLM_TYPE_230B_A10B, // Minimax M2
    LLM_TYPE_235B_A22B,
    LLM_TYPE_300B_A47B, // Ernie MoE big
-    LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
    LLM_TYPE_355B_A32B, // GLM-4.5
    LLM_TYPE_E2B,
    LLM_TYPE_E4B,
@@ -470,6 +465,8 @@ struct llama_model {
    struct ggml_tensor * dense_2_out_layers = nullptr;
    struct ggml_tensor * dense_3_out_layers = nullptr;

+    llama_model_params params;
+
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;

@@ -479,9 +476,6 @@ struct llama_model {
    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

-    // for keeping track of extra nodes used by lora adapters
-    uint32_t n_lora_nodes = 0;
-
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;

@@ -503,9 +497,6 @@ struct llama_model {
    size_t n_tensors() const;
    size_t n_devices() const;

-    uint32_t n_gpu_layers() const;
-    llama_split_mode split_mode() const;
-
    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;

    // total number of parameters in the model
@@ -534,8 +525,6 @@ struct llama_model {
    ggml_cgraph * build_graph(const llm_graph_params & params) const;

 private:
-    llama_model_params params;
-
    struct impl;
    std::unique_ptr<impl> pimpl;
 };
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
--- a/llama/llama.cpp/src/llama-sampling.h
+++ b/llama/llama.cpp/src/llama-sampling.h
@@ -14,19 +14,7 @@ struct llama_grammar;
 struct llama_sampler_chain {
    llama_sampler_chain_params params;

-    // has .backend_init() been called?
-    bool is_init = false;
-
-    struct info {
-        bool is_backend;
-
-        llama_sampler * ptr;
-    };
-
-    std::vector<info> samplers;
-
-    // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
-    std::vector<llama_token_data> cur;
+    std::vector<struct llama_sampler *> samplers;

    // timing

@@ -36,9 +24,9 @@ struct llama_sampler_chain {
 };

 struct llama_sampler * llama_sampler_init_dry_testing(
-        int32_t context_size,
-        float   dry_multiplier,
-        float   dry_base,
-        int32_t dry_allowed_length,
-        int32_t dry_penalty_last_n,
-        const std::vector<std::vector<llama_token>> & seq_breakers);
+                         int32_t   context_size,
+                           float   dry_multiplier,
+                           float   dry_base,
+                         int32_t   dry_allowed_length,
+                         int32_t   dry_penalty_last_n,
+  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_YOUTU:
-                regex_exprs = {
-                    "[가-힣ㄱ-ㆎ]+|[！…“”‘’—：；，、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
-                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                regex_exprs = {
                    "[\r\n]",
@@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
-            case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1856,11 +1849,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "deepseek-v3") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
                clean_spaces = false;
-            } else if (
-                    tokenizer_pre == "youtu") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
-                clean_spaces = false;
-                ignore_merges = true;
            } else if (
                    tokenizer_pre == "falcon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1879,8 +1867,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
                    tokenizer_pre == "a.x-4.0" ||
-                    tokenizer_pre == "mellum"  ||
-                    tokenizer_pre == "modern-bert" ) {
+                    tokenizer_pre == "mellum") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
@@ -2016,10 +2003,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "minimax-m2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                clean_spaces = false;
-            } else if (
-                tokenizer_pre == "solar-open") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
-                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2193,8 +2176,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        //       for now, we apply this workaround to find the tokens based on their text

        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
            if (special_eot_id == LLAMA_TOKEN_NULL) {
                if (false
@@ -2210,10 +2191,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2224,10 +2205,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|eom_id|>"
                        ) {
                    special_eom_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2244,10 +2225,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_prefix|>" // GLM-4.5
                        ) {
                    special_fim_pre_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2264,10 +2245,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_suffix|>" // GLM-4.5
                        ) {
                    special_fim_suf_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2284,10 +2265,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_middle|>" // GLM-4.5
                        ) {
                    special_fim_mid_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2301,10 +2282,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<PAD>"
                        ) {
                    special_fim_pad_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2319,10 +2300,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<reponame>"    // Granite
                        ) {
                    special_fim_rep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2333,41 +2314,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|file_sep|>" // Qwen
                        ) {
                    special_fim_sep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
        }

-        // auto-detect unused tokens: e.g. control tokens with the word "unused"
-        // ideally, these tokens should be marked as unused during conversion
-        {
-            uint32_t n_unused = 0;
-
-            for (const auto & t : token_to_id) {
-                auto & attr = id_to_token[t.second].attr;
-
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    continue;
-                }
-
-                if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
-                    if (strstr(t.first.c_str(), "unused") != NULL) {
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
-                    }
-                }
-
-                if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
-                    n_unused++;
-                }
-            }
-
-            LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
-        }
-
        // maintain a list of tokens that cause end-of-generation
        // this is currently determined based on the token text, which is obviously not ideal
        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2386,16 +2341,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        }

        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            if (false
                    || t.first == "<|eot_id|>"
                    || t.first == "<|im_end|>"
                    || t.first == "<|end|>"
                    || t.first == "<|return|>" // o200k_harmony
                    || t.first == "<|call|>"   // o200k_harmony
-                    || t.first == "<|flush|>"  // solar-open
-                    || t.first == "<|calls|>"  // solar-open
                    || t.first == "<end_of_turn>"
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
@@ -2405,28 +2356,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.second, t.first.c_str());
-                    attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            } else {
-                if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
-                    // token is control, but not marked as EOG -> print a debug log
-                    if (special_eog_ids.count(t.second) == 0) {
-                        LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                                __func__, t.second, t.first.c_str());
-                    }
+                // token is control, but not marked as EOG -> print a debug log
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
                }
            }
        }

        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
-                attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
+                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }

@@ -2446,42 +2393,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }

-        // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
-        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
+        // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
+        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
        //       we remove the "<|end|>" token from the EOG list
        {
            bool has_return = false;
            bool has_call   = false;
            bool has_end    = false;
-            bool has_flush  = false;

            llama_token end_id = LLAMA_TOKEN_NULL;

            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
            for (auto tid : special_eog_ids) {
-                auto & text = id_to_token[tid].text;
+                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());

-                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, text.c_str());
-
-                if (text == "<|return|>") {
+                if (id_to_token[tid].text == "<|return|>") {
                    has_return = true;
-                } else if (text == "<|call|>" || text == "<|calls|>") {
+                } else if (id_to_token[tid].text == "<|call|>") {
                    has_call = true;
-                } else if (text == "<|flush|>") {
-                    has_flush = true;
-                } else if (text == "<|end|>") {
+                } else if (id_to_token[tid].text == "<|end|>") {
                    has_end = true;
                    end_id = tid;
                }
            }

-            if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
+            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
-
-                auto & attr = id_to_token[end_id].attr;
-                attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
-
-                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
    }
@@ -2579,13 +2518,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
            }
-        } else if (_contains_any(model_name, {"modern-bert"})) {
-            if (token_to_id.count("[MASK]") == 0 ) {
-                LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
-            }
-            else {
-                _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
-            }
        }
    }
 }
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -51,8 +51,6 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
    LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2      = 41,
    LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
-    LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
-    LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
 };

 struct LLM_KV;
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -71,9 +71,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
    }, &ud);

    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap  = false;
-    mparams_copy.use_mlock = false;
+    mparams_copy.no_alloc = true;
+    mparams_copy.use_mmap = false;

    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
    if (model == nullptr) {
@@ -111,20 +110,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        }
    }
    for (size_t i = 0; i < ret.size(); i++) {
-        size_t free;
-        size_t total;
+        size_t free, total;
        ggml_backend_dev_memory(model->devices[i], &free, &total);
-
-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-        if (free == 0 && total == 0) {
-            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (cpu_dev == nullptr) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-            ggml_backend_dev_memory(cpu_dev, &free, &total);
-        }
        ret[i].free  = free;
        ret[i].total = total;
    }
@@ -152,15 +139,12 @@ enum layer_fraction_t {
 };
 // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue

-class llama_params_fit_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
 static void llama_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    constexpr int64_t MiB = 1024*1024;
+    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    typedef std::vector<llama_device_memory_data> dmds_t;
    const llama_model_params default_mparams = llama_model_default_params();

@@ -179,12 +163,6 @@ static void llama_params_fit_impl(
        return;
    }

-    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
-    margins.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        margins.push_back(margins_s[id]);
-    }
-
    std::vector<std::string> dev_names;
    {
        dev_names.reserve(nd);
@@ -202,12 +180,11 @@ static void llama_params_fit_impl(
        }
    }

-    int64_t sum_free            = 0;
-    int64_t sum_projected_free  = 0;
-    int64_t sum_projected_used  = 0;
-    int64_t sum_projected_model = 0;
-    std::vector<int64_t> projected_free_per_device;
-    projected_free_per_device.reserve(nd);
+    int64_t sum_total          = 0;
+    int64_t sum_projected_free = 0;
+    int64_t min_projected_free = INT64_MAX;
+    int64_t sum_projected_used = 0;
+    int64_t sum_projected_ctx  = 0;

    if (nd > 1) {
        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -217,98 +194,59 @@ static void llama_params_fit_impl(

        const int64_t projected_used = dmd.mb.total();
        const int64_t projected_free = dmd.free - projected_used;
-        projected_free_per_device.push_back(projected_free);

-        sum_free            += dmd.free;
-        sum_projected_used  += projected_used;
-        sum_projected_free  += projected_free;
-        sum_projected_model += dmd.mb.model;
+        sum_total          += dmd.total;
+        sum_projected_used += projected_used;
+        sum_projected_free += projected_free;
+        min_projected_free  = std::min(min_projected_free, projected_free);
+        sum_projected_ctx  += dmd.mb.context;

        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
+                projected_free >= 0 ? "surplus" : "deficit");
        }
    }
-    assert(sum_free >= 0 && sum_projected_used >= 0);
+    assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
+    assert(sum_projected_used >= sum_projected_ctx);
    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (nd == 1) {
-        if (projected_free_per_device[0] >= margins[0]) {
+        __func__, sum_projected_used/MiB, sum_total/MiB);
+    if (min_projected_free >= margin) {
+        if (nd == 1) {
            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
-            return;
-        }
-    } else {
-        bool changes_needed = false;
-        for (size_t id = 0; id < nd; id++) {
-            if (projected_free_per_device[id] < margins[id]) {
-                changes_needed = true;
-                break;
-            }
-        }
-        if (!changes_needed) {
-            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                __func__, min_projected_free/MiB, margin/MiB);
            return;
        }
+        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
+            __func__, min_projected_free/MiB, margin/MiB);
+        return;
    }

    // step 2: try reducing memory use by reducing the context size

    {
-        int64_t global_surplus = sum_projected_free;
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus -= margins[id];
-        }
+        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
        if (global_surplus < 0) {
-            if (nd == 1) {
-                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
-            } else {
-                LLAMA_LOG_INFO(
-                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
-                    __func__, -global_surplus/MiB);
-            }
+            LLAMA_LOG_INFO(nd == 1 ?
+                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
+                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
+                __func__, margin/MiB, -global_surplus/MiB);
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
-                    for (size_t id = 0; id < nd; id++) {
-                        sum_used_target -= margins[id];
-                    }
-                    if (nd > 1) {
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
-                        //   - for dense models only whole layers can be assigned to devices
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
-                        //   - use slightly more than the expected average for nd devices to be safe
-                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
-                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
-                    }
-
-                    int64_t sum_projected_used_min_ctx = 0;
-                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    for (const auto & dmd : dmds_min_ctx) {
-                        sum_projected_used_min_ctx += dmd.mb.total();
-                    }
-                    if (sum_used_target > sum_projected_used_min_ctx) {
-                        // linear interpolation between minimum and maximum context size:
-                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
-                            / (sum_projected_used - sum_projected_used_min_ctx);
-                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
-                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
-                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
+                    const uint32_t ctx_reduction = std::min(
+                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
+                    cparams->n_ctx = hp_nct - ctx_reduction;
+                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
+                    global_surplus += memory_reduction;
+                    LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    if (global_surplus >= 0) {
                        if (nd == 1) {
                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
-                    } else {
-                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
                    LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
@@ -321,28 +259,32 @@ static void llama_params_fit_impl(
    }

    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
    }
    if (nd > 1) {
        if (!tensor_split) {
-            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
+            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
        }
        if (mparams->tensor_split) {
            for (size_t id = 0; id < nd; id++) {
                if (mparams->tensor_split[id] != 0.0f) {
-                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
+                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
                }
            }
        }
        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+        }
+        if (hp_ngl < 2*nd) {
+            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
+                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
        }
    }
    if (!tensor_buft_overrides) {
-        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
+        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
    }
    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
+        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
    }

    // step 3: iteratively fill the back to front with "dense" layers
@@ -395,11 +337,6 @@ static void llama_params_fit_impl(

        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
        layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
-
-        uint32_t n_full() const {
-            assert(n_layer >= n_part);
-            return n_layer - n_part;
-        }
    };

    const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -408,7 +345,8 @@ static void llama_params_fit_impl(
    auto set_ngl_tensor_split_tbo = [&](
            const std::vector<ngl_t> & ngl_per_device,
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
+            llama_model_params & mparams,
+            const bool add_nonrepeating) {
        mparams.n_gpu_layers = 0;
        for (size_t id = 0; id < nd; id++) {
            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -416,25 +354,29 @@ static void llama_params_fit_impl(
                tensor_split[id] = ngl_per_device[id].n_layer;
            }
        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
+        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides

+        if (add_nonrepeating) {
+            mparams.n_gpu_layers += 1;
+            tensor_split[nd - 1] += 1;
+        }
        mparams.tensor_split = tensor_split;

        size_t itbo = 0;
        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
+            il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
                if (itbo + 1 >= ntbo) {
                    tensor_buft_overrides[itbo].pattern = nullptr;
                    tensor_buft_overrides[itbo].buft    = nullptr;
                    itbo++;
                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
+                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+                        + std::to_string(ntbo) + " is insufficient for model\n");
                }
                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
+                tensor_buft_overrides[itbo].buft = overflow_bufts[id];
                itbo++;
            }
            il0 += ngl_per_device[id].n_part;
@@ -449,9 +391,10 @@ static void llama_params_fit_impl(
    auto get_memory_for_layers = [&](
            const char * func_name,
            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+            const bool add_nonrepeating) -> std::vector<int64_t> {
        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);

        const dmds_t dmd_nl = llama_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -484,9 +427,9 @@ static void llama_params_fit_impl(
        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        for (size_t id = 0; id < nd; id++) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
+        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
+            global_surplus_cpu_moe += dmd.free;
+            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
        }

        if (global_surplus_cpu_moe > 0) {
@@ -505,18 +448,27 @@ static void llama_params_fit_impl(
    std::vector<int64_t> targets; // maximum acceptable memory use per device
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
+        targets.push_back(dmds_full[id].free - margin);
        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
+    // whether for the optimal memory use we expect to load at least some MoE tensors:
+    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
+
+    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+    for (size_t id = 0; id < nd - 1; ++id) {
+        overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
    }
+    overflow_bufts.push_back(ggml_backend_cpu_buffer_type());

    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+    if (hp_nex > 0) {
+        for (size_t id = 0; id < nd; id++) {
+            ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
+        }
+    }

    // optimize the number of layers per device using the method of false position:
    //   - ngl_per_device has 0 layers for each device, lower bound
@@ -524,30 +476,22 @@ static void llama_params_fit_impl(
    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
    //   - check memory use of our guess, replace either the low or high bound
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
-    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
        LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
    } else {
        LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
    }
+    uint32_t n_unassigned = hp_ngl;
    for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl + 1;
-        for (size_t jd = id + 1; jd < nd; ++jd) {
-            assert(n_unassigned >= ngl_per_device[jd].n_layer);
-            n_unassigned -= ngl_per_device[jd].n_layer;
-        }
-
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        ngl_per_device_high[id].n_layer = n_unassigned;
        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
+            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
        }
        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
            if (mem_high[id] > targets[id]) {
-                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -556,26 +500,25 @@ static void llama_params_fit_impl(
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
                    ngl_per_device_test[id].n_layer += step_size;
                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
-                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
+                        ngl_per_device_test[id].n_part += step_size;
                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);

                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
-                        mem            = mem_test;
+                        ngl_per_device  = ngl_per_device_test;
+                        mem             = mem_test;
+                        n_unassigned   -= ngl_per_device[id].n_layer;
                        LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
-                ngl_per_device = ngl_per_device_high;
-                mem            = mem_high;
+                ngl_per_device  = ngl_per_device_high;
+                n_unassigned   -= ngl_per_device[id].n_layer;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }
@@ -586,7 +529,7 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
        return;
    }

@@ -606,20 +549,24 @@ static void llama_params_fit_impl(
    assert(id_dense_start < nd);

    LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
+    for (size_t id = 0; id <= id_dense_start; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
+            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
            ngl_per_device_high[id].n_layer += n_layer_move;
            ngl_per_device_high[jd].n_layer -= n_layer_move;
            ngl_per_device_high[jd].n_part = 0;
        }
        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);

        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+            assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
+            assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
+            assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                   >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+            uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            while (delta > 1) {
                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                step_size = std::max(step_size, uint32_t(1));
@@ -635,11 +582,11 @@ static void llama_params_fit_impl(
                    ngl_per_device_test[id].n_layer += n_convert_jd;
                    n_converted_test += n_convert_jd;

-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
+                    if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
                        break;
                    }
                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);

                if (mem_test[id] <= targets[id]) {
                    ngl_per_device = ngl_per_device_test;
@@ -654,38 +601,32 @@ static void llama_params_fit_impl(
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+                delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                    - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            }
        } else {
            ngl_per_device = ngl_per_device_high;
-            mem            = mem_high;
            id_dense_start = id_dense_start_high;
            LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }

        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
+        if (ngl_per_device[id_dense_start].n_layer > 0) {
            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
            size_t id_dense_start_test = id_dense_start;
            ngl_per_device_test[id_dense_start_test].n_layer--;
            ngl_per_device_test[id_dense_start_test].n_part--;
            ngl_per_device_test[id].n_layer++;
            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
+            if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
                id_dense_start_test++;
            }
            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
-            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
-            if (id < nd - 1) {
-                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
-            }
            LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+            if (mem_test[id] < targets[id]) {
                ngl_per_device = ngl_per_device_test;
-                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -693,10 +634,9 @@ static void llama_params_fit_impl(

                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -705,10 +645,9 @@ static void llama_params_fit_impl(
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -723,41 +662,30 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }

-    // print info for devices that were not changed during the conversion from dense only to full layers:
-    for (size_t id = id_dense_start + 1; id < nd; id++) {
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LLAMA_LOG_INFO(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
 }

-enum llama_params_fit_status llama_params_fit(
+bool llama_params_fit(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    const int64_t t0_us = llama_time_us();
-    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
+    bool ok = true;
    try {
-        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const llama_params_fit_exception & e) {
-        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
    } catch (const std::runtime_error & e) {
-        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
+        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
+        ok = false;
    }
    const int64_t t1_us = llama_time_us();
    LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
+    return ok;
 }

 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
-        /*.no_perf =*/ true,
+        /*.no_perf                     =*/ true,
    };

    return result;
@@ -830,7 +758,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
    model.t_start_us = tm.t_start_us;

    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();

--- a/llama/llama.cpp/src/models/afmoe.cpp
+++ b/llama/llama.cpp/src/models/afmoe.cpp
@@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
-        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
-                              (il + 1) % hparams.n_no_rope_layer_step != 0;
-
        // dual attention normalization (pre)
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
            cb(Qcur, "Qcur_normed", il);
            cb(Kcur, "Kcur_normed", il);

+            // RoPE only for sliding_attention layers
+            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+                                ((il + 1) % hparams.n_no_rope_layer_step) != 0;
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur_rope", il);

                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur_rope", il);
            }
--- a/llama/llama.cpp/src/models/bert.cpp
+++ b/llama/llama.cpp/src/models/bert.cpp
@@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
            cb(cur, "ffn_out", il);
        } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-            const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
-            auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
            cur = build_ffn(cur,
-                    model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+                    model.layers[il].ffn_up, NULL, NULL,
                    model.layers[il].ffn_gate, NULL, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    type_op, LLM_FFN_PAR, il);
+                    model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            cur = build_ffn(cur,
--- a/llama/llama.cpp/src/models/cogvlm.cpp
+++ b/llama/llama.cpp/src/models/cogvlm.cpp
@@ -3,14 +3,12 @@
 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
+    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);

-    ggml_tensor * inpL;
-    ggml_tensor * cur;
-
+    ggml_tensor *inpL, *cur;
    inpL = build_inp_embd(model.tok_embd);

    ggml_tensor * inp_pos = build_inp_pos();
@@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
        }

        ggml_tensor * inpSA = inpL;
-        cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cur                 = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);

        // build self attention
        {
--- a/llama/llama.cpp/src/models/cohere2-iswa.cpp
+++ b/llama/llama.cpp/src/models/cohere2-iswa.cpp
@@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const

    for (int il = 0; il < n_layer; ++il) {
        const bool is_swa = hparams.is_swa(il);
-        // UNUSED:
-        // const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
--- a/llama/llama.cpp/src/models/deepseek2.cpp
+++ b/llama/llama.cpp/src/models/deepseek2.cpp
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                true, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
--- a/llama/llama.cpp/src/models/gemma-embedding.cpp
+++ b/llama/llama.cpp/src/models/gemma-embedding.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }

    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/llama/llama.cpp/src/models/gemma2-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma2-iswa.cpp
@@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll

            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);

            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);

            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/models/gemma3.cpp
+++ b/llama/llama.cpp/src/models/gemma3.cpp
@@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
-
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

--- a/llama/llama.cpp/src/models/gemma3n-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma3n-iswa.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model),
@@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
-
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

@@ -245,7 +248,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto inp = std::make_unique<llm_graph_input_embd>();
+    auto          inp = std::make_unique<llm_graph_input_embd>();
    ggml_tensor * inp_per_layer;
    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
@@ -255,20 +258,10 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
        cb(inp_per_layer, "inp_per_layer_selected", -1);
-        res->add_input(std::move(inp));
    } else {
-        // Vision embedding path: use padding token (ID=0) embedding
-        const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
-
-        // Extract and dequantize padding token embedding (column 0)
-        ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
-        ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
-        inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
-
-        // Reshape to [n_embd_altup, n_layer, 1]
-        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
-        cb(inp_per_layer, "inp_per_layer_vision", -1);
+        GGML_ABORT("TODO: support embd input");
    }
+    res->add_input(std::move(inp));
    return inp_per_layer;
 }

@@ -286,7 +279,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
                                              -1);  // [n_embd_altup, n_layer, n_tokens]
    cb(per_layer_proj, "per_layer_proj", -1);

-    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
+    inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
    inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
    cb(inp_per_layer, "inp_per_layer", -1);

--- a/llama/llama.cpp/src/models/llama-iswa.cpp
+++ b/llama/llama.cpp/src/models/llama-iswa.cpp
@@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;

@@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );

                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
            } else if (inp_attn_scale) {
--- a/llama/llama.cpp/src/models/llama.cpp
+++ b/llama/llama.cpp/src/models/llama.cpp
@@ -1,7 +1,6 @@
 #include "models.h"

-template <bool embed>
-llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15,14 +14,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

-    using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
-
-    inp_attn_type * inp_attn = nullptr;
-    if constexpr (embed) {
-        inp_attn = build_attn_inp_no_cache();
-    } else {
-        inp_attn = build_attn_inp_kv();
-    }
+    auto * inp_attn = build_attn_inp_kv();

    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

@@ -153,16 +145,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    cb(cur, "result_norm", -1);
    res->t_embd = cur;

-    if constexpr (!embed) {
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
+    // lm_head
+    cur = build_lora_mm(model.output, cur);

-        cb(cur, "result_output", -1);
-        res->t_logits = cur;
-    }
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;

    ggml_build_forward_expand(gf, cur);
 }
-
-template struct llm_build_llama<false>;
-template struct llm_build_llama<true>;
--- a/llama/llama.cpp/src/models/maincoder.cpp
+++ b/llama/llama.cpp/src/models/maincoder.cpp
@@ -1,117 +0,0 @@
-#include "models.h"
-
-llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    auto * inp_attn = build_attn_inp_kv();
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        // norm
-        cur = build_norm(inpL,
-                model.layers[il].attn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_normed", il);
-
-            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-            cb(Kcur, "Kcur_normed", il);
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        }
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // feed-forward network
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                model.layers[il].ffn_gate, NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(cur, "ffn_out", il);
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/mimo2-iswa.cpp
+++ b/llama/llama.cpp/src/models/mimo2-iswa.cpp
@@ -1,123 +0,0 @@
-
-#include "models.h"
-
-llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn = build_attn_inp_kv_iswa();
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        uint32_t n_head_l    = hparams.n_head(il);
-        uint32_t n_head_kv_l = hparams.n_head_kv(il);
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        cur = inpL;
-
-        // self_attention
-        {
-            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-            Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            ggml_tensor * sinks = model.layers[il].attn_sinks;
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
-        }
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        // feed-forward network
-        if (model.layers[il].ffn_gate_inp == nullptr) {
-            // dense branch
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
-                                0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
-            cb(cur, "ffn_moe_out", il);
-        }
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/models.h
+++ b/llama/llama.cpp/src/models/models.h
@@ -303,7 +303,6 @@ struct llm_build_llada_moe : public llm_graph_context {
    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
 };

-template <bool embed>
 struct llm_build_llama : public llm_graph_context {
    llm_build_llama(const llama_model & model, const llm_graph_params & params);
 };
@@ -312,18 +311,10 @@ struct llm_build_llama_iswa : public llm_graph_context {
    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_maincoder : public llm_graph_context {
-    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mamba : public llm_graph_context_mamba {
    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_mimo2_iswa : public llm_graph_context {
-    llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_minicpm3 : public llm_graph_context {
    llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
 };
@@ -336,10 +327,6 @@ struct llm_build_mistral3 : public llm_graph_context {
    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_modern_bert : public llm_graph_context {
-    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mpt : public llm_graph_context {
    llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };
@@ -409,11 +396,6 @@ struct llm_build_plamo : public llm_graph_context {
    llm_build_plamo(const llama_model & model, const llm_graph_params & params);
 };

-template <bool iswa>
-struct llm_build_plamo3 : public llm_graph_context {
-    llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_plm : public llm_graph_context {
    llm_build_plm(const llama_model & model, const llm_graph_params & params);
 };
--- a/llama/llama.cpp/src/models/modern-bert.cpp
+++ b/llama/llama.cpp/src/models/modern-bert.cpp
@@ -1,116 +0,0 @@
-#include "models.h"
-
-llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    // construct input embeddings (token, type, position)
-    inpL = build_inp_embd(model.tok_embd);
-    cb(inpL, "inp_embd", -1);
-
-    // embed layer norm
-    inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
-    cb(inpL, "inp_norm", -1);
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    auto * inp_attn = build_attn_inp_no_cache();
-
-    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        cur = inpL;
-
-        // attention layer norm
-        if (model.layers[il].attn_norm) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-        }
-
-        // self attention
-        cur = build_lora_mm(model.layers[il].wqkv, cur);
-        cb(cur, "wqkv", il);
-
-        const size_t type_size = ggml_type_size(cur->type);
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
-
-        // RoPE
-        Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-        Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        cur = build_attn(inp_attn,
-                    model.layers[il].wo, nullptr,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        cb(cur, "kqv_out", il);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-        }
-
-        // re-add the layer input
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // attention layer norm
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                NULL,                      NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
-
-        // attentions bypass the intermediate layer
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM, -1);
-    cb(cur, "final_norm_out", -1);
-
-    if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
-        // extracting cls token
-        cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
-        cb(cur, "cls_pooled_embd", -1);
-    }
-
-    cb(cur, "res_embd", -1);
-    res->t_embd = cur;
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/openai-moe-iswa.cpp
+++ b/llama/llama.cpp/src/models/openai-moe-iswa.cpp
@@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

        // norm
@@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,

            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

--- a/llama/llama.cpp/src/models/plamo3.cpp
+++ b/llama/llama.cpp/src/models/plamo3.cpp
@@ -1,128 +0,0 @@
-#include "models.h"
-
-template <bool iswa>
-llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context(params) {
-    const int64_t head_dim_q = hparams.n_embd_head_k;
-    const int64_t head_dim_v = hparams.n_embd_head_v;
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
-    inp_attn_type * inp_attn = nullptr;
-
-    if constexpr (iswa) {
-        inp_attn = build_attn_inp_kv_iswa();
-    } else {
-        inp_attn = build_attn_inp_kv();
-    }
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * residual = inpL;
-
-        float freq_base_l  = 0.0f;
-        float freq_scale_l = 0.0f;
-        if constexpr (iswa) {
-            freq_base_l  = model.get_rope_freq_base (cparams, il);
-            freq_scale_l = model.get_rope_freq_scale(cparams, il);
-        } else {
-            freq_base_l  = freq_base;
-            freq_scale_l = freq_scale;
-        }
-
-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
-        cb(cur, "wqkv", il);
-
-        const int32_t n_head    = hparams.n_head(il);
-        const int32_t n_head_kv = hparams.n_head_kv(il);
-
-        const int64_t q_offset = 0;
-        const int64_t k_offset = head_dim_q * n_head;
-        const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
-                head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
-                head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
-                head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-        cb(Qcur, "attn_q_norm", il);
-        Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-        cb(Kcur, "attn_k_norm", il);
-
-        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-        Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-
-        const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
-
-        cur = build_attn(inp_attn,
-                model.layers[il].wo, NULL,
-                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
-        cb(cur, "attn_out", il);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur      = ggml_get_rows(ctx0, cur, inp_out_ids);
-            residual = ggml_get_rows(ctx0, residual, inp_out_ids);
-        }
-
-        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, residual);
-        cb(cur, "attn_residual", il);
-
-        residual = cur;
-
-        cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                NULL,                      NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
-        cb(cur, "ffn_out", il);
-
-        cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, residual);
-        cb(cur, "ffn_residual", il);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-    res->t_embd = cur;
-
-    cur = build_lora_mm(model.output, cur);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
-
-// Explicit template instantiations
-template struct llm_build_plamo3<false>;
-template struct llm_build_plamo3<true>;
--- a/llama/llama.cpp/src/models/smallthinker.cpp
+++ b/llama/llama.cpp/src/models/smallthinker.cpp
@@ -26,16 +26,10 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA  = inpL;
+        ggml_tensor * probs  = nullptr;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
-        const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
-                              il % hparams.n_no_rope_layer_step != 0;
-
-        ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
+        probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
        cb(probs, "ffn_moe_logits", il);

        // norm
@@ -58,11 +52,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

-            if (use_rope) {
-                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+            if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                                    ext_factor, attn_factor, beta_fast, beta_slow);

-                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                                    ext_factor, attn_factor, beta_fast, beta_slow);
            }
            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -985,11 +985,6 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
        { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
        { "\\p{S}", unicode_cpt_flags::SYMBOL },
-        { "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
-        { "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
-        { "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
-        { "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
-        { "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
    };

    static const std::map<int, int> k_ucat_cpt = {
@@ -1100,26 +1095,22 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                        continue;
                    }

-                    // Match \p{...} Unicode properties of varying lengths
-                    if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
+                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
                        regex_expr[i + 1] == 'p' &&
-                        regex_expr[i + 2] == '{') {
-                        // Find the closing brace
-                        size_t closing_brace = regex_expr.find('}', i + 3);
-                        if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
-                            const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
-                            if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
-                                if (!inside) {
-                                    regex_expr_collapsed += '[';
-                                }
-                                regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
-                                regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
-                                if (!inside) {
-                                    regex_expr_collapsed += ']';
-                                }
-                                i = closing_brace;
-                                continue;
+                        regex_expr[i + 2] == '{' &&
+                        regex_expr[i + 4] == '}') {
+                        const std::string pat = regex_expr.substr(i, 5);
+                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+                            if (!inside) {
+                                regex_expr_collapsed += '[';
                            }
+                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
+                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
+                            if (!inside) {
+                                regex_expr_collapsed += ']';
+                            }
+                            i += 4;
+                            continue;
                        }
                    }

--- a/llama/llama.cpp/tools/mtmd/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -45,14 +45,13 @@
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"

-#define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
-#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
-#define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
-#define KEY_MINICPMV_VERSION       "clip.minicpmv_version"
-#define KEY_MINICPMV_QUERY_NUM     "clip.minicpmv_query_num"
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"

 // audio-specific
 #define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
@@ -139,62 +138,6 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"

-// (conformer) lfm2
-#define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
-#define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
-#define TN_FFN_NORM_1      "%s.blk.%d.ffn_norm_1.%s"
-#define TN_FFN_UP_1        "%s.blk.%d.ffn_up_1.%s"
-#define TN_FFN_DOWN_1      "%s.blk.%d.ffn_down_1.%s"
-#define TN_POS_BIAS_U      "%s.blk.%d.pos_bias_u"
-#define TN_POS_BIAS_V      "%s.blk.%d.pos_bias_v"
-#define TN_NORM_CONV       "%s.blk.%d.norm_conv.%s"
-#define TN_LINEAR_POS      "%s.blk.%d.linear_pos.%s"
-#define TN_CONV_DW         "%s.blk.%d.conv_dw.%s"
-#define TN_CONV_NORM       "%s.blk.%d.conv_norm.%s"
-#define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
-#define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
-
-// mobilenetv5 (gemma3n) definitions
-#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
-#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
-#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
-
-// Stage 0 Block (Edge Residual)
-#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
-#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
-#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
-#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
-
-// Stage 1+ Block (Universal Inverted Residual)
-#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
-#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
-#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
-#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
-#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
-#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
-#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
-#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
-#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
-
-// Attention Components
-#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
-#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
-#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
-#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
-#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
-#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
-#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
-#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
-#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
-
-// MSFA
-#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
-#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
-#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
-
-
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

@@ -212,8 +155,6 @@ enum projector_type {
    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_QWEN3VL,
    PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_GEMMA3NV,
-    PROJECTOR_TYPE_GEMMA3NA,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@@ -224,15 +165,12 @@ enum projector_type {
    PROJECTOR_TYPE_GLMA,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
-    PROJECTOR_TYPE_MUSIC_FLAMINGO,
    PROJECTOR_TYPE_LFM2,
    PROJECTOR_TYPE_KIMIVL,
    PROJECTOR_TYPE_LIGHTONOCR,
    PROJECTOR_TYPE_COGVLM,
    PROJECTOR_TYPE_JANUS_PRO,
-    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
-    PROJECTOR_TYPE_YOUTUVL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -246,8 +184,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
-    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
@@ -257,15 +193,12 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GLMA,      "glma"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
    { PROJECTOR_TYPE_LFM2,      "lfm2"},
    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
-    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
-    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/llama/llama.cpp/tools/mtmd/clip-model.h
+++ b/llama/llama.cpp/tools/mtmd/clip-model.h
@@ -4,7 +4,6 @@
 #include "clip.h"
 #include "clip-impl.h"

-#include <array>
 #include <vector>
 #include <unordered_set>
 #include <cstdint>
@@ -61,7 +60,6 @@ struct clip_hparams {
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
-    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)

    // audio
    int32_t n_mel_bins = 0; // whisper preprocessor
@@ -144,74 +142,11 @@ struct clip_layer {
    ggml_tensor * deepstack_fc2_w = nullptr;
    ggml_tensor * deepstack_fc2_b = nullptr;

-    // lfm2
-    ggml_tensor * ff_norm_w     = nullptr;
-    ggml_tensor * ff_norm_b     = nullptr;
-    ggml_tensor * ff_norm_1_w   = nullptr;
-    ggml_tensor * ff_norm_1_b   = nullptr;
-    ggml_tensor * ff_up_1_w     = nullptr;
-    ggml_tensor * ff_up_1_b     = nullptr;
-    ggml_tensor * ff_down_1_w   = nullptr;
-    ggml_tensor * ff_down_1_b   = nullptr;
-    ggml_tensor * pos_bias_u    = nullptr;
-    ggml_tensor * pos_bias_v    = nullptr;
-    ggml_tensor * norm_conv_w   = nullptr;
-    ggml_tensor * norm_conv_b   = nullptr;
-    ggml_tensor * linear_pos_w  = nullptr;
-
-    ggml_tensor * conv_norm_w   = nullptr;
-    ggml_tensor * conv_norm_b   = nullptr;
-    ggml_tensor * conv_dw_w     = nullptr;
-    ggml_tensor * conv_dw_b     = nullptr;
-    ggml_tensor * conv_pw1_w    = nullptr;
-    ggml_tensor * conv_pw1_b    = nullptr;
-    ggml_tensor * conv_pw2_w    = nullptr;
-    ggml_tensor * conv_pw2_b    = nullptr;
-
    bool has_deepstack() const {
        return deepstack_fc1_w != nullptr;
    }
 };

-// Expanded MobileNetV5 block structure for Gemma3n vision encoder
-struct mobilenetv5_block {
-    // Stage 0 (Edge Residual)
-    ggml_tensor * s0_conv_exp_w = nullptr;
-    ggml_tensor * s0_bn1_w      = nullptr;
-    ggml_tensor * s0_conv_pwl_w = nullptr;
-    ggml_tensor * s0_bn2_w      = nullptr;
-
-    // Stage 1+ (Universal Inverted Residual)
-    ggml_tensor * dw_start_w    = nullptr;
-    ggml_tensor * dw_start_bn_w = nullptr;
-
-    ggml_tensor * pw_exp_w      = nullptr;
-    ggml_tensor * pw_exp_bn_w   = nullptr;
-
-    ggml_tensor * dw_mid_w      = nullptr;
-    ggml_tensor * dw_mid_bn_w   = nullptr;
-
-    ggml_tensor * pw_proj_w     = nullptr;
-    ggml_tensor * pw_proj_bn_w  = nullptr;
-
-    ggml_tensor * layer_scale_w = nullptr;
-
-    // Attention (MQA) components
-    ggml_tensor * attn_q_w = nullptr;
-    ggml_tensor * attn_k_w = nullptr;
-    ggml_tensor * attn_v_w = nullptr;
-    ggml_tensor * attn_o_w = nullptr;
-
-    // Optional downsampling/norm in attention
-    ggml_tensor * attn_k_dw_w   = nullptr;
-    ggml_tensor * attn_k_norm_w = nullptr;
-    ggml_tensor * attn_v_dw_w   = nullptr;
-    ggml_tensor * attn_v_norm_w = nullptr;
-
-    // Block norm (often present in attention blocks)
-    ggml_tensor * attn_norm_w   = nullptr;
-};
-
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -328,23 +263,6 @@ struct clip_model {
    ggml_tensor * mm_input_proj_w = nullptr;
    ggml_tensor * mm_soft_emb_norm_w = nullptr;

-    // mobilenetv5 for gemma3n
-    std::vector<mobilenetv5_block> mobilenet_blocks;
-    std::vector<int> mobilenet_stage_ends;
-    ggml_tensor * mobilenet_stem_conv_w = nullptr;
-    ggml_tensor * mobilenet_stem_conv_b = nullptr;
-    ggml_tensor * mobilenet_stem_norm_w = nullptr;
-    ggml_tensor * mm_post_proj_norm_w = nullptr;
-
-    // Multi-Scale Fusion Adapter (MSFA) components
-    ggml_tensor * msfa_concat_conv_w = nullptr;
-    ggml_tensor * msfa_concat_norm_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_w = nullptr;
-    ggml_tensor * msfa_ffn_project_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_bn = nullptr;
-    ggml_tensor * msfa_ffn_project_bn = nullptr;
-
-
    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
@@ -368,16 +286,9 @@ struct clip_model {
    ggml_tensor * mm_boi = nullptr;
    ggml_tensor * mm_eoi = nullptr;

-    // lfm2 audio
-    std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
-    std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
-    ggml_tensor * pre_encode_out_w = nullptr;
-    ggml_tensor * pre_encode_out_b = nullptr;
-
    bool audio_has_avgpool() const {
        return proj_type == PROJECTOR_TYPE_QWEN2A
-            || proj_type == PROJECTOR_TYPE_VOXTRAL
-            || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
    }

    bool audio_has_stack_frames() const {
--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
@@ -801,10 +801,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -835,7 +831,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_QWEN2A:
        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
            } break;
@@ -855,18 +850,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_llava>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_LFM2A:
-            {
-                builder = std::make_unique<clip_graph_conformer>(ctx, img);
-            } break;
        case PROJECTOR_TYPE_GLM4V:
            {
                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
-            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@@ -1167,14 +1154,6 @@ struct clip_model_loader {
                        // test model (tinygemma3) has a different value, we optionally read it
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                    } break;
-
-                case PROJECTOR_TYPE_GEMMA3NV:
-                    {
-                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
-                        // Similar configuration to Gemma3
-                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
-                    } break;
                case PROJECTOR_TYPE_QWEN2VL:
                case PROJECTOR_TYPE_QWEN25VL:
                case PROJECTOR_TYPE_QWEN3VL:
@@ -1192,20 +1171,6 @@ struct clip_model_loader {
                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                        }
                    } break;
-                case PROJECTOR_TYPE_YOUTUVL:
-                    {
-                        hparams.n_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
-                        std::vector<int> wa_layer_indexes_vec;
-                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
-                        for (auto & layer : wa_layer_indexes_vec) {
-                            hparams.wa_layer_indexes.insert(layer);
-                        }
-                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
-                        hparams.set_limit_image_tokens(1, 62500);
-                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
-                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@@ -1224,7 +1189,6 @@ struct clip_model_loader {
                case PROJECTOR_TYPE_QWEN2A:
                case PROJECTOR_TYPE_GLMA:
                case PROJECTOR_TYPE_VOXTRAL:
-                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                    {
                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1240,15 +1204,6 @@ struct clip_model_loader {
                        hparams.audio_window_len   = 400;
                        hparams.audio_hop_len      = 160;
                    } break;
-                case PROJECTOR_TYPE_LFM2A:
-                    {
-                        // audio preprocessing params
-                        hparams.audio_chunk_len        = 1; // in seconds
-                        hparams.audio_sample_rate      = 16000;
-                        hparams.audio_n_fft            = 512;
-                        hparams.audio_window_len       = 400;
-                        hparams.audio_hop_len          = 160;
-                    } break;
                default:
                    break;
            }
@@ -1274,14 +1229,7 @@ struct clip_model_loader {
                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
-                if (!hparams.wa_layer_indexes.empty()) {
-                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
-                    for (auto & layer : hparams.wa_layer_indexes) {
-                        LOG_INF("%d ", layer);
-                    }
-                    LOG_INF("\n");
-                }
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                if (hparams.image_min_pixels > 0) {
                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                }
@@ -1363,10 +1311,6 @@ struct clip_model_loader {

        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

-        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
-            hparams.n_layer = 0; // gemma3n does not use normal layer structure
-        }
-
        // layers
        model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1441,7 +1385,6 @@ struct clip_model_loader {
            }
        }

-
        switch (model.proj_type) {
            case PROJECTOR_TYPE_MLP:
            case PROJECTOR_TYPE_MLP_NORM:
@@ -1554,14 +1497,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
-            case PROJECTOR_TYPE_YOUTUVL:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
-                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
-                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.projection     = get_tensor(TN_MM_PROJECTOR);
@@ -1581,112 +1516,11 @@ struct clip_model_loader {
                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                } break;
-            case PROJECTOR_TYPE_GEMMA3NV:
-                {
-                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
-                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
-                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
-
-                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
-                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
-                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
-                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
-
-                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
-
-                    // Dynamically load blocks stage by stage
-                    for (int stage = 0; stage < 4; ++stage) {
-                        int blocks_found_in_stage = 0;
-
-                        for (int blk_idx = 0; ; ++blk_idx) {
-                            bool found_block = false;
-                            mobilenetv5_block block;
-
-                            // 1. Check for Edge Residual (S0)
-                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
-                            if (block.s0_conv_exp_w) {
-                                found_block = true;
-                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
-                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
-                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
-                            }
-                            // 2. Check for UIR (Universal Inverted Residual)
-                            else {
-                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
-                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
-                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
-
-                                if (block.dw_start_w || block.pw_exp_w) {
-                                    found_block = true;
-                                    if (block.dw_start_w) {
-                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
-                                    }
-                                    if (block.pw_exp_w) {
-                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
-                                    }
-                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
-                                    if (block.dw_mid_w) {
-                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
-                                    }
-                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
-                                    if (block.pw_proj_w) {
-                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
-                                    }
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            // 3. Check for Attention (MQA)
-                            // Even if UIR/Edge check failed, this might be a pure attention block
-                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
-                            if (attn_q_check) {
-                                found_block = true;
-                                block.attn_q_w = attn_q_check;
-                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
-                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
-                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
-                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
-                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
-                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
-                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
-                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
-                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
-                                if (!block.layer_scale_w) {
-                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
-                                }
-                            }
-
-                            if (found_block) {
-                                model.mobilenet_blocks.push_back(block);
-                                blocks_found_in_stage++;
-                            } else {
-                                // End of blocks for this stage
-                                break;
-                            }
-                        }
-
-                        // Track where this stage ends in the flat vector
-                        if (blocks_found_in_stage > 0) {
-                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
-                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
-                        }
-                    }
-                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
-                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    model.projection = get_tensor(TN_MM_PROJECTOR);
                } break;
            case PROJECTOR_TYPE_LFM2:
-                {
-                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
-                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
-                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1746,17 +1580,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                } break;
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                {
-                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
-                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
-                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
-                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
-                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
-                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -1805,52 +1628,6 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
                } break;
-            case PROJECTOR_TYPE_LFM2A:
-                {
-                    for (int i : {0, 2, 3, 5, 6}) {
-                        model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
-                        model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
-                    }
-                    model.pre_encode_out_w    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
-                    model.pre_encode_out_b    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
-
-                    model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
-                    model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
-                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
-                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
-                    model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
-                    model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
-
-                    for (int il = 0; il < hparams.n_layer; ++il) {
-                        auto & layer = model.layers[il];
-
-                        layer.ff_norm_w   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "weight"));
-                        layer.ff_norm_b   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "bias"));
-                        layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
-                        layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
-                        layer.ff_up_1_w   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "weight"));
-                        layer.ff_up_1_b   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "bias"));
-                        layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
-                        layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
-
-                        layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
-                        layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
-
-                        layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
-                        layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
-
-                        layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
-
-                        layer.conv_norm_w  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
-                        layer.conv_norm_b  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
-                        layer.conv_dw_w    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "weight"));
-                        layer.conv_dw_b    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "bias"));
-                        layer.conv_pw1_w   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "weight"));
-                        layer.conv_pw1_b   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "bias"));
-                        layer.conv_pw2_w   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "weight"));
-                        layer.conv_pw2_b   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
-                    }
-                } break;
            default:
                GGML_ASSERT(false && "unknown projector type");
        }
@@ -2155,7 +1932,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

    try {
        clip_model_loader loader(fname);
-        bool skip_audio = false;

        if (loader.has_vision) {
            ctx_vision = new clip_ctx(ctx_params);
@@ -2165,14 +1941,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
                loader.warmup(*ctx_vision);
            }

-            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
-            // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
-
            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

-        if (loader.has_audio && !skip_audio) {
+        if (loader.has_audio) {
            ctx_audio = new clip_ctx(ctx_params);
            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
            loader.load_tensors(*ctx_audio);
@@ -2896,57 +2668,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                // res_imgs->data[0] = *res;
                res_imgs->entries.push_back(std::move(img_f32));
            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                const int patch_size = params.patch_size;  // typically 16
-                const int merge_size = params.n_merge;      // typically 2
-                const int align_size = patch_size * merge_size;  // 32
-
-                const int max_num_patches = params.image_max_pixels > 0 ?
-                    params.image_max_pixels / (patch_size * patch_size) : 256;
-
-                // Linear search for optimal scale to fit within max_num_patches
-                float scale = 1.0f;
-                int target_height = original_size.height;
-                int target_width = original_size.width;
-
-                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
-                    float scaled_size = size * scale;
-                    // Round up to nearest multiple of align_size
-                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
-                    // Ensure at least one patch
-                    return std::max(align_size, aligned);
-                };
-
-                // Linear search with 0.02 step size
-                while (scale > 0.0f) {
-                    target_height = get_scaled_image_size(scale, original_size.height);
-                    target_width = get_scaled_image_size(scale, original_size.width);
-
-                    int num_patches_h = target_height / patch_size;
-                    int num_patches_w = target_width / patch_size;
-                    int num_patches = num_patches_h * num_patches_w;
-
-                    if (num_patches > max_num_patches) {
-                        scale -= 0.02f;
-                    } else {
-                        break;
-                    }
-                }
-
-                clip_image_size new_size = {target_width, target_height};
-
-                // Resize the image
-                clip_image_u8 resized;
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-
-                // Normalize to float32
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-
-                // Add to results
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;

        case PROJECTOR_TYPE_IDEFICS3:
            {
@@ -3010,16 +2731,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
        case PROJECTOR_TYPE_JANUS_PRO:
            {
                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3189,7 +2900,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
            break;
@@ -3205,7 +2915,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        default:
            break;
@@ -3266,7 +2975,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
                int x_patch = img->nx / (params.patch_size * 2);
@@ -3282,12 +2990,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int scale_factor = ctx->model.hparams.n_merge;
                n_patches /= (scale_factor * scale_factor);
            } break;
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
-                // regardless of input size (see architecture description)
-                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
-            } break;
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_KIMIVL:
            {
@@ -3313,7 +3015,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
                n_patches = img->nx;

@@ -3346,10 +3047,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            {
                n_patches += 2; // for BOI and EOI token embeddings
            } break;
-        case PROJECTOR_TYPE_LFM2A:
-            {
-                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
-            } break;
        default:
            GGML_ABORT("unsupported projector type");
    }
@@ -3400,6 +3097,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int pos_w = image_size_width  / patch_size;
    const int pos_h = image_size_height / patch_size;

+    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl

    auto get_inp_tensor = [&gf](const char * name) {
        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3548,11 +3246,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_YOUTUVL:
            {
                // pw * ph = number of tokens output by ViT after apply patch merger
                // ipw * ipw = number of vision token been processed inside ViT
-                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
                const int merge_ratio = 2;
                const int pw  = image_size_width  / patch_size / merge_ratio;
                const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3563,7 +3259,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                std::vector<int> inv_idx(ph * pw);

                if (use_window_attn) {
-                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int attn_window_size = 112;
                    const int grid_window = attn_window_size / patch_size / merge_ratio;
                    int dst = 0;
                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3680,7 +3376,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("patches", patches);
            } break;
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
        case PROJECTOR_TYPE_QWEN2A:
@@ -3688,7 +3383,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_COGVLM:
            {
@@ -3711,27 +3405,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                }
                set_input_i32("pos_w", pos_data);
            } break;
-        case PROJECTOR_TYPE_LFM2A:
-            {
-                GGML_ASSERT(imgs.entries.size() == 1);
-                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
-
-                auto d_model = 512;
-                auto seq_len = n_frames * 2 - 1;
-                std::vector<float> pos_emb(d_model*seq_len);
-                std::vector<double> inv_freq(d_model / 2);
-                for (size_t i = 0; i < inv_freq.size(); ++i) {
-                    inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
-                }
-                for (int64_t pos = 0; pos < seq_len; ++pos) {
-                    for (size_t i = 0; i < inv_freq.size(); ++i) {
-                        const float ang = (n_frames - pos - 1) * inv_freq[i];
-                        pos_emb[pos*d_model + 2*i + 0] = sinf(ang);  // even
-                        pos_emb[pos*d_model + 2*i + 1] = cosf(ang);  // odd
-                    }
-                }
-                set_input_f32("pos_emb", pos_emb);
-            } break;
        default:
            GGML_ABORT("Unknown projector type");
    }
@@ -3802,19 +3475,16 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
        case PROJECTOR_TYPE_JANUS_PRO:
-        case PROJECTOR_TYPE_YOUTUVL:
            return ctx->model.mm_1_b->ne[0];
        case PROJECTOR_TYPE_QWEN3VL:
            // main path + deepstack paths
            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3NV:
            return ctx->model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->model.projection->ne[1];
        case PROJECTOR_TYPE_ULTRAVOX:
        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->model.mm_3_w->ne[1];
@@ -3829,8 +3499,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_COGVLM:
            return ctx->model.mm_4h_to_h_w->ne[1];
-        case PROJECTOR_TYPE_LFM2A:
-            return ctx->model.position_embeddings->ne[0];
        case PROJECTOR_TYPE_GLM4V:
            return ctx->model.mm_ffn_down_w->ne[1];
        default:
@@ -3839,7 +3507,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }

 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
        return ctx->model.hparams.minicpmv_version;
    }
@@ -3847,26 +3514,24 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }

 bool clip_is_glm(const struct clip_ctx * ctx) {
-    // TODO: remove this function
    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }

 bool clip_is_mrope(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
+        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
 }

 bool clip_is_llava(const struct clip_ctx * ctx) {
    return ctx->model.hparams.has_llava_projector;
 }

+bool clip_is_gemma3(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3876,16 +3541,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }

 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_ULTRAVOX:
-        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-            return true;
-        default:
-            return false;
-    }
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
+        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
+        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
--- a/llama/llama.cpp/tools/mtmd/clip.h
+++ b/llama/llama.cpp/tools/mtmd/clip.h
@@ -106,8 +106,7 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-// note for contributor: this clip_is_(model) pattern is deprecated
-//                       do NOT add new functions like this
+bool clip_is_gemma3(const struct clip_ctx * ctx);

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

--- a/llama/llama.cpp/tools/mtmd/models/conformer.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/conformer.cpp
@@ -1,217 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_conformer::build() {
-    const int n_frames   = img.nx;
-    const int n_pos      = n_frames / 2;
-    const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
-    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
-
-    ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
-    ggml_set_name(pos_emb, "pos_emb");
-    ggml_set_input(pos_emb);
-    ggml_build_forward_expand(gf, pos_emb);
-
-    ggml_tensor * inp = build_inp_raw(1);
-    cb(inp, "input", -1);
-
-    auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
-
-    // pre encode, conv subsampling
-    {
-        // layer.0 - conv2d
-        cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
-        cb(cur, "conformer.pre_encode.conv.{}", 0);
-
-        // layer.1 - relu
-        cur = ggml_relu_inplace(ctx0, cur);
-
-        // layer.2 conv2d dw
-        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
-        cb(cur, "conformer.pre_encode.conv.{}", 2);
-
-        // layer.3 conv2d
-        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
-        cb(cur, "conformer.pre_encode.conv.{}", 3);
-
-        // layer.4 - relu
-        cur = ggml_relu_inplace(ctx0, cur);
-
-        // layer.5 conv2d dw
-        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
-        cb(cur, "conformer.pre_encode.conv.{}", 5);
-
-        // layer.6 conv2d
-        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
-        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
-        cb(cur, "conformer.pre_encode.conv.{}", 6);
-
-        // layer.7 - relu
-        cur = ggml_relu_inplace(ctx0, cur);
-
-        // flatten channel and frequency axis
-        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
-        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
-
-        // calculate out
-        cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
-        cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
-        cb(cur, "conformer.pre_encode.out", -1);
-    }
-
-    // pos_emb
-    cb(pos_emb, "pos_emb", -1);
-
-    for (int il = 0; il < hparams.n_layer; il++) {
-        const auto & layer = model.layers[il];
-
-        auto * residual = cur;
-
-        cb(cur, "layer.in", il);
-
-        // feed_forward1
-        cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
-
-        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
-                        il);
-        cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
-
-        const auto fc_factor = 0.5f;
-        residual             = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
-
-        // self-attention
-        {
-            cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
-            cb(cur, "conformer.layers.{}.norm_self_att", il);
-
-            ggml_tensor * Qcur     = ggml_mul_mat(ctx0, layer.q_w, cur);
-            Qcur                   = ggml_add(ctx0, Qcur, layer.q_b);
-            Qcur                   = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
-            ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
-            Q_bias_u               = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
-            ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
-            Q_bias_v               = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
-
-            // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
-            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
-            Kcur               = ggml_add(ctx0, Kcur, layer.k_b);
-            Kcur               = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
-            Kcur               = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-
-            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
-            Vcur               = ggml_add(ctx0, Vcur, layer.v_b);
-            Vcur               = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
-            Vcur               = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
-
-            // build_attn won't fit due to matrix_ac and matrix_bd separation
-            ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
-            matrix_ac               = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
-            cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
-
-            auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
-            cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
-            p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
-            p = ggml_permute(ctx0, p, 0, 2, 1, 3);
-
-            auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
-            matrix_bd        = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
-
-            // rel shift
-            {
-                const auto pos_len = matrix_bd->ne[0];
-                const auto q_len   = matrix_bd->ne[1];
-                const auto h       = matrix_bd->ne[2];
-                matrix_bd          = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
-                matrix_bd          = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
-                matrix_bd          = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
-                matrix_bd          = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
-                                                        matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
-                matrix_bd          = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
-            }
-
-            matrix_bd     = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
-                                               matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
-            auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
-            scores        = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
-            cb(scores, "conformer.layers.{}.self_attn.id0", il);
-
-            ggml_tensor * attn = ggml_soft_max(ctx0, scores);
-            ggml_tensor * x    = ggml_mul_mat(ctx0, attn, Vcur);
-            x                  = ggml_permute(ctx0, x, 2, 0, 1, 3);
-            x                  = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
-
-            ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
-            out               = ggml_add(ctx0, out, layer.o_b);
-            cb(out, "conformer.layers.{}.self_attn.linear_out", il);
-
-            cur = out;
-        }
-
-        residual = ggml_add(ctx0, residual, cur);
-        cur      = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_conv", il);
-
-        // conv
-        {
-            auto * x = cur;
-            x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
-            x = ggml_add(ctx0, x, layer.conv_pw1_b);
-            cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
-
-            // ggml_glu doesn't support sigmoid
-            // TODO @ngxson : support this ops in ggml
-            {
-                int64_t       d    = x->ne[0] / 2;
-                ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
-                x                  = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
-                x                  = ggml_cont(ctx0, ggml_transpose(ctx0, x));
-            }
-
-            // use ggml_ssm_conv for f32 precision
-            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
-            x = ggml_roll(ctx0, x, 4, 0, 0, 0);
-            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
-            x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
-            x = ggml_add(ctx0, x, layer.conv_dw_b);
-
-            x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
-            x = ggml_silu(ctx0, x);
-
-            // pointwise_conv2
-            x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
-            x = ggml_add(ctx0, x, layer.conv_pw2_b);
-
-            cur = x;
-        }
-
-        residual = ggml_add(ctx0, residual, cur);
-
-        cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
-
-        cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
-                        FFN_SILU, il);  // TODO(tarek): read activation for ffn from hparams
-        cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
-
-        residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
-        cb(residual, "conformer.layers.{}.conv.id", il);
-
-        cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
-        cb(cur, "conformer.layers.{}.norm_out", il);
-    }
-
-    // audio adapter
-    cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
-    cb(cur, "audio_adapter.model.{}", 0);
-    cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
-
-    cb(cur, "projected", -1);
-
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
--- a/llama/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
@@ -1,451 +0,0 @@
-#include "models.h"
-
-// Helpers for MobileNetV5 Blocks
-// RMS Norm 2D - normalizes over channels for each spatial position
-ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
-    // inp: [W, H, C, B]
-
-    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_rms_norm(ctx0, cur, eps);
-
-    if (weight) {
-        cur = ggml_mul(ctx0, cur, weight);
-    }
-
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-
-    return cur;
-}
-
-// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
-ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
-    const int64_t ih = inp->ne[1];  // height
-    const int64_t iw = inp->ne[0];  // width
-
-    // Calculate output size (ceil division)
-    const int64_t oh = (ih + stride_h - 1) / stride_h;
-    const int64_t ow = (iw + stride_w - 1) / stride_w;
-
-    // Calculate padding needed
-    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
-    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
-
-    // Split padding asymmetrically
-    const int pad_h_top = pad_h / 2;
-    const int pad_h_bottom = pad_h - pad_h_top;
-    const int pad_w_left = pad_w / 2;
-    const int pad_w_right = pad_w - pad_w_left;
-
-    // Apply padding if needed
-    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
-    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
-    if (pad_h > 0 || pad_w > 0) {
-        inp = ggml_pad_ext(ctx0, inp,
-            pad_w_left, pad_w_right,     // width padding (dim 0)
-            pad_h_top, pad_h_bottom,      // height padding (dim 1)
-            0, 0,                         // no channel padding (dim 2)
-            0, 0);                        // no batch padding (dim 3)
-    }
-
-    return inp;
-}
-
-
-// Edge Residual Block (Stage 0)
-ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Expansion Conv (3x3)
-    if (stride == 2) {
-        // Case: Downsampling (Block 0)
-        // Replicates Conv2dSame(kernel=3, stride=2)
-        cur = pad_same_2d(cur, 3, 3, stride, stride);
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
-    } else {
-        // Case: Normal 3x3 Block (Block 1, 2)
-        // Replicates Conv2d(kernel=3, stride=1, padding=1)
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
-    }
-
-    // BN + Activation
-    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
-    cur = ggml_gelu(ctx0, cur);
-
-    // 2. Pointwise Linear Conv (1x1)
-    // 1x1 Convs usually have padding=0 and stride=1
-    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
-    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
-
-    // 3. Residual Connection
-    // Only apply residual if spatial dimensions and channels match (stride 1)
-    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Universal Inverted Residual Block (Stage 1+)
-ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Depthwise Start (Optional)
-    // NOTE: dw_start always has stride=1 (no downsampling here)
-    if (block.dw_start_w) {
-        int k = block.dw_start_w->ne[0]; // 3 or 5
-        int p = k / 2;
-        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
-        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
-    }
-
-    // 2. Pointwise Expansion (1x1)
-    if (block.pw_exp_w) {
-        // Standard 1x1 conv, pad=0, stride=1
-        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 3. Depthwise Mid (Optional)
-    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
-    if (block.dw_mid_w) {
-        int k = block.dw_mid_w->ne[0]; // 3 or 5
-
-        if (stride > 1) {
-            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
-            cur = pad_same_2d(cur, k, k, stride, stride);
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
-        } else {
-            // Case: Stride 1 -> Use Standard Symmetric Padding
-            int p = k / 2;
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
-        }
-
-        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 4. Pointwise Projection (1x1)
-    if (block.pw_proj_w) {
-        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
-    }
-
-    // Apply Layer Scaling if present
-    if (block.layer_scale_w) {
-        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-    }
-
-    // 5. Residual Connection
-    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
-    bool same_channel = (inp->ne[2] == cur->ne[2]);
-    if (same_spatial && same_channel) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// Attention Block (MQA)
-ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
-    ggml_tensor * cur = inp;
-
-    // Norm
-    if (block.attn_norm_w) {
-        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
-    }
-
-    // 1. Q Calculation
-    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
-
-    // 2. K Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * k_inp = cur;
-    if (block.attn_k_dw_w) {
-        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
-        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
-        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_k_norm_w) {
-            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
-
-    // 3. V Calculation (Downsampled)
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * v_inp = cur;
-    if (block.attn_v_dw_w) {
-        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
-        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
-        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_v_norm_w) {
-            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
-
-    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
-    const int D = k->ne[2]; // Head dimension
-    const int n_head = q->ne[2] / D;
-    const int N = W * H;
-
-    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
-    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
-    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
-    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
-    q = ggml_cont(ctx0, q);
-
-    const int Wk = k->ne[0]; const int Hk = k->ne[1];
-    const int M = Wk * Hk;
-
-    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
-    k = ggml_reshape_3d(ctx0, k, M, D, B);
-    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
-    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
-    k = ggml_cont(ctx0, k);
-
-    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
-    v = ggml_reshape_3d(ctx0, v, M, D, B);
-    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
-    v = ggml_cont(ctx0, v); // [M, D, 1, B]
-
-    // Multi-Query Attention
-    float scale = 1.0f / sqrtf((float)D);
-
-    // Step 1: Compute Q @ K.T
-    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
-
-    scores = ggml_scale(ctx0, scores, scale);
-
-    scores = ggml_soft_max(ctx0, scores);
-
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
-
-    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
-    kqv = ggml_cont(ctx0, kqv);
-
-
-    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
-    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
-    kqv = ggml_cont(ctx0, kqv);
-
-    // Output projection
-    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
-
-    // Residual & Layer Scale
-    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
-        if (block.layer_scale_w) {
-            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
-        }
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-ggml_cgraph * clip_graph_mobilenetv5::build() {
-    ggml_tensor * inp = build_inp_raw();
-
-    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
-    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
-
-    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
-    if (model.mobilenet_stem_conv_b) {
-        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
-    }
-    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
-    cur = ggml_gelu(ctx0, cur);
-
-
-    // 2. Blocks
-    std::vector<ggml_tensor*> intermediate_features;
-    const int total_blocks = model.mobilenet_blocks.size();
-
-    auto is_stage_start = [&](int i) {
-        if (i == 0) return true;
-        for (int end_idx : model.mobilenet_stage_ends) {
-            if (i == end_idx + 1) return true;
-        }
-        return false;
-    };
-
-    auto is_fusion_point = [&](int i) {
-        if (model.mobilenet_stage_ends.size() >= 4) {
-                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
-                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
-        } else {
-            if (i == total_blocks - 1) return true;
-        }
-        return false;
-    };
-
-    for (int i = 0; i < total_blocks; i++) {
-        const auto & block = model.mobilenet_blocks[i];
-        int stride = is_stage_start(i) ? 2 : 1;
-
-        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
-        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
-        else                          cur = build_inverted_residual(cur, block, stride);
-
-        if (is_fusion_point(i)) {
-
-            intermediate_features.push_back(cur);
-        }
-    }
-
-    // 3. Multi-Scale Fusion Adapter (MSFA)
-    if (!intermediate_features.empty()) {
-
-        // A. Reference Resolution: PyTorch implementation uses inputs[0]
-        // We assume intermediate_features[0] is the "High Resolution" target.
-        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
-        ggml_tensor* target_feat = intermediate_features[0];
-        int high_res_w = target_feat->ne[0];
-        int high_res_h = target_feat->ne[1];
-
-        std::vector<ggml_tensor*> resized_feats;
-
-        // B. Resize inputs to match inputs[0] (High Resolution)
-        for (auto feat : intermediate_features) {
-            int feat_w = feat->ne[0];
-            int feat_h = feat->ne[1];
-
-            // PyTorch: if feat_size < high_resolution: interpolate
-            if (feat_w < high_res_w || feat_h < high_res_h) {
-                // Calculate scale factor.
-                // Note: PyTorch 'nearest' works on arbitrary float scales.
-                // ggml_upscale generally takes integer factors or target sizes depending on helper.
-                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
-                int scale_w = high_res_w / feat_w;
-                // int scale_h = high_res_h / feat_h;
-
-                // Safety check for non-integer scaling if strictly replicating
-                GGML_ASSERT(high_res_w % feat_w == 0);
-
-                // Upsample (Nearest Neighbor)
-                // 2 is the scale factor
-                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
-            }
-            resized_feats.push_back(feat);
-        }
-
-        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
-        cur = resized_feats[0];
-        for (size_t k = 1; k < resized_feats.size(); ++k) {
-            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
-        }
-
-        // D. FFN (UniversalInvertedResidual)
-        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
-
-        // 1. Expansion
-        if (model.msfa_ffn_expand_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
-
-            if (model.msfa_ffn_expand_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
-            }
-
-            cur = ggml_gelu(ctx0, cur);
-
-        }
-
-        // 2. Projection (No DW because kernel_size=0)
-        if (model.msfa_ffn_project_w) {
-            // 1x1 Conv
-            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
-
-            // UniversalInvertedResidual typically has a norm after projection
-            if (model.msfa_ffn_project_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
-            }
-
-        }
-
-        // E. Final Downsample to Target Resolution (Output Resolution)
-        // PyTorch: matches self.output_resolution (e.g. 16x16)
-        const int target_out_res = 16;
-        int current_w = cur->ne[0];
-
-        if (current_w > target_out_res) {
-            int s = current_w / target_out_res;
-
-            GGML_ASSERT(current_w % target_out_res == 0);
-
-            // Avg Pool: Kernel=s, Stride=s
-            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
-
-        }
-
-        // F. Final Norm
-        if (model.msfa_concat_norm_w) {
-            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
-
-        }
-    }
-
-    // 4. Gemma 3n Multimodal Projection (Embedder)
-    // Input: 'cur' is [Width, Height, Channels, Batch]
-    int W = cur->ne[0];
-    int H = cur->ne[1];
-    int C = cur->ne[2];
-    int B = cur->ne[3];
-
-    GGML_ASSERT(C == hparams.n_embd);
-
-    // 1. Permute and Flatten to [Channels, Tokens, Batch]
-    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
-    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
-    cur = ggml_cont(ctx0, cur);
-
-
-    // 2. FEATURE SCALING
-    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
-    const float scale_factor = sqrtf((float)C);
-    cur = ggml_scale(ctx0, cur, scale_factor);
-
-
-    // 3. SOFT EMBEDDING NORM
-    // PyTorch: self._norm(x) * self.weight
-    // We must normalize regardless, then multiply if weight exists.
-    {
-        const float eps = 1e-6f; // Gemma3n uses 1e-6
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_soft_emb_norm_w) {
-            // Weight shape is (2048,) -> Element-wise broadcast multiply
-            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
-        }
-
-    }
-
-    // 4. PROJECTION
-    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
-    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
-    if (model.mm_input_proj_w) {
-        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
-    }
-
-    // 5. POST PROJECTION NORM
-    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
-    // with_scale=False means weight is registered as buffer with value 1.0
-    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
-    {
-        const float eps = 1e-6f;
-        cur = ggml_rms_norm(ctx0, cur, eps);
-
-        if (model.mm_post_proj_norm_w) {
-            // If weight is loaded, multiply (should be ~1.0 anyway)
-            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
-        }
-    }
-
-    ggml_build_forward_expand(gf, cur);
-    return gf;
-}
--- a/llama/llama.cpp/tools/mtmd/models/models.h
+++ b/llama/llama.cpp/tools/mtmd/models/models.h
@@ -2,11 +2,6 @@

 #include "../clip-graph.h"

-/*
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
- */
-
 struct clip_graph_siglip : clip_graph {
    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -27,11 +22,6 @@ struct clip_graph_qwen3vl : clip_graph {
    ggml_cgraph * build() override;
 };

-struct clip_graph_youtuvl : clip_graph {
-    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-};
-
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -67,45 +57,7 @@ struct clip_graph_whisper_enc : clip_graph {
    ggml_cgraph * build() override;
 };

-struct clip_graph_conformer : clip_graph {
-    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-};
-
 struct clip_graph_glm4v : clip_graph {
    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
 };
-
-struct clip_graph_mobilenetv5 : clip_graph {
-    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
-    ggml_cgraph * build() override;
-
-    ggml_tensor * rms_norm_2d(
-        ggml_tensor * inp,
-        ggml_tensor * weight,
-        float eps = 1e-6f);
-
-    ggml_tensor* pad_same_2d(
-        ggml_tensor* inp,
-        int kernel_h,
-        int kernel_w,
-        int stride_h,
-        int stride_w,
-        int dilation_h = 1,
-        int dilation_w = 1);
-
-    ggml_tensor * build_edge_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_inverted_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_mobilenet_attn(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block);
-};
--- a/llama/llama.cpp/tools/mtmd/models/siglip.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/siglip.cpp
@@ -50,15 +50,10 @@ ggml_cgraph * clip_graph_siglip::build() {
        const int scale_factor = model.hparams.n_merge;
        cur = build_patch_merge_permute(cur, scale_factor);

-        // projection, in LFM2-VL input norm is optional
-        if (model.mm_input_norm_w) {
-            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-        }
-
-        if (model.mm_input_norm_b) {
-            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
-        }
+        // projection
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);

        cur = build_ffn(cur,
            model.mm_1_w, model.mm_1_b,
--- a/llama/llama.cpp/tools/mtmd/models/whisper-enc.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/whisper-enc.cpp
@@ -86,15 +86,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
            FFN_GELU_ERF,
            -1);

-    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-        // projector
-        cur = build_ffn(cur,
-            model.mm_1_w, model.mm_1_b,
-            nullptr, nullptr,
-            model.mm_2_w, model.mm_2_b,
-            FFN_GELU_ERF,
-            -1);
-
    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
            cur = ggml_norm(ctx0, cur, hparams.eps);
            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
--- a/llama/llama.cpp/tools/mtmd/models/youtuvl.cpp
+++ b/llama/llama.cpp/tools/mtmd/models/youtuvl.cpp
@@ -1,179 +0,0 @@
-#include "models.h"
-
-ggml_cgraph * clip_graph_youtuvl::build() {
-    GGML_ASSERT(model.class_embedding == nullptr);
-    const int batch_size       = 1;
-    const bool use_window_attn = !hparams.wa_layer_indexes.empty();
-    const int n_pos            = n_patches;
-    const int num_position_ids = n_pos * 4;
-    const int m = 2;
-    const int Wp = n_patches_x;
-    const int Hp = n_patches_y;
-    const int Hm = Hp / m;
-    const int Wm = Wp / m;
-    norm_type norm_t = NORM_TYPE_NORMAL;
-
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-    ggml_tensor * inp = build_inp_raw();
-
-    // change conv3d to linear
-    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
-    {
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            Wm * m * patch_size, m * patch_size, Hm, 3);
-        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, Wm, m * patch_size, Hm);
-
-        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            m * patch_size * 3, patch_size, m, Hm * Wm);
-
-        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
-        inp = ggml_cont_4d(
-            ctx0, inp,
-            patch_size, 3, patch_size, Hm * Wm * m * m);
-
-        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
-        inp = ggml_cont_3d(
-            ctx0, inp,
-            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
-    }
-    inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-
-    if (model.patch_bias) {
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
-    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-
-    ggml_tensor * inpL           = inp;
-    ggml_tensor * window_mask    = nullptr;
-    ggml_tensor * window_idx     = nullptr;
-    ggml_tensor * inv_window_idx = nullptr;
-
-    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
-    }
-    if (use_window_attn) {
-        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
-        ggml_set_name(inv_window_idx, "inv_window_idx");
-        ggml_set_input(inv_window_idx);
-        // mask for window attention
-        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
-        ggml_set_name(window_mask, "window_mask");
-        ggml_set_input(window_mask);
-
-        // if flash attn is used, we need to pad the mask and cast to f16
-        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
-            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
-        }
-
-        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
-        GGML_ASSERT(batch_size == 1);
-        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
-        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
-        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
-    }
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        const auto & layer = model.layers[il];
-        const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
-
-        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
-
-        // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
-        // self-attention
-        {
-            ggml_tensor * Qcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
-            ggml_tensor * Kcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
-            ggml_tensor * Vcur = ggml_add(ctx0,
-                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
-
-            Qcur = ggml_rope_multi(
-                ctx0, Qcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            Kcur = ggml_rope_multi(
-                ctx0, Kcur, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-
-            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
-
-            cur = build_attn(layer.o_w, layer.o_b,
-                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
-        }
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, inpL);
-
-        inpL = cur; // inpL = residual, cur = hidden_states
-
-        // layernorm2
-        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
-
-        // ffn
-        cur = build_ffn(cur,
-            layer.ff_up_w, layer.ff_up_b,
-            nullptr, nullptr,
-            layer.ff_down_w, layer.ff_down_b,
-            hparams.ffn_op, il);
-
-        // residual 2
-        cur = ggml_add(ctx0, inpL, cur);
-
-        inpL = cur;
-    }
-
-    ggml_tensor * embeddings = inpL;
-    if (use_window_attn) {
-        const int spatial_merge_unit = 4;
-        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
-        ggml_set_name(window_idx, "window_idx");
-        ggml_set_input(window_idx);
-        GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
-        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
-        cb(embeddings, "window_order_restored", -1);
-    }
-
-    // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
-    if (model.post_ln_w) {
-        embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
-    }
-
-    // Now apply merger (VLPatchMerger):
-    // 1. Apply RMS norm (ln_q in VLPatchMerger)
-    embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
-    cb(embeddings, "merger_normed", -1);
-
-    // 2. First reshape for spatial merge (merge 2x2 patches)
-    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
-    cb(embeddings, "merger_reshaped", -1);
-
-    embeddings = build_ffn(embeddings,
-                    model.mm_0_w, model.mm_0_b,
-                    nullptr, nullptr,
-                    model.mm_1_w, model.mm_1_b,
-                    FFN_GELU,
-                    -1);
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
--- a/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp
@@ -9,250 +9,207 @@
 #include <fstream>
 #include <algorithm>

-// some of the code here is copied from whisper.cpp
+// most of the code here is copied from whisper.cpp

 constexpr bool DEBUG = false;

-void mtmd_audio_cache::fill_sin_cos_table(int n) {
-    sin_vals.resize(n);
-    cos_vals.resize(n);
-    for (int i = 0; i < n; i++) {
-        double theta = (2 * M_PI * i) / n;
-        sin_vals[i]  = sinf(theta);
-        cos_vals[i]  = cosf(theta);
-    }
-}
+struct mtmd_audio_mel_filters {
+    int32_t n_mel;
+    int32_t n_fft;

-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
-    hann_window.resize(length);
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
-    }
-}
+    std::vector<float> data;
+};

-void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
-                                                  int   n_fft,
-                                                  int   sample_rate,
-                                                  float fmin,
-                                                  float fmax,
-                                                  bool  slaney_area_norm,
-                                                  float scale) {
-    GGML_ASSERT(n_mel > 0 && n_fft > 1);
-    if (fmax <= 0.0f) {
-        fmax = 0.5f * sample_rate;
-    }
+// note: this global cache is shared among all preprocessors
+//       if we want to use multiple preprocessors at the same time,
+//       we will need to enclose it in the preprocessor class in the future
+static struct mtmd_audio_global_cache {
+    // precomputed sin/cos table for FFT
+    std::vector<float> sin_vals;
+    std::vector<float> cos_vals;

-    // Slaney scale (matches librosa default)
-    const double min_log_hz  = 1000.0;
-    const double lin_slope   = 3 / 200.;
-    const double min_log_mel = min_log_hz * lin_slope;
-    const double log_step    = log(6.4) / 27.0;
-    auto         hz_to_mel   = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
-        return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
-    };
-    auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
-        return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
-    };
+    // hann window
+    std::vector<float> hann_window;

-    // infer N_fft from n_fft_bins
-    const double bin_hz_step = double(sample_rate) / double(n_fft);
+    // mel filter bank
+    mtmd_audio_mel_filters filters;

-    // mel grid: n_mel + 2 edges
-    const double        m_lo = hz_to_mel(fmin);
-    const double        m_hi = hz_to_mel(fmax);
-    std::vector<double> mel_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
-    }
-
-    // convert to Hz
-    std::vector<double> hz_pts(n_mel + 2);
-    for (int i = 0; i < n_mel + 2; ++i) {
-        hz_pts[i] = mel_to_hz(mel_pts[i]);
-    }
-
-    const int n_fft_bins = n_fft / 2 + 1;
-
-    // filterbank
-    std::vector<float> out(n_mel * n_fft_bins, 0);
-    for (int m = 0; m < n_mel; ++m) {
-        const double f_left   = hz_pts[m];
-        const double f_center = hz_pts[m + 1];
-        const double f_right  = hz_pts[m + 2];
-
-        const double denom_l = std::max(1e-30, f_center - f_left);
-        const double denom_r = std::max(1e-30, f_right - f_center);
-        const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
-
-        for (int k = 0; k < n_fft_bins; ++k) {
-            const double f = k * bin_hz_step;
-            double       w = 0.0;
-            if (f >= f_left && f <= f_center) {
-                w = (f - f_left) / denom_l;
-            } else if (f > f_center && f <= f_right) {
-                w = (f_right - f) / denom_r;
-            }
-            out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+    void fill_sin_cos_table(int n) {
+        sin_vals.resize(n);
+        cos_vals.resize(n);
+        for (int i = 0; i < n; i++) {
+            double theta = (2 * M_PI * i) / n;
+            sin_vals[i] = sinf(theta);
+            cos_vals[i] = cosf(theta);
        }
    }

-    filters.n_mel = n_mel;
-    filters.n_fft = n_fft;
-    filters.data  = std::move(out);
+    void fill_hann_window(int length, bool periodic) {
+        hann_window.resize(length);
+        int offset = -1;
+        if (periodic) {
+            offset = 0;
+        }
+        for (int i = 0; i < length; i++) {
+            hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+        }
+    }

-    if (DEBUG) {  // debug
-        for (size_t i = 0; i < filters.data.size(); ++i) {
-            if (filters.data[i] != 0.0f) {
-                printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+    void fill_mel_filterbank_matrix(
+        int n_mel,
+        int n_fft,
+        int sample_rate,            // e.g. 16000
+        float fmin = 0.0f,          // e.g. 0.0
+        float fmax = -1.0f,         // e.g. sr/2; pass -1 for auto
+        bool slaney_area_norm = true,
+        float scale = 1.0f          // optional extra scaling; use 1.0f/1000.0f to mimic your code
+    ) {
+        GGML_ASSERT(n_mel > 0 && n_fft > 1);
+        if (fmax <= 0.0f) {
+            fmax = 0.5f * sample_rate;
+        }
+
+        // Slaney scale (matches librosa default)
+        const double min_log_hz = 1000.0;
+        const double lin_slope = 3 / 200.;
+        const double min_log_mel = min_log_hz * lin_slope;
+        const double log_step = log(6.4) / 27.0;
+        auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
+            return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
+        };
+        auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
+            return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
+        };
+
+        // infer N_fft from n_fft_bins
+        const double bin_hz_step = double(sample_rate) / double(n_fft);
+
+        // mel grid: n_mel + 2 edges
+        const double m_lo = hz_to_mel(fmin);
+        const double m_hi = hz_to_mel(fmax);
+        std::vector<double> mel_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+        }
+
+        // convert to Hz
+        std::vector<double> hz_pts(n_mel + 2);
+        for (int i = 0; i < n_mel + 2; ++i) {
+            hz_pts[i] = mel_to_hz(mel_pts[i]);
+        }
+
+        const int n_fft_bins = n_fft / 2 + 1;
+
+        // filterbank
+        std::vector<float> out(n_mel * n_fft_bins, 0);
+        for (int m = 0; m < n_mel; ++m) {
+            const double f_left   = hz_pts[m];
+            const double f_center = hz_pts[m + 1];
+            const double f_right  = hz_pts[m + 2];
+
+            const double denom_l = std::max(1e-30, f_center - f_left);
+            const double denom_r = std::max(1e-30, f_right  - f_center);
+            const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
+
+            for (int k = 0; k < n_fft_bins; ++k) {
+                const double f = k * bin_hz_step;
+                double w = 0.0;
+                if (f >= f_left && f <= f_center) {
+                    w = (f - f_left) / denom_l;
+                } else if (f > f_center && f <= f_right) {
+                    w = (f_right - f) / denom_r;
+                }
+                out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+            }
+        }
+
+        filters.n_mel = n_mel;
+        filters.n_fft = n_fft;
+        filters.data  = std::move(out);
+
+        if (DEBUG) { // debug
+            for (size_t i = 0; i < filters.data.size(); ++i) {
+                if (filters.data[i] != 0.0f) {
+                    printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+                }
            }
        }
    }
-}
+} g_cache;

-// Unified DFT implementation for both forward and inverse transforms
-// Template parameters:
-//   Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
-//            true  = IDFT with exp(+2πi·k·n/N), scales by 1/N
-//   RealInput: true = input is real-valued (stride 1), avoids imaginary computations
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-    const int sin_cos_step   = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    const float     scale = Inverse ? (1.0f / N) : 1.0f;
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
+    const int sin_cos_step = n_sin_cos_vals / N;

    for (int k = 0; k < N; k++) {
        float re = 0;
        float im = 0;

        for (int n = 0; n < N; n++) {
-            int   idx     = (k * n * sin_cos_step) % n_sin_cos_vals;
-            float cos_val = cache.cos_vals[idx];
-            float sin_val = cache.sin_vals[idx];
-
-            if constexpr (RealInput) {
-                // Real input: in_im = 0, simplifies to:
-                // re += in_re * cos_val
-                // im += sign * in_re * sin_val
-                float in_re = in[n];
-                re += in_re * cos_val;
-                im += sign * in_re * sin_val;
-            } else {
-                float in_re = in[n * 2 + 0];
-                float in_im = in[n * 2 + 1];
-                // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
-                re += in_re * cos_val - sign * in_im * sin_val;
-                im += sign * in_re * sin_val + in_im * cos_val;
-            }
+            int idx = (k * n * sin_cos_step) % (n_sin_cos_vals); // t = 2*M_PI*k*n/N
+            re += in[n] * g_cache.cos_vals[idx]; // cos(t)
+            im -= in[n] * g_cache.sin_vals[idx]; // sin(t)
        }

-        out[k * 2 + 0] = re * scale;
-        out[k * 2 + 1] = im * scale;
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
    }
 }

-// Cooley-Tukey FFT/IFFT unified implementation
-// Template parameters:
-//   Inverse: false = FFT with exp(-2πi·k/N), no scaling
-//            true  = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
-//   RealInput: true = input is real-valued (stride 1)
-//              false = input is complex-valued (interleaved real/imag, stride 2)
-template <bool Inverse, bool RealInput>
-static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    const int n_sin_cos_vals = cache.sin_vals.size();
-
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(float * in, int N, float * out) {
+    const int n_sin_cos_vals = g_cache.sin_vals.size();
    if (N == 1) {
        out[0] = in[0];
-        if constexpr (RealInput) {
-            out[1] = 0.0f;
-        } else {
-            out[1] = in[1];
-        }
+        out[1] = 0;
        return;
    }

    const int half_N = N / 2;
-    if (N - half_N * 2 == 1) {
-        // Odd N: fall back to DFT
-        dft_impl<Inverse, RealInput>(cache, in, N, out);
+    if (N - half_N*2 == 1) {
+        dft(in, N, out);
        return;
    }

-    // Split into even and odd
-    if constexpr (RealInput) {
-        // Real input: stride is 1, copy only real values
-        float * even = in + N;
-        for (int i = 0; i < half_N; ++i) {
-            even[i] = in[2 * i];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, true>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i] = in[2 * i + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
-    } else {
-        // Complex input: stride is 2, copy complex pairs
-        float * even = in + N * 2;
-        for (int i = 0; i < half_N; ++i) {
-            even[i * 2 + 0] = in[2 * i * 2 + 0];
-            even[i * 2 + 1] = in[2 * i * 2 + 1];
-        }
-        float * even_fft = out + 2 * N;
-        fft_impl<Inverse, false>(cache, even, half_N, even_fft);
-
-        float * odd = even;
-        for (int i = 0; i < half_N; ++i) {
-            odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
-            odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
-        }
-        float * odd_fft = even_fft + N;
-        fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
+    float* even = in + N;
+    for (int i = 0; i < half_N; ++i) {
+        even[i]= in[2*i];
    }
+    float* even_fft = out + 2 * N;
+    fft(even, half_N, even_fft);

-    float * even_fft = out + 2 * N;
-    float * odd_fft  = even_fft + N;
+    float* odd = even;
+    for (int i = 0; i < half_N; ++i) {
+        odd[i] = in[2*i + 1];
+    }
+    float* odd_fft = even_fft + N;
+    fft(odd, half_N, odd_fft);

    const int sin_cos_step = n_sin_cos_vals / N;
-
-    constexpr float sign  = Inverse ? 1.0f : -1.0f;
-    constexpr float scale = Inverse ? 0.5f : 1.0f;
-
    for (int k = 0; k < half_N; k++) {
-        int   idx = k * sin_cos_step;  // t = 2*M_PI*k/N
-        float re  = cache.cos_vals[idx];
-        float im  = sign * cache.sin_vals[idx];
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re =  g_cache.cos_vals[idx]; // cos(t)
+        float im = -g_cache.sin_vals[idx]; // sin(t)

-        float re_odd = odd_fft[2 * k + 0];
-        float im_odd = odd_fft[2 * k + 1];
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];

-        out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
-        out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;

-        out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
-        out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
    }
 }

-// Forward FFT for real input (used by mel spectrogram)
-static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<false, true>(cache, in, N, out);
-}
-
-// Inverse FFT for complex input
-static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
-    fft_impl<true, false>(cache, in, N, out);
-}
-
 struct filter_params {
    int32_t n_mel;
    int32_t n_fft_bins;
@@ -265,27 +222,20 @@ struct filter_params {
    bool    norm_per_feature = false;
 };

-static void log_mel_spectrogram_worker_thread(int                        ith,
-                                              const float *              hann,
-                                              const std::vector<float> & samples,
-                                              int                        n_samples,
-                                              int                        frame_size,
-                                              int                        frame_step,
-                                              int                        n_threads,
-                                              const filter_params &      params,
-                                              const mtmd_audio_cache &   cache,
-                                              mtmd_audio_mel &           out) {
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const filter_params & params, mtmd_audio_mel & out) {
    std::vector<float> fft_in(frame_size * 2, 0.0);
    std::vector<float> fft_out(frame_size * 2 * 2 * 2);

    int n_fft_bins = params.n_fft_bins;
    int i = ith;

-    const auto & filters = cache.filters;
+    const auto & filters = g_cache.filters;

    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
-    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+    GGML_ASSERT(g_cache.sin_vals.size() == g_cache.cos_vals.size());
    // calculate FFT only when fft_in are not all zero
    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
        const int offset = i * frame_step;
@@ -301,7 +251,7 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
        }

        // FFT
-        fft(cache, fft_in.data(), frame_size, fft_out.data());
+        fft(fft_in.data(), frame_size, fft_out.data());

        // Calculate modulus^2 of complex numbers
        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
@@ -348,7 +298,6 @@ static bool log_mel_spectrogram(
        const int     n_samples_in,
        const int     n_threads,
        const filter_params & params,
-        const mtmd_audio_cache & cache,
        mtmd_audio_mel & out) {
    //const int64_t t_start_us = ggml_time_us();

@@ -356,9 +305,9 @@ static bool log_mel_spectrogram(
    int n_samples = n_samples_in;

    // Hann window
-    const float * hann       = cache.hann_window.data();
-    const int     frame_size = (params.n_fft_bins - 1) * 2;
-    const int     frame_step = params.hop_length;
+    const float * hann = g_cache.hann_window.data();
+    const int frame_size = (params.n_fft_bins - 1) * 2;
+    const int frame_step = params.hop_length;

    // Padding
    std::vector<float> samples_padded;
@@ -386,9 +335,9 @@ static bool log_mel_spectrogram(

    // preemphasis
    if (params.preemph) {
-        const int   pad_amount = frame_size / 2;
+        const int pad_amount = frame_size / 2;
        const float preemph = 0.97f;
-        float       prev = samples_padded[pad_amount];
+        float prev = samples_padded[pad_amount];
        for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
            float cur = samples_padded[i];
            samples_padded[i] = cur - preemph * prev;
@@ -423,14 +372,14 @@ static bool log_mel_spectrogram(
    {
        std::vector<std::thread> workers(n_threads - 1);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] =
-                std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
-                            frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
+                    n_samples, frame_size, frame_step, n_threads,
+                    std::cref(params), std::ref(out));
        }

        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
-                                          cache, out);
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, out);
        for (int iw = 0; iw < n_threads - 1; ++iw) {
            workers[iw].join();
        }
@@ -455,7 +404,7 @@ static bool log_mel_spectrogram(

            for (int j = 0; j < effective_n_len; ++j) {
                auto &value = out.data[i * out.n_len + j];
-                value        = (value - mean) / mstd;
+                value = (value - mean) / mstd;
            }

            // pad the rest with zeros
@@ -501,14 +450,18 @@ static bool log_mel_spectrogram(
 //

 void mtmd_audio_preprocessor_whisper::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
 }

-bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 samples,
-                                                 size_t                        n_samples,
-                                                 std::vector<mtmd_audio_mel> & output) {
+bool mtmd_audio_preprocessor_whisper::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
    if (n_samples == 0) {
        // empty audio
        return false;
@@ -518,7 +471,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    // if input is too short, pad with zeros
    // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
    // TODO: maybe handle this better
-    size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1);  // +1 second margin
+    size_t min_samples = (size_t)hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
    if (n_samples < min_samples) {
        smpl.resize(min_samples, 0.0f);
        std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
@@ -533,19 +486,22 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    params.hop_length       = hparams.audio_hop_len;
    params.sample_rate      = hparams.audio_sample_rate;
    params.center_padding   = false;
-    params.preemph          = 0.0f;  // disabled
+    params.preemph          = 0.0f; // disabled
    params.use_natural_log  = false;
    params.norm_per_feature = false;

-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());

    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
    if (!ok) {
        return false;
    }
@@ -556,21 +512,21 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
    }
    const size_t frames_per_chunk = 3000;
-    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
-    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
-        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
-        if ((size_t) n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
+        if ((size_t)n_len < frames_per_chunk) {
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
        }

        mtmd_audio_mel out_chunk;
        out_chunk.n_len     = n_len;
        out_chunk.n_mel     = out_full.n_mel;
-        out_chunk.n_len_org = out_full.n_mel;  // unused
+        out_chunk.n_len_org = out_full.n_mel; // unused
        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);

        for (int i = 0; i < out_full.n_mel; i++) {
-            auto src = out_full.data.begin() + i * out_full.n_len + off;
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
        }

@@ -579,152 +535,3 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s

    return true;
 }
-
-//
-// mtmd_audio_preprocessor_conformer
-//
-
-void mtmd_audio_preprocessor_conformer::initialize() {
-    cache.fill_sin_cos_table(hparams.audio_n_fft);
-    cache.fill_hann_window(hparams.audio_window_len, true);
-    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
-}
-
-bool mtmd_audio_preprocessor_conformer::preprocess(const float *                 samples,
-                                                   size_t                        n_samples,
-                                                   std::vector<mtmd_audio_mel> & output) {
-    // empty audio
-    if (n_samples == 0) {
-        return false;
-    }
-
-    filter_params params;
-    params.n_mel            = hparams.n_mel_bins;
-    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
-    params.hann_window_size = hparams.audio_window_len;
-    params.hop_length       = hparams.audio_hop_len;
-    params.sample_rate      = hparams.audio_sample_rate;
-    params.center_padding   = true;
-    params.preemph          = 0.97f;
-    params.use_natural_log  = true;
-    params.norm_per_feature = true;
-
-    // make sure the cache is initialized
-    GGML_ASSERT(!cache.sin_vals.empty());
-    GGML_ASSERT(!cache.cos_vals.empty());
-    GGML_ASSERT(!cache.filters.data.empty());
-
-    mtmd_audio_mel out_full;
-    bool           ok = log_mel_spectrogram(samples, n_samples,
-                                            4,  // n_threads
-                                            params, cache, out_full);
-    if (!ok) {
-        return false;
-    }
-
-    output.push_back(std::move(out_full));
-    return true;
-}
-
-//
-// mtmd_audio_streaming_istft implementation
-//
-
-mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
-    n_fft(n_fft),
-    hop_length(hop_length),
-    n_fft_bins(n_fft / 2 + 1),
-    overlap_buffer(n_fft, 0.0f),
-    window_sum_buffer(n_fft, 0.0f),
-    padding_to_remove((n_fft - hop_length) / 2),
-    ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
-    ifft_out(n_fft * 2 * 4, 0.0f) {
-    cache.fill_sin_cos_table(n_fft);
-    cache.fill_hann_window(n_fft, true);
-}
-
-void mtmd_audio_streaming_istft::reset() {
-    std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
-    std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
-    padding_to_remove = (n_fft - hop_length) / 2;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
-    std::vector<float> output(hop_length);
-
-    // copy frequencies
-    for (int j = 0; j < n_fft_bins; j++) {
-        ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
-        ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
-    }
-
-    // mirror negative frequencies
-    for (int j = 1; j < n_fft_bins - 1; j++) {
-        int mirror_idx              = n_fft - j;
-        ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
-        ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1];  // conjugate
-    }
-
-    ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
-
-    // update window sum and overlap buffer
-    for (int j = 0; j < n_fft; j++) {
-        window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
-        overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
-    }
-
-    // extract hop_length samples with normalization
-    for (int i = 0; i < hop_length; i++) {
-        if (window_sum_buffer[i] > 1e-8f) {
-            output[i] = overlap_buffer[i] / window_sum_buffer[i];
-        } else {
-            output[i] = overlap_buffer[i];
-        }
-    }
-
-    // shift buffers left by hop_length
-    std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
-    std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
-
-    std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
-    std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
-
-    // Remove padding if needed
-    int to_remove = std::min(padding_to_remove, (int) output.size());
-    padding_to_remove -= to_remove;
-    output.erase(output.begin(), output.begin() + to_remove);
-
-    return output;
-}
-
-std::vector<float> mtmd_audio_streaming_istft::flush() {
-    std::vector<float> output;
-
-    // Extract remaining samples from overlap buffer
-    // Continue until we've extracted all meaningful samples
-    int remaining = n_fft - hop_length;
-    while (remaining > 0) {
-        int chunk_size = std::min(remaining, hop_length);
-
-        for (int i = 0; i < chunk_size; i++) {
-            float sample;
-            if (window_sum_buffer[i] > 1e-8f) {
-                sample = overlap_buffer[i] / window_sum_buffer[i];
-            } else {
-                sample = overlap_buffer[i];
-            }
-            output.push_back(sample);
-        }
-
-        // Shift buffers
-        std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
-        std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
-
-        std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
-        std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
-
-        remaining -= chunk_size;
-    }
-
-    return output;
-}
--- a/llama/llama.cpp/tools/mtmd/mtmd-audio.h
+++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.h
@@ -17,38 +17,6 @@ struct mtmd_audio_mel {
    std::vector<float> data;
 };

-struct mtmd_audio_mel_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-// cache for audio processing, each processor instance owns its own cache
-struct mtmd_audio_cache {
-    std::vector<float> sin_vals;
-    std::vector<float> cos_vals;
-
-    std::vector<float> hann_window;
-
-    mtmd_audio_mel_filters filters;
-
-    void fill_sin_cos_table(int n);
-
-    void fill_hann_window(int length, bool periodic);
-
-    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
-    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
-    void fill_mel_filterbank_matrix(int   n_mel,
-                                    int   n_fft,
-                                    int   sample_rate,               // e.g. 16000
-                                    float fmin             = 0.0f,   // e.g. 0.0
-                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
-                                    bool  slaney_area_norm = true,
-                                    float scale = 1.0f  // optional extra scaling
-    );
-};
-
 struct mtmd_audio_preprocessor {
    const clip_hparams & hparams;

@@ -63,51 +31,4 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
    void initialize() override;
    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
-};
-
-struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
-    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
-    void initialize() override;
-    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
-
-  private:
-    mtmd_audio_cache cache;
-};
-
-//
-// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
-//
-struct mtmd_audio_streaming_istft {
-    mtmd_audio_streaming_istft(int n_fft, int hop_length);
-
-    // reset streaming state
-    void reset();
-
-    // process a single STFT frame (streaming)
-    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
-    // returns: up to hop_length samples
-    std::vector<float> process_frame(const float * frame_spectrum);
-
-    // flush remaining samples at end of stream
-    std::vector<float> flush();
-
-  private:
-    int n_fft;
-    int hop_length;
-    int n_fft_bins;
-
-    // Own cache for output processing
-    mtmd_audio_cache cache;
-
-    // Streaming state
-    std::vector<float> overlap_buffer;
-    std::vector<float> window_sum_buffer;
-    int                padding_to_remove;
-
-    // Working buffers for IFFT
-    std::vector<float> ifft_in;
-    std::vector<float> ifft_out;
 };
--- a/llama/llama.cpp/tools/mtmd/mtmd.cpp
+++ b/llama/llama.cpp/tools/mtmd/mtmd.cpp
@@ -276,7 +276,7 @@ struct mtmd_context {
        }

        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
@@ -293,7 +293,7 @@ struct mtmd_context {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
            img_end = "[IMG_END]";

-        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";
@@ -339,13 +339,8 @@ struct mtmd_context {
            case PROJECTOR_TYPE_QWEN25O:
            case PROJECTOR_TYPE_ULTRAVOX:
            case PROJECTOR_TYPE_VOXTRAL:
-            case PROJECTOR_TYPE_GLMA:
-            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                break;
-            case PROJECTOR_TYPE_LFM2A:
-                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
-                break;
            default:
                GGML_ABORT("unsupported audio projector type");
        }
@@ -363,9 +358,6 @@ struct mtmd_context {
            // [BEGIN_AUDIO] ... (embeddings) ...
            aud_beg = "[BEGIN_AUDIO]";

-        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-            // <sound> ... (embeddings) ...
-            aud_beg = "<sound>";
        }
    }

@@ -872,15 +864,10 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }

 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_YOUTUVL:
-            return true;
-        default:
-            return false;
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
+        return true;
    }
+    return false;
 }

 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
--- a/llama/llama.cpp/tools/mtmd/mtmd.h
+++ b/llama/llama.cpp/tools/mtmd/mtmd.h
@@ -27,9 +27,6 @@
 * - Make sure the C API is aligned with the libllama C API (as in llama.h)
 * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
 * - Keep the API minimal, do not expose internal details unless necessary
- *
- * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
- * We encourage human contributors to ensure the quality and reliability of the codebase.
 */

 #ifdef LLAMA_SHARED
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -23,7 +23,7 @@ problem.
 8 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 1b59924b8..a8a61b1e2 100644
+index 8547ecc84..9f37ca70c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -64,7 +64,7 @@ index 1b59924b8..a8a61b1e2 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index d7a93848d..51bf8bc55 100644
+index da624c587..efc63e092 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
@@ -84,7 +84,7 @@ index d7a93848d..51bf8bc55 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index f021de1d7..9f3cb06ad 100644
+index ab0f6fe9c..6519af435 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -112,7 +112,7 @@ index f021de1d7..9f3cb06ad 100644
 
 static void * ggml_cuda_host_malloc(size_t size) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
-index 56b59f0af..790cabca0 100644
+index 70bf6f3d9..f2b7fe692 100644
 --- a/ggml/src/ggml-metal/ggml-metal.cpp
 +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -132,10 +132,10 @@ index 56b59f0af..790cabca0 100644
 
 static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index 608ba1e55..d010ca00e 100644
+index 0d37587f6..ff373d413 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -3499,6 +3499,7 @@ struct ggml_backend_opencl_buffer_context {
+@@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@@ -144,10 +144,10 @@ index 608ba1e55..d010ca00e 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index d7c8ad8c1..281fa1bdb 100644
+index 18a45d2d9..89041805e 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -557,6 +557,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     RPC_STATUS_ASSERT(status);
     delete ctx;
@@ -156,7 +156,7 @@ index d7c8ad8c1..281fa1bdb 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 8f8176b67..5f8f3c210 100644
+index e996d98be..84b679315 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -184,10 +184,10 @@ index 8f8176b67..5f8f3c210 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b1a51a436..885864111 100644
+index 34ec09d40..120191ca0 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -12751,6 +12751,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -195,7 +195,7 @@ index b1a51a436..885864111 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -12894,6 +12895,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a20c6525e..09874b67a 100644
+index 7b01a2edf..63250cdf1 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1832,16 +1832,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index a20c6525e..09874b67a 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -2032,7 +2023,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
+@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
--- a/llama/patches/0003-clip-unicode.patch
+++ b/llama/patches/0003-clip-unicode.patch
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 97c83de5f..26710548e 100644
+index 35e3aef0a..84a3796b5 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@
@@ -33,7 +33,7 @@ index 97c83de5f..26710548e 100644
 struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
 
 //#define CLIP_DEBUG_FUNCTIONS
-@@ -1842,7 +1855,29 @@ struct clip_model_loader {
+@@ -1619,7 +1632,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index 97c83de5f..26710548e 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1869,7 +1904,11 @@ struct clip_model_loader {
+@@ -1646,7 +1681,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/llama/patches/0004-solar-pro.patch
+++ b/llama/patches/0004-solar-pro.patch
@@ -19,10 +19,10 @@ adds support for the Solar Pro architecture
 create mode 100644 src/models/solar.cpp

 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index b0932794d..12e14f55c 100644
+index 4192af7c0..bd44d73e7 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
-@@ -129,6 +129,7 @@ add_library(llama
+@@ -125,6 +125,7 @@ add_library(llama
             models/seed-oss.cpp
             models/smallthinker.cpp
             models/smollm3.cpp
@@ -31,10 +31,10 @@ index b0932794d..12e14f55c 100644
             models/starcoder.cpp
             models/starcoder2.cpp
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 2ead96546..36855e408 100644
+index 8caf80afc..2ce8ffec0 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -89,6 +89,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@@ -42,7 +42,7 @@ index 2ead96546..36855e408 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -215,6 +216,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
     { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
@@ -50,7 +50,7 @@ index 2ead96546..36855e408 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -347,6 +349,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
+@@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_ATTN_QKV,                               "blk.%d.attn_qkv" },
     { LLM_TENSOR_LAYER_OUT_NORM,                         "blk.%d.layer_output_norm" },
     { LLM_TENSOR_ATTN_OUT_NORM,                          "blk.%d.attn_output_norm" },
@@ -58,9 +58,9 @@ index 2ead96546..36855e408 100644
     { LLM_TENSOR_POS_EMBD,                               "position_embd" },
     { LLM_TENSOR_FFN_ACT,                                "blk.%d.ffn.act" },
     { LLM_TENSOR_TOKEN_EMBD_NORM,                        "token_embd_norm" },
-@@ -2254,6 +2257,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
-                 LLM_TENSOR_FFN_DOWN,
-                 LLM_TENSOR_FFN_UP,
+@@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
+             return {
+                 LLM_TENSOR_TOKEN_EMBD,
             };
 +        case LLM_ARCH_SOLAR:
 +            return {
@@ -81,7 +81,7 @@ index 2ead96546..36855e408 100644
         default:
             GGML_ABORT("unknown architecture for tensor mapping");
     }
-@@ -2422,6 +2441,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -90,10 +90,10 @@ index 2ead96546..36855e408 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 68ec6a18b..7fab2fa93 100644
+index 6cbf9b1f8..14d461c76 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -93,6 +93,7 @@ enum llm_arch {
+@@ -91,6 +91,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_GRANITE_HYBRID,
     LLM_ARCH_CHAMELEON,
@@ -101,7 +101,7 @@ index 68ec6a18b..7fab2fa93 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -219,6 +220,7 @@ enum llm_kv {
+@@ -212,6 +213,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
     LLM_KV_ATTENTION_TEMPERATURE_SCALE,
@@ -109,7 +109,7 @@ index 68ec6a18b..7fab2fa93 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -473,6 +475,7 @@ enum llm_tensor {
+@@ -465,6 +467,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@@ -118,10 +118,10 @@ index 68ec6a18b..7fab2fa93 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index c847ef91b..700e4bcf5 100644
+index fe1fa4341..aabff2f06 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -167,6 +167,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
+@@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
     return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
 }
 
@@ -137,7 +137,7 @@ index c847ef91b..700e4bcf5 100644
     if (il < n_layer) {
         return swa_layers[il];
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 7ae3ec292..cc2ce7f15 100644
+index f6e95b5d2..c6e673276 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -65,6 +65,8 @@ struct llama_hparams {
@@ -149,7 +149,7 @@ index 7ae3ec292..cc2ce7f15 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -266,6 +268,9 @@ struct llama_hparams {
+@@ -259,6 +261,9 @@ struct llama_hparams {
 
     uint32_t n_pos_per_embd() const;
 
@@ -160,10 +160,10 @@ index 7ae3ec292..cc2ce7f15 100644
 
     bool has_kv(uint32_t il) const;
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index e66febaa0..cf200c61f 100644
+index ca2ea2461..8916a6242 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
-@@ -489,7 +489,7 @@ namespace GGUFMeta {
+@@ -466,7 +466,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
@@ -173,10 +173,10 @@ index e66febaa0..cf200c61f 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 5de6493b9..d3c60d418 100644
+index ae8207ee1..00cd579e0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -2074,6 +2074,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@@ -198,7 +198,7 @@ index 5de6493b9..d3c60d418 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -5626,6 +5641,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@@ -233,7 +233,7 @@ index 5de6493b9..d3c60d418 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -7838,6 +7881,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+@@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params);
             } break;
@@ -244,7 +244,7 @@ index 5de6493b9..d3c60d418 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-@@ -8116,6 +8163,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
         case LLM_ARCH_CHAMELEON:
@@ -253,10 +253,10 @@ index 5de6493b9..d3c60d418 100644
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_SMOLLM3:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 79200a0d9..740cb7094 100644
+index c6eb95318..b378b23ec 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -79,6 +79,7 @@ enum llm_type {
+@@ -76,6 +76,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
@@ -264,7 +264,7 @@ index 79200a0d9..740cb7094 100644
     LLM_TYPE_26B,
     LLM_TYPE_27B,
     LLM_TYPE_30B,
-@@ -410,6 +411,8 @@ struct llama_layer {
+@@ -405,6 +406,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
 
@@ -274,10 +274,10 @@ index 79200a0d9..740cb7094 100644
 
     struct llama_layer_convnext convnext;
 diff --git a/src/models/models.h b/src/models/models.h
-index 72b2b760c..4e2162c77 100644
+index ffb36acc6..6d84a185d 100644
 --- a/src/models/models.h
 +++ b/src/models/models.h
-@@ -533,6 +533,11 @@ struct llm_build_smollm3 : public llm_graph_context {
+@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
     llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
 };
 
--- a/llama/patches/0005-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0005-fix-deepseek-deseret-regex.patch
@@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 09874b67a..0049d59bf 100644
+index 63250cdf1..dd86a1745 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -25,7 +25,7 @@ index 09874b67a..0049d59bf 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index b47dcbe61..6d1084f26 100644
+index bb44edfad..13ced055f 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
--- a/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 6192a8704..993ec027f 100644
+index 4c04c3300..f4747f262 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -26,4 +26,4 @@ index 6192a8704..993ec027f 100644
 +    add_custom_target(ggml-cpu)
     if (GGML_SYSTEM_ARCH STREQUAL "x86")
         ggml_add_cpu_backend_variant(x64)
-         ggml_add_cpu_backend_variant(sse42              SSE42)
+         ggml_add_cpu_backend_variant(sse42        SSE42)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
jmorganca	f1c7d8718f	refactor: consolidate imagegen default constants Add defaults.go with DefaultWidth and DefaultHeight constants. Steps are model-dependent and now handled by each model: - Z-Image uses 9 steps (turbo model) - Other models define their own defaults Updated all files using hardcoded 1024x1024 values to use constants.	2026-01-15 00:42:53 -08:00
jmorganca	9d07e26b62	fix: capture multiple stderr lines for better error reporting Previously only the last stderr line was stored, which often missed important context when the subprocess failed. Now stores up to 10 recent stderr lines and joins them in error messages, providing more complete diagnostic information.	2026-01-14 22:27:37 -08:00
jmorganca	8485b6546e	docs: clarify no-op interface methods in imagegen server Add explanatory comments for methods that are required by the llm.LlamaServer interface but don't apply to image generation: - Load: models loaded by subprocess at startup - WaitUntilRunning: NewServer already blocks until ready - Embedding/Tokenize/Detokenize: not applicable to image gen - GetDeviceInfos: GPU tracking handled by subprocess	2026-01-14 22:25:24 -08:00
jmorganca	d9ae425d54	refactor: remove unused HasTensorLayers wrapper function The package-level HasTensorLayers(modelName string) was just a thin wrapper around ResolveModelName(modelName) != "". Callers can use ResolveModelName directly. The ModelManifest.HasTensorLayers() method remains for checking if a manifest has tensor layers.	2026-01-14 22:21:49 -08:00
jmorganca	9e1d79ac67	refactor: consolidate imagegen default constants Add defaults.go with DefaultWidth, DefaultHeight, and DefaultSteps constants. Update all files using hardcoded 1024/1024/9 defaults to use these constants instead: - server.go: completion request defaults - cli.go: DefaultOptions and flag defaults - runner/runner.go: request defaults - api/handler.go: size parsing defaults - cmd/engine/main.go: flag defaults - models/zimage/zimage.go: generate config defaults - models/qwen_image/qwen_image.go: width/height defaults	2026-01-14 22:21:00 -08:00
jmorganca	7273d9925e	fix: log errors in imagegen generate handler instead of discarding Previously errors from runner.Completion were silently discarded with `_ = err`. Now they are properly logged with slog.Error for debugging.	2026-01-14 22:18:03 -08:00
jmorganca	4896240528	fix: imagegen models now properly expire after keepalive timeout The refCount was incorrectly initialized to 1 in loadImageGen, causing the reference count to be 2 after useLoadedRunner incremented it. When requests completed, refCount would decrement to 1 (not 0), so the expiration timer was never set up and models stayed loaded forever. Removing the explicit refCount:1 lets it default to 0, matching the behavior of regular LLM model loading.	2026-01-14 18:02:10 -08:00
vincent d warmerdam	349d814814	docs: add marimo integration (#13326 ) * docs added * fix title * add marimo to docs.json --------- Co-authored-by: Devon Rifkin <drifkin@drifkin.net>	2026-01-14 17:37:38 -08:00
Yuhong Sun	c8743031e0	docs: add onyx integration (#13135 ) * Ready for team review * Update docs/integrations/onyx.mdx Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> * update docs.json --------- Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>	2026-01-14 17:32:05 -08:00
Jeffrey Morgan	4adb9cf4bb	scripts: fix macOS auto-update signature verification failure (#13713 ) Add --norsrc flag to ditto commands when creating Ollama-darwin.zip to exclude AppleDouble resource fork files (._* files) from the archive. The mlx.metallib file has extended attributes, which causes ditto to include a ._mlx.metallib AppleDouble file in the zip. Since this file is not part of the code signature seal, macOS rejects the bundle during auto-update verification with: "a sealed resource is missing or invalid" "file added: .../._mlx.metallib" The --norsrc flag prevents ditto from preserving resource forks and extended attributes, ensuring only signed files are included in the release archive.	2026-01-14 07:48:10 -08:00
Daniel Hiltgen	74f475e735	Revert "Documentation edits made through Mintlify web editor" (#13688 ) This reverts commit `c6d4c0c7f2`. Merge after 0.14.0 ships for the updated Linux documentation.	2026-01-14 07:42:34 -08:00
Maternion	875cecba74	docs: update default context window size to 4096 tokens (#13709 )	2026-01-14 01:01:28 -08:00
Josh Daniel Bañares	7d411a4686	docs: update web search param in examples (#13711 )	2026-01-14 00:38:39 -08:00
Daniel Hiltgen	02a2401596	mlx: bundle openblas dependency (#13706 )	2026-01-13 15:29:47 -08:00
Daniel Hiltgen	e4b488a7b5	CI: dedup cuda libraries to reduce payload size (#13704 )	2026-01-13 11:25:31 -08:00
Daniel Hiltgen	98079ddd79	ci: add missing mlx components to release build (#13702 )	2026-01-13 09:13:09 -08:00
Jeffrey Morgan	d70942f47b	x/imagegen/cli: skip local model check (#13699 )	2026-01-12 22:38:10 -08:00
Jeffrey Morgan	58e4701557	scripts: increase notarization timeout to 20m (#13697 ) The 100MB mlx.metallib file significantly increased the app bundle size, causing Apple's notarization service to timeout with the previous 10m limit.	2026-01-12 20:38:38 -08:00
Jeffrey Morgan	dbf47ee55a	cmake: use CMAKE_SYSTEM_PROCESSOR instead of CMAKE_OSX_ARCHITECTURES for mlx.metallib install (#13696 ) The CMake condition for installing mlx.metallib checks CMAKE_OSX_ARCHITECTURES, but this variable is only set when explicitly passed - not auto-detected. The arm64 build was missing this flag, causing the metallib to not be installed, which then caused codesign to fail on the unexpanded glob pattern.	2026-01-12 20:05:11 -08:00
Jeffrey Morgan	af7ea6e96e	x/imagegen: install mlx.metallib and fix macOS rpath handling, add mlx library directories to LD_LIBRARY_PATH (#13695 ) - Install mlx.metallib for arm64 builds (required for Metal GPU acceleration) - Apply rpath settings to all macOS builds, not just x86_64 - Add CMAKE_BUILD_WITH_INSTALL_RPATH to avoid install_name_tool errors - Update build_darwin.sh to copy, sign, and package the metallib	2026-01-12 19:03:11 -08:00
Jeffrey Morgan	8f1e0140e7	x/imagegen: fix mlx build in Dockerfile and macOS build script (#13693 )	2026-01-12 15:52:43 -08:00
Parth Sareen	35c3c9e3c2	anthropic: allow non-thinking models when using Anthropic API (#13692 )	2026-01-12 15:13:26 -08:00
Parth Sareen	d06acbcb19	x/cmd: enable web search and web fetch with flag (#13690 )	2026-01-12 13:59:40 -08:00
Jeffrey Morgan	9667c2282f	x/imagegen: add naive TeaCache and FP8 quantization support (#13683 ) TeaCache: - Timestep embedding similarity caching for diffusion models - Polynomial rescaling with configurable thresholds - Reduces transformer forward passes by ~30-50% FP8 quantization: - Support for FP8 quantized models (8-bit weights with scales) - QuantizedMatmul on Metal, Dequantize on CUDA - Client-side quantization via ollama create --quantize fp8 Other bug fixes: - Fix `/api/show` API for image generation models - Server properly returns model info (architecture, parameters, quantization) - Memory allocation optimizations - CLI improvements for image generation	2026-01-12 13:45:22 -08:00
Jeffrey Morgan	a937a68317	server: fix slow 'ollama rm' of models with many layers (#13680 ) RemoveLayers was calling Manifests() for each layer to check if it was shared with other models. For models with many blobs (e.g., tensor models), this caused O(N*M) manifest reads. Now loads manifests once and builds a set of in-use digests.	2026-01-12 13:17:48 -08:00
Parth Sareen	2185112d84	x/cmd: connect /set flags to behavior in experimental mode (#13684 )	2026-01-12 00:40:44 -08:00
Parth Sareen	91926601dc	x: add missing /set, /show, /load, /save commands to experimental mode (#13682 )	2026-01-11 23:12:31 -08:00
Jeffrey Morgan	361d6c16c2	x/imagegen/transfer: fix timeout and progress reporting (#13679 ) Removes 5-minute HTTP client timeout that caused "context deadline exceeded" errors on large file downloads. Stall detection (10s) already handles unresponsive connections. Fixes progress bar total going down on resume by calculating total from all blobs upfront and reporting already-downloaded bytes as completed immediately.	2026-01-11 15:33:53 -08:00
Patrick Devine	7e2496e88e	Fix cmake install command in README (#13678 ) Update installation command for MLX component in README.	2026-01-11 13:16:42 -08:00
WhatToPutHere	5b84e29882	docs: fix troubleshooting page (#13674 ) Updated the link in the log output description to point to the correct troubleshooting guide format.	2026-01-11 00:58:07 -08:00