wip?

server: add python tool parsing logic
2026-01-15 19:09:25 -05:00 · 2025-05-07 19:00:44 -07:00 · 2025-05-02 16:23:54 -07:00
237 changed files with 9262 additions and 17391 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -103,6 +103,11 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 11'
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@@ -319,6 +324,7 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
@@ -426,22 +432,6 @@ jobs:
          docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
        working-directory: ${{ runner.temp }}

-  # Trigger downstream release process
-  trigger:
-    runs-on: ubuntu-latest
-    environment: release
-    needs: [darwin-build, windows-build, windows-depends]
-    steps:
-      - name: Trigger downstream release process
-        run: |
-          curl -L \
-            -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
-            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
-
  # Aggregate all the assets and ship a release
  release:
    needs: [darwin-sign, windows-sign, linux-build]
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -19,8 +19,8 @@ linters:
    - nolintlint
    - nosprintfhostport
    - staticcheck
+    - tenv
    - unconvert
-    - usetesting
    - wastedassign
    - whitespace
  disable:
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,6 +17,14 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+      }
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@@ -70,6 +78,11 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/17
+++ b/17
@@ -7,10 +7,14 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

+# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
-    && dnf install -y ccache \
+    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
+    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
+    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@@ -34,6 +38,15 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

+FROM base AS cuda-11
+ARG CUDA11VERSION=11.3
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8
+
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -85,9 +98,11 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12

 FROM --platform=linux/arm64 scratch AS arm64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618
+FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac

 .PHONY: help
 help:
--- a/README.md
+++ b/README.md
@@ -61,8 +61,6 @@ Here are some example models that can be downloaded:
 | QwQ                | 32B        | 20GB  | `ollama run qwq`                 |
 | DeepSeek-R1        | 7B         | 4.7GB | `ollama run deepseek-r1`         |
 | DeepSeek-R1        | 671B       | 404GB | `ollama run deepseek-r1:671b`    |
-| Llama 4            | 109B       | 67GB  | `ollama run llama4:scout`        |
-| Llama 4            | 400B       | 245GB | `ollama run llama4:maverick`     |
 | Llama 3.3          | 70B        | 43GB  | `ollama run llama3.3`            |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
 | Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
@@ -79,7 +77,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Granite-3.3         | 8B         | 4.9GB | `ollama run granite3.3`          |
+| Granite-3.2         | 8B         | 4.9GB | `ollama run granite3.2`          |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -287,7 +285,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
+- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -314,8 +312,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
- [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
 - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
@@ -329,14 +325,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
+- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
+- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@@ -345,16 +341,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
+- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
 - [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
@@ -372,7 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
 - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@@ -390,7 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
 - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
 - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
 - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
@@ -398,13 +394,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
 - [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
- [Flufy](https://github.com/Aharon-Bensadoun/Flufy) (A beautiful chat interface for interacting with Ollama's API. Built with React, TypeScript, and Material-UI.)
 - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
 - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)

 ### Cloud

@@ -446,7 +440,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
+- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)

 ### Apple Vision Pro
@@ -474,7 +468,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Libraries

- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
+- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
 - [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration)
@@ -521,21 +515,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)

 ### Mobile

- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
+- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)

@@ -559,7 +552,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
+- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -569,8 +562,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
+- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
@@ -584,7 +577,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)

 ### Supported backends

--- a/api/client_test.go
+++ b/api/client_test.go
@@ -1,6 +1,7 @@
 package api

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"net/http"
@@ -136,7 +137,7 @@ func TestClientStream(t *testing.T) {
 			client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)

 			var receivedChunks []ChatResponse
-			err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
+			err := client.stream(context.Background(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
 				var resp ChatResponse
 				if err := json.Unmarshal(chunk, &resp); err != nil {
 					return fmt.Errorf("failed to unmarshal chunk: %w", err)
@@ -222,7 +223,7 @@ func TestClientDo(t *testing.T) {
 				ID      string `json:"id"`
 				Success bool   `json:"success"`
 			}
-			err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp)
+			err := client.do(context.Background(), http.MethodPost, "/v1/messages", nil, &resp)

 			if tc.wantErr != "" {
 				if err == nil {
--- a/api/types.go
+++ b/api/types.go
@@ -271,6 +271,9 @@ type Options struct {
 	RepeatPenalty    float32  `json:"repeat_penalty,omitempty"`
 	PresencePenalty  float32  `json:"presence_penalty,omitempty"`
 	FrequencyPenalty float32  `json:"frequency_penalty,omitempty"`
+	Mirostat         int      `json:"mirostat,omitempty"`
+	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
+	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
 }

@@ -280,7 +283,12 @@ type Runner struct {
 	NumBatch  int   `json:"num_batch,omitempty"`
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
+	LowVRAM   bool  `json:"low_vram,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
+	LogitsAll bool  `json:"logits_all,omitempty"`
+	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
+	UseMLock  bool  `json:"use_mlock,omitempty"`
 	NumThread int   `json:"num_thread,omitempty"`
 }

@@ -463,6 +471,13 @@ type ProcessModelResponse struct {
 	SizeVRAM  int64        `json:"size_vram"`
 }

+type RetrieveModelResponse struct {
+	Id      string `json:"id"`
+	Object  string `json:"object"`
+	Created int64  `json:"created"`
+	OwnedBy string `json:"owned_by"`
+}
+
 type TokenResponse struct {
 	Token string `json:"token"`
 }
@@ -645,6 +660,9 @@ func DefaultOptions() Options {
 		RepeatPenalty:    1.1,
 		PresencePenalty:  0.0,
 		FrequencyPenalty: 0.0,
+		Mirostat:         0,
+		MirostatTau:      5.0,
+		MirostatEta:      0.1,
 		Seed:             -1,

 		Runner: Runner{
@@ -653,6 +671,8 @@ func DefaultOptions() Options {
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
+			LowVRAM:   false,
+			UseMLock:  false,
 			UseMMap:   nil,
 		},
 	}
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -4,14 +4,20 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"

 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/logutil"
 )

 func InitLogging() {
+	level := slog.LevelInfo
+
+	if envconfig.Debug() {
+		level = slog.LevelDebug
+	}
+
 	var logFile *os.File
 	var err error
 	// Detect if we're a GUI app on windows, and if not, send logs to console
@@ -27,8 +33,20 @@ func InitLogging() {
 			return
 		}
 	}
+	handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: true,
+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
+			if attr.Key == slog.SourceKey {
+				source := attr.Value.Any().(*slog.Source)
+				source.File = filepath.Base(source.File)
+			}
+			return attr
+		},
+	})
+
+	slog.SetDefault(slog.New(handler))

-	slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
 	slog.Info("ollama app started")
 }

--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@@ -78,7 +78,7 @@ func BenchmarkColdStart(b *testing.B) {

 	for _, tt := range tests {
 		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
-			ctx := b.Context()
+			ctx := context.Background()

 			// Set number of tokens as our throughput metric
 			b.SetBytes(int64(tt.maxTokens))
@@ -113,7 +113,7 @@ func BenchmarkWarmStart(b *testing.B) {

 	for _, tt := range tests {
 		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
-			ctx := b.Context()
+			ctx := context.Background()

 			// Pre-warm the model
 			warmup(client, m, tt.prompt, b)
@@ -140,7 +140,7 @@ func setup(b *testing.B) *api.Client {
 	if err != nil {
 		b.Fatal(err)
 	}
-	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
+	if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
 		b.Fatalf("Model unavailable: %v", err)
 	}

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -31,7 +31,6 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
-	"golang.org/x/sync/errgroup"
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
@@ -42,7 +41,6 @@ import (
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
-	"github.com/ollama/ollama/types/syncmap"
 	"github.com/ollama/ollama/version"
 )

@@ -108,7 +106,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}
 	spinner.Stop()

-	req.Model = args[0]
+	req.Name = args[0]
 	quantize, _ := cmd.Flags().GetString("quantize")
 	if quantize != "" {
 		req.Quantize = quantize
@@ -119,54 +117,34 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	var g errgroup.Group
-	g.SetLimit(max(runtime.GOMAXPROCS(0)-1, 1))
-
-	files := syncmap.NewSyncMap[string, string]()
-	for f, digest := range req.Files {
-		g.Go(func() error {
+	if len(req.Files) > 0 {
+		fileMap := map[string]string{}
+		for f, digest := range req.Files {
 			if _, err := createBlob(cmd, client, f, digest, p); err != nil {
 				return err
 			}
-
-			// TODO: this is incorrect since the file might be in a subdirectory
-			//       instead this should take the path relative to the model directory
-			//       but the current implementation does not allow this
-			files.Store(filepath.Base(f), digest)
-			return nil
-		})
+			fileMap[filepath.Base(f)] = digest
+		}
+		req.Files = fileMap
 	}

-	adapters := syncmap.NewSyncMap[string, string]()
-	for f, digest := range req.Adapters {
-		g.Go(func() error {
+	if len(req.Adapters) > 0 {
+		fileMap := map[string]string{}
+		for f, digest := range req.Adapters {
 			if _, err := createBlob(cmd, client, f, digest, p); err != nil {
 				return err
 			}
-
-			// TODO: same here
-			adapters.Store(filepath.Base(f), digest)
-			return nil
-		})
+			fileMap[filepath.Base(f)] = digest
+		}
+		req.Adapters = fileMap
 	}

-	if err := g.Wait(); err != nil {
-		return err
-	}
-
-	req.Files = files.Items()
-	req.Adapters = adapters.Items()
-
 	bars := make(map[string]*progress.Bar)
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != "" {
 			bar, ok := bars[resp.Digest]
 			if !ok {
-				msg := resp.Status
-				if msg == "" {
-					msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19])
-				}
-				bar = progress.NewBar(msg, resp.Total, resp.Completed)
+				bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
 				bars[resp.Digest] = bar
 				p.Add(resp.Digest, bar)
 			}
@@ -235,7 +213,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, digest stri
 		}
 	}()

-	if err := client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
+	if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
 		return "", err
 	}
 	return digest, nil
@@ -1429,6 +1407,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
+				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -2,6 +2,7 @@ package cmd

 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"io"
 	"net/http"
@@ -336,7 +337,7 @@ func TestDeleteHandler(t *testing.T) {
 	t.Cleanup(mockServer.Close)

 	cmd := &cobra.Command{}
-	cmd.SetContext(t.Context())
+	cmd.SetContext(context.TODO())
 	if err := DeleteHandler(cmd, []string{"test-model"}); err != nil {
 		t.Fatalf("DeleteHandler failed: %v", err)
 	}
@@ -398,6 +399,11 @@ func TestGetModelfileName(t *testing.T) {
 			var expectedFilename string

 			if tt.fileExists {
+				tempDir, err := os.MkdirTemp("", "modelfiledir")
+				defer os.RemoveAll(tempDir)
+				if err != nil {
+					t.Fatalf("temp modelfile dir creation failed: %v", err)
+				}
 				var fn string
 				if tt.modelfileName != "" {
 					fn = tt.modelfileName
@@ -405,7 +411,7 @@ func TestGetModelfileName(t *testing.T) {
 					fn = "Modelfile"
 				}

-				tempFile, err := os.CreateTemp(t.TempDir(), fn)
+				tempFile, err := os.CreateTemp(tempDir, fn)
 				if err != nil {
 					t.Fatalf("temp modelfile creation failed: %v", err)
 				}
@@ -524,7 +530,7 @@ func TestPushHandler(t *testing.T) {

 			cmd := &cobra.Command{}
 			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(t.Context())
+			cmd.SetContext(context.TODO())

 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
@@ -629,7 +635,7 @@ func TestListHandler(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", mockServer.URL)

 			cmd := &cobra.Command{}
-			cmd.SetContext(t.Context())
+			cmd.SetContext(context.TODO())

 			// Capture stdout
 			oldStdout := os.Stdout
@@ -684,7 +690,7 @@ func TestCreateHandler(t *testing.T) {
 						return
 					}

-					if req.Model != "test-model" {
+					if req.Name != "test-model" {
 						t.Errorf("expected model name 'test-model', got %s", req.Name)
 					}

@@ -724,7 +730,7 @@ func TestCreateHandler(t *testing.T) {
 			}))
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			t.Cleanup(mockServer.Close)
-			tempFile, err := os.CreateTemp(t.TempDir(), "modelfile")
+			tempFile, err := os.CreateTemp("", "modelfile")
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -744,7 +750,7 @@ func TestCreateHandler(t *testing.T) {
 			}

 			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(t.Context())
+			cmd.SetContext(context.TODO())

 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -531,8 +531,6 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 			return "", imgs, err
 		}
 		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
-		input = strings.ReplaceAll(input, "'"+nfp+"'", "")
-		input = strings.ReplaceAll(input, "'"+fp+"'", "")
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,8 +1,6 @@
 package cmd

 import (
-	"os"
-	"path/filepath"
 	"testing"

 	"github.com/stretchr/testify/assert"
@@ -52,24 +50,3 @@ d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 }
-
-// Ensure that file paths wrapped in single quotes are removed with the quotes.
-func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
-	dir := t.TempDir()
-	fp := filepath.Join(dir, "img.jpg")
-	data := make([]byte, 600)
-	copy(data, []byte{
-		0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
-		0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0xff, 0xd9,
-	})
-	if err := os.WriteFile(fp, data, 0o600); err != nil {
-		t.Fatalf("failed to write test image: %v", err)
-	}
-
-	input := "before '" + fp + "' after"
-	cleaned, imgs, err := extractFileData(input)
-	assert.NoError(t, err)
-	assert.Len(t, imgs, 1)
-	assert.Equal(t, cleaned, "before  after")
-}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -4,9 +4,9 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io"
 	"io/fs"
 	"log/slog"
-	"os"
 	"slices"
 	"strings"

@@ -89,7 +89,7 @@ type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []*ggml.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
@@ -106,13 +106,13 @@ type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []*ggml.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 }

-func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -147,14 +147,14 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 		return err
 	}

-	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
+	return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
 }

 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, f *os.File) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
@@ -189,8 +189,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
-	case "Qwen2_5_VLForConditionalGeneration":
-		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
@@ -241,13 +239,13 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}

-	return writeFile(f, conv.KV(t), conv.Tensors(ts))
+	return writeFile(ws, conv.KV(t), conv.Tensors(ts))
 }

-func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
+func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
 	for i := range ts {
 		ts[i].Shape = slices.Clone(ts[i].Shape)
 		slices.Reverse(ts[i].Shape)
 	}
-	return ggml.WriteGGUF(f, kv, ts)
+	return ggml.WriteGGUF(ws, kv, ts)
 }
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
 			continue
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }

-func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -126,11 +126,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor

 	if p.RopeScaling.factors != nil {
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -145,7 +145,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 			}
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -88,13 +88,13 @@ func (p *llama4Model) Replacements() []string {
 }

 // Tensors implements ModelConverter.
-func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor

 	var textTensors []Tensor
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, &ggml.Tensor{
+			out = append(out, ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
@@ -112,7 +112,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
 				// clone tensor since we need separate repackers
 				tt := t.Clone()
 				tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
-				out = append(out, &ggml.Tensor{
+				out = append(out, ggml.Tensor{
 					Name:     strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
 					Kind:     tt.Kind(),
 					Shape:    newShape,
@@ -125,7 +125,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
 			t.SetRepacker(p.repack())
 			newShape := slices.Clone(t.Shape())
 			newShape[1], newShape[2] = newShape[2], newShape[1]
-			out = append(out, &ggml.Tensor{
+			out = append(out, ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    newShape,
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }

-func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor

 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") {
@@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
 			}
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		return true
 	})

-	var out []*ggml.Tensor
+	var out []ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 	var addRopeFactors sync.Once

-	out := make([]*ggml.Tensor, 0, len(ts)+2)
+	out := make([]ggml.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, &ggml.Tensor{
+				out = append(out, ggml.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, &ggml.Tensor{
+				}, ggml.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
 			})
 		}

-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -15,7 +15,6 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
-		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@@ -40,18 +39,16 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
-	case "mrope", "default":
-		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
 	return kv
 }

-func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
+func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
-		out = append(out, &ggml.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -1,102 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type qwen25VLModel struct {
-	qwen2Model
-
-	VisionModel struct {
-		Depth               uint32  `json:"depth"`
-		HiddenSize          uint32  `json:"hidden_size"`
-		NumHeads            uint32  `json:"num_heads"`
-		InChannels          uint32  `json:"in_chans"`
-		PatchSize           uint32  `json:"patch_size"`
-		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
-		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
-		WindowSize          uint32  `json:"window_size"`
-		RMSNormEps          float32 `json:"layer_norm_epsilon"`
-		RopeTheta           float32 `json:"rope_theta"`
-		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
-		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
-	} `json:"vision_config"`
-}
-
-var _ ModelConverter = (*qwen25VLModel)(nil)
-
-func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
-	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen25vl"
-
-	for k, v := range q.qwen2Model.KV(t) {
-		if strings.HasPrefix(k, "qwen2.") {
-			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
-		}
-	}
-
-	if q.VisionModel.FullAttentionBlocks == nil {
-		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
-	}
-
-	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
-	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
-	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
-	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
-	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
-	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
-	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
-	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
-	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
-	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
-	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
-	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
-
-	return kv
-}
-
-func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-
-	for _, t := range ts {
-		if strings.Contains(t.Name(), "patch_embed.proj") {
-			for t := range splitDim(t, 2,
-				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
-				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
-			) {
-				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
-				out = append(out, t)
-			}
-		} else if strings.Contains(t.Name(), "attn.qkv") {
-			out = append(out, slices.Collect(splitDim(t, 0,
-				strings.NewReplacer("attn.qkv", "attn_q"),
-				strings.NewReplacer("attn.qkv", "attn_k"),
-				strings.NewReplacer("attn.qkv", "attn_v"),
-			))...)
-		} else {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		}
-	}
-
-	return out
-}
-
-func (p *qwen25VLModel) Replacements() []string {
-	return append(
-		p.qwen2Model.Replacements(),
-		"visual", "v",
-		"blocks", "blk",
-		"attn.proj", "attn_out",
-		"norm1", "ln1",
-		"norm2", "ln2",
-	)
-}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -130,7 +130,6 @@ func TestConvertModel(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
-			defer expectFile.Close()

 			var expect map[string]string
 			if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
--- a/convert/fs.go
+++ b/convert/fs.go
@@ -0,0 +1,58 @@
+package convert
+
+import (
+	"archive/zip"
+	"errors"
+	"io"
+	"io/fs"
+	"os"
+	"path/filepath"
+)
+
+type ZipReader struct {
+	r *zip.Reader
+	p string
+
+	// limit is the maximum size of a file that can be read directly
+	// from the zip archive. Files larger than this size will be extracted
+	limit int64
+}
+
+func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
+	return &ZipReader{r, p, limit}
+}
+
+func (z *ZipReader) Open(name string) (fs.File, error) {
+	r, err := z.r.Open(name)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	if fi, err := r.Stat(); err != nil {
+		return nil, err
+	} else if fi.Size() < z.limit {
+		return r, nil
+	}
+
+	if !filepath.IsLocal(name) {
+		return nil, zip.ErrInsecurePath
+	}
+
+	n := filepath.Join(z.p, name)
+	if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
+		w, err := os.Create(n)
+		if err != nil {
+			return nil, err
+		}
+		defer w.Close()
+
+		if _, err := io.Copy(w, r); err != nil {
+			return nil, err
+		}
+	} else if err != nil {
+		return nil, err
+	}
+
+	return os.Open(n)
+}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -1,56 +0,0 @@
-package convert
-
-import (
-	"iter"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-)
-
-// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
-// is split evenly based on the number of replacers provided.
-func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
-	return func(yield func(*ggml.Tensor) bool) {
-		for i, replacer := range replacers {
-			shape := slices.Clone(t.Shape())
-			shape[dim] = shape[dim] / uint64(len(replacers))
-
-			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
-			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
-
-			tt := t.Clone()
-			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-				dims := make([]int, len(shape))
-				for i := range shape {
-					dims[i] = int(shape[i])
-				}
-
-				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-				t, err := t.Slice(slice...)
-				if err != nil {
-					return nil, err
-				}
-
-				t = tensor.Materialize(t)
-				// flatten tensor so it can be written as a vector
-				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
-					return nil, err
-				}
-
-				return native.VectorF32(t.(*tensor.Dense))
-			})
-
-			if !yield(&ggml.Tensor{
-				Name:     replacer.Replace(t.Name()),
-				Kind:     t.Kind(),
-				Shape:    shape,
-				WriterTo: tt,
-			}) {
-				break
-			}
-		}
-	}
-}
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,7 +3,6 @@
 package discover

 import (
-	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -60,8 +59,6 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
-		// The detected driver is older than Feb 2023
-		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
 }

 func getVerboseState() C.uint16_t {
-	if envconfig.LogLevel() < slog.LevelInfo {
+	if envconfig.Debug() {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
@@ -27,14 +27,12 @@

 #endif

-#ifndef LOG
 #define LOG(verbose, ...) \
  do { \
    if (verbose) { \
      fprintf(stderr, __VA_ARGS__); \
    } \
  } while (0)
-#endif

 #ifdef __cplusplus
 extern "C" {
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
@@ -1,7 +1,6 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

 #include <string.h>
-#include <inttypes.h>
 #include "gpu_info_cudart.h"

 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
@@ -59,7 +58,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
-    if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) {
+    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
      return;
    }
@@ -169,9 +168,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->free = memInfo.free;
  resp->used = memInfo.used;

-  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total);
-  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free);
-  LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used);
+  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
+  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
+  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
 }

@@ -181,4 +180,4 @@ void cudart_release(cudart_handle_t h) {
  h.handle = NULL;
 }

-#endif  // __APPLE__
+#endif  // __APPLE__
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@@ -1,7 +1,6 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

 #include <string.h>
-#include <inttypes.h>
 #include "gpu_info_nvcuda.h"

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
@@ -194,8 +193,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  resp->total = memInfo.total;
  resp->free = memInfo.free;

-  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
-  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
+  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
+  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);

  
@@ -248,4 +247,4 @@ void nvcuda_release(nvcuda_handle_t h) {
  h.handle = NULL;
 }

-#endif  // __APPLE__
+#endif  // __APPLE__
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/api.md
+++ b/docs/api.md
@@ -394,6 +394,9 @@ curl http://localhost:11434/api/generate -d '{
    "repeat_penalty": 1.2,
    "presence_penalty": 1.5,
    "frequency_penalty": 1.0,
+    "mirostat": 1,
+    "mirostat_tau": 0.8,
+    "mirostat_eta": 0.6,
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
@@ -401,7 +404,10 @@ curl http://localhost:11434/api/generate -d '{
    "num_batch": 2,
    "num_gpu": 1,
    "main_gpu": 0,
+    "low_vram": false,
+    "vocab_only": false,
    "use_mmap": true,
+    "use_mlock": false,
    "num_thread": 8
  }
 }'
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens. 
+By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 

@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:

 ```shell
-/set parameter num_ctx 4096
+/set parameter num_ctx 8192
 ```

 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 4096
+    "num_ctx": 8192
  }
 }'
 ```
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+Ollama supports Nvidia GPUs with compute capability 5.0+.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -150,6 +150,9 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
+| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                         | int        | mirostat 0           |
+| mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                        | float      | mirostat_eta 0.1     |
+| mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                         | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```

 **Experimental LLM Library Override**
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -149,22 +149,9 @@ func Bool(k string) func() bool {
 	}
 }

-// LogLevel returns the log level for the application.
-// Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
-func LogLevel() slog.Level {
-	level := slog.LevelInfo
-	if s := Var("OLLAMA_DEBUG"); s != "" {
-		if b, _ := strconv.ParseBool(s); b {
-			level = slog.LevelDebug
-		} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
-			level = slog.Level(i * -4)
-		}
-	}
-
-	return level
-}
-
 var (
+	// Debug enabled additional debug information.
+	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
 	// KvCacheType is the quantization type for the K/V cache.
@@ -182,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
 )

 func String(s string) func() string {
@@ -222,6 +209,8 @@ var (
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
 	MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
+	// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
+	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )

 func Uint64(key string, defaultValue uint64) func() uint64 {
@@ -238,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }

+func Int64(key string, defaultValue int64) func() int64 {
+	return func() int64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+
+		return defaultValue
+	}
+}
+
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)

@@ -249,7 +252,7 @@ type EnvVar struct {

 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
@@ -266,7 +269,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},

 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -1,13 +1,11 @@
 package envconfig

 import (
-	"log/slog"
 	"math"
 	"testing"
 	"time"

 	"github.com/google/go-cmp/cmp"
-	"github.com/ollama/ollama/logutil"
 )

 func TestHost(t *testing.T) {
@@ -280,9 +278,9 @@ func TestVar(t *testing.T) {
 }

 func TestContextLength(t *testing.T) {
-	cases := map[string]uint{
-		"":     4096,
-		"2048": 2048,
+	cases := map[string]int64{
+		"":     -1,
+		"4096": 4096,
 	}

 	for k, v := range cases {
@@ -294,34 +292,3 @@ func TestContextLength(t *testing.T) {
 		})
 	}
 }
-
-func TestLogLevel(t *testing.T) {
-	cases := map[string]slog.Level{
-		// Default to INFO
-		"":      slog.LevelInfo,
-		"false": slog.LevelInfo,
-		"f":     slog.LevelInfo,
-		"0":     slog.LevelInfo,
-
-		// True values enable Debug
-		"true": slog.LevelDebug,
-		"t":    slog.LevelDebug,
-
-		// Positive values increase verbosity
-		"1": slog.LevelDebug,
-		"2": logutil.LevelTrace,
-
-		// Negative values decrease verbosity
-		"-1": slog.LevelWarn,
-		"-2": slog.LevelError,
-	}
-
-	for k, v := range cases {
-		t.Run(k, func(t *testing.T) {
-			t.Setenv("OLLAMA_DEBUG", k)
-			if i := LogLevel(); i != v {
-				t.Errorf("%s: expected %d, got %d", k, v, i)
-			}
-		})
-	}
-}
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -36,12 +36,12 @@ func (kv KV) ParameterCount() uint64 {
 	return keyValue(kv, "general.parameter_count", uint64(0))
 }

-func (kv KV) FileType() FileType {
+func (kv KV) FileType() fileType {
 	if t := kv.Uint("general.file_type"); t > 0 {
-		return FileType(t)
+		return fileType(t)
 	}

-	return FileTypeUnknown
+	return fileTypeUnknown
 }

 func (kv KV) BlockCount() uint64 {
@@ -125,7 +125,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"mistral3",
 		"llama4",
-		"qwen25vl",
 	}, kv.Architecture())
 }

@@ -150,7 +149,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ..
 		return val.(T)
 	}

-	slog.Debug("key not found", "key", key, "default", defaultValue[0])
+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
 	return defaultValue[0]
 }

@@ -227,11 +226,7 @@ func (t Tensor) block() (n int) {
 }

 func (t Tensor) blockSize() uint64 {
-	return (TensorType)(t.Kind).BlockSize()
-}
-
-func (t TensorType) BlockSize() uint64 {
-	switch t {
+	switch t.Kind {
 	case
 		0,  // F32
 		1,  // F16
@@ -257,77 +252,73 @@ func (t TensorType) BlockSize() uint64 {
 }

 func (t Tensor) typeSize() uint64 {
-	return TensorType(t.Kind).TypeSize()
-}
+	blockSize := t.blockSize()

-func (t TensorType) TypeSize() uint64 {
-	blockSize := t.BlockSize()
-
-	switch t {
-	case TensorTypeF32:
+	switch t.Kind {
+	case 0: // FP32
 		return 4
-	case TensorTypeF16:
+	case 1: // FP16
 		return 2
-	case TensorTypeQ4_0:
+	case 2: // Q4_0
 		return 2 + blockSize/2
-	case TensorTypeQ4_1:
+	case 3: // Q4_1
 		return 2 + 2 + blockSize/2
-	case TensorTypeQ5_0:
+	case 6: // Q5_0
 		return 2 + 4 + blockSize/2
-	case TensorTypeQ5_1:
+	case 7: // Q5_1
 		return 2 + 2 + 4 + blockSize/2
-	case TensorTypeQ8_0:
+	case 8: // Q8_0
 		return 2 + blockSize
-	case TensorTypeQ8_1:
+	case 9: // Q8_1
 		return 2 + 2 + blockSize
-	case TensorTypeQ2_K:
+	case 10: // Q2_K
 		return blockSize/16 + blockSize/4 + 2 + 2
-	case TensorTypeQ3_K:
+	case 11: // Q3_K
 		return blockSize/8 + blockSize/4 + 12 + 2
-	case TensorTypeQ4_K:
+	case 12: // Q4_K
 		return 2 + 2 + 12 + blockSize/2
-	case TensorTypeQ5_K:
+	case 13: // Q5_K
 		return 2 + 2 + 12 + blockSize/8 + blockSize/2
-	case TensorTypeQ6_K:
+	case 14: // Q6_K
 		return blockSize/2 + blockSize/4 + blockSize/16 + 2
-	case TensorTypeQ8_K:
+	case 15: // Q8_K
 		return 4 + blockSize + 2*blockSize/16
-	case tensorTypeIQ2_XXS:
+	case 16: // IQ2_XXS
 		return 2 + 2*blockSize/8
-	case tensorTypeIQ2_XS:
+	case 17: // IQ2_XS
 		return 2 + 2*blockSize/8 + blockSize/32
-	case tensorTypeIQ3_XXS:
+	case 18: // IQ3_XXS
 		return 2 + blockSize/4 + blockSize/8
-	case tensorTypeIQ1_S:
+	case 19: // IQ1_S
 		return 2 + blockSize/8 + blockSize/16
-	case tensorTypeIQ4_NL:
+	case 20: // IQ4_NL
 		return 2 + blockSize/2
-	case tensorTypeIQ3_S:
+	case 21: // IQ3_S
 		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
-	case tensorTypeIQ2_S:
+	case 22: // IQ2_S
 		return 2 + blockSize/4 + blockSize/16
-	case tensorTypeIQ4_XS:
+	case 23: // IQ4_XS
 		return 2 + 2 + blockSize/2 + blockSize/64
-	case TensorTypeI8:
+	case 24: // I8
 		return 1
-	case TensorTypeI16:
+	case 25: // I16
 		return 2
-	case TensorTypeI32:
+	case 26: // I32
 		return 4
-	case TensorTypeI64:
+	case 27: // I64
 		return 8
-	case TensorTypeF64:
+	case 28: // F64
 		return 8
-	case tensorTypeIQ1_M:
+	case 29: // IQ1_M
 		return blockSize/8 + blockSize/16 + blockSize/32
-	case TensorTypeBF16:
+	case 30: // BF16
 		return 2
 	default:
 		return 0
 	}
 }

-func (t Tensor) Elements() uint64 {
+func (t Tensor) parameters() uint64 {
 	var count uint64 = 1
 	for _, n := range t.Shape {
 		count *= n
@@ -336,11 +327,11 @@ func (t Tensor) Elements() uint64 {
 }

 func (t Tensor) Size() uint64 {
-	return t.Elements() * t.typeSize() / t.blockSize()
+	return t.parameters() * t.typeSize() / t.blockSize()
 }

 func (t Tensor) Type() string {
-	return TensorType(t.Kind).String()
+	return fileType(t.Kind).String()
 }

 type container interface {
@@ -489,7 +480,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 		var ropeFreqsCount uint64
 		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
-				ropeFreqsCount = ropeFreqsWeights.Elements()
+				ropeFreqsCount = ropeFreqsWeights.parameters()
 			}
 		}

--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -9,12 +9,8 @@ import (
 	"io"
 	"log/slog"
 	"maps"
-	"os"
-	"runtime"
 	"slices"
 	"strings"
-
-	"golang.org/x/sync/errgroup"
 )

 type containerGGUF struct {
@@ -229,7 +225,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		}

 		llm.tensors = append(llm.tensors, &tensor)
-		llm.parameters += tensor.Elements()
+		llm.parameters += tensor.parameters()
 	}

 	// patch KV with parameter count
@@ -492,38 +488,25 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 		return err
 	}

-	if t == ggufTypeString {
-		for _, e := range any(s).([]string) {
-			if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil {
-				return err
-			}
-
-			if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil {
-				return err
-			}
-		}
-		return nil
-	}
-
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
+func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	alignment := kv.Uint("general.alignment", 32)

-	if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
 		return err
 	}

-	if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
 		return err
 	}

-	if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
 		return err
 	}

-	if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
 		return err
 	}

@@ -531,12 +514,12 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 	slices.Sort(keys)

 	for _, key := range keys {
-		if err := ggufWriteKV(f, key, kv[key]); err != nil {
+		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
 			return err
 		}
 	}

-	slices.SortStableFunc(ts, func(a, b *Tensor) int {
+	slices.SortStableFunc(ts, func(a, b Tensor) int {
 		if i, j := a.block(), b.block(); i < 0 && j > 0 {
 			return 1
 		} else if i > 0 && j < 0 {
@@ -547,34 +530,21 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 	})

 	var s uint64
-	for i := range ts {
-		ts[i].Offset = s
-		if err := ggufWriteTensorInfo(f, ts[i]); err != nil {
+	for _, t := range ts {
+		t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
+		if err := ggufWriteTensorInfo(ws, t); err != nil {
 			return err
 		}
-		s += ts[i].Size()
-		s += uint64(ggufPadding(int64(s), int64(alignment)))
+		s += t.Size()
 	}

-	offset, err := f.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-	offset += ggufPadding(offset, int64(alignment))
-
-	var g errgroup.Group
-	g.SetLimit(runtime.GOMAXPROCS(0))
-	// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
 	for _, t := range ts {
-		t := t
-		w := io.NewOffsetWriter(f, offset+int64(t.Offset))
-		g.Go(func() error {
-			_, err := t.WriteTo(w)
+		if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil {
 			return err
-		})
+		}
 	}

-	return g.Wait()
+	return nil
 }

 func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
@@ -589,10 +559,8 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {

 	var err error
 	switch v := v.(type) {
-	case uint32, FileType:
+	case uint32:
 		err = writeGGUF(ws, ggufTypeUint32, v)
-	case uint64:
-		err = writeGGUF(ws, ggufTypeUint64, v)
 	case float32:
 		err = writeGGUF(ws, ggufTypeFloat32, v)
 	case bool:
@@ -601,20 +569,32 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 		err = writeGGUFString(ws, v)
 	case []int32:
 		err = writeGGUFArray(ws, ggufTypeInt32, v)
-	case *array[int32]:
-		err = writeGGUFArray(ws, ggufTypeInt32, v.values)
 	case []uint32:
 		err = writeGGUFArray(ws, ggufTypeUint32, v)
-	case *array[uint32]:
-		err = writeGGUFArray(ws, ggufTypeUint32, v.values)
 	case []float32:
 		err = writeGGUFArray(ws, ggufTypeFloat32, v)
-	case *array[float32]:
-		err = writeGGUFArray(ws, ggufTypeFloat32, v.values)
 	case []string:
-		err = writeGGUFArray(ws, ggufTypeString, v)
-	case *array[string]:
-		err = writeGGUFArray(ws, ggufTypeString, v.values)
+		if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
+			return err
+		}
+
+		if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
+			return err
+		}
+
+		if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
+			return err
+		}
+
+		for _, e := range v {
+			if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
+				return err
+			}
+
+			if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
+				return err
+			}
+		}
 	default:
 		return fmt.Errorf("improper type for '%s'", k)
 	}
@@ -622,7 +602,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 	return err
 }

-func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
+func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
 	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
 	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
 		return err
@@ -649,6 +629,20 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
 	return binary.Write(ws, binary.LittleEndian, t.Offset)
 }

+func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
+	offset, err := ws.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
+		return err
+	}
+
+	_, err = t.WriteTo(ws)
+	return err
+}
+
 func ggufPadding(offset, align int64) int64 {
 	return (align - offset%align) % align
 }
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -1,63 +0,0 @@
-package ggml
-
-import (
-	"bytes"
-	"os"
-	"slices"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestWriteGGUF(t *testing.T) {
-	w, err := os.CreateTemp(t.TempDir(), "*.bin")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer w.Close()
-
-	if err := WriteGGUF(w, KV{
-		"general.alignment": uint32(16),
-	}, []*Tensor{
-		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-	}); err != nil {
-		t.Fatal(err)
-	}
-
-	r, err := os.Open(w.Name())
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer r.Close()
-
-	ff, _, err := Decode(r, 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if diff := cmp.Diff(ff.KV(), KV{
-		"general.alignment":       uint32(16),
-		"general.parameter_count": uint64(36),
-	}); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(ff.Tensors(), Tensors{
-		Offset: 336,
-		items: []*Tensor{
-			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
-			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
-			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
-			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
-			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
-			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
-		},
-	}, cmp.AllowUnexported(Tensors{})); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
-	}
-}
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -1,341 +1,185 @@
 package ggml

-import (
-	"fmt"
-	"log/slog"
-	"strings"
-)
+import "fmt"

-// FileType is the Go equivalent to llama_ftype used for gguf file typing
-type FileType uint32
+type fileType uint32

 const (
-	FileTypeF32 FileType = iota
-	FileTypeF16
-	FileTypeQ4_0
-	FileTypeQ4_1
-	fileTypeQ4_1_F16 // unused by GGML
-	fileTypeQ4_2     // unused by GGML
-	fileTypeQ4_3     // unused by GGML
-	FileTypeQ8_0
-	FileTypeQ5_0
-	FileTypeQ5_1
-	FileTypeQ2_K
-	FileTypeQ3_K_S
-	FileTypeQ3_K_M
-	FileTypeQ3_K_L
-	FileTypeQ4_K_S
-	FileTypeQ4_K_M
-	FileTypeQ5_K_S
-	FileTypeQ5_K_M
-	FileTypeQ6_K
-	fileTypeIQ2_XXS // not supported by ollama
-	fileTypeIQ2_XS  // not supported by ollama
-	FileTypeQ2_K_S
-	fileTypeIQ3_XS  // not supported by ollama
-	fileTypeIQ3_XXS // not supported by ollama
-	fileTypeIQ1_S   // not supported by ollama
-	fileTypeIQ4_NL  // not supported by ollama
-	fileTypeIQ3_S   // not supported by ollama
-	fileTypeIQ3_M   // not supported by ollama
-	fileTypeIQ2_S   // not supported by ollama
-	fileTypeIQ2_M   // not supported by ollama
-	fileTypeIQ4_XS  // not supported by ollama
-	fileTypeIQ1_M   // not supported by ollama
-	FileTypeBF16
-	fileTypeQ4_0_4_4 // unused by GGML
-	fileTypeQ4_0_4_8 // unused by GGML
-	fileTypeQ4_0_8_8 // unused by GGML
-	fileTypeTQ1_0    // not supported by ollama
-	fileTypeTQ2_0    // not supported by ollama
+	fileTypeF32 fileType = iota
+	fileTypeF16
+	fileTypeQ4_0
+	fileTypeQ4_1
+	fileTypeQ4_1_F16
+	fileTypeQ4_2 // unused
+	fileTypeQ4_3 // unused
+	fileTypeQ8_0
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
+	fileTypeQ4_K_S
+	fileTypeQ4_K_M
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+	fileTypeIQ2_XXS
+	fileTypeIQ2_XS
+	fileTypeQ2_K_S
+	fileTypeIQ3_XS
+	fileTypeIQ3_XXS
+	fileTypeIQ1_S
+	fileTypeIQ4_NL
+	fileTypeIQ3_S
+	fileTypeIQ3_M
+	fileTypeIQ2_S
+	fileTypeIQ2_M
+	fileTypeIQ4_XS
+	fileTypeIQ1_M
+	fileTypeBF16

-	FileTypeUnknown = 1024
+	fileTypeUnknown
 )

-// ParseFileType parses the provided GGUF file type
-// Only Ollama supported types are considered valid
-func ParseFileType(s string) (FileType, error) {
+func ParseFileType(s string) (fileType, error) {
 	switch s {
 	case "F32":
-		return FileTypeF32, nil
+		return fileTypeF32, nil
 	case "F16":
-		return FileTypeF16, nil
+		return fileTypeF16, nil
 	case "Q4_0":
-		return FileTypeQ4_0, nil
+		return fileTypeQ4_0, nil
 	case "Q4_1":
-		return FileTypeQ4_1, nil
+		return fileTypeQ4_1, nil
+	case "Q4_1_F16":
+		return fileTypeQ4_1_F16, nil
 	case "Q8_0":
-		return FileTypeQ8_0, nil
+		return fileTypeQ8_0, nil
 	case "Q5_0":
-		return FileTypeQ5_0, nil
+		return fileTypeQ5_0, nil
 	case "Q5_1":
-		return FileTypeQ5_1, nil
+		return fileTypeQ5_1, nil
 	case "Q2_K":
-		return FileTypeQ2_K, nil
+		return fileTypeQ2_K, nil
 	case "Q3_K_S":
-		return FileTypeQ3_K_S, nil
+		return fileTypeQ3_K_S, nil
 	case "Q3_K_M":
-		return FileTypeQ3_K_M, nil
+		return fileTypeQ3_K_M, nil
 	case "Q3_K_L":
-		return FileTypeQ3_K_L, nil
+		return fileTypeQ3_K_L, nil
 	case "Q4_K_S":
-		return FileTypeQ4_K_S, nil
-	case "Q4_K_M", "Q4_K":
-		return FileTypeQ4_K_M, nil
+		return fileTypeQ4_K_S, nil
+	case "Q4_K_M":
+		return fileTypeQ4_K_M, nil
 	case "Q5_K_S":
-		return FileTypeQ5_K_S, nil
-	case "Q5_K_M", "Q5_K":
-		return FileTypeQ5_K_M, nil
+		return fileTypeQ5_K_S, nil
+	case "Q5_K_M":
+		return fileTypeQ5_K_M, nil
 	case "Q6_K":
-		return FileTypeQ6_K, nil
+		return fileTypeQ6_K, nil
+	case "IQ2_XXS":
+		return fileTypeIQ2_XXS, nil
+	case "IQ2_XS":
+		return fileTypeIQ2_XS, nil
 	case "Q2_K_S":
-		return FileTypeQ2_K_S, nil
+		return fileTypeQ2_K_S, nil
+	case "IQ3_XS":
+		return fileTypeIQ3_XS, nil
+	case "IQ3_XXS":
+		return fileTypeIQ3_XXS, nil
+	case "IQ1_S":
+		return fileTypeIQ1_S, nil
+	case "IQ4_NL":
+		return fileTypeIQ4_NL, nil
+	case "IQ3_S":
+		return fileTypeIQ3_S, nil
+	case "IQ3_M":
+		return fileTypeIQ3_M, nil
+	case "IQ2_S":
+		return fileTypeIQ2_S, nil
+	case "IQ2_M":
+		return fileTypeIQ2_M, nil
+	case "IQ4_XS":
+		return fileTypeIQ4_XS, nil
+	case "IQ1_M":
+		return fileTypeIQ1_M, nil
 	case "BF16":
-		return FileTypeBF16, nil
+		return fileTypeBF16, nil
 	default:
-		supportedFileTypes := []FileType{
-			FileTypeF32,
-			FileTypeF16,
-			FileTypeQ4_K_S,
-			FileTypeQ4_K_M,
-			FileTypeQ8_0,
-			// fsggml.FileTypeBF16, // TODO
-		}
-		strs := make([]string, len(supportedFileTypes))
-		for i := range supportedFileTypes {
-			strs[i] = supportedFileTypes[i].String()
-		}
-
-		return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", "))
+		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
 	}
 }

-func (t FileType) String() string {
+func (t fileType) String() string {
 	switch t {
-	case FileTypeF32:
+	case fileTypeF32:
 		return "F32"
-	case FileTypeF16:
+	case fileTypeF16:
 		return "F16"
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return "Q4_0"
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return "Q4_1"
-	case FileTypeQ8_0:
+	case fileTypeQ4_1_F16:
+		return "Q4_1_F16"
+	case fileTypeQ8_0:
 		return "Q8_0"
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return "Q5_0"
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return "Q5_1"
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return "Q2_K"
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return "Q3_K_S"
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return "Q3_K_M"
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return "Q3_K_L"
-	case FileTypeQ4_K_S:
+	case fileTypeQ4_K_S:
 		return "Q4_K_S"
-	case FileTypeQ4_K_M:
+	case fileTypeQ4_K_M:
 		return "Q4_K_M"
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return "Q5_K_S"
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return "Q5_K_M"
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return "Q6_K"
-	case FileTypeQ2_K_S:
+	case fileTypeIQ2_XXS:
+		return "IQ2_XXS"
+	case fileTypeIQ2_XS:
+		return "IQ2_XS"
+	case fileTypeQ2_K_S:
 		return "Q2_K_S"
-	case FileTypeBF16:
+	case fileTypeIQ3_XS:
+		return "IQ3_XS"
+	case fileTypeIQ3_XXS:
+		return "IQ3_XXS"
+	case fileTypeIQ1_S:
+		return "IQ1_S"
+	case fileTypeIQ4_NL:
+		return "IQ4_NL"
+	case fileTypeIQ3_S:
+		return "IQ3_S"
+	case fileTypeIQ3_M:
+		return "IQ3_M"
+	case fileTypeIQ2_S:
+		return "IQ2_S"
+	case fileTypeIQ4_XS:
+		return "IQ4_XS"
+	case fileTypeIQ2_M:
+		return "IQ2_M"
+	case fileTypeIQ1_M:
+		return "IQ1_M"
+	case fileTypeBF16:
 		return "BF16"
 	default:
 		return "unknown"
 	}
 }

-func (t FileType) Value() uint32 {
+func (t fileType) Value() uint32 {
 	return uint32(t)
 }
-
-func (ftype FileType) ToTensorType() TensorType {
-	switch ftype {
-	case FileTypeF32:
-		return TensorTypeF32
-	case FileTypeF16:
-		return TensorTypeF16
-	case FileTypeQ4_0:
-		return TensorTypeQ4_0
-	case FileTypeQ4_1:
-		return TensorTypeQ4_1
-	case FileTypeQ8_0:
-		return TensorTypeQ8_0
-	case FileTypeQ5_0:
-		return TensorTypeQ5_0
-	case FileTypeQ5_1:
-		return TensorTypeQ5_1
-	case FileTypeQ2_K:
-		return TensorTypeQ2_K
-	case FileTypeQ3_K_S:
-		return TensorTypeQ3_K
-	case FileTypeQ3_K_M:
-		return TensorTypeQ3_K
-	case FileTypeQ3_K_L:
-		return TensorTypeQ3_K
-	case FileTypeQ4_K_S:
-		return TensorTypeQ4_K
-	case FileTypeQ4_K_M:
-		return TensorTypeQ4_K
-	case FileTypeQ5_K_S:
-		return TensorTypeQ5_K
-	case FileTypeQ5_K_M:
-		return TensorTypeQ5_K
-	case FileTypeQ6_K:
-		return TensorTypeQ6_K
-	case FileTypeQ2_K_S:
-		return TensorTypeQ2_K
-	case FileTypeBF16:
-		return TensorTypeBF16
-	default:
-		slog.Warn("unsupported file type", "type", ftype)
-		return 0 // F32
-	}
-}
-
-// TensorType is equivalent to ggml_type for individual tensor types
-// Note: these are not the same as FileType
-type TensorType uint32
-
-const (
-	TensorTypeF32 TensorType = iota
-	TensorTypeF16
-	TensorTypeQ4_0
-	TensorTypeQ4_1
-	tensorTypeQ4_2 // unused by GGML
-	tensorTypeQ4_3 // unused by GGML
-	TensorTypeQ5_0
-	TensorTypeQ5_1
-	TensorTypeQ8_0
-	TensorTypeQ8_1
-	TensorTypeQ2_K
-	TensorTypeQ3_K
-	TensorTypeQ4_K
-	TensorTypeQ5_K
-	TensorTypeQ6_K
-	TensorTypeQ8_K
-	tensorTypeIQ2_XXS // not supported by ollama
-	tensorTypeIQ2_XS  // not supported by ollama
-	tensorTypeIQ3_XXS // not supported by ollama
-	tensorTypeIQ1_S   // not supported by ollama
-	tensorTypeIQ4_NL  // not supported by ollama
-	tensorTypeIQ3_S   // not supported by ollama
-	tensorTypeIQ2_S   // not supported by ollama
-	tensorTypeIQ4_XS  // not supported by ollama
-	TensorTypeI8
-	TensorTypeI16
-	TensorTypeI32
-	TensorTypeI64
-	TensorTypeF64
-	tensorTypeIQ1_M // not supported by ollama
-	TensorTypeBF16
-	tensorTypeQ4_0_4_4   // unused by GGML
-	tensorTypeQ4_0_4_8   // unused by GGML
-	tensorTypeQ4_0_8_8   // unused by GGML
-	tensorTypeTQ1_0      // not supported by ollama
-	tensorTypeTQ2_0      // not supported by ollama
-	tensorTypeIQ4_NL_4_4 // unused by GGML
-	tensorTypeIQ4_NL_4_8 // unused by GGML
-	tensorTypeIQ4_NL_8_8 // unused by GGML
-)
-
-// ParseFileType parses the provided GGUF file type
-// Only Ollama supported types are considered valid
-func ParseTensorType(s string) (TensorType, error) {
-	switch s {
-	case "F32":
-		return TensorTypeF32, nil
-	case "F16":
-		return TensorTypeF16, nil
-	case "Q4_0":
-		return TensorTypeQ4_0, nil
-	case "Q4_1":
-		return TensorTypeQ4_1, nil
-	case "Q5_0":
-		return TensorTypeQ5_0, nil
-	case "Q5_1":
-		return TensorTypeQ5_1, nil
-	case "Q8_0":
-		return TensorTypeQ8_0, nil
-	case "Q8_1":
-		return TensorTypeQ8_1, nil
-	case "Q2_K":
-		return TensorTypeQ2_K, nil
-	case "Q3_K":
-		return TensorTypeQ3_K, nil
-	case "Q4_K":
-		return TensorTypeQ4_K, nil
-	case "Q5_K":
-		return TensorTypeQ5_K, nil
-	case "Q6_K":
-		return TensorTypeQ6_K, nil
-	case "Q8_K":
-		return TensorTypeQ8_K, nil
-	case "F64":
-		return TensorTypeF64, nil
-	case "BF16":
-		return TensorTypeBF16, nil
-	default:
-		return 0, fmt.Errorf("unsupported quantization type %s", s)
-	}
-}
-
-func (t TensorType) IsQuantized() bool {
-	switch t {
-	case TensorTypeF32, TensorTypeF16, TensorTypeBF16:
-		return false
-	default:
-		return true
-	}
-}
-
-func (t TensorType) RowSize(ne uint64) uint64 {
-	return t.TypeSize() * ne / t.BlockSize()
-}
-
-func (t TensorType) String() string {
-	switch t {
-	case TensorTypeF32:
-		return "F32"
-	case TensorTypeF16:
-		return "F16"
-	case TensorTypeQ4_0:
-		return "Q4_0"
-	case TensorTypeQ4_1:
-		return "Q4_1"
-	case TensorTypeQ5_0:
-		return "Q5_0"
-	case TensorTypeQ5_1:
-		return "Q5_1"
-	case TensorTypeQ8_0:
-		return "Q8_0"
-	case TensorTypeQ8_1:
-		return "Q8_1"
-	case TensorTypeQ2_K:
-		return "Q2_K"
-	case TensorTypeQ3_K:
-		return "Q3_K"
-	case TensorTypeQ4_K:
-		return "Q4_K"
-	case TensorTypeQ5_K:
-		return "Q5_K"
-	case TensorTypeQ6_K:
-		return "Q6_K"
-	case TensorTypeQ8_K:
-		return "Q8_K"
-	case TensorTypeF64:
-		return "F64"
-	case TensorTypeBF16:
-		return "BF16"
-	default:
-		return "unknown"
-	}
-}
--- a/go.mod
+++ b/go.mod
@@ -11,7 +11,7 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.12.0
+	golang.org/x/sync v0.11.0
 )

 require (
@@ -70,12 +70,12 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.36.0
+	golang.org/x/crypto v0.33.0
 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
-	golang.org/x/net v0.38.0 // indirect
-	golang.org/x/sys v0.31.0
-	golang.org/x/term v0.30.0
-	golang.org/x/text v0.23.0
+	golang.org/x/net v0.35.0 // indirect
+	golang.org/x/sys v0.30.0
+	golang.org/x/term v0.29.0
+	golang.org/x/text v0.22.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
-golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
+golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
+golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
+golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
+golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
-golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
+golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
+golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
+golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -34,15 +34,13 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
 func TestAllMiniLMEmbeddings(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbeddingRequest{
 		Model:  "all-minilm",
 		Prompt: "why is the sky blue?",
 	}

-	res, err := embeddingTestHelper(ctx, client, t, req)
+	res, err := embeddingTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -64,15 +62,13 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 func TestAllMiniLMEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: "why is the sky blue?",
 	}

-	res, err := embedTestHelper(ctx, client, t, req)
+	res, err := embedTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -102,15 +98,13 @@ func TestAllMiniLMEmbed(t *testing.T) {
 func TestAllMiniLMBatchEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: []string{"why is the sky blue?", "why is the grass green?"},
 	}

-	res, err := embedTestHelper(ctx, client, t, req)
+	res, err := embedTestHelper(ctx, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@@ -150,8 +144,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()

 	truncTrue, truncFalse := true, false

@@ -190,7 +182,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	res := make(map[string]*api.EmbedResponse)

 	for _, req := range reqs {
-		response, err := embedTestHelper(ctx, client, t, req.Request)
+		response, err := embedTestHelper(ctx, t, req.Request)
 		if err != nil {
 			t.Fatalf("error: %v", err)
 		}
@@ -206,7 +198,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}

 	// check that truncate set to false returns an error if context length is exceeded
-	_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
+	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
 		Model:    "all-minilm",
 		Input:    "why is the sky blue?",
 		Truncate: &truncFalse,
@@ -218,7 +210,9 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}
 }

-func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
@@ -232,7 +226,9 @@ func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T,
 	return response, nil
 }

-func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -48,6 +48,17 @@ var (
 	}
 )

+func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
+	deadline, hasDeadline := t.Deadline()
+	if !hasDeadline {
+		return 8 * time.Minute, 10 * time.Minute
+	} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
+		t.Skip("too little time")
+		return time.Duration(0), time.Duration(0)
+	}
+	return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
+}
+
 func TestModelsGenerate(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
--- a/integration/quantization_test.go
+++ b/integration/quantization_test.go
@@ -1,130 +0,0 @@
-//go:build integration && models
-
-package integration
-
-import (
-	"bytes"
-	"context"
-	"fmt"
-	"log/slog"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestQuantization(t *testing.T) {
-	sourceModels := []string{
-		"qwen2.5:0.5b-instruct-fp16",
-	}
-	quantizations := []string{
-		"Q8_0",
-		"Q4_K_S",
-		"Q4_K_M",
-		"Q4_K",
-	}
-	softTimeout, hardTimeout := getTimeouts(t)
-	started := time.Now()
-	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	for _, base := range sourceModels {
-		if err := PullIfMissing(ctx, client, base); err != nil {
-			t.Fatalf("pull failed %s", err)
-		}
-		for _, quant := range quantizations {
-			newName := fmt.Sprintf("%s__%s", base, quant)
-			t.Run(newName, func(t *testing.T) {
-				if time.Now().Sub(started) > softTimeout {
-					t.Skip("skipping remaining tests to avoid excessive runtime")
-				}
-				req := &api.CreateRequest{
-					Model:        newName,
-					Quantization: quant,
-					From:         base,
-				}
-				fn := func(resp api.ProgressResponse) error {
-					// fmt.Print(".")
-					return nil
-				}
-				t.Logf("quantizing: %s -> %s", base, quant)
-				if err := client.Create(ctx, req, fn); err != nil {
-					t.Fatalf("create failed %s", err)
-				}
-				defer func() {
-					req := &api.DeleteRequest{
-						Model: newName,
-					}
-					t.Logf("deleting: %s -> %s", base, quant)
-					if err := client.Delete(ctx, req); err != nil {
-						t.Logf("failed to clean up %s: %s", req.Model, err)
-					}
-				}()
-				// Check metadata on the model
-				resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
-				if err != nil {
-					t.Fatalf("unable to show model: %s", err)
-				}
-				if !strings.Contains(resp.Details.QuantizationLevel, quant) {
-					t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
-				}
-
-				stream := true
-				genReq := api.GenerateRequest{
-					Model:     newName,
-					Prompt:    "why is the sky blue?",
-					KeepAlive: &api.Duration{Duration: 3 * time.Second},
-					Options: map[string]any{
-						"seed":        42,
-						"temperature": 0.0,
-					},
-					Stream: &stream,
-				}
-				t.Logf("verifying: %s -> %s", base, quant)
-
-				// Some smaller quantizations can cause models to have poor quality
-				// or get stuck in repetition loops, so we stop as soon as we have any matches
-				anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
-				reqCtx, reqCancel := context.WithCancel(ctx)
-				atLeastOne := false
-				var buf bytes.Buffer
-				genfn := func(response api.GenerateResponse) error {
-					buf.Write([]byte(response.Response))
-					fullResp := strings.ToLower(buf.String())
-					for _, resp := range anyResp {
-						if strings.Contains(fullResp, resp) {
-							atLeastOne = true
-							t.Log(fullResp)
-							reqCancel()
-							break
-						}
-					}
-					return nil
-				}
-
-				done := make(chan int)
-				var genErr error
-				go func() {
-					genErr = client.Generate(reqCtx, &genReq, genfn)
-					done <- 0
-				}()
-
-				select {
-				case <-done:
-					if genErr != nil && !atLeastOne {
-						t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
-					}
-				case <-ctx.Done():
-					t.Error("outer test context done while waiting for generate")
-				}
-
-				t.Logf("passed")
-
-			})
-		}
-	}
-}
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -217,7 +217,6 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 					slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
 					return
 				}
-				defer fp.Close()
 				data, err := io.ReadAll(fp)
 				if err != nil {
 					slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
@@ -359,14 +358,3 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
 		}
 	}
 }
-
-func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
-	deadline, hasDeadline := t.Deadline()
-	if !hasDeadline {
-		return 8 * time.Minute, 10 * time.Minute
-	} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
-		t.Skip("too little time")
-		return time.Duration(0), time.Duration(0)
-	}
-	return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
-}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) {
 		}
 	}

-	return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
+	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
 }

 func (c *Causal) updateSlidingWindow() {
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
+char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -10,11 +10,11 @@ include common/stb_image.*
 include include/
 include include/llama.*
 include include/llama-*.*
-include tools/
-include tools/mtmd/
-include tools/mtmd/clip.*
-include tools/mtmd/clip-impl.*
-include tools/mtmd/llava.*
+include examples/
+include examples/llava/
+include examples/llava/clip.*
+include examples/llava/clip-impl.*
+include examples/llava/llava.*
 include src/
 include src/llama.*
 include src/llama-*.*
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -1096,6 +1096,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.n_threads         = params.cpuparams.n_threads;
    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
@@ -1113,7 +1114,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
-    cparams.op_offload        = !params.no_op_offload;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1565,20 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
-
-ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
-    const int64_t ne_datapoint = llama_n_ctx(ctx);
-    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
-    ggml_opt_dataset_t result = ggml_opt_dataset_init(
-        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
-
-    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
-    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
-
-    for (int64_t idata = 0; idata < ndata; ++idata) {
-        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
-        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
-    }
-
-    return result;
-}
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -66,6 +66,7 @@ enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -95,7 +96,6 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
-    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -161,7 +161,6 @@ struct common_params_sampling {
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
-        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
        COMMON_SAMPLER_TYPE_TOP_P,
@@ -324,6 +323,7 @@ struct common_params {
    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -332,7 +332,6 @@ struct common_params {
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
-    bool no_op_offload     = false; // globally disable offload host tensor operations to device

    bool single_turn       = false; // single turn chat conversation

@@ -341,10 +340,8 @@ struct common_params {

    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

-    // multimodal models (see tools/mtmd)
+    // multimodal models (see examples/llava)
    struct common_params_model mmproj;
-    bool mmproj_use_gpu = true;     // use GPU for multimodal model
-    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)

    // embedding
@@ -410,14 +407,13 @@ struct common_params {

    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
-    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization

    // cvector-generator params
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
+    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill

@@ -666,9 +662,3 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 }
-
-//
-// training utils
-//
-
-ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -16,9 +16,6 @@ using json = nlohmann::ordered_json;
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();

-    if (max_items == 0) {
-        return "";
-    }
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
    }
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -1,7 +1,6 @@
 #include "sampling.h"

 #include "common.h"
-#include "log.h"

 #include <cmath>
 #include <unordered_map>
@@ -230,48 +229,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.data()));

    if (params.mirostat == 0) {
-        for (const auto & cnstr : params.samplers) {
-            switch (cnstr) {
-                case COMMON_SAMPLER_TYPE_DRY:
-                    {
-                        std::vector<const char *> c_breakers;
-                        c_breakers.reserve(params.dry_sequence_breakers.size());
-                        for (const auto & str : params.dry_sequence_breakers) {
-                            c_breakers.push_back(str.c_str());
-                        }
+        if (params.top_n_sigma >= 0) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
+        } else {
+            for (const auto & cnstr : params.samplers) {
+                switch (cnstr) {
+                    case COMMON_SAMPLER_TYPE_DRY:
+                        {
+                            std::vector<const char *> c_breakers;
+                            c_breakers.reserve(params.dry_sequence_breakers.size());
+                            for (const auto & str : params.dry_sequence_breakers) {
+                                c_breakers.push_back(str.c_str());
+                            }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
-                    }
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
-                    break;
-                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                    break;
-                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
-                    break;
-                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                    break;
-                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
-                    break;
-                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                    break;
-                default:
-                    GGML_ASSERT(false && "unknown sampler type");
+                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        }
+                        break;
+                    case COMMON_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TOP_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_MIN_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_XTC:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                        break;
+                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                        break;
+                    case COMMON_SAMPLER_TYPE_INFILL:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                        break;
+                    case COMMON_SAMPLER_TYPE_PENALTIES:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                        break;
+                    default:
+                        GGML_ASSERT(false && "unknown sampler type");
+                }
            }
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -473,7 +475,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
-        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
@@ -489,7 +490,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
@@ -504,7 +504,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "dry",         COMMON_SAMPLER_TYPE_DRY },
        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -518,7 +517,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
-        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -535,16 +533,14 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        auto sampler = sampler_canonical_name_map.find(name);
        if (sampler != sampler_canonical_name_map.end()) {
            samplers.push_back(sampler->second);
-            continue;
-        }
-        if (allow_alt_names) {
-            sampler = sampler_alt_name_map.find(name);
-            if (sampler != sampler_alt_name_map.end()) {
-                samplers.push_back(sampler->second);
-                continue;
+        } else {
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
+                }
            }
        }
-        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
    }

    return samplers;
@@ -556,7 +552,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
@@ -571,8 +566,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        const auto sampler = sampler_name_map.find(c);
        if (sampler != sampler_name_map.end()) {
            samplers.push_back(sampler->second);
-        } else {
-            LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
        }
    }

--- a/llama/llama.cpp/examples/llava/clip-impl.h
+++ b/llama/llama.cpp/examples/llava/clip-impl.h
@@ -2,6 +2,8 @@
 #include "gguf.h"
 #include "clip.h"

+#include "clip.h"
+
 #include <climits>
 #include <cstdarg>
 #include <string>
@@ -15,29 +17,33 @@
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
+#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
+#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
+#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
+#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
-#define KEY_N_EMBD              "clip.vision.embedding_length"
-#define KEY_N_FF                "clip.vision.feed_forward_length"
-#define KEY_N_BLOCK             "clip.vision.block_count"
-#define KEY_N_HEAD              "clip.vision.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.vision.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.vision.projection_dim"
+#define KEY_N_EMBD              "clip.%s.embedding_length"
+#define KEY_N_FF                "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK             "clip.%s.block_count"
+#define KEY_N_HEAD              "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
+#define KEY_TOKENS              "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS         "clip.text.context_length"
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
-#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
-#define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_PROJ_TYPE           "clip.projector_type"
-#define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
+#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"

 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
-#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"


 //
@@ -53,16 +59,10 @@
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
 #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
-#define TN_ATTN_K_NORM     "%s.blk.%d.attn_k_norm.%s"
-#define TN_ATTN_Q_NORM     "%s.blk.%d.attn_q_norm.%s"
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
-#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
-#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
-#define TN_LN_1            "%s.blk.%d.ln1.%s" // layer norm
-#define TN_LN_2            "%s.blk.%d.ln2.%s" // layer norm
-#define TN_LS_1            "%s.blk.%d.ls1.%s" // layer scale
-#define TN_LS_2            "%s.blk.%d.ls2.%s" // layer scale
+#define TN_LN_1            "%s.blk.%d.ln1.%s"
+#define TN_LN_2            "%s.blk.%d.ln2.%s"
 #define TN_LN_PRE          "%s.pre_ln.%s"
 #define TN_LN_POST         "%s.post_ln.%s"
 #define TN_LLAVA_PROJ      "mm.%d.%s"
@@ -70,14 +70,8 @@
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
-#define TN_MM_INP_NORM     "mm.input_norm.weight"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
-#define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
-#define TN_MM_PATCH_MERGER "mm.patch_merger.weight"     // mistral small 3.1
-#define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
-#define TN_TOK_GLM_BOI     "adapter.boi"                // glm-edge (these embeddings are not in text model)
-#define TN_TOK_GLM_EOI     "adapter.eoi"                // glm-edge (these embeddings are not in text model)

 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -93,23 +87,18 @@
 #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
-
-// align x to upper multiple of n
-#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+#define TN_GLM_BOI_W            "adapter.boi"
+#define TN_GLM_EOI_W            "adapter.eoi"

 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_LDPV2,
-    PROJECTOR_TYPE_MINICPMV,
+    PROJECTOR_TYPE_RESAMPLER,
    PROJECTOR_TYPE_GLM_EDGE,
-    PROJECTOR_TYPE_QWEN2VL,
+    PROJECTOR_TYPE_MERGER,
    PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_IDEFICS3,
-    PROJECTOR_TYPE_PIXTRAL,
-    PROJECTOR_TYPE_QWEN25VL,
-    PROJECTOR_TYPE_INTERNVL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -117,14 +106,10 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP,       "mlp" },
    { PROJECTOR_TYPE_LDP,       "ldp" },
    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
+    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
-    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
+    { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
-    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
-    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -239,15 +224,6 @@ struct clip_image_u8_batch {

 struct clip_image_f32_batch {
    std::vector<clip_image_f32_ptr> entries;
-
-    clip_image_f32_batch clone() const {
-        clip_image_f32_batch new_batch;
-        new_batch.entries.reserve(entries.size());
-        for (const auto & entry : entries) {
-            new_batch.entries.emplace_back(new clip_image_f32(*entry));
-        }
-        return new_batch;
-    }
 };

 //
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
--- a/llama/llama.cpp/examples/llava/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
 CLIP_API void clip_free(struct clip_ctx * ctx);

 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);

 CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@@ -59,29 +59,18 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

-GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
-    "use clip_n_output_tokens instead");
-GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
-    "use clip_n_output_tokens instead");
-
-CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-
-// for M-RoPE, this will be the number of token positions in X and Y directions
-// for other models, X will be the total number of tokens and Y will be 1
-CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-
-// this should be equal to the embedding dimension of the text model
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);

 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

-CLIP_API struct clip_image_size      * clip_image_size_init(void);
-CLIP_API struct clip_image_u8        * clip_image_u8_init (void);
-CLIP_API struct clip_image_f32       * clip_image_f32_init(void);
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
+CLIP_API struct clip_image_size      * clip_image_size_init();
+CLIP_API struct clip_image_u8        * clip_image_u8_init ();
+CLIP_API struct clip_image_f32       * clip_image_f32_init();
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava

 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
@@ -125,6 +114,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);

+CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
+
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);


--- a/llama/llama.cpp/examples/llava/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@@ -2,7 +2,6 @@
 #include "llava.h"

 #include "llama.h"
-#include "ggml-cpp.h"

 #include <algorithm>
 #include <cerrno>
@@ -113,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 }

 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
-static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
    struct {
        struct ggml_context * ctx;
    } model;
@@ -176,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    model.ctx = ggml_init(params);

-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
    for (size_t i = 1; i < num_images; i++) {
@@ -210,17 +209,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
-
-    ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
-    GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
-    ggml_backend_graph_compute(backend.get(), gf);
-
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
    struct ggml_tensor* result = ggml_graph_node(gf, -1);

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));

    // Debug: Test single segments
    // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -318,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                image_embd_v[i],
                clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -347,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
+        *n_img_pos = clip_n_patches(ctx_clip);
        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
-        *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
        if (!encoded) {
            LOG_ERR("Unable to encode image\n");
@@ -386,8 +381,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

        int n_img_pos_out;
-        clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
        *n_img_pos = n_img_pos_out;

        for (size_t i = 0; i < image_embd_v.size(); i++) {
--- a/llama/llama.cpp/examples/llava/llava.go
+++ b/llama/llama.cpp/examples/llava/llava.go
@@ -1,4 +1,4 @@
-package mtmd
+package llava

 // #cgo CXXFLAGS: -std=c++11
 // #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
--- a/llama/llama.cpp/examples/llava/llava.h
+++ b/llama/llama.cpp/examples/llava/llava.h
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -4,7 +4,6 @@
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
-#include "ggml-opt.h"

 #include <stddef.h>
 #include <stdint.h>
@@ -112,8 +111,6 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
-        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
-        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
    };

    enum llama_rope_type {
@@ -354,19 +351,20 @@ extern "C" {
        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]

+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        // TODO: move at the end of the struct
+        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embeddings;  // if true, extract embeddings (together with logits)
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
+        bool cross_attn;  // whether to use cross attention
+
        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
        // currently works only with CPU execution
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
-
-        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
-        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-        bool op_offload;  // whether to offload host tensor operations to device
-        bool cross_attn;  // whether to use cross attention
    };

    // model quantization parameters
@@ -448,10 +446,6 @@ extern "C" {
                                 size_t    n_paths,
              struct llama_model_params    params);

-    LLAMA_API void llama_model_save_to_file(
-            const struct llama_model * model,
-                        const char * path_model);
-
    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
            "use llama_model_free instead");

@@ -935,19 +929,14 @@ extern "C" {
    // Frees a batch of tokens allocated with llama_batch_init()
    LLAMA_API void llama_batch_free(struct llama_batch batch);

-    // Process a batch of tokens.
-    // In contrast to llama_decode() - this call does not use KV cache.
-    // For encode-decoder contexts, processes the batch using the encoder.
-    // Can store the encoder output internally for later use by the decoder's cross-attention layers.
+    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
+    // Stores the encoder output internally for later use by the decoder cross-attention layers.
    //   0 - success
    // < 0 - error. the KV cache state is restored to the state before this call
    LLAMA_API int32_t llama_encode(
            struct llama_context * ctx,
              struct llama_batch   batch);

-    // Process a batch of tokens.
-    // Requires KV cache.
-    // For encode-decoder contexts, processes the batch using the decoder.
    // Positive return values does not mean a fatal error, but rather a warning.
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -1248,7 +1237,6 @@ extern "C" {
        "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    /// Setting k <= 0 makes this a noop
    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -1444,37 +1432,6 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);

-    //
-    // training
-    //
-
-    // function that returns whether or not a given tensor contains trainable parameters
-    typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
-
-    // always returns true
-    LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
-
-    struct llama_opt_params {
-        uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
-
-        llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
-        void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
-
-        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
-        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
-    };
-
-    LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
-
-    LLAMA_API void llama_opt_epoch(
-            struct llama_context    * lctx,
-            ggml_opt_dataset_t        dataset,
-            ggml_opt_result_t         result_train,
-            ggml_opt_result_t         result_eval,
-            int64_t                   idata_split,
-            ggml_opt_epoch_callback   callback_train,
-            ggml_opt_epoch_callback   callback_eval);
-
 #ifdef __cplusplus
 }
 #endif
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -253,9 +253,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-        if (!cpu_dev) {
-            throw std::runtime_error(format("%s: no CPU backend found", __func__));
-        }
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);

        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@@ -294,9 +291,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));

                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-                if (!cpu_dev) {
-                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
-                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);

                break;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
-    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
    { LLM_ARCH_BLOOM,            "bloom"            },
    { LLM_ARCH_STABLELM,         "stablelm"         },
@@ -74,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -109,7 +109,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
-    { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
@@ -512,24 +511,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
-    {
-        LLM_ARCH_NOMIC_BERT_MOE,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
-            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
-            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-        },
-    },
    {
        LLM_ARCH_JINA_BERT_V2,
        {
@@ -1606,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
+    {
+        LLM_ARCH_MISTRAL3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
+        }
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -24,7 +24,6 @@ enum llm_arch {
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
    LLM_ARCH_NOMIC_BERT,
-    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_JINA_BERT_V2,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
@@ -76,6 +75,7 @@ enum llm_arch {
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_SOLAR,
    LLM_ARCH_WAVTOKENIZER_DEC,
+    LLM_ARCH_MISTRAL3,
    LLM_ARCH_PLM,
    LLM_ARCH_BAILINGMOE,
    LLM_ARCH_UNKNOWN,
@@ -113,7 +113,6 @@ enum llm_kv {
    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_EXPERT_WEIGHTS_NORM,
    LLM_KV_EXPERT_GATING_FUNC,
-    LLM_KV_MOE_EVERY_N_LAYERS,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
    return ubatch;
 }

-llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
+void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
    GGML_ASSERT(batch.n_tokens >= 0);
    this->batch = &batch;
    this->n_embd = n_embd;
@@ -203,7 +203,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
    for (size_t i = 0; i < n_tokens; ++i) {
        ids[i] = i;
    }
-
    if (simple_split) {
        seq.resize(1);
        llama_sbatch_seq & s = seq[0];
@@ -213,7 +212,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
        s.length = n_tokens;
        return;
    }
-
    std::sort(ids.begin(), ids.end(),
            [&batch](size_t a, size_t b) {
                int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
@@ -241,7 +239,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
                return n_seq_a > n_seq_b;
            }
    );
-
    // init seq
    llama_sbatch_seq * last_seq = nullptr;

@@ -265,7 +262,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
        seq.push_back(new_seq);
        last_seq = &seq.back();
    }
-
    // keep shared prompts first at the end, then sort by length descending.
    std::sort(seq.begin(), seq.end(),
            [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -70,8 +70,7 @@ struct llama_sbatch {
    // sequence-wise split
    llama_ubatch split_seq(size_t n_ubatch);

-    llama_sbatch() = default;
-    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
+    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
 };

 // temporary allocate memory for the input batch if needed
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -35,7 +35,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
-    { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
    { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
@@ -51,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "deepseek3",         LLM_CHAT_TEMPLATE_DEEPSEEK_3        },
    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
-    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGLM_3         },
-    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGLM_4         },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
    { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
@@ -63,7 +62,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "yandex",            LLM_CHAT_TEMPLATE_YANDEX            },
    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
-    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -83,9 +81,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
    if (tmpl_contains("<|im_start|>")) {
        return tmpl_contains("<|im_sep|>")
            ? LLM_CHAT_TEMPLATE_PHI_4
-            : tmpl_contains("<end_of_utterance>")
-                ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
-                : LLM_CHAT_TEMPLATE_CHATML;
+            : LLM_CHAT_TEMPLATE_CHATML;
    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
        if (tmpl_contains("[SYSTEM_PROMPT]")) {
            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -123,12 +119,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        }
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
        return LLM_CHAT_TEMPLATE_PHI_3;
-    } else if (tmpl_contains("[gMASK]<sop>")) {
-        return LLM_CHAT_TEMPLATE_CHATGLM_4;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
-    } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
-        return LLM_CHAT_TEMPLATE_GLMEDGE;
    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
        return LLM_CHAT_TEMPLATE_ZEPHYR;
    } else if (tmpl_contains("bos_token + message['role']")) {
@@ -157,7 +149,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_LLAMA_3;
    } else if (tmpl_contains("[gMASK]sop")) {
        // chatglm3-6b
-        return LLM_CHAT_TEMPLATE_CHATGLM_3;
+        return LLM_CHAT_TEMPLATE_CHATGML_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGML_4;
    } else if (tmpl_contains(LU8("<用户>"))) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        return LLM_CHAT_TEMPLATE_MINICPM;
@@ -203,20 +197,19 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
        // Official mistral 'v7' template
        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
-        //      https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
-        const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
        for (auto message : chat) {
            std::string role(message->role);
            std::string content(message->content);
            if (role == "system") {
-                ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
+                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
            } else if (role == "user") {
-                ss << "[INST]" << trailing_space << content << "[/INST]";
-            } else {
-                ss << trailing_space << content << "</s>";
+                ss << "[INST] " << content << "[/INST]";
+            }
+            else {
+                ss << " " << content << "</s>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
@@ -439,7 +432,7 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
        // chatglm3-6b
        ss << "[gMASK]" << "sop";
        for (auto message : chat) {
@@ -449,14 +442,14 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n" << message->content;
        }
        if (add_ass) {
-            ss << "<|assistant|>\n";
+            ss << "<|assistant|>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
        for (auto message : chat) {
@@ -627,23 +620,7 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|header_start|>assistant<|header_end|>\n\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
-        // SmolVLM
-        ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
-        for (auto message : chat) {
-            std::string role(message->role);
-            if (role == "system") {
-                ss << message->content << "\n\n";
-            } else if (role == "user") {
-                ss << "User: " << message->content << "<end_of_utterance>\n";
-            } else {
-                ss << "Assistant: " << message->content << "<end_of_utterance>\n";
-            }
-        }
-        if (add_ass) {
-            ss << "Assistant:";
-        }
-    } else {
+    }  else {
        // template not supported
        return -1;
    }
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -14,7 +14,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MISTRAL_V3,
    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
    LLM_CHAT_TEMPLATE_MISTRAL_V7,
-    LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
    LLM_CHAT_TEMPLATE_PHI_3,
    LLM_CHAT_TEMPLATE_PHI_4,
    LLM_CHAT_TEMPLATE_FALCON_3,
@@ -30,8 +29,8 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_DEEPSEEK_3,
    LLM_CHAT_TEMPLATE_COMMAND_R,
    LLM_CHAT_TEMPLATE_LLAMA_3,
-    LLM_CHAT_TEMPLATE_CHATGLM_3,
-    LLM_CHAT_TEMPLATE_CHATGLM_4,
+    LLM_CHAT_TEMPLATE_CHATGML_3,
+    LLM_CHAT_TEMPLATE_CHATGML_4,
    LLM_CHAT_TEMPLATE_GLMEDGE,
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -42,7 +41,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_YANDEX,
    LLM_CHAT_TEMPLATE_BAILING,
    LLM_CHAT_TEMPLATE_LLAMA4,
-    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -8,7 +8,6 @@
 #include "llama-kv-cache.h"

 #include "ggml-cpp.h"
-#include "ggml-opt.h"

 #include <map>
 #include <vector>
@@ -29,12 +28,7 @@ struct llama_context {

    void synchronize();

-    const llama_model   & get_model()   const;
-    const llama_cparams & get_cparams() const;
-
-    ggml_backend_sched_t get_sched() const;
-
-    ggml_context * get_ctx_compute() const;
+    const llama_model & get_model() const;

    uint32_t n_ctx()         const;
    uint32_t n_ctx_per_seq() const;
@@ -136,32 +130,6 @@ struct llama_context {
    llama_perf_context_data perf_get_data() const;
    void perf_reset();

-    //
-    // training
-    //
-
-    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
-
-    void opt_epoch(
-            ggml_opt_dataset_t      dataset,
-            ggml_opt_result_t       result_train,
-            ggml_opt_result_t       result_eval,
-            int64_t                 idata_split,
-            ggml_opt_epoch_callback callback_train,
-            ggml_opt_epoch_callback callback_eval);
-
-    void opt_epoch_iter(
-            ggml_opt_dataset_t               dataset,
-            ggml_opt_result_t                result,
-            const std::vector<llama_token> & tokens,
-            const std::vector<llama_token> & labels_sparse,
-            llama_batch                    & batch,
-            ggml_opt_epoch_callback          callback,
-            bool                             train,
-            int64_t                          idata_in_loop,
-            int64_t                          ndata_in_loop,
-            int64_t                          t_loop_start);
-
 private:
    //
    // output
@@ -171,30 +139,51 @@ private:
    // Returns max number of outputs for which space was reserved.
    int32_t output_reserve(int32_t n_outputs);

+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe remove this
+    void output_reorder();
+
    //
    // graph
    //

-public:
    int32_t graph_max_nodes() const;

    // zero-out inputs and create the ctx_compute for the compute graph
    ggml_cgraph * graph_init();

-    // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
-
-private:
    llm_graph_result_ptr graph_build(
            ggml_context * ctx,
             ggml_cgraph * gf,
      const llama_ubatch & ubatch,
          llm_graph_type   gtype);

+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    ggml_status graph_compute(
+            ggml_cgraph * gf,
+                   bool   batched);
+
    llm_graph_cb graph_get_cb() const;

+    // used by kv_self_update()
+    ggml_tensor * build_rope_shift(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * shift,
+        ggml_tensor * factors,
+              float   freq_base,
+              float   freq_scale,
+        ggml_backend_buffer * bbuf) const;
+
+    llm_graph_result_ptr build_kv_self_shift(
+            ggml_context * ctx0,
+            ggml_cgraph * gf) const;
+
+    llm_graph_result_ptr build_kv_self_defrag(
+            ggml_context * ctx0,
+            ggml_cgraph * gf,
+            const std::vector<struct llama_kv_defrag_move> & moves) const;
+
    // TODO: read/write lora adapters and cvec
    size_t state_write_data(llama_io_write_i & io);
    size_t state_read_data (llama_io_read_i  & io);
@@ -211,10 +200,14 @@ private:
    llama_cparams       cparams;
    llama_adapter_cvec  cvec;
    llama_adapter_loras loras;
+    llama_sbatch        sbatch;

    llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably

-    std::unique_ptr<llama_memory_i> memory;
+    std::unique_ptr<llama_kv_cache_unified> kv_self;
+
+    // TODO: remove
+    bool logits_all = false;

    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
@@ -241,9 +234,6 @@ private:

    ggml_context_ptr ctx_compute;

-    // training
-    ggml_opt_context_t opt_ctx = nullptr;
-
    ggml_threadpool_t threadpool       = nullptr;
    ggml_threadpool_t threadpool_batch = nullptr;

--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -29,9 +29,8 @@ struct llama_cparams {
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
-    bool warmup;
-    bool op_offload;
    bool cross_attn;
+    bool warmup;

    enum llama_pooling_type pooling_type;

--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -55,21 +55,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
    if (ubatch->pos && pos) {
        const int64_t n_tokens = ubatch->n_tokens;

-        if (ubatch->token && n_pos_per_embd == 4) {
-            // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
-            // the 3 first dims are the same, and 4th dim is all 0
-            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
-            // copy the first dimension
-            for (int i = 0; i < n_tokens; ++i) {
-                pos_data[               i] = ubatch->pos[i];
-                pos_data[    n_tokens + i] = ubatch->pos[i];
-                pos_data[2 * n_tokens + i] = ubatch->pos[i];
-                pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
-            }
-            ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
-        } else {
-            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
-        }
+        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
    }
 }

@@ -85,7 +71,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
            ) * f_attn_temp_scale + 1.0;
        }

-        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
+        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
    }
 }

@@ -284,7 +270,24 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {

        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
        for (uint32_t i = 0; i < n_kv; ++i) {
-            data[i] = kv_self->s_copy(i);
+            const uint32_t  cell_id = i + kv_self->head;
+
+            //////////////////////////////////////////////
+            // TODO: this should not mutate the KV cache !
+            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
+
+            // prevent out-of-bound sources
+            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
+                kv_cell.src = cell_id;
+            }
+
+            data[i] = kv_cell.src;
+
+            // TODO: do not mutate the KV cache
+            // ensure copy only happens once
+            if (kv_cell.src != (int32_t) cell_id) {
+                kv_cell.src = cell_id;
+            }
        }
    }
 }
@@ -300,7 +303,18 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {

        // clear unused states
        for (int i = 0; i < n_kv; ++i) {
-            data[i] = kv_self->s_mask(i);
+            const uint32_t  cell_id = i + kv_self->head;
+
+            //////////////////////////////////////////////
+            // TODO: this should not mutate the KV cache !
+            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
+
+            data[i] = (float) (kv_cell.src >= 0);
+
+            // only clear once
+            if (kv_cell.src < 0) {
+                kv_cell.src = cell_id;
+            }
        }
    }
 }
@@ -584,7 +598,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    res              (std::make_unique<llm_graph_result>()) {
    }

-int64_t llm_graph_context::n_pos_per_embd() const {
+int64_t llm_graph_context::n_pos_per_token() const {
    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }

@@ -788,17 +802,13 @@ ggml_tensor * llm_graph_context::build_ffn(
            } break;
    }

-    if (gate && type_gate == LLM_FFN_PAR) {
+    if (type_gate == LLM_FFN_PAR) {
        cur = ggml_mul(ctx0, cur, tmp);
        cb(cur, "ffn_gate_par", il);
    }

    if (down) {
        cur = build_lora_mm(down, cur);
-        if (arch == LLM_ARCH_GLM4) {
-            // GLM4 seems to have numerical issues with half-precision accumulators
-            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
-        }
    }

    if (down_b) {
@@ -906,35 +916,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
    cb(up, "ffn_moe_up", il);

-    ggml_tensor * experts = nullptr;
-    if (gate_exps) {
-        cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-        cb(cur, "ffn_moe_gate", il);
-    } else {
-        cur = up;
-    }
+    ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+    cb(gate, "ffn_moe_gate", il);

    switch (type_op) {
        case LLM_FFN_SILU:
            {
-                cur = ggml_silu(ctx0, cur);
-                cb(cur, "ffn_moe_silu", il);
+                gate = ggml_silu(ctx0, gate);
+                cb(gate, "ffn_moe_silu", il);
            } break;
        case LLM_FFN_GELU:
            {
-                cur = ggml_gelu(ctx0, cur);
-                cb(cur, "ffn_moe_gelu", il);
+                gate = ggml_gelu(ctx0, gate);
+                cb(gate, "ffn_moe_gelu", il);
            } break;
        default:
            GGML_ABORT("fatal error");
    }

-    if (gate_exps) {
-        cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
-        cb(cur, "ffn_moe_gate_par", il);
-    }
+    ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
+    cb(par, "ffn_moe_gate_par", il);

-    experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
    cb(experts, "ffn_moe_down", il);

    if (!weight_before_ffn) {
@@ -977,7 +980,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        //cb(inp->tokens, "inp_tokens", -1);
        ggml_set_input(inp->tokens);
-        res->t_tokens = inp->tokens;

        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);

@@ -1018,11 +1020,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 }

 ggml_tensor * llm_graph_context::build_inp_pos() const {
-    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
+    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());

    auto & cur = inp->pos;

-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
    ggml_set_input(cur);

    res->add_input(std::move(inp));
@@ -1031,12 +1033,11 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }

 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);

    auto & cur = inp->attn_scale;

-    // this need to be 1x1xN for broadcasting
-    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
+    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
    ggml_set_input(cur);

    res->add_input(std::move(inp));
@@ -1084,7 +1085,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }

 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);

@@ -1101,7 +1102,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }

 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);

@@ -1234,19 +1235,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);

        if (v_mla) {
-#if 0
-            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
-            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
            cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
            cur = ggml_mul_mat(ctx0, v_mla, cur);
-#else
-            // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
-            // The permutations are noops and only change how the tensor data is interpreted.
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_mul_mat(ctx0, v_mla, cur);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
-#endif
        }

        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
@@ -1426,6 +1416,8 @@ ggml_tensor * llm_graph_context::build_attn(

    // store to KV cache
    {
+        GGML_ASSERT(!kv_self->recurrent);
+
        const auto kv_head = kv_self->head;

        GGML_ASSERT(kv_self->size == n_ctx);
@@ -1594,7 +1586,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
         ggml_tensor * state_mask,
             int32_t   n_state,
             int32_t   n_seqs) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

    const auto n_kv    = kv_self->n;
    const auto kv_head = kv_self->head;
@@ -1626,7 +1618,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
                 int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

    const auto token_shift_count = hparams.token_shift_count;

@@ -1647,7 +1639,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
         ggml_tensor * token_shift,
  const llama_ubatch & ubatch,
                 int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

    const auto token_shift_count = hparams.token_shift_count;
    const auto n_embd = hparams.n_embd;
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -19,7 +19,6 @@ struct llama_cparams;

 class llama_memory_i;
 class llama_kv_cache_unified;
-class llama_kv_cache_recurrent;

 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -92,27 +91,29 @@ public:

 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
+    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
    virtual ~llm_graph_input_pos() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * pos = nullptr; // I32 [n_batch]

-    const int64_t n_pos_per_embd = 1;
+    const int64_t n_pos_per_token = 1;
 };

 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
    virtual ~llm_graph_input_attn_temp() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]

+    const int64_t n_pos_per_token = 1;
+
    const uint32_t n_attn_temp_floor_scale;
    const float    f_attn_temp_scale;
 };
@@ -188,26 +189,26 @@ public:

 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_copy() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * s_copy; // I32 [kv_size]

-    const llama_kv_cache_recurrent * kv_self;
+    const llama_kv_cache_unified * kv_self;
 };

 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_mask() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * s_mask; // F32 [1, n_kv]

-    const llama_kv_cache_recurrent * kv_self;
+    const llama_kv_cache_unified * kv_self;
 };

 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -309,7 +310,6 @@ class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;

-    virtual ggml_tensor * get_tokens()      = 0;
    virtual ggml_tensor * get_logits()      = 0;
    virtual ggml_tensor * get_embd()        = 0;
    virtual ggml_tensor * get_embd_pooled() = 0;
@@ -324,7 +324,6 @@ class llm_graph_result : public llm_graph_result_i {
 public:
    virtual ~llm_graph_result() = default;

-    ggml_tensor * get_tokens()      override { return t_tokens; }
    ggml_tensor * get_logits()      override { return t_logits; }
    ggml_tensor * get_embd()        override { return t_embd; }
    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
@@ -341,7 +340,6 @@ public:
    }

    // important graph nodes
-    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
@@ -365,8 +363,8 @@ struct llm_graph_params {
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;

-    ggml_backend_sched_t sched;
-    ggml_backend_t backend_cpu;
+    ggml_backend_sched * sched;
+    ggml_backend * backend_cpu;

    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
@@ -417,9 +415,9 @@ struct llm_graph_context {

    ggml_context * ctx0 = nullptr;

-    ggml_backend_sched_t sched;
+    ggml_backend_sched * sched;

-    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?

    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
@@ -432,7 +430,7 @@ struct llm_graph_context {

    llm_graph_context(const llm_graph_params & params);

-    int64_t n_pos_per_embd() const;
+    int64_t n_pos_per_token() const;

    void cb(ggml_tensor * cur, const char * name, int il) const;

--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -72,7 +72,6 @@ struct llama_hparams {
    float    expert_weights_scale = 0.0;
    bool     expert_weights_norm  = false;
    uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
-    uint32_t moe_every_n_layers   = 0;

    float f_norm_eps;
    float f_norm_rms_eps;
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -2,72 +2,32 @@

 #include "llama.h"
 #include "llama-io.h"
-#include "llama-graph.h"
 #include "llama-memory.h"

 #include "ggml-cpp.h"

+#include <functional>
 #include <set>
 #include <vector>

 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
-struct llama_sbatch;
-struct llama_model;
-struct llama_context;

 struct llama_kv_cache : public llama_memory_i {
-    virtual ~llama_kv_cache() = default;
+    using llama_memory_i::llama_memory_i;

-    // call if batch processing fails - restores the cache state
-    virtual void restore() = 0;
+    virtual void restore() = 0; // call if batch processing fails - restores the cache state
+    virtual void commit() = 0;  // call after successful batch processing - clears any pending state

-    // call after successful batch processing - clears any pending state
-    virtual void commit()  = 0;
+    virtual int32_t get_n_tokens()   const = 0;
+    virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache

-    // process any pending defrag/shift/etc. operations
-    // optionally call once before processing a new batch
-    virtual bool update(llama_context & lctx) = 0;
-
-    // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
-    virtual void defrag_sched(float thold) = 0;
-
-    // simulate full cache, used for allocating worst-case compute buffers
-    virtual void set_full() = 0;
-
-    //
-    // batch processing
-    //
-
-    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
-
-    // different KV caches require different batch splitting strategies
-    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
-
-    // find an empty slot of size "n_tokens" in the cache
-    virtual bool find_slot(const llama_ubatch & batch) = 0;
-
-    // getters
-    virtual int32_t   get_n_tokens()   const = 0;
-    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
-    virtual llama_pos get_pos_max()    const = 0;
-    virtual bool      get_can_shift()  const = 0;
+    virtual bool get_can_shift() const = 0;

    bool get_can_edit() const override { return get_can_shift(); }
-
-    //
-    // state write/read
-    //
-
-    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
-    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };

-//
-// llama_kv_cache_guard
-//
-
 struct llama_kv_cache_guard {
    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}

@@ -82,7 +42,7 @@ struct llama_kv_cache_guard {
 private:
    llama_kv_cache * kv;
 };
- 
+
 // block of KV slots to move when defragging
 struct llama_kv_defrag_move {
    uint32_t src;
@@ -90,50 +50,65 @@ struct llama_kv_defrag_move {
    uint32_t len;
 };

-//
-// llama_kv_cache_unified
-//
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta =  0;
+    int32_t   src   = -1; // used by recurrent state models to copy states
+    int32_t   tail  = -1;

+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+        return seq_id.find(id) != seq_id.end();
+    }
+
+    bool is_empty() const {
+        return seq_id.empty();
+    }
+
+    bool is_same_seq(const llama_kv_cell & other) const {
+        return seq_id == other.seq_id;
+    }
+};
+
+// ring-buffer of cached KV data
+// TODO: pimpl
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    struct kv_cell {
-        llama_pos pos   = -1;
-        llama_pos delta =  0;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
+    // can be used to query data from the model if needed
+    struct callbacks {
+        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
    };

-    static uint32_t get_padding(const llama_cparams & cparams);
-
    llama_kv_cache_unified(
-            const llama_model & model,
+            const llama_hparams & hparams,
+            callbacks             cbs);
+
+    virtual ~llama_kv_cache_unified() = default;
+
+    // TODO: become constructor
+    bool init(
+            const llama_model & model,   // TODO: do not reference the model
+          const llama_cparams & cparams,
                    ggml_type   type_k,
                    ggml_type   type_v,
-                         bool   v_trans,
-                         bool   offload,
                     uint32_t   kv_size,
-                     uint32_t   padding);
+                         bool   offload);

-    ~llama_kv_cache_unified() = default;
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;

-    //
-    // llama_memory_i
-    //
+    size_t total_size() const;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos pos_max() const;

    void clear() override;
+    void defrag() override;
+
+    virtual void restore() override;
+    virtual void commit() override;

    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
@@ -143,76 +118,25 @@ public:

    llama_pos seq_pos_max(llama_seq_id seq_id) const override;

-    //
-    // llama_kv_cache
-    //
-
-    void restore() override;
-    void commit()  override;
-
-    bool update(llama_context & ctx) override;
-
-    void defrag_sched(float thold) override;
-
-    void set_full() override;
-
-    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+    bool get_can_shift() const override;

+    // find an empty slot of size "n_tokens" in the cache
    // updates the cache head
    // Note: On success, it's important that cache.head points
    // to the first cell of the slot.
-    bool find_slot(const llama_ubatch & batch) override;
+    bool find_slot(const llama_ubatch & batch);

-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
+    // TODO: maybe not needed
+    uint32_t get_padding(const llama_cparams & cparams) const;

-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
+    // find how many cells are currently in use
+    uint32_t cell_max() const;

-    bool get_can_shift() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
-
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_impl also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<kv_cell> cells;
-
-    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<ggml_tensor *> v_l;
-
-private:
-    const llama_model & model;
-    const llama_hparams & hparams;
-
-    bool has_shift = false;
-    bool do_defrag = false;
-
-    bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
-
-    // required padding
-    uint32_t padding = 1;
-
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
-
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;

    // defrag
+
    struct {
        std::vector<llama_kv_defrag_move> moves;
    } defrag_info;
@@ -221,6 +145,7 @@ private:
    bool defrag_prepare(int32_t n_max_nodes);

    // commit/restore cache
+
    struct slot_range {
        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
        uint32_t c1 = 0;
@@ -231,125 +156,25 @@ private:
        std::vector<slot_range> ranges;
    } pending;

-    // find how many cells are currently in use
-    uint32_t cell_max() const;
-
-    size_t total_size() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    ggml_tensor * build_rope_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_tensor * cur,
-                    ggml_tensor * shift,
-                    ggml_tensor * factors,
-                          float   freq_base,
-                          float   freq_scale) const;
-
-    llm_graph_result_ptr build_graph_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
-
-    llm_graph_result_ptr build_graph_defrag(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf,
-                    const std::vector<llama_kv_defrag_move> & moves) const;
-
-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
-
-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
-};
-
-//
-// llama_kv_cache_recurrent
-//
-
-class llama_kv_cache_recurrent : public llama_kv_cache {
-public:
-    struct kv_cell {
-        llama_pos pos  = -1;
-        int32_t   src  = -1; // used to copy states
-        int32_t   tail = -1;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
-    llama_kv_cache_recurrent(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   offload,
-                     uint32_t   kv_size);
-
-    ~llama_kv_cache_recurrent() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id) override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    void restore() override;
-    void commit()  override;
-
-    bool update(llama_context & lctx) override;
-
-    void defrag_sched(float thold) override;
-
-    void set_full() override;
-
-    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
-
-    bool find_slot(const llama_ubatch & batch) override;
-
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
-    bool get_can_shift() const override;
-
-    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
-    int32_t s_copy(int i) const;
-    float   s_mask(int i) const;
-
    // state write/load

-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
+
+    // members
+
+    const llama_hparams & hparams;
+
+    callbacks cbs;
+
+    bool has_shift = false;
+    bool do_defrag = false;
+
+    // TODO: remove this and implement llama_kv_cache_recurrent instead
+    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+
+    bool v_trans   = true;  // the value tensor is transposed
+    bool can_shift = false;

    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_impl also uses it, so it
@@ -361,41 +186,18 @@ public:
    // computed before each graph build
    uint32_t n = 0;

-    std::vector<kv_cell> cells;
+    std::vector<llama_kv_cell> cells;

    std::vector<ggml_tensor *> k_l; // per layer
    std::vector<ggml_tensor *> v_l;

 private:
-    //const llama_model & model;
-    const llama_hparams & hparams;
-
-    // commit/restore cache
-    // TODO: rework for recurrent cache
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    // pending cell updates that are not yet committed
-    struct {
-        std::vector<slot_range> ranges;
-    } pending;
-
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;

    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

-    // find how many cells are currently in use
-    uint32_t cell_max() const;
-
-    size_t total_size() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;

@@ -403,6 +205,11 @@ private:
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };

+// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
+//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
+//public:
+//    using llama_kv_cache_unified::llama_kv_cache_unified;
+//};

 //
 // kv cache view
--- a/llama/llama.cpp/src/llama-memory.h
+++ b/llama/llama.cpp/src/llama-memory.h
@@ -2,22 +2,12 @@

 #include "llama.h"

-struct llama_memory_params {
-    // kv cache
-    ggml_type type_k;
-    ggml_type type_v;
-
-    // parameters for other types of memory
-    // ...
-};
-
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 class llama_memory_i {
 public:
-    virtual ~llama_memory_i() = default;
-
    virtual void clear() = 0;
+    virtual void defrag() = 0;

    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -301,12 +301,12 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);

        switch (arr_info.gt) {
-            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT(
+                                            (std::is_same<T,  int32_t>::value) ||
+                                            (std::is_same<T, uint32_t>::value));  break;
            default:
-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
        }

        result.resize(arr_info.length);
@@ -332,12 +332,12 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);

        switch (arr_info.gt) {
-            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT(
+                                            (std::is_same<T,  int32_t>::value) ||
+                                            (std::is_same<T, uint32_t>::value));  break;
            default:
-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
        }

        if (arr_info.length > N_MAX) {
@@ -826,10 +826,6 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
        mmaps_used.reserve(files.size());
        for (const auto & file : files) {
            auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
-            if (!reg) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-
            auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
            mmaps_used.emplace_back(mapping->size(), 0);
--- a/llama/llama.cpp/src/llama-model-saver.cpp
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@@ -1,281 +0,0 @@
-#include "llama-model-saver.h"
-
-#include "gguf.h"
-
-#include "llama.h"
-#include "llama-hparams.h"
-#include "llama-model.h"
-#include "llama-vocab.h"
-
-#include <string>
-
-llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
-    gguf_ctx = gguf_init_empty();
-}
-
-llama_model_saver::~llama_model_saver() {
-    gguf_free(gguf_ctx);
-}
-
-void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
-    gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
-}
-
-void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
-    gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
-}
-
-void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
-    gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
-}
-
-void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
-    gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
-}
-
-void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
-    gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
-}
-
-[[noreturn]]
-void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
-    GGML_UNUSED(key);
-    GGML_UNUSED(value);
-    GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
-}
-
-template <typename Container>
-void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
-    const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
-    GGML_ASSERT(n_values <= value.size());
-
-    if (n_values == 0) {
-        return;
-    }
-
-    if (per_layer) {
-        bool all_values_the_same = true;
-        for (size_t i = 1; i < n_values; ++i) {
-            if (value[i] != value[0]) {
-                all_values_the_same = false;
-                break;
-            }
-        }
-        if (all_values_the_same) {
-            add_kv(key, value[0]);
-            return;
-        }
-    }
-
-    if (std::is_same<typename Container::value_type, uint8_t>::value) {
-        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
-    } else if (std::is_same<typename Container::value_type, int8_t>::value) {
-        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
-    } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
-        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
-    } else if (std::is_same<typename Container::value_type, int32_t>::value) {
-        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
-    } else if (std::is_same<typename Container::value_type, float>::value) {
-        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
-    } else if (std::is_same<Container, std::string>::value) {
-        gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
-    } else {
-        GGML_ABORT("fatal error");
-    }
-}
-
-void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
-    std::vector<const char *> tmp(value.size());
-    for (size_t i = 0; i < value.size(); ++i) {
-        tmp[i] = value[i].c_str();
-    }
-    gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
-}
-
-void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
-    if (!tensor) {
-        return;
-    }
-    if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
-        GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
-        return;
-    }
-    gguf_add_tensor(gguf_ctx, tensor);
-}
-
-void llama_model_saver::add_kv_from_model() {
-    const llama_hparams & hparams = model.hparams;
-    const llama_vocab   & vocab   = model.vocab;
-
-    const int32_t n_vocab = vocab.n_tokens();
-    std::vector<std::string> tokens(n_vocab);
-    std::vector<float>       scores(n_vocab);
-    std::vector<int32_t>     token_types(n_vocab);
-
-    for (int32_t id = 0; id < n_vocab; ++id) {
-        const llama_vocab::token_data & token_data = vocab.get_token_data(id);
-
-        tokens[id] = token_data.text;
-        scores[id] = token_data.score;
-
-        switch(token_data.attr) {
-            case LLAMA_TOKEN_ATTR_UNKNOWN:      token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN;      break;
-            case LLAMA_TOKEN_ATTR_UNUSED:       token_types[id] = LLAMA_TOKEN_TYPE_UNUSED;       break;
-            case LLAMA_TOKEN_ATTR_NORMAL:       token_types[id] = LLAMA_TOKEN_TYPE_NORMAL;       break;
-            case LLAMA_TOKEN_ATTR_CONTROL:      token_types[id] = LLAMA_TOKEN_TYPE_CONTROL;      break;
-            case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
-            case LLAMA_TOKEN_ATTR_BYTE:         token_types[id] = LLAMA_TOKEN_TYPE_BYTE;         break;
-            case LLAMA_TOKEN_ATTR_UNDEFINED:
-            default:                            token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED;    break;
-        }
-    }
-
-    // add_kv(LLM_KV_GENERAL_TYPE,                      ???);
-    add_kv(LLM_KV_GENERAL_ARCHITECTURE,              model.arch_name());
-    // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION,      ???);
-    // add_kv(LLM_KV_GENERAL_ALIGNMENT,                 ???);
-    add_kv(LLM_KV_GENERAL_NAME,                      model.name);
-    // add_kv(LLM_KV_GENERAL_AUTHOR,                    ???);
-    // add_kv(LLM_KV_GENERAL_VERSION,                   ???);
-    // add_kv(LLM_KV_GENERAL_URL,                       ???);
-    // add_kv(LLM_KV_GENERAL_DESCRIPTION,               ???);
-    // add_kv(LLM_KV_GENERAL_LICENSE,                   ???);
-    // add_kv(LLM_KV_GENERAL_SOURCE_URL,                ???);
-    // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO,            ???);
-
-    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
-    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
-    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
-    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
-    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
-    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
-    add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-    add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
-    add_kv(LLM_KV_USE_PARALLEL_RESIDUAL,             hparams.use_par_res);
-    // add_kv(LLM_KV_TENSOR_DATA_LAYOUT,                ???);
-    add_kv(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
-    add_kv(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
-    add_kv(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
-    add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
-    add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
-    add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
-    add_kv(LLM_KV_DECODER_START_TOKEN_ID,            hparams.dec_start_token_id);
-    add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING,            hparams.f_attn_logit_softcapping);
-    add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING,           hparams.f_final_logit_softcapping);
-    add_kv(LLM_KV_SWIN_NORM,                         hparams.swin_norm);
-    add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS,            hparams.rescale_every_n_layers);
-    add_kv(LLM_KV_TIME_MIX_EXTRA_DIM,                hparams.time_mix_extra_dim);
-    add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM,              hparams.time_decay_extra_dim);
-    add_kv(LLM_KV_RESIDUAL_SCALE,                    hparams.f_residual_scale);
-    add_kv(LLM_KV_EMBEDDING_SCALE,                   hparams.f_embedding_scale);
-
-    add_kv(LLM_KV_ATTENTION_HEAD_COUNT,              hparams.n_head_arr, true);
-    add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
-    add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS,          hparams.f_max_alibi_bias);
-    add_kv(LLM_KV_ATTENTION_CLAMP_KQV,               hparams.f_clamp_kqv);
-    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k);
-    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v);
-    add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,           hparams.f_norm_eps);
-    add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-    add_kv(LLM_KV_ATTENTION_CAUSAL,                  hparams.causal_attn);
-    add_kv(LLM_KV_ATTENTION_Q_LORA_RANK,             hparams.n_lora_q);
-    add_kv(LLM_KV_ATTENTION_KV_LORA_RANK,            hparams.n_lora_kv);
-    add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,  hparams.n_rel_attn_bkts);
-    add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
-    add_kv(LLM_KV_ATTENTION_SCALE,                   hparams.f_attention_scale);
-
-    const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
-
-    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot);
-    add_kv(LLM_KV_ROPE_FREQ_BASE,                    hparams.rope_freq_base_train);
-    // add_kv(LLM_KV_ROPE_SCALE_LINEAR,                 rope_scaling_factor); // old name
-    add_kv(LLM_KV_ROPE_SCALING_TYPE,                 llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
-    add_kv(LLM_KV_ROPE_SCALING_FACTOR,               rope_scaling_factor);
-    add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR,          hparams.rope_attn_factor);
-    add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,         hparams.n_ctx_orig_yarn);
-    add_kv(LLM_KV_ROPE_SCALING_FINETUNED,            hparams.rope_finetuned);
-    add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,         hparams.rope_yarn_log_mul);
-
-    // TODO: implement split file support
-    // add_kv(LLM_KV_SPLIT_NO,                          ???);
-    // add_kv(LLM_KV_SPLIT_COUNT,                       ???);
-    // add_kv(LLM_KV_SPLIT_TENSORS_COUNT,               ???);
-
-    add_kv(LLM_KV_SSM_INNER_SIZE,                    hparams.ssm_d_inner);
-    add_kv(LLM_KV_SSM_CONV_KERNEL,                   hparams.ssm_d_conv);
-    add_kv(LLM_KV_SSM_STATE_SIZE,                    hparams.ssm_d_state);
-    add_kv(LLM_KV_SSM_TIME_STEP_RANK,                hparams.ssm_dt_rank);
-    add_kv(LLM_KV_SSM_DT_B_C_RMS,                    hparams.ssm_dt_b_c_rms);
-
-    add_kv(LLM_KV_WKV_HEAD_SIZE,                     hparams.wkv_head_size);
-
-    add_kv(LLM_KV_TOKENIZER_MODEL,                   vocab.get_tokenizer_model());
-    add_kv(LLM_KV_TOKENIZER_PRE,                     vocab.get_tokenizer_pre());
-    add_kv(LLM_KV_TOKENIZER_LIST,                    tokens);
-    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE,              token_types);
-    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,        vocab.n_token_types());
-    add_kv(LLM_KV_TOKENIZER_SCORES,                  scores);
-    add_kv(LLM_KV_TOKENIZER_MERGES,                  vocab.get_bpe_merges());
-    // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
-    add_kv(LLM_KV_TOKENIZER_BOS_ID,                  uint32_t(vocab.token_bos()));
-    add_kv(LLM_KV_TOKENIZER_EOS_ID,                  uint32_t(vocab.token_eos()));
-    add_kv(LLM_KV_TOKENIZER_EOT_ID,                  uint32_t(vocab.token_eot()));
-    add_kv(LLM_KV_TOKENIZER_EOM_ID,                  uint32_t(vocab.token_eom()));
-    add_kv(LLM_KV_TOKENIZER_UNK_ID,                  uint32_t(vocab.token_unk()));
-    add_kv(LLM_KV_TOKENIZER_SEP_ID,                  uint32_t(vocab.token_sep()));
-    add_kv(LLM_KV_TOKENIZER_PAD_ID,                  uint32_t(vocab.token_pad()));
-    // add_kv(LLM_KV_TOKENIZER_CLS_ID,                  uint32_t(vocab.token_bos())); // deprecated
-    // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
-    add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
-    add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
-    add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
-    add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
-    add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
-    // add_kv(LLM_KV_TOKENIZER_HF_JSON,                 ???);
-    // add_kv(LLM_KV_TOKENIZER_RWKV,                    ???);
-    add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID,              uint32_t(vocab.token_fim_pre()));
-    add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID,              uint32_t(vocab.token_fim_suf()));
-    add_kv(LLM_KV_TOKENIZER_FIM_MID_ID,              uint32_t(vocab.token_fim_mid()));
-    add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID,              uint32_t(vocab.token_fim_pad()));
-    add_kv(LLM_KV_TOKENIZER_FIM_REP_ID,              uint32_t(vocab.token_fim_rep()));
-    add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID,              uint32_t(vocab.token_fim_sep()));
-
-    // TODO: implement LoRA support
-    // add_kv(LLM_KV_ADAPTER_TYPE,                      ???);
-    // add_kv(LLM_KV_ADAPTER_LORA_ALPHA,                ???);
-
-    // deprecated
-    // add_kv(LLM_KV_TOKENIZER_PREFIX_ID,               ???);
-    // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID,               ???);
-    // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID,               ???);
-}
-
-void llama_model_saver::add_tensors_from_model() {
-    if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
-        add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
-    }
-    add_tensor(model.type_embd);
-    add_tensor(model.pos_embd);
-    add_tensor(model.tok_norm);
-    add_tensor(model.tok_norm_b);
-    add_tensor(model.output_norm);
-    add_tensor(model.output_norm_b);
-    add_tensor(model.output);
-    add_tensor(model.output_b);
-    add_tensor(model.output_norm_enc);
-    add_tensor(model.cls);
-    add_tensor(model.cls_b);
-    add_tensor(model.cls_out);
-    add_tensor(model.cls_out_b);
-
-    for (const struct llama_layer & layer : model.layers) {
-        for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
-            add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
-        }
-    }
-}
-
-void llama_model_saver::save(const std::string & path_model) {
-    gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
-}
-
--- a/llama/llama.cpp/src/llama-model-saver.h
+++ b/llama/llama.cpp/src/llama-model-saver.h
@@ -1,37 +0,0 @@
-#pragma once
-
-#include "llama.h"
-#include "llama-arch.h"
-
-#include <vector>
-
-struct llama_model_saver {
-    struct gguf_context * gguf_ctx = nullptr;
-    const struct llama_model & model;
-    const struct LLM_KV llm_kv;
-
-    llama_model_saver(const struct llama_model & model);
-    ~llama_model_saver();
-
-    void add_kv(enum llm_kv key, uint32_t     value);
-    void add_kv(enum llm_kv key, int32_t      value);
-    void add_kv(enum llm_kv key, float        value);
-    void add_kv(enum llm_kv key, bool         value);
-    void add_kv(enum llm_kv key, const char * value);
-
-    [[noreturn]]
-    void add_kv(enum llm_kv key, char value); // needed to make the template below compile
-
-    template <typename Container>
-    void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
-
-    void add_kv(enum llm_kv key, const std::vector<std::string> & value);
-
-    void add_tensor(const struct ggml_tensor * tensor);
-
-    void add_kv_from_model();
-
-    void add_tensors_from_model();
-
-    void save(const std::string & path_model);
-};
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -40,17 +40,14 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_335M:          return "335M";
        case LLM_TYPE_410M:          return "410M";
        case LLM_TYPE_450M:          return "450M";
-        case LLM_TYPE_475M:          return "475M";
        case LLM_TYPE_770M:          return "770M";
        case LLM_TYPE_780M:          return "780M";
        case LLM_TYPE_0_5B:          return "0.5B";
-        case LLM_TYPE_0_6B:          return "0.6B";
        case LLM_TYPE_1B:            return "1B";
        case LLM_TYPE_1_3B:          return "1.3B";
        case LLM_TYPE_1_4B:          return "1.4B";
        case LLM_TYPE_1_5B:          return "1.5B";
        case LLM_TYPE_1_6B:          return "1.6B";
-        case LLM_TYPE_1_7B:          return "1.7B";
        case LLM_TYPE_1_8B:          return "1.8B";
        case LLM_TYPE_2B:            return "2B";
        case LLM_TYPE_2_8B:          return "2.8B";
@@ -69,7 +66,6 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_15B:           return "15B";
        case LLM_TYPE_16B:           return "16B";
        case LLM_TYPE_20B:           return "20B";
-        case LLM_TYPE_27B:           return "27B";
        case LLM_TYPE_30B:           return "30B";
        case LLM_TYPE_32B:           return "32B";
        case LLM_TYPE_34B:           return "34B";
@@ -78,9 +74,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_65B:           return "65B";
        case LLM_TYPE_70B:           return "70B";
        case LLM_TYPE_236B:          return "236B";
-        case LLM_TYPE_290B:          return "290B";
        case LLM_TYPE_314B:          return "314B";
-        case LLM_TYPE_405B:          return "405B";
        case LLM_TYPE_671B:          return "671B";
        case LLM_TYPE_SMALL:         return "0.1B";
        case LLM_TYPE_MEDIUM:        return "0.4B";
@@ -94,10 +88,10 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_16x3_8B:       return "16x3.8B";
        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
        case LLM_TYPE_57B_A14B:      return "57B.A14B";
+        case LLM_TYPE_27B:           return "27B";
+        case LLM_TYPE_290B:          return "290B";
        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
-        case LLM_TYPE_30B_A3B:       return "30B.A3B";
-        case LLM_TYPE_235B_A22B:     return "235B.A22B";
        default:                     return "?B";
    }
 }
@@ -117,10 +111,6 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };

-std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
-    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
-}
-
 static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
        if (kv.second == name) {
@@ -303,10 +293,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
    // add extra buffer types, only if no GPU device is present
    // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
-
    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
        ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
@@ -605,7 +591,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                switch (hparams.n_layer) {
                    case 32: type = LLM_TYPE_7B; break;
                    case 80: type = LLM_TYPE_70B; break;
-                    case 162: type = LLM_TYPE_405B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -724,19 +709,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }
            } break;
        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
-                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);

                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
-                    if (arch == LLM_ARCH_NOMIC_BERT) {
-                        type = LLM_TYPE_137M;
-                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
-                        type = LLM_TYPE_475M;
-                    }
+                    type = LLM_TYPE_137M;
                }
            } break;
        case LLM_ARCH_BLOOM:
@@ -797,7 +776,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            // fall through
        case LLM_ARCH_QWEN2:
            {
-                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
@@ -827,10 +805,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
-                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
-                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
-                    case 40: type = LLM_TYPE_14B; break;
-                    case 64: type = LLM_TYPE_32B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -840,8 +814,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {

                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_30B_A3B; break;
-                    case 94: type = LLM_TYPE_235B_A22B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -1453,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_MISTRAL3: break;
        default: throw std::runtime_error("unsupported model architecture");
    }

@@ -1521,9 +1494,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    }

    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -1691,11 +1661,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
                    std::regex pattern(overrides->pattern);
                    if (std::regex_search(tensor_name, pattern)) {
+                        LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
                        buft = overrides->buft;
-                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
-                                tensor_name.c_str(),
-                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
-                                ggml_backend_buft_name(buft));
                        break;
                    }
                }
@@ -1712,9 +1679,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            auto * buft_dev = ggml_backend_buft_get_device(buft);
            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-                if (!cpu_dev) {
-                    throw std::runtime_error("no CPU backend found");
-                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);
            }

@@ -1942,9 +1906,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);

-                        if (n_ff > 0) {
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        }
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1954,11 +1916,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                        }

-                        if (n_ff > 0) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);

                        // optional MLP bias
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -2173,7 +2133,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_BERT:
            case LLM_ARCH_NOMIC_BERT:
-            case LLM_ARCH_NOMIC_BERT_MOE:
                {
                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -2207,31 +2166,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
                        }

-                        if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
-                            layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-                        }
-
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);

                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);

-                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
-                            layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
-                        } else {
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);

-                            if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
-                                layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-                                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
-                                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-                            } else {
-                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                            }
+                        if (arch == LLM_ARCH_BERT) {
+                            layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
+                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                        } else {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
                        }

                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -3602,11 +3550,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                    // output
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);

                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@@ -4239,9 +4183,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        if (!dev) {
            // FIXME: workaround for CPU backend buft having a NULL device
            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (!dev) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
        }
        ggml_backend_dev_props props;
        ggml_backend_dev_get_props(dev, &props);
@@ -4371,7 +4312,7 @@ uint64_t llama_model::n_elements() const {
 }

 void llama_model::print_info() const {
-    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
+    const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);

    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
        bool is_var = false;
@@ -4432,7 +4373,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
-        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
+        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
@@ -4579,19 +4520,6 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
    return it->second;
 }

-ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
-    // choose long/short freq factors based on the context size
-    if (layers[il].rope_freqs != nullptr) {
-        return layers[il].rope_freqs;
-    }
-
-    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
-        return layers[il].rope_long;
-    }
-
-    return layers[il].rope_short;
-}
-
 struct llm_build_llama : public llm_graph_context {
    llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4632,7 +4560,7 @@ struct llm_build_llama : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4954,7 +4882,7 @@ struct llm_build_mllama: public llm_graph_context {
                // self attention layer

                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5078,7 +5006,6 @@ struct llm_build_deci : public llm_graph_context {
            ggml_tensor * inpSA = inpL;
            const int64_t n_head_kv = hparams.n_head_kv(il);
            const int64_t n_head    = hparams.n_head(il);
-            const int64_t n_ff      = hparams.n_ff(il);

            if (n_head == 0) {
                // attention-free layer of Llama-3_1-Nemotron-51B
@@ -5098,7 +5025,7 @@ struct llm_build_deci : public llm_graph_context {
            } else if (n_head > 0) {
                // self-attention
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5154,11 +5081,6 @@ struct llm_build_deci : public llm_graph_context {
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }

-            // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
-            if (n_ff == 0) {
-                continue;
-            }
-
            // For Granite architecture
            if (hparams.f_residual_scale) {
                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
@@ -6152,11 +6074,6 @@ struct llm_build_bert : public llm_graph_context {
                cur = build_lora_mm(model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);

-                if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
-                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                    cb(cur, "bqkv", il);
-                }
-
                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -6209,29 +6126,13 @@ struct llm_build_bert : public llm_graph_context {
            cb(ffn_inp, "ffn_inp", il);

            // feed-forward network
-            if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
-                // MoE branch
-                cur = build_moe_ffn(cur,
-                        model.layers[il].ffn_gate_inp,
-                        model.layers[il].ffn_up_exps,
-                        nullptr,
-                        model.layers[il].ffn_down_exps,
-                        nullptr,
-                        hparams.n_expert,
-                        hparams.n_expert_used,
-                        LLM_FFN_GELU,
-                        false, false,
-                        0.0f,
-                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
-                cb(cur, "ffn_moe_out", il);
-            } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
+            if (model.arch == LLM_ARCH_BERT) {
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        NULL,                      NULL,                        NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
-                cb(cur, "ffn_out", il);
            } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   NULL,                        NULL,
@@ -6239,7 +6140,6 @@ struct llm_build_bert : public llm_graph_context {
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
            } else {
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   NULL, NULL,
@@ -6247,8 +6147,8 @@ struct llm_build_bert : public llm_graph_context {
                        model.layers[il].ffn_down, NULL, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
            }
+            cb(cur, "ffn_out", il);

            // attentions bypass the intermediate layer
            cur = ggml_add(ctx0, cur, ffn_inp);
@@ -7585,7 +7485,7 @@ struct llm_build_phi3 : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for 128k context
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                ggml_tensor* attn_norm_output = build_norm(inpL,
                        model.layers[il].attn_norm,
@@ -8337,7 +8237,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;

-            ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+            ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

            // norm
            cur = build_norm(inpL,
@@ -9104,7 +9004,7 @@ struct llm_build_mamba : public llm_graph_context {
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const {
-        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

        const auto kv_head = kv_self->head;

@@ -9405,7 +9305,7 @@ struct llm_build_cohere2 : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for 128k context
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10343,7 +10243,7 @@ struct llm_build_deepseek : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11707,7 +11607,7 @@ struct llm_build_exaone : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11852,7 +11752,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
            ggml_tensor * state_mask,
            const llama_ubatch & ubatch,
            int   il) const {
-        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

        const auto n_tokens = ubatch.n_tokens;
        const auto n_seqs = ubatch.n_seqs;
@@ -12248,7 +12148,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
            ggml_tensor *& first_layer_value,
            const llama_ubatch & ubatch,
            int   il) const {
-        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);

        const auto n_tokens = ubatch.n_tokens;
        const auto n_seqs = ubatch.n_seqs;
@@ -12796,7 +12696,7 @@ struct llm_build_solar : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13247,7 +13147,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);

                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13367,46 +13267,36 @@ struct llm_build_bailingmoe : public llm_graph_context {
    }
 };

-llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
+llama_memory_i * llama_model::create_memory() const {
    llama_memory_i * res;

    switch (arch) {
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_JINA_BERT_V2:
-        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
-            {
-                res = nullptr;
-            } break;
        case LLM_ARCH_MAMBA:
        case LLM_ARCH_RWKV6:
        case LLM_ARCH_RWKV6QWEN2:
        case LLM_ARCH_RWKV7:
        case LLM_ARCH_ARWKV7:
            {
-                res = new llama_kv_cache_recurrent(
-                        *this,
-                        GGML_TYPE_F32,
-                        GGML_TYPE_F32,
-                        cparams.offload_kqv,
-                        std::max((uint32_t) 1, cparams.n_seq_max));
+                res = new llama_kv_cache_unified(hparams, {
+                    /*.get_rope_factors =*/ nullptr
+                });
            } break;
        default:
            {
-                const auto padding = llama_kv_cache_unified::get_padding(cparams);
+                res = new llama_kv_cache_unified(hparams, {
+                    /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
+                        // choose long/short freq factors based on the context size
+                        if (layers[il].rope_freqs != nullptr) {
+                            return layers[il].rope_freqs;
+                        }

-                cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+                        if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+                            return layers[il].rope_long;
+                        }

-                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
-
-                res = new llama_kv_cache_unified(
-                        *this,
-                        params.type_k,
-                        params.type_v,
-                        !cparams.flash_attn,
-                        cparams.offload_kqv,
-                        cparams.n_ctx,
-                        padding);
+                        return layers[il].rope_short;
+                    }
+                });
            }
    }

@@ -13459,7 +13349,6 @@ llm_graph_result_ptr llama_model::build_graph(
        case LLM_ARCH_BERT:
        case LLM_ARCH_JINA_BERT_V2:
        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
            {
                llm = std::make_unique<llm_build_bert>(*this, params, gf);
            } break;
@@ -13797,6 +13686,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DECI:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
+        case LLM_ARCH_PLAMO:
+        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
        case LLM_ARCH_XVERSE:
@@ -13814,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_CHAMELEON:
        case LLM_ARCH_SOLAR:
        case LLM_ARCH_BAILINGMOE:
+        case LLM_ARCH_MISTRAL3:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
@@ -13822,7 +13714,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DBRX:
        case LLM_ARCH_BERT:
        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_STABLELM:
        case LLM_ARCH_BITNET:
        case LLM_ARCH_QWEN:
@@ -13835,7 +13726,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_PHI2:
        case LLM_ARCH_PHI3:
        case LLM_ARCH_PHIMOE:
-        case LLM_ARCH_PLAMO:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_GEMMA3:
@@ -13843,7 +13733,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_OPENELM:
        case LLM_ARCH_GPTNEOX:
        case LLM_ARCH_CODESHELL:
-        case LLM_ARCH_ORION:
        case LLM_ARCH_NEMOTRON:
        case LLM_ARCH_EXAONE:
        case LLM_ARCH_MINICPM3:
@@ -13916,14 +13805,6 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
    const auto & it = model->gguf_kv.find(key);
    if (it == model->gguf_kv.end()) {
-        // one-off fix for very popular models (so we are not flooded with issues)
-        // do not extend this list unless absolutely necessary
-        // Mistral-Small-2503 does not have built-in chat template
-        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
-        if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
-            return "mistral-v7-tekken";
-        }
-
        return nullptr;
    }

--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -37,17 +37,14 @@ enum llm_type {
    LLM_TYPE_335M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
-    LLM_TYPE_475M,
    LLM_TYPE_770M,
    LLM_TYPE_780M,
    LLM_TYPE_0_5B,
-    LLM_TYPE_0_6B,
    LLM_TYPE_1B,
    LLM_TYPE_1_3B,
    LLM_TYPE_1_4B,
    LLM_TYPE_1_5B,
    LLM_TYPE_1_6B,
-    LLM_TYPE_1_7B,
    LLM_TYPE_1_8B,
    LLM_TYPE_2B,
    LLM_TYPE_2_8B,
@@ -67,7 +64,6 @@ enum llm_type {
    LLM_TYPE_16B,
    LLM_TYPE_20B,
    LLM_TYPE_22B,
-    LLM_TYPE_27B,
    LLM_TYPE_30B,
    LLM_TYPE_32B,
    LLM_TYPE_34B,
@@ -77,9 +73,7 @@ enum llm_type {
    LLM_TYPE_70B,
    LLM_TYPE_90B,
    LLM_TYPE_236B,
-    LLM_TYPE_290B,
    LLM_TYPE_314B,
-    LLM_TYPE_405B,
    LLM_TYPE_671B,
    LLM_TYPE_SMALL,
    LLM_TYPE_MEDIUM,
@@ -93,14 +87,12 @@ enum llm_type {
    LLM_TYPE_16x3_8B,
    LLM_TYPE_10B_128x3_66B,
    LLM_TYPE_57B_A14B,
+    LLM_TYPE_27B,
+    LLM_TYPE_290B,
    LLM_TYPE_17B_16E, // llama4 Scout
    LLM_TYPE_17B_128E, // llama4 Maverick
-    LLM_TYPE_30B_A3B,
-    LLM_TYPE_235B_A22B,
 };

-std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
-
 struct llama_layer_posnet {
    // resnet
    struct ggml_tensor * norm1   = nullptr;
@@ -413,11 +405,8 @@ struct llama_model {

    const struct ggml_tensor * get_tensor(const char * name) const;

-    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
-
-    // note: can mutate `cparams`
    // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
+    llama_memory_i * create_memory() const; // TODO: params

    // TODO: move this to new llm_arch_model_i interface
    llm_graph_result_ptr build_graph(
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        nthread = std::thread::hardware_concurrency();
    }

-    // mmap consistently increases speed on Linux, and also increases speed on Windows with
+    // mmap consistently increases speed Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
    constexpr bool use_mmap = true;
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

    llama_model_kv_override * kv_overrides = nullptr;
    if (params->kv_overrides) {
-        auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
        kv_overrides = v->data();
    }

@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // This used to be a regex, but <regex> has an extreme cost to compile times.
        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?

+        // don't quantize vision stuff
+        quantize &= name.find("v.") == std::string::npos;
+        quantize &= name.find("mm.") == std::string::npos;
+
        // quantize only 2D and 3D tensors (experts)
        quantize &= (ggml_n_dims(tensor) >= 2);

--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
    // }

    if (k <= 0) {
-        return;
+        k = cur_p->size;
    }

    k = std::min(k, (int) cur_p->size);
@@ -298,7 +298,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
        }
        cur_p->sorted = true;
    }
-
    cur_p->size = k;
 }

@@ -1750,35 +1749,23 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
 static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;

-    if (ctx->n <= 0.0f || cur_p->size <= 1) {
-        return;
-    }
-
    // find max logit and calculate mean
    float max = cur_p->data[0].logit;
    float logits_sum = 0;
-    size_t valid_count = 0;
    for (size_t i = 0; i < cur_p->size; ++i) {
-        // Only count non-negative infinity values
-        if (cur_p->data[i].logit != -INFINITY) {
-            if (cur_p->data[i].logit > max) {
-                max = cur_p->data[i].logit;
-            }
-            logits_sum += cur_p->data[i].logit;
-            valid_count++;
+        if (cur_p->data[i].logit > max) {
+            max = cur_p->data[i].logit;
        }
+        logits_sum += cur_p->data[i].logit;
    }
-    float mean = valid_count > 0 ? logits_sum/valid_count : 0;
+    float mean = logits_sum/cur_p->size;

    // calculate standard deviation
    float acc = 0;
    for (size_t i = 0; i < cur_p->size; ++i) {
-        // Skip -infinity in std calculation
-        if (cur_p->data[i].logit != -INFINITY) {
-            acc += pow(cur_p->data[i].logit - mean, 2);
-        }
+        acc += pow(cur_p->data[i].logit - mean, 2);
    }
-    float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
+    float std = sqrt(acc/cur_p->size);

    //apply mask
    for (size_t i = 0; i < cur_p->size; ++i) {
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1,7 +1,5 @@
 #include "llama-vocab.h"

-#include "ggml.h"
-#include "gguf.h"
 #include "llama-impl.h"
 #include "llama-model-loader.h"

@@ -417,13 +415,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
-                regex_exprs = {
-                    // original regex from tokenizer.json
-                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@@ -1236,9 +1227,6 @@ struct fragment_buffer_variant {
 struct llama_vocab::impl {
    uint32_t n_token_types = 0; // for BERT-style token types

-    std::string tokenizer_model;
-    std::string tokenizer_pre;
-
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

@@ -1374,6 +1362,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {

    // determine vocab type
    {
+        std::string tokenizer_model;
+        std::string tokenizer_pre;
+
        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);

@@ -1468,10 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {

            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
            if (precompiled_charsmap_keyidx != -1) {
-                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
-                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
-
-                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
+                size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
@@ -1509,8 +1497,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "llama3"   ||
                    tokenizer_pre == "llama-v3" ||
                    tokenizer_pre == "llama-bpe"||
-                    tokenizer_pre == "falcon3"  ||
-                    tokenizer_pre == "pixtral") {
+                    tokenizer_pre == "falcon3") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                ignore_merges = true;
                add_bos = true;
@@ -1637,10 +1624,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "bailingmoe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                clean_spaces = false;
-            } else if (
-                tokenizer_pre == "seed-coder") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
-                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2786,14 +2769,6 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
    pimpl->load(ml, kv);
 }

-std::string llama_vocab::get_tokenizer_model() const {
-    return pimpl->tokenizer_model;
-}
-
-std::string llama_vocab::get_tokenizer_pre() const {
-    return pimpl->tokenizer_pre;
-}
-
 enum llama_vocab_type llama_vocab::get_type() const {
    return pimpl->type;
 }
@@ -3016,20 +2991,6 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
    return it->second;
 }

-std::vector<std::string> llama_vocab::get_bpe_merges() const {
-    std::vector<std::string> result(pimpl->bpe_ranks.size());
-
-    for (const auto & pair : pimpl->bpe_ranks) {
-        result[pair.second] = pair.first.first + " " + pair.first.second;
-    }
-
-    return result;
-}
-
-std::vector<char> llama_vocab::get_precompiled_charsmap() const {
-    return pimpl->precompiled_charsmap;
-}
-
 int32_t llama_vocab::tokenize(
                  const char * text,
                     int32_t   text_len,
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -21,9 +21,6 @@ struct llama_vocab {

    void load(llama_model_loader & ml, const LLM_KV & kv);

-    std::string get_tokenizer_model() const;
-    std::string get_tokenizer_pre() const;
-
    enum llama_vocab_type     get_type()     const;
    enum llama_vocab_pre_type get_pre_type() const;

@@ -83,9 +80,6 @@ struct llama_vocab {
    int max_token_len() const;

    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
-    std::vector<std::string> get_bpe_merges() const;
-
-    std::vector<char> get_precompiled_charsmap() const;

    int32_t tokenize(
                   const char * text,
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -4,7 +4,6 @@
 #include "llama-mmap.h"
 #include "llama-vocab.h"
 #include "llama-model-loader.h"
-#include "llama-model-saver.h"
 #include "llama-model.h"

 #include "ggml.h"
@@ -254,13 +253,6 @@ struct llama_model * llama_model_load_from_splits(
    return llama_model_load_from_file_impl(splits.front(), splits, params);
 }

-void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
-    llama_model_saver ms(*model);
-    ms.add_kv_from_model();
-    ms.add_tensors_from_model();
-    ms.save(path_model);
-}
-
 //
 // chat templates
 //
@@ -346,4 +338,3 @@ const char * llama_print_system_info(void) {

    return s.c_str();
 }
-
--- a/llama/llama.cpp/tools/mtmd/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -2,11 +2,10 @@ package llama

 /*
 #cgo CFLAGS: -std=c11
-#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
 #cgo CXXFLAGS: -std=c++17
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
-#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
 #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include

@@ -40,8 +39,8 @@ import (
 	"unsafe"

 	_ "github.com/ollama/ollama/llama/llama.cpp/common"
+	_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
 	_ "github.com/ollama/ollama/llama/llama.cpp/src"
-	_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )

@@ -199,6 +198,7 @@ type ModelParams struct {
 	NumGpuLayers int
 	MainGpu      int
 	UseMmap      bool
+	UseMlock     bool
 	TensorSplit  []float32
 	Progress     func(float32)
 	VocabOnly    bool
@@ -217,6 +217,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
 	cparams.main_gpu = C.int32_t(params.MainGpu)
 	cparams.use_mmap = C.bool(params.UseMmap)
+	cparams.use_mlock = C.bool(params.UseMlock)
 	cparams.vocab_only = C.bool(params.VocabOnly)

 	if len(params.TensorSplit) > 0 {
@@ -460,6 +461,24 @@ func (m *Model) NEmbd() int {
 	return int(C.llama_model_n_embd(m.c))
 }

+func Quantize(infile, outfile string, ftype uint32) error {
+	cinfile := C.CString(infile)
+	defer C.free(unsafe.Pointer(cinfile))
+
+	coutfile := C.CString(outfile)
+	defer C.free(unsafe.Pointer(coutfile))
+
+	params := C.llama_model_quantize_default_params()
+	params.nthread = -1
+	params.ftype = ftype
+
+	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
+		return fmt.Errorf("llama_model_quantize: %d", rc)
+	}
+
+	return nil
+}
+
 // vision processing
 type ClipContext struct {
 	c *C.struct_clip_ctx
@@ -587,6 +606,9 @@ type SamplingParams struct {
 	PenaltyRepeat  float32
 	PenaltyFreq    float32
 	PenaltyPresent float32
+	Mirostat       int
+	MirostatTau    float32
+	MirostatEta    float32
 	PenalizeNl     bool
 	Seed           uint32
 	Grammar        string
@@ -603,6 +625,9 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
 	cparams.penalty_present = C.float(params.PenaltyFreq)
+	cparams.mirostat = C.int32_t(params.Mirostat)
+	cparams.mirostat_tau = C.float(params.MirostatTau)
+	cparams.mirostat_eta = C.float(params.MirostatEta)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
@@ -637,8 +662,8 @@ func SchemaToGrammar(schema []byte) []byte {
 	cStr := C.CString(string(schema))
 	defer C.free(unsafe.Pointer(cStr))

-	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := min(1024*1024, len(schema)*4)
+	// Allocate buffer for grammar output with reasonable size
+	const maxLen = 32768 // 32KB
 	buf := make([]byte, maxLen)

 	// Call C function to convert schema to grammar
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -24,7 +24,7 @@ problem.
 9 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index b30b4cb3..0ce73a99 100644
+index 273075f4..dd11f304 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -43,7 +43,7 @@ index b30b4cb3..0ce73a99 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_aligned_free(buffer->context, buffer->size);
@@ -55,7 +55,7 @@ index b30b4cb3..0ce73a99 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index b4b85abc..cb0d8528 100644
+index a7febef7..31750b6f 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -96,7 +96,7 @@ index b4b85abc..cb0d8528 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@@ -104,7 +104,7 @@ index b4b85abc..cb0d8528 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 576f9581..1b56f858 100644
+index 266d8af4..12886cd3 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 4f0abb5a..de1ec184 100644
+index a0667b7d..bd83adc5 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@@ -161,10 +161,10 @@ index 4f0abb5a..de1ec184 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 0ea72994..ae3a3c33 100644
+index 1de34c96..4600f61e 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@@ -172,7 +172,7 @@ index 0ea72994..ae3a3c33 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@@ -180,7 +180,7 @@ index 0ea72994..ae3a3c33 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 0ea72994..ae3a3c33 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index e2b357fd..68768029 100644
+index 39f3cd34..c569a8a5 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@@ -200,7 +200,7 @@ index e2b357fd..68768029 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 9389ca80..806c1b3d 100644
+index 48060517..a35b498c 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@@ -31,8 +31,8 @@ index 9389ca80..806c1b3d 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
+@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -11,10 +11,10 @@ instead of forcing one or the error
 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 62246c10..dca22d8b 100644
+index 983385f8..32f59819 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_all = 0;
 
     // count outputs
@@ -23,7 +23,7 @@ index 62246c10..dca22d8b 100644
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             n_outputs_all += batch.logits[i] != 0;
         }
-@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
@@ -32,7 +32,7 @@ index 62246c10..dca22d8b 100644
         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
         if (t_embd && res->get_embd_pooled()) {
-@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
 fixes loading vision models in llama.cpp on windows
 filesystems for paths that include wide characters
 ---
- tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

-diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index 41ba45a7..cdd8ca44 100644
--- a/tools/mtmd/clip.cpp
-+++ b/tools/mtmd/clip.cpp
-@@ -31,6 +31,19 @@
- #include <numeric>
- #include <functional>
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+index 75970615..d57b4bd6 100644
+--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
+@@ -29,6 +29,19 @@
+ #include <limits>
+ #include <array>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -32,8 +32,8 @@ index 41ba45a7..cdd8ca44 100644
 +
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
- enum ffn_op_type {
-@@ -2190,7 +2203,29 @@ struct clip_model_loader {
+ //#define CLIP_DEBUG_FUNCTIONS
+@@ -1430,7 +1443,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@@ -63,7 +63,7 @@ index 41ba45a7..cdd8ca44 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -2217,7 +2252,11 @@ struct clip_model_loader {
+@@ -1457,7 +1492,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ParthSareen	23e8ac9428	wip?	2025-05-07 19:00:44 -07:00
ParthSareen	611d3a17ed	server: add python tool parsing logic	2025-05-02 16:23:54 -07:00