runner: add test for unicode token processing

2026-02-18 15:25:27 -05:00 · 2025-05-14 11:29:11 -07:00
152 changed files with 2745 additions and 7391 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -103,18 +103,21 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 11'
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            cuda-version: '12.8'
-            flags: ''
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
-            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -157,9 +160,6 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: matrix.preset == 'CPU'
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
@@ -178,9 +178,9 @@ jobs:
          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
      - name: Build target "${{ matrix.preset }}"
        run: |
-          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          cmake --preset "${{ matrix.preset }}"
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@@ -246,7 +246,7 @@ jobs:
            dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe

  windows-sign:
-    runs-on: windows
+    runs-on: windows-2022
    environment: release
    needs: [windows-depends, windows-build]
    steps:
@@ -322,21 +322,16 @@ jobs:
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama)                 echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_sbsa/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_jetpack5)   echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6)   echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)            echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
            esac
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
-      - run: |
-          echo "Manifests"
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
-            echo $ARCHIVE
-            cat $ARCHIVE
-          done
      - run: |
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
@@ -475,18 +470,8 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          pattern: dist-linux-*
-          path: stage
-          merge-multiple: false
-      - name: Merge linux amd64 payload
-        working-directory: stage/dist-linux-amd64-archive
-        run: |
-          tar zxf ollama-linux-amd64.tgz
-          tar zxf ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
-          rm -f ollama-linux-amd64.tgz ../dist-linux-amd64-rocm/ollama-linux-amd64.tgz
-          tar -c -f- --owner 0 --group 0 . | pigz -9vc > ../ollama-linux-amd64.tgz
-      - name: Cleanup linux payloads
-        run: |
-          find stage -name ollama-linux\*.tgz -exec mv {} dist/ \;
+          path: dist
+          merge-multiple: true
      - run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
        working-directory: dist
      - name: Create or update Release
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -36,7 +36,7 @@ jobs:
              | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
          }

-          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT

  linux:
    needs: [changes]
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,11 +78,11 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
-            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
+            flags: '-DAMDGPU_TARGETS=gfx1010'
    runs-on: windows
    steps:
      - run: |
@@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
@@ -120,9 +120,6 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -136,8 +133,8 @@ jobs:
          path: ${{ github.workspace }}\.ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
-          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
        env:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,8 +51,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)

-add_compile_definitions(NDEBUG)
-
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
@@ -78,13 +76,14 @@ if(CMAKE_CUDA_COMPILER)

    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()

@@ -115,11 +114,7 @@ if(CMAKE_HIP_COMPILER)

        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCY_SET rocm
-            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
-            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
-        )
-        install(RUNTIME_DEPENDENCY_SET rocm
+            RUNTIME_DEPENDENCIES
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,12 +17,20 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+      }
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
      }
    },
    {
@@ -50,7 +58,6 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
-        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
@@ -71,6 +78,11 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/24
+++ b/24
@@ -7,13 +7,12 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

-# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
-    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

@@ -39,6 +38,15 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

+FROM base AS cuda-11
+ARG CUDA11VERSION=11.3
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8
+
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -90,15 +98,17 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
-COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6

 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama /lib/ollama
+COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm

 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
+To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):

 ```shell
-ollama run gemma3
+ollama run llama3.2
 ```

 ## Model library
@@ -405,11 +405,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
- [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)

 ### Cloud

@@ -453,8 +448,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))

 ### Apple Vision Pro

@@ -591,7 +584,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)

 ### Supported backends
--- a/api/client.go
+++ b/api/client.go
@@ -24,10 +24,7 @@ import (
 	"net/http"
 	"net/url"
 	"runtime"
-	"strconv"
-	"time"

-	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@@ -79,14 +76,6 @@ func NewClient(base *url.URL, http *http.Client) *Client {
 	}
 }

-func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
-	token, err := auth.Sign(ctx, []byte(challenge))
-	if err != nil {
-		return "", err
-	}
-	return token, nil
-}
-
 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
 	var reqBody io.Reader
 	var data []byte
@@ -108,21 +97,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	}

 	requestURL := c.base.JoinPath(path)
-
-	var token string
-	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
-		now := strconv.FormatInt(time.Now().Unix(), 10)
-		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
-		token, err = getAuthorizationToken(ctx, chal)
-		if err != nil {
-			return err
-		}
-
-		q := requestURL.Query()
-		q.Set("ts", now)
-		requestURL.RawQuery = q.Encode()
-	}
-
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
 	if err != nil {
 		return err
@@ -132,10 +106,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	request.Header.Set("Accept", "application/json")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

-	if token != "" {
-		request.Header.Set("Authorization", token)
-	}
-
 	respObj, err := c.http.Do(request)
 	if err != nil {
 		return err
@@ -173,22 +143,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}

 	requestURL := c.base.JoinPath(path)
-
-	var token string
-	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
-		var err error
-		now := strconv.FormatInt(time.Now().Unix(), 10)
-		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
-		token, err = getAuthorizationToken(ctx, chal)
-		if err != nil {
-			return err
-		}
-
-		q := requestURL.Query()
-		q.Set("ts", now)
-		requestURL.RawQuery = q.Encode()
-	}
-
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
 	if err != nil {
 		return err
@@ -198,10 +152,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

-	if token != "" {
-		request.Header.Set("Authorization", token)
-	}
-
 	response, err := c.http.Do(request)
 	if err != nil {
 		return err
--- a/api/types.go
+++ b/api/types.go
@@ -83,12 +83,6 @@ type GenerateRequest struct {
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]any `json:"options"`
-
-	// Think controls whether thinking/reasoning models will think before
-	// responding. Needs to be a pointer so we can distinguish between false
-	// (request that thinking _not_ be used) and unset (use the old behavior
-	// before this option was introduced)
-	Think *bool `json:"think,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -114,10 +108,6 @@ type ChatRequest struct {

 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
-
-	// Think controls whether thinking/reasoning models will think before
-	// responding
-	Think *bool `json:"think,omitempty"`
 }

 type Tools []Tool
@@ -136,11 +126,8 @@ func (t Tool) String() string {
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
-	Role    string `json:"role"`
-	Content string `json:"content"`
-	// Thinking contains the text that was inside thinking tags in the
-	// original model output when ChatRequest.Think is enabled.
-	Thinking  string      `json:"thinking,omitempty"`
+	Role      string      `json:"role"`
+	Content   string      `json:"content"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 }
@@ -491,10 +478,6 @@ type GenerateResponse struct {
 	// Response is the textual response itself.
 	Response string `json:"response"`

-	// Thinking contains the text that was inside thinking tags in the
-	// original model output when ChatRequest.Think is enabled.
-	Thinking string `json:"thinking,omitempty"`
-
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`

--- a/api/types_test.go
+++ b/api/types_test.go
@@ -372,50 +372,3 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 		})
 	}
 }
-
-func TestThinking_UnmarshalJSON(t *testing.T) {
-	trueVal := true
-	falseVal := false
-
-	tests := []struct {
-		name             string
-		input            string
-		expectedThinking *bool
-		expectedError    bool
-	}{
-		{
-			name:             "true",
-			input:            `{ "think": true }`,
-			expectedThinking: &trueVal,
-		},
-		{
-			name:             "false",
-			input:            `{ "think": false }`,
-			expectedThinking: &falseVal,
-		},
-		{
-			name:             "unset",
-			input:            `{ }`,
-			expectedThinking: nil,
-		},
-		{
-			name:             "invalid",
-			input:            `{ "think": "true" }`,
-			expectedThinking: nil,
-			expectedError:    true,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			var req GenerateRequest
-			err := json.Unmarshal([]byte(test.input), &req)
-			if test.expectedError {
-				require.Error(t, err)
-			} else {
-				require.NoError(t, err)
-				assert.Equal(t, test.expectedThinking, req.Think)
-			}
-		})
-	}
-}
--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@@ -0,0 +1,178 @@
+package benchmark
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// Command line flags
+var modelFlag string
+
+func init() {
+	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
+	flag.Lookup("m").DefValue = "model"
+}
+
+// modelName returns the model name from flags, failing the test if not set
+func modelName(b *testing.B) string {
+	if modelFlag == "" {
+		b.Fatal("Error: -m flag is required for benchmark tests")
+	}
+	return modelFlag
+}
+
+type TestCase struct {
+	name      string
+	prompt    string
+	maxTokens int
+}
+
+// runGenerateBenchmark contains the common generate and metrics logic
+func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
+	start := time.Now()
+	var ttft time.Duration
+	var metrics api.Metrics
+
+	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
+		if ttft == 0 && resp.Response != "" {
+			ttft = time.Since(start)
+		}
+		if resp.Done {
+			metrics = resp.Metrics
+		}
+		return nil
+	})
+
+	// Report custom metrics as part of the benchmark results
+	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
+	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
+
+	// Token throughput metrics
+	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
+	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
+	b.ReportMetric(promptThroughput, "prompt_tok/s")
+	b.ReportMetric(genThroughput, "gen_tok/s")
+
+	// Token counts
+	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
+	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
+	if err != nil {
+		b.Fatal(err)
+	}
+}
+
+// BenchmarkColdStart runs benchmarks with model loading from cold state
+func BenchmarkColdStart(b *testing.B) {
+	client := setup(b)
+	tests := []TestCase{
+		{"short_prompt", "Write a long story", 100},
+		{"medium_prompt", "Write a detailed economic analysis", 500},
+		{"long_prompt", "Write a comprehensive AI research paper", 1000},
+	}
+	m := modelName(b)
+
+	for _, tt := range tests {
+		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
+			ctx := b.Context()
+
+			// Set number of tokens as our throughput metric
+			b.SetBytes(int64(tt.maxTokens))
+
+			for b.Loop() {
+				b.StopTimer()
+				// Ensure model is unloaded before each iteration
+				unload(client, m, b)
+				b.StartTimer()
+
+				req := &api.GenerateRequest{
+					Model:   m,
+					Prompt:  tt.prompt,
+					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
+				}
+
+				runGenerateBenchmark(b, ctx, client, req)
+			}
+		})
+	}
+}
+
+// BenchmarkWarmStart runs benchmarks with pre-loaded model
+func BenchmarkWarmStart(b *testing.B) {
+	client := setup(b)
+	tests := []TestCase{
+		{"short_prompt", "Write a long story", 100},
+		{"medium_prompt", "Write a detailed economic analysis", 500},
+		{"long_prompt", "Write a comprehensive AI research paper", 1000},
+	}
+	m := modelName(b)
+
+	for _, tt := range tests {
+		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
+			ctx := b.Context()
+
+			// Pre-warm the model
+			warmup(client, m, tt.prompt, b)
+
+			// Set number of tokens as our throughput metric
+			b.SetBytes(int64(tt.maxTokens))
+
+			for b.Loop() {
+				req := &api.GenerateRequest{
+					Model:   m,
+					Prompt:  tt.prompt,
+					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
+				}
+
+				runGenerateBenchmark(b, ctx, client, req)
+			}
+		})
+	}
+}
+
+// setup verifies server and model availability
+func setup(b *testing.B) *api.Client {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		b.Fatal(err)
+	}
+	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
+		b.Fatalf("Model unavailable: %v", err)
+	}
+
+	return client
+}
+
+// warmup ensures the model is loaded and warmed up
+func warmup(client *api.Client, model string, prompt string, b *testing.B) {
+	for range 3 {
+		err := client.Generate(
+			context.Background(),
+			&api.GenerateRequest{
+				Model:   model,
+				Prompt:  prompt,
+				Options: map[string]any{"num_predict": 50, "temperature": 0.1},
+			},
+			func(api.GenerateResponse) error { return nil },
+		)
+		if err != nil {
+			b.Logf("Error during model warm-up: %v", err)
+		}
+	}
+}
+
+// unload forces model unloading using KeepAlive: 0 parameter
+func unload(client *api.Client, model string, b *testing.B) {
+	req := &api.GenerateRequest{
+		Model:     model,
+		KeepAlive: &api.Duration{Duration: 0},
+	}
+	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
+		b.Logf("Unload error: %v", err)
+	}
+	time.Sleep(1 * time.Second)
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -39,7 +39,6 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
@@ -47,23 +46,6 @@ import (
 	"github.com/ollama/ollama/version"
 )

-// ensureThinkingSupport emits a warning if the model does not advertise thinking support
-func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
-	if name == "" {
-		return
-	}
-	resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
-	if err != nil {
-		return
-	}
-	for _, cap := range resp.Capabilities {
-		if cap == model.CapabilityThinking {
-			return
-		}
-	}
-	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
-}
-
 var errModelfileNotFound = errors.New("specified Modelfile wasn't found")

 func getModelfileName(cmd *cobra.Command) (string, error) {
@@ -283,9 +265,6 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
-
-		// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
-		Think: opts.Think,
 	}

 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
@@ -320,22 +299,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format

-	thinkFlag := cmd.Flags().Lookup("think")
-	if thinkFlag.Changed {
-		think, err := cmd.Flags().GetBool("think")
-		if err != nil {
-			return err
-		}
-		opts.Think = &think
-	} else {
-		opts.Think = nil
-	}
-	hidethinking, err := cmd.Flags().GetBool("hidethinking")
-	if err != nil {
-		return err
-	}
-	opts.HideThinking = hidethinking
-
 	keepAlive, err := cmd.Flags().GetString("keepalive")
 	if err != nil {
 		return err
@@ -399,11 +362,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
-	if err != nil {
-		return err
-	}
-
 	opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)

 	// TODO: remove the projector info and vision info checks below,
@@ -789,38 +747,11 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					targetWidth := 10 // Small width where we are displaying the data in a column
-
-					var itemsToShow int
-					totalWidth := 1 // Start with 1 for opening bracket
-
-					// Find how many we can fit
-					for i := range vData {
-						itemStr := fmt.Sprintf("%v", vData[i])
-						width := runewidth.StringWidth(itemStr)
-
-						// Add separator width (", ") for all items except the first
-						if i > 0 {
-							width += 2
-						}
-
-						// Check if adding this item would exceed our width limit
-						if totalWidth+width > targetWidth && i > 0 {
-							break
-						}
-
-						totalWidth += width
-						itemsToShow++
-					}
-
-					// Format the output
-					if itemsToShow < len(vData) {
-						v = fmt.Sprintf("%v", vData[:itemsToShow])
-						v = strings.TrimSuffix(v, "]")
-						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
-					} else {
-						v = fmt.Sprintf("%v", vData)
+					n := 3
+					if len(vData) < n {
+						n = len(vData)
 					}
+					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@@ -841,19 +772,10 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {

 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		count := 0
-		for scanner.Scan() {
-			text := strings.TrimSpace(scanner.Text())
-			if text == "" {
-				continue
+		for scanner.Scan() && (len(rows) < n || n < 0) {
+			if text := scanner.Text(); text != "" {
+				rows = append(rows, []string{"", strings.TrimSpace(text)})
 			}
-			count++
-			if n < 0 || count <= n {
-				rows = append(rows, []string{"", text})
-			}
-		}
-		if n >= 0 && count > n {
-			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
@@ -965,19 +887,17 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 type generateContextKey string

 type runOptions struct {
-	Model        string
-	ParentModel  string
-	Prompt       string
-	Messages     []api.Message
-	WordWrap     bool
-	Format       string
-	System       string
-	Images       []api.ImageData
-	Options      map[string]any
-	MultiModal   bool
-	KeepAlive    *api.Duration
-	Think        *bool
-	HideThinking bool
+	Model       string
+	ParentModel string
+	Prompt      string
+	Messages    []api.Message
+	WordWrap    bool
+	Format      string
+	System      string
+	Images      []api.ImageData
+	Options     map[string]any
+	MultiModal  bool
+	KeepAlive   *api.Duration
 }

 type displayResponseState struct {
@@ -1033,26 +953,6 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 	}
 }

-func thinkingOutputOpeningText(plainText bool) string {
-	text := "Thinking...\n"
-
-	if plainText {
-		return text
-	}
-
-	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
-}
-
-func thinkingOutputClosingText(plainText bool) string {
-	text := "...done thinking.\n\n"
-
-	if plainText {
-		return text
-	}
-
-	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
-}
-
 func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -1080,34 +980,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var role string
-	var thinkTagOpened bool = false
-	var thinkTagClosed bool = false

 	fn := func(response api.ChatResponse) error {
-		if response.Message.Content != "" || !opts.HideThinking {
-			p.StopAndClear()
-		}
+		p.StopAndClear()

 		latest = response

 		role = response.Message.Role
-		if response.Message.Thinking != "" && !opts.HideThinking {
-			if !thinkTagOpened {
-				fmt.Print(thinkingOutputOpeningText(false))
-				thinkTagOpened = true
-			}
-			displayResponse(response.Message.Thinking, opts.WordWrap, state)
-		}
-
 		content := response.Message.Content
-		if thinkTagOpened && !thinkTagClosed && content != "" {
-			fmt.Print(thinkingOutputClosingText(false))
-			thinkTagClosed = true
-		}
-		// purposefully not putting thinking blocks in the response, which would
-		// only be needed if we later added tool calling to the cli (they get
-		// filtered out anyway since current models don't expect them unless you're
-		// about to finish some tool calls)
 		fullResponse.WriteString(content)

 		displayResponse(content, opts.WordWrap, state)
@@ -1124,7 +1004,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Messages: opts.Messages,
 		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
-		Think:    opts.Think,
 	}

 	if opts.KeepAlive != nil {
@@ -1186,32 +1065,13 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()

 	var state *displayResponseState = &displayResponseState{}
-	var thinkTagOpened bool = false
-	var thinkTagClosed bool = false
-
-	plainText := !term.IsTerminal(int(os.Stdout.Fd()))

 	fn := func(response api.GenerateResponse) error {
+		p.StopAndClear()
+
 		latest = response
 		content := response.Response

-		if response.Response != "" || !opts.HideThinking {
-			p.StopAndClear()
-		}
-
-		if response.Thinking != "" && !opts.HideThinking {
-			if !thinkTagOpened {
-				fmt.Print(thinkingOutputOpeningText(plainText))
-				thinkTagOpened = true
-			}
-			displayResponse(response.Thinking, opts.WordWrap, state)
-		}
-
-		if thinkTagOpened && !thinkTagClosed && content != "" {
-			fmt.Print(thinkingOutputClosingText(plainText))
-			thinkTagClosed = true
-		}
-
 		displayResponse(content, opts.WordWrap, state)

 		return nil
@@ -1237,7 +1097,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
-		Think:     opts.Think,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -1341,11 +1200,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
+		if !strings.Contains(err.Error(), " refused") {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return fmt.Errorf("ollama server not responding - %w", err)
+			return errors.New("could not connect to ollama app, is it running?")
 		}
 	}
 	return nil
@@ -1423,7 +1282,7 @@ func NewCLI() *cobra.Command {
 	}

 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -1453,8 +1312,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
-	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@@ -1506,6 +1363,7 @@ func NewCLI() *cobra.Command {
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListRunningHandler,
 	}
+
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
@@ -1594,45 +1452,3 @@ func NewCLI() *cobra.Command {

 	return rootCmd
 }
-
-// If the user has explicitly set thinking options, either through the CLI or
-// through the `/set think` or `set nothink` interactive options, then we
-// respect them. Otherwise, we check model capabilities to see if the model
-// supports thinking. If the model does support thinking, we enable it.
-// Otherwise, we unset the thinking option (which is different than setting it
-// to false).
-//
-// If capabilities are not provided, we fetch them from the server.
-func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
-	if explicitlySetByUser {
-		return runOpts.Think, nil
-	}
-
-	if caps == nil {
-		client, err := api.ClientFromEnvironment()
-		if err != nil {
-			return nil, err
-		}
-		ret, err := client.Show(context.Background(), &api.ShowRequest{
-			Model: runOpts.Model,
-		})
-		if err != nil {
-			return nil, err
-		}
-		caps = &ret.Capabilities
-	}
-
-	thinkingSupported := false
-	for _, cap := range *caps {
-		if cap == model.CapabilityThinking {
-			thinkingSupported = true
-		}
-	}
-
-	if thinkingSupported {
-		thinking := true
-		return &thinking, nil
-	}
-
-	return nil, nil
-}
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -225,7 +225,6 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
-    ...                  

 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -62,8 +62,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
-		fmt.Fprintln(os.Stderr, "  /set think             Enable thinking")
-		fmt.Fprintln(os.Stderr, "  /set nothink           Disable thinking")
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -130,7 +128,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 	var sb strings.Builder
 	var multiline MultilineState
-	var thinkExplicitlySet bool = opts.Think != nil

 	for {
 		line, err := scanner.Readline()
@@ -198,19 +195,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
-			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
-			if err != nil {
-				return err
-			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
-				if strings.Contains(err.Error(), "does not support thinking") {
-					fmt.Printf("error: %v\n", err)
-					continue
-				}
 				return err
 			}
 			continue
@@ -271,22 +260,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
-				case "think":
-					think := true
-					opts.Think = &think
-					thinkExplicitlySet = true
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						ensureThinkingSupport(cmd.Context(), client, opts.Model)
-					}
-					fmt.Println("Set 'think' mode.")
-				case "nothink":
-					think := false
-					opts.Think = &think
-					thinkExplicitlySet = true
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						ensureThinkingSupport(cmd.Context(), client, opts.Model)
-					}
-					fmt.Println("Set 'nothink' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
@@ -475,11 +448,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 			assistant, err := chat(cmd, opts)
 			if err != nil {
-				if strings.Contains(err.Error(), "does not support thinking") {
-					fmt.Printf("error: %v\n", err)
-					sb.Reset()
-					continue
-				}
 				return err
 			}
 			if assistant != nil {
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"regexp"
+	"strings"

 	"github.com/ollama/ollama/api"
 )
@@ -19,12 +19,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
-	m := r.FindStringSubmatch(link)
-	if len(m) != 1 {
+	if !strings.Contains(link, "Ollama.app") {
 		return errors.New("could not find ollama app")
 	}
-	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
+	path := strings.Split(link, "Ollama.app")
+	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,27 +4,17 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"log/slog"
 	"os"
 	"os/exec"
-	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
-	"unsafe"

 	"github.com/ollama/ollama/api"
-	"golang.org/x/sys/windows"
-)
-
-const (
-	Installer = "OllamaSetup.exe"
 )

 func startApp(ctx context.Context, client *api.Client) error {
-	if len(isProcRunning(Installer)) > 0 {
-		return fmt.Errorf("upgrade in progress...")
-	}
+	// log.Printf("XXX Attempting to find and start ollama app")
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -45,11 +35,14 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
+	// log.Printf("XXX attempting to start app %s", appExe)

 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
+	cmd := exec.Command(cmd_path, "/c", appExe)
+	// TODO - these hide flags aren't working - still pops up a command window for some reason
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}

+	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -63,50 +56,3 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
-
-func isProcRunning(procName string) []uint32 {
-	pids := make([]uint32, 2048)
-	var ret uint32
-	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
-		slog.Debug("failed to check for running installers", "error", err)
-		return nil
-	}
-	if ret > uint32(len(pids)) {
-		pids = make([]uint32, ret+10)
-		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
-			slog.Debug("failed to check for running installers", "error", err)
-			return nil
-		}
-	}
-	if ret < uint32(len(pids)) {
-		pids = pids[:ret]
-	}
-	var matches []uint32
-	for _, pid := range pids {
-		if pid == 0 {
-			continue
-		}
-		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
-		if err != nil {
-			continue
-		}
-		defer windows.CloseHandle(hProcess)
-		var module windows.Handle
-		var cbNeeded uint32
-		cb := (uint32)(unsafe.Sizeof(module))
-		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
-			continue
-		}
-		var sz uint32 = 1024 * 8
-		moduleName := make([]uint16, sz)
-		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
-		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
-			continue
-		}
-		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
-		if strings.EqualFold(exeFile, procName) {
-			matches = append(matches, pid)
-		}
-	}
-	return matches
-}
--- a/cmd/warn_thinking_test.go
+++ b/cmd/warn_thinking_test.go
@@ -1,63 +0,0 @@
-package cmd
-
-import (
-	"encoding/json"
-	"io"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"strings"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/types/model"
-)
-
-// Test that a warning is printed when thinking is requested but not supported.
-func TestWarnMissingThinking(t *testing.T) {
-	cases := []struct {
-		capabilities []model.Capability
-		expectWarn   bool
-	}{
-		{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
-		{capabilities: []model.Capability{}, expectWarn: true},
-	}
-
-	for _, tc := range cases {
-		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
-				t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
-			}
-			var req api.ShowRequest
-			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-				t.Fatalf("decode request: %v", err)
-			}
-			resp := api.ShowResponse{Capabilities: tc.capabilities}
-			if err := json.NewEncoder(w).Encode(resp); err != nil {
-				t.Fatalf("encode response: %v", err)
-			}
-		}))
-		defer srv.Close()
-
-		t.Setenv("OLLAMA_HOST", srv.URL)
-		client, err := api.ClientFromEnvironment()
-		if err != nil {
-			t.Fatal(err)
-		}
-		oldStderr := os.Stderr
-		r, w, _ := os.Pipe()
-		os.Stderr = w
-		ensureThinkingSupport(t.Context(), client, "m")
-		w.Close()
-		os.Stderr = oldStderr
-		out, _ := io.ReadAll(r)
-
-		warned := strings.Contains(string(out), "warning:")
-		if tc.expectWarn && !warned {
-			t.Errorf("expected warning, got none")
-		}
-		if !tc.expectWarn && warned {
-			t.Errorf("did not expect warning, got: %s", string(out))
-		}
-	}
-}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -53,11 +53,8 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}

 	for _, sv := range t.SpecialVocabulary {
-		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
-		if len(sv.IDs) > 0 {
-			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
-		}
+		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 	}

 	return kv
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,8 +139,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}

 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
-			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -182,9 +181,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,6 +2,9 @@ package convert

 import (
 	"fmt"
+	"io"
+	"slices"
+	"strings"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -27,38 +30,65 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }

 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	merges := make([]merge, 0, p.NumHiddenLayers*6)
-	for i := range p.NumHiddenLayers {
-		merges = append(merges, merge{
-			fmt.Sprintf("blk.%d.*.w1.weight", i),
-			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w1.bias", i),
-			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w2.weight", i),
-			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w2.bias", i),
-			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w3.weight", i),
-			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w3.bias", i),
-			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
+	oldnew := []string{
+		"model.layers", "blk",
+		"w1", "ffn_gate_exps",
+		"w2", "ffn_down_exps",
+		"w3", "ffn_up_exps",
+	}
+
+	for i := range p.NumLocalExperts {
+		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
+	}
+
+	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
+	namer := strings.NewReplacer(oldnew...)
+	experts := make(map[string]experts)
+
+	// merge experts into a single tensor while removing them from ts
+	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
+		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
+			return false
+		}
+
+		name := namer.Replace(t.Name())
+		experts[name] = append(experts[name], t)
+		return true
+	})
+
+	var out []*ggml.Tensor
+	for n, e := range experts {
+		// TODO(mxyng): sanity check experts
+		out = append(out, &ggml.Tensor{
+			Name:     n,
+			Kind:     e[0].Kind(),
+			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
+			WriterTo: e,
 		})
 	}

-	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }

 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
-		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
-		"block_sparse_moe.experts.", ".",
 	)
 }
+
+type experts []Tensor
+
+func (e experts) WriteTo(w io.Writer) (int64, error) {
+	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
+	for _, t := range e {
+		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
+		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
+		// this accomplishes the same thing by writing each expert tensor in sequence
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -94,9 +94,7 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
-		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
-			text = append(text, t)
-		} else if t.Name() == "v.position_embd.gate" {
+		if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
@@ -107,21 +105,23 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 					WriterTo: tt,
 				})
 			}
-		} else {
-			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-				t.SetRepacker(m.repack(t.Name()))
-			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
-				t.SetRepacker(m.repack(t.Name()))
-			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
-				t.SetRepacker(m.repack(t.Name()))
-			}
-
+		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+			t.SetRepacker(m.repack(t.Name()))
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
+		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		} else {
+			text = append(text, t)
 		}
 	}

@@ -137,35 +137,16 @@ func (m *mllamaModel) repack(name string) Repacker {

 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))

-		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
-			heads := m.VisionModel.AttentionHeads
-			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
-				return nil, err
-			}
+		t, err = tensor.Tanh(t)
+		if err != nil {
+			return nil, err
+		}

-			if err := t.T(0, 2, 1, 3); err != nil {
-				return nil, err
-			}
-
-			if err := t.Reshape(dims...); err != nil {
-				return nil, err
-			}
-
-			if err := t.Transpose(); err != nil {
-				return nil, err
-			}
-		} else {
-			t, err = tensor.Tanh(t)
+		if name == "v.position_embd.gate" {
+			t, err = tensor.Sub(float32(1), t)
 			if err != nil {
 				return nil, err
 			}
-
-			if name == "v.position_embd.gate" {
-				t, err = tensor.Sub(float32(1), t)
-				if err != nil {
-					return nil, err
-				}
-			}
 		}

 		t = tensor.Materialize(t)
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -65,17 +65,17 @@ func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
-				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
-				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
+				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
+				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
-				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
-				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
-				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
+				strings.NewReplacer("attn.qkv", "attn_q"),
+				strings.NewReplacer("attn.qkv", "attn_k"),
+				strings.NewReplacer("attn.qkv", "attn_v"),
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, err := ggml.Decode(r, -1)
+	m, _, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, err := ggml.Decode(r, -1)
+			m, _, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -1,129 +1,56 @@
 package convert

 import (
-	"cmp"
-	"io"
 	"iter"
-	"path"
 	"slices"
 	"strings"

+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/fs/ggml"
 )

-type split struct {
-	*strings.Replacer
-	dim int
-
-	// fn is an optional function to apply to the tensor after slicing
-	fn func(tensor.Tensor) (tensor.Tensor, error)
-}
-
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
-// is split evenly based on the number of replacers provided unless a specific count is given.
-func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
+// is split evenly based on the number of replacers provided.
+func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
-		var offset int
-		for _, split := range splits {
-			t := t.Clone()
+		for i, replacer := range replacers {
 			shape := slices.Clone(t.Shape())
-			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
+			shape[dim] = shape[dim] / uint64(len(replacers))

 			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
-			slice[dim] = tensor.S(offset, offset+int(shape[dim]))
-			offset += int(shape[dim])
+			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))

-			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+			tt := t.Clone()
+			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}

-				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-				tt, err := tt.Slice(slice...)
+				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				t, err := t.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}

-				tt = tensor.Materialize(tt)
-
-				if split.fn != nil {
-					tt, err = split.fn(tt)
-					if err != nil {
-						return nil, err
-					}
-				}
-
+				t = tensor.Materialize(t)
 				// flatten tensor so it can be written as a vector
-				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
+				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 					return nil, err
 				}

-				return native.VectorF32(tt.(*tensor.Dense))
+				return native.VectorF32(t.(*tensor.Dense))
 			})

 			if !yield(&ggml.Tensor{
-				Name:     split.Replace(t.Name()),
+				Name:     replacer.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
-				WriterTo: t,
+				WriterTo: tt,
 			}) {
 				break
 			}
 		}
 	}
 }
-
-type merge struct {
-	pattern, name string
-}
-
-// mergeTensors merges tensors that match a given pattern into a single tensor.
-func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
-	var matched []Tensor
-	for i := range merges {
-		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
-			matched, _ := path.Match(merges[i].pattern, t.Name())
-			return matched
-		})
-
-		if len(matched) > 0 {
-			out = append(out, &ggml.Tensor{
-				Name:     merges[i].name,
-				Kind:     matched[0].Kind(),
-				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
-				WriterTo: mergeGroup(matched),
-			})
-		}
-	}
-
-	return out, unmatched
-}
-
-// slicesSplitFunc splits a slice into two slices based on a predicate function.
-func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
-	for _, e := range s {
-		if fn(e) {
-			matched = append(matched, e)
-		} else {
-			unmatched = append(unmatched, e)
-		}
-	}
-
-	return matched, unmatched
-}
-
-type mergeGroup []Tensor
-
-func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
-	for _, t := range g {
-		if _, err := t.WriteTo(w); err != nil {
-			return 0, err
-		}
-	}
-
-	return 0, nil
-}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -1,402 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"encoding/binary"
-	"io"
-	"iter"
-	"slices"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-)
-
-type fakeTensor struct {
-	name  string
-	shape []uint64
-	data  []float32
-
-	repacker Repacker
-}
-
-func (f fakeTensor) Name() string {
-	return f.name
-}
-
-func (f fakeTensor) Shape() []uint64 {
-	return f.shape
-}
-
-func (f fakeTensor) Kind() uint32 {
-	return 0
-}
-
-func (f *fakeTensor) SetRepacker(fn Repacker) {
-	f.repacker = fn
-}
-
-func (f fakeTensor) Clone() Tensor {
-	return &fakeTensor{
-		name:     f.name,
-		shape:    slices.Clone(f.shape),
-		data:     slices.Clone(f.data),
-		repacker: f.repacker,
-	}
-}
-
-func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
-	data := f.data
-	if f.repacker != nil {
-		data, err = f.repacker(f.name, data, f.shape)
-		if err != nil {
-			return 0, err
-		}
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
-		return 0, err
-	}
-
-	return int64(len(data) * 4), nil
-}
-
-func mul(shape []uint64) int {
-	n := 1
-	for _, dim := range shape {
-		n *= int(dim)
-	}
-	return n
-}
-
-func TestSplitDim(t *testing.T) {
-	r := fakeTensor{
-		name:  "a.b",
-		shape: []uint64{3, 4},
-		data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
-	}
-
-	t.Run("no split", func(t *testing.T) {
-		for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
-			if tt.Name != "x.b" {
-				t.Fatalf("expected name 'x', got '%s'", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 4}) {
-				t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
-				t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
-			}
-		}
-	})
-
-	t.Run("even split", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 1,
-			split{Replacer: strings.NewReplacer("a", "x")},
-			split{Replacer: strings.NewReplacer("b", "y")},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
-				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
-				t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
-			}
-		}
-	})
-
-	t.Run("uneven split", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 0,
-			split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
-			split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{2, 4}) {
-				t.Fatal("expected shape [2, 4], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
-				t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{1, 4}) {
-				t.Fatal("expected shape [1, 4], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
-				t.Fatal("expected data [8, 9, 10, 11], got", f32s)
-			}
-		}
-	})
-
-	t.Run("split with transpose", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 1,
-			split{Replacer: strings.NewReplacer("a", "x")},
-			split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
-				return tensor.Transpose(tt, 1, 0)
-			}},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
-				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
-				t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
-			}
-		}
-	})
-}
-
-func TestMerge(t *testing.T) {
-	unmatched := []Tensor{
-		&fakeTensor{
-			name:  "a.0.b",
-			shape: []uint64{5, 2},
-			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
-		},
-		&fakeTensor{
-			name:  "a.1.b",
-			shape: []uint64{5, 2},
-			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
-		},
-		&fakeTensor{
-			name:  "c.0.d",
-			shape: []uint64{5, 2},
-			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
-		},
-		&fakeTensor{
-			name:  "c.1.d",
-			shape: []uint64{5, 2},
-			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
-		},
-		&fakeTensor{
-			name:  "e.0.f",
-			shape: []uint64{5, 2},
-			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
-		},
-	}
-
-	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
-		for i := range n {
-			got := matched[i]
-			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
-				t.Errorf("unexpected (-want +got):\n%s", diff)
-			}
-
-			var b bytes.Buffer
-			if _, err := got.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, 20)
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			offset := 10 + (i * 20)
-			want := make([]float32, 20)
-			for j := range 20 {
-				want[j] = float32(offset + j)
-			}
-
-			if diff := cmp.Diff(want, f32s); diff != "" {
-				t.Errorf("unexpected data (-want +got):\n%s", diff)
-			}
-		}
-	}
-
-	t.Run("single merge", func(t *testing.T) {
-		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
-		if len(unmatched) != 3 {
-			t.Error("expected 3 remaining tensors, got", len(unmatched))
-		}
-
-		if len(matched) != 1 {
-			t.Error("expected 1 merged tensor, got", len(matched))
-		}
-
-		checkMatched(t, 1, matched)
-	})
-
-	t.Run("multiple merges", func(t *testing.T) {
-		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
-		if len(unmatched) != 1 {
-			t.Error("expected 1 remaining tensors, got", len(unmatched))
-		}
-
-		if len(matched) != 2 {
-			t.Error("expected 2 merged tensor, got", len(matched))
-		}
-
-		checkMatched(t, 2, matched)
-	})
-
-	t.Run("no match", func(t *testing.T) {
-		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
-		if len(unmatched) != 5 {
-			t.Error("expected 5 remaining tensors, got", len(unmatched))
-		}
-
-		if len(matched) != 0 {
-			t.Error("expected no merged tensors, got", len(matched))
-		}
-	})
-}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -110,7 +110,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}

 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
-		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -172,34 +171,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}

-	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
-	} else if err != nil {
-		return nil, err
-	} else {
-		defer f.Close()
-
-		var p map[string]json.RawMessage
-		if err := json.NewDecoder(f).Decode(&p); err != nil {
-			return nil, err
-		}
-
-		for _, st := range specialTokenTypes {
-			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
-				var ids []int32
-				if err := json.Unmarshal(bts, &ids); err != nil {
-					// value is not a list so the existing ID is used
-					continue
-				}
-
-				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
-					return sv.Type == st
-				}); i >= 0 {
-					t.SpecialVocabulary[i].IDs = ids
-				}
-			}
-		}
-	}
-
 	return t, nil
 }

@@ -309,9 +280,6 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
-
-	// IDs is populated by generation_config.json
-	IDs []int32
 }

 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,67 +247,6 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
-		{
-			name: "generation config eos token ids",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"added_tokens": [
-						{
-							"id": 0,
-							"content": "<bos>",
-							"special": true
-						},
-						{
-							"id": 1,
-							"content": "<eos>",
-							"special": true
-						},
-						{
-							"id": 2,
-							"content": "<eot>",
-							"special": true
-						},
-						{
-							"id": 3,
-							"content": "<eom>",
-							"special": true
-						}
-					],
-					"model": {
-						"vocab": {
-							"<bos>": 0,
-							"<eos>": 1,
-							"<eot>": 2,
-							"<eom>": 3
-						}
-					}
-				}`),
-				"tokenizer_config.json": strings.NewReader(`{
-					"add_bos_token": true,
-					"add_eos_token": false,
-					"bos_token": "<bos>",
-					"eos_token": "<eos>"
-				}`),
-				"generation_config.json": strings.NewReader(`{
-					"bos_token_id": 0,
-					"eos_token_id": [1, 2, 3]
-				}`),
-			}),
-			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model:  "gpt2",
-					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
-					Scores: []float32{0, 1, 2, 3},
-					Types:  []int32{3, 3, 3, 3},
-				},
-				SpecialVocabulary: []*SpecialVocabulary{
-					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
-					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
-				},
-				Pre: "default",
-			},
-		},
 	}

 	for _, tt := range cases {
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,7 +3,6 @@
 package discover

 import (
-	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -56,13 +55,10 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
-		return "sbsa"
 	}

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
-		// The detected driver is older than Feb 2023
-		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/api.md
+++ b/docs/api.md
@@ -43,7 +43,6 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `prompt`: the prompt to generate a response for
 - `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
- `think`: (for thinking models) should the model think before responding?

 Advanced parameters (optional):

@@ -491,13 +490,11 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
 - `tools`: list of tools in JSON for the model to use if supported
- `think`: (for thinking models) should the model think before responding?

 The `message` object has the following fields:

 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
- `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use

--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -0,0 +1,59 @@
+# Benchmark
+
+Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
+
+## When to use
+
+Run these benchmarks when:
+- Making changes to the model inference engine
+- Modifying model loading/unloading logic
+- Changing prompt processing or token generation code
+- Implementing a new model architecture
+- Testing performance across different hardware setups
+
+## Prerequisites
+- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
+## Usage and Examples
+
+>[!NOTE]
+>All commands must be run from the root directory of the Ollama project.
+
+Basic syntax:
+```bash
+go test -bench=. ./benchmark/... -m $MODEL_NAME
+```
+
+Required flags:
+- `-bench=.`: Run all benchmarks
+- `-m`: Model name to benchmark
+
+Optional flags:
+- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
+- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
+
+Common usage patterns:
+
+Single benchmark run with a model specified:
+```bash
+go test -bench=. ./benchmark/... -m llama3.3
+```
+
+## Output metrics
+
+The benchmark reports several key metrics:
+
+- `gen_tok/s`: Generated tokens per second
+- `prompt_tok/s`: Prompt processing tokens per second
+- `ttft_ms`: Time to first token in milliseconds
+- `load_ms`: Model load time in milliseconds
+- `gen_tokens`: Total tokens generated
+- `prompt_tokens`: Total prompt tokens processed
+
+Each benchmark runs two scenarios:
+- Cold start: Model is loaded from disk for each test
+- Warm start: Model is pre-loaded in memory
+
+Three prompt lengths are tested for each scenario:
+- Short prompt (100 tokens)
+- Medium prompt (500 tokens)
+- Long prompt (1000 tokens)
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```

-> NOTE: In rare cirumstances, you may need to change a package using the new
+> NOTE: In rare cirumstances, you may nedd to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+Ollama supports Nvidia GPUs with compute capability 5.0+.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/import.md
+++ b/docs/import.md
@@ -132,12 +132,22 @@ success

 ### Supported Quantizations

+- `q4_0`
+- `q4_1`
+- `q5_0`
+- `q5_1`
 - `q8_0`

 #### K-means Quantizations

+- `q3_K_S`
+- `q3_K_M`
+- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
+- `q5_K_S`
+- `q5_K_M`
+- `q6_K`


 ## Sharing your model on ollama.com
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -112,8 +112,8 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
-> of your Radeon GPU.
+> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+> GPU.

 ## Customizing

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```

 **Experimental LLM Library Override**
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -183,8 +183,6 @@ var (
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
-	// Auth enables authentication between the Ollama client and server
-	UseAuth = Bool("OLLAMA_AUTH")
 )

 func String(s string) func() string {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"math"
 	"slices"
 	"strings"

@@ -15,7 +16,6 @@ import (
 type GGML struct {
 	container
 	model
-	Length int64
 }

 type model interface {
@@ -34,8 +34,7 @@ func (kv KV) Kind() string {
 }

 func (kv KV) ParameterCount() uint64 {
-	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
-	return val
+	return keyValue(kv, "general.parameter_count", uint64(0))
 }

 func (kv KV) FileType() FileType {
@@ -54,27 +53,16 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

-func (kv KV) HeadCountMax() uint64 {
-	// TODO(drifkin): using the max value can cause an overestimation. In the
-	// future if array values become more popular, we can adapt the more invasive
-	// <https://github.com/ollama/ollama/pull/10225>
-	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
+func (kv KV) HeadCount() uint64 {
+	return uint64(kv.Uint("attention.head_count"))
 }

-func (kv KV) HeadCountMin() uint64 {
-	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
+func (kv KV) HeadCountKV() uint64 {
+	return uint64(kv.Uint("attention.head_count_kv", 1))
 }

-func (kv KV) HeadCountKVMax() uint64 {
-	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
-}
-
-func (kv KV) HeadCountKVMin() uint64 {
-	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
-}
-
-func (kv KV) EmbeddingHeadCountMax() uint64 {
-	if heads := kv.HeadCountMin(); heads > 0 {
+func (kv KV) EmbeddingHeadCount() uint64 {
+	if heads := kv.HeadCount(); heads > 0 {
 		return kv.EmbeddingLength() / heads
 	}

@@ -82,11 +70,15 @@ func (kv KV) EmbeddingHeadCountMax() uint64 {
 }

 func (kv KV) EmbeddingHeadCountK() uint64 {
-	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
 }

 func (kv KV) EmbeddingHeadCountV() uint64 {
-	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
+}
+
+func (kv KV) GQA() uint64 {
+	return kv.HeadCount() / kv.HeadCountKV()
 }

 func (kv KV) ContextLength() uint64 {
@@ -98,72 +90,35 @@ func (kv KV) ChatTemplate() string {
 }

 func (kv KV) String(key string, defaultValue ...string) string {
-	val, _ := keyValue(kv, key, append(defaultValue, "")...)
-	return val
+	return keyValue(kv, key, append(defaultValue, "")...)
 }

 func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
-	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
-	return val
+	return keyValue(kv, key, append(defaultValue, 0)...)
 }

 func (kv KV) Float(key string, defaultValue ...float32) float32 {
-	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
-	return val
+	return keyValue(kv, key, append(defaultValue, 0)...)
 }

 func (kv KV) Bool(key string, defaultValue ...bool) bool {
-	val, _ := keyValue(kv, key, append(defaultValue, false)...)
-	return val
-}
-
-func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
-	_, max := kv.UintOrArrayValue(key, defaultValue)
-	return max
-}
-
-func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
-	min, _ := kv.UintOrArrayValue(key, defaultValue)
-	return min
-}
-
-func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
-	if u32, ok := keyValue(kv, key, uint32(0)); ok {
-		return u32, u32
-	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
-		min := slices.Min(u32s.values)
-		max := slices.Max(u32s.values)
-		return min, max
-	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
-		min := slices.Min(i32s.values)
-		max := slices.Max(i32s.values)
-		if min < 0 || max < 0 {
-			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
-		}
-		return uint32(min), uint32(max)
-	}
-
-	return defaultValue, defaultValue
+	return keyValue(kv, key, append(defaultValue, false)...)
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
-	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
 }

 func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
-	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
 }

 func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
-	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
 }

 func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
 }

 func (kv KV) OllamaEngineRequired() bool {
@@ -188,17 +143,17 @@ type arrayValueTypes interface {
 		*array[string] | *array[float32] | *array[float64] | *array[bool]
 }

-func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
+func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}

-	if val, ok := kv[key].(T); ok {
-		return val, true
+	if val, ok := kv[key]; ok {
+		return val.(T)
 	}

-	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
-	return defaultValue[0], false
+	slog.Debug("key not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0]
 }

 type Tensors struct {
@@ -432,12 +387,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, err
+		return nil, 0, err
 	}

 	var c container
@@ -447,34 +402,33 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, errors.New("invalid file magic")
+		return nil, 0, errors.New("invalid file magic")
 	}

 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}

 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}

 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-		Length:    offset,
-	}, nil
+	}, offset, nil
 }

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
-	heads := f.KV().HeadCountMax()
-	headsKV := f.KV().HeadCountKVMax()
+	heads := f.KV().HeadCount()
+	headsKV := f.KV().HeadCountKV()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)

-	embeddingHeads := f.KV().EmbeddingHeadCountMax()
+	embeddingHeads := f.KV().EmbeddingHeadCount()
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
 	embeddingHeadsV := f.KV().EmbeddingHeadCountV()

@@ -699,15 +653,24 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
+		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
+		temporalPatchSize := uint64(2)

-		numPatches := maxPixels / (patchSize * patchSize)
+		// Calculate max possible patches based on max_pixels
+		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
+		maxWidth := maxPixels / maxHeight
+		maxGridHeight := maxHeight / patchSize
+		maxGridWidth := maxWidth / patchSize
+		// Account for merged patches (2x2 grid)
+		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)

+		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * patchSize^2)
-			numPatches*numChannels*patchSize*patchSize +
-			// Self-attention calculations
+			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
+			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
+			// Self-attention calculations (similar to other architectures)
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -269,33 +269,3 @@ func TestKeyValue(t *testing.T) {
 		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
 	}
 }
-
-func TestHeadCount(t *testing.T) {
-	valuesArray := []int32{1, 5, 3, 4}
-	cases := []struct {
-		kv   KV
-		want uint64
-	}{
-		{
-			kv: KV{
-				"general.architecture":     "abc",
-				"abc.attention.head_count": &array[int32]{values: valuesArray, size: len(valuesArray)},
-			},
-			want: uint64(5),
-		},
-		{
-			kv: KV{
-				"general.architecture":     "abc",
-				"abc.attention.head_count": uint32(3),
-			},
-			want: uint64(3),
-		},
-	}
-
-	for _, tt := range cases {
-		got := tt.kv.HeadCountMax()
-		if got != tt.want {
-			t.Errorf("unexpected max value: got=%d want=%d", got, tt.want)
-		}
-	}
-}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -527,17 +527,23 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		return err
 	}

-	for _, key := range slices.Sorted(maps.Keys(kv)) {
+	keys := slices.Collect(maps.Keys(kv))
+	slices.Sort(keys)
+
+	for _, key := range keys {
 		if err := ggufWriteKV(f, key, kv[key]); err != nil {
 			return err
 		}
 	}

 	slices.SortStableFunc(ts, func(a, b *Tensor) int {
-		if i, j := a.block(), b.block(); i > 0 && j > 0 {
+		if i, j := a.block(), b.block(); i < 0 && j > 0 {
+			return 1
+		} else if i > 0 && j < 0 {
+			return -1
+		} else {
 			return cmp.Compare(i, j)
 		}
-		return cmp.Compare(a.Name, b.Name)
 	})

 	var s uint64
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -2,82 +2,62 @@ package ggml

 import (
 	"bytes"
-	"math/rand/v2"
 	"os"
-	"strings"
+	"slices"
 	"testing"

 	"github.com/google/go-cmp/cmp"
 )

 func TestWriteGGUF(t *testing.T) {
-	r := rand.New(rand.NewPCG(0, 0))
-	for range 8 {
-		t.Run("shuffle", func(t *testing.T) {
-			t.Parallel()
+	w, err := os.CreateTemp(t.TempDir(), "*.bin")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer w.Close()

-			ts := []*Tensor{
-				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
-				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
-			}
+	if err := WriteGGUF(w, KV{
+		"general.alignment": uint32(16),
+	}, []*Tensor{
+		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+	}); err != nil {
+		t.Fatal(err)
+	}

-			r.Shuffle(len(ts), func(i, j int) {
-				ts[i], ts[j] = ts[j], ts[i]
-			})
+	r, err := os.Open(w.Name())
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()

-			w, err := os.CreateTemp(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "_")+"*.bin")
-			if err != nil {
-				t.Fatal(err)
-			}
-			defer w.Close()
+	ff, _, err := Decode(r, 0)
+	if err != nil {
+		t.Fatal(err)
+	}

-			if err := WriteGGUF(w, KV{
-				"general.alignment": uint32(16),
-			}, ts); err != nil {
-				t.Fatal(err)
-			}
+	if diff := cmp.Diff(ff.KV(), KV{
+		"general.alignment":       uint32(16),
+		"general.parameter_count": uint64(36),
+	}); diff != "" {
+		t.Errorf("Mismatch (-want +got):\n%s", diff)
+	}

-			r, err := os.Open(w.Name())
-			if err != nil {
-				t.Fatal(err)
-			}
-			defer r.Close()
-
-			ff, err := Decode(r, 0)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			if diff := cmp.Diff(KV{
-				"general.alignment":       uint32(16),
-				"general.parameter_count": uint64(54),
-			}, ff.KV()); diff != "" {
-				t.Errorf("Mismatch (-want +got):\n%s", diff)
-			}
-
-			if diff := cmp.Diff(Tensors{
-				Offset: 608,
-				items: []*Tensor{
-					{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
-					{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
-					{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
-					{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
-					{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
-					{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
-					{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
-					{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
-					{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
-				},
-			}, ff.Tensors(), cmp.AllowUnexported(Tensors{})); diff != "" {
-				t.Errorf("Mismatch (-want +got):\n%s", diff)
-			}
-		})
+	if diff := cmp.Diff(ff.Tensors(), Tensors{
+		Offset: 336,
+		items: []*Tensor{
+			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
+			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
+			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
+			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
+			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
+			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
+		},
+	}, cmp.AllowUnexported(Tensors{})); diff != "" {
+		t.Errorf("Mismatch (-want +got):\n%s", diff)
 	}
 }
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -1,347 +0,0 @@
-package gguf
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"iter"
-	"os"
-	"slices"
-	"strings"
-)
-
-const (
-	typeUint8 uint32 = iota
-	typeInt8
-	typeUint16
-	typeInt16
-	typeUint32
-	typeInt32
-	typeFloat32
-	typeBool
-	typeString
-	typeArray
-	typeUint64
-	typeInt64
-	typeFloat64
-)
-
-var ErrUnsupported = errors.New("unsupported")
-
-type File struct {
-	Magic   [4]byte
-	Version uint32
-
-	keyValues *lazy[KeyValue]
-	tensors   *lazy[TensorInfo]
-	offset    int64
-
-	file   *os.File
-	reader *bufferedReader
-	bts    []byte
-}
-
-func Open(path string) (f *File, err error) {
-	f = &File{bts: make([]byte, 4096)}
-	f.file, err = os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-
-	f.reader = newBufferedReader(f.file, 32<<10)
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
-		return nil, err
-	}
-
-	if bytes.Equal(f.Magic[:], []byte("gguf")) {
-		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
-	}
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
-		return nil, err
-	}
-
-	if f.Version < 2 {
-		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
-	}
-
-	f.tensors, err = newLazy(f, f.readTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	f.tensors.successFunc = func() error {
-		offset := f.reader.offset
-
-		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
-		f.offset = offset + (alignment-offset%alignment)%alignment
-		return nil
-	}
-
-	f.keyValues, err = newLazy(f, f.readKeyValue)
-	if err != nil {
-		return nil, err
-	}
-
-	return f, nil
-}
-
-func (f *File) readTensor() (TensorInfo, error) {
-	name, err := readString(f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	dims, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	shape := make([]uint64, dims)
-	for i := range dims {
-		shape[i], err = read[uint64](f)
-		if err != nil {
-			return TensorInfo{}, err
-		}
-	}
-
-	type_, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	offset, err := read[uint64](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	return TensorInfo{
-		Name:   name,
-		Offset: offset,
-		Shape:  shape,
-		Type:   TensorType(type_),
-	}, nil
-}
-
-func (f *File) readKeyValue() (KeyValue, error) {
-	key, err := readString(f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	t, err := read[uint32](f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	value, err := func() (any, error) {
-		switch t {
-		case typeUint8:
-			return read[uint8](f)
-		case typeInt8:
-			return read[int8](f)
-		case typeUint16:
-			return read[uint16](f)
-		case typeInt16:
-			return read[int16](f)
-		case typeUint32:
-			return read[uint32](f)
-		case typeInt32:
-			return read[int32](f)
-		case typeUint64:
-			return read[uint64](f)
-		case typeInt64:
-			return read[int64](f)
-		case typeFloat32:
-			return read[float32](f)
-		case typeFloat64:
-			return read[float64](f)
-		case typeBool:
-			return read[bool](f)
-		case typeString:
-			return readString(f)
-		case typeArray:
-			return readArray(f)
-		default:
-			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-		}
-	}()
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	return KeyValue{
-		Key:   key,
-		Value: Value{value},
-	}, nil
-}
-
-func read[T any](f *File) (t T, err error) {
-	err = binary.Read(f.reader, binary.LittleEndian, &t)
-	return t, err
-}
-
-func readString(f *File) (string, error) {
-	n, err := read[uint64](f)
-	if err != nil {
-		return "", err
-	}
-
-	if int(n) > len(f.bts) {
-		f.bts = make([]byte, n)
-	}
-
-	bts := f.bts[:n]
-	if _, err := io.ReadFull(f.reader, bts); err != nil {
-		return "", err
-	}
-	defer clear(bts)
-
-	return string(bts), nil
-}
-
-func readArray(f *File) (any, error) {
-	t, err := read[uint32](f)
-	if err != nil {
-		return nil, err
-	}
-
-	n, err := read[uint64](f)
-	if err != nil {
-		return nil, err
-	}
-
-	switch t {
-	case typeUint8:
-		return readArrayData[uint8](f, n)
-	case typeInt8:
-		return readArrayData[int8](f, n)
-	case typeUint16:
-		return readArrayData[uint16](f, n)
-	case typeInt16:
-		return readArrayData[int16](f, n)
-	case typeUint32:
-		return readArrayData[uint32](f, n)
-	case typeInt32:
-		return readArrayData[int32](f, n)
-	case typeUint64:
-		return readArrayData[uint64](f, n)
-	case typeInt64:
-		return readArrayData[int64](f, n)
-	case typeFloat32:
-		return readArrayData[float32](f, n)
-	case typeFloat64:
-		return readArrayData[float64](f, n)
-	case typeBool:
-		return readArrayData[bool](f, n)
-	case typeString:
-		return readArrayString(f, n)
-	default:
-		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-	}
-}
-
-func readArrayData[T any](f *File, n uint64) (s []T, err error) {
-	s = make([]T, n)
-	for i := range n {
-		e, err := read[T](f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func readArrayString(f *File, n uint64) (s []string, err error) {
-	s = make([]string, n)
-	for i := range n {
-		e, err := readString(f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func (f *File) Close() error {
-	f.keyValues.stop()
-	f.tensors.stop()
-	return f.file.Close()
-}
-
-func (f *File) KeyValue(key string) KeyValue {
-	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
-		key = f.KeyValue("general.architecture").String() + "." + key
-	}
-
-	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
-		return kv.Key == key
-	}); index >= 0 {
-		return f.keyValues.values[index]
-	}
-
-	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
-		if keyValue.Key == key {
-			return keyValue
-		}
-	}
-
-	return KeyValue{}
-}
-
-func (f *File) NumKeyValues() int {
-	return int(f.keyValues.count)
-}
-
-func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
-	return f.keyValues.All()
-}
-
-func (f *File) TensorInfo(name string) TensorInfo {
-	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
-		return t.Name == name
-	}); index >= 0 {
-		return f.tensors.values[index]
-	}
-
-	// fast-forward through key values if we haven't already
-	_ = f.keyValues.rest()
-	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
-		if tensor.Name == name {
-			return tensor
-		}
-	}
-
-	return TensorInfo{}
-}
-
-func (f *File) NumTensors() int {
-	return int(f.tensors.count)
-}
-
-func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
-	// fast forward through key values if we haven't already
-	f.keyValues.rest()
-	return f.tensors.All()
-}
-
-func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
-	t := f.TensorInfo(name)
-	if t.NumBytes() == 0 {
-		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
-	}
-
-	// fast forward through tensor info if we haven't already
-	_ = f.tensors.rest()
-	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
-}
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -1,249 +0,0 @@
-package gguf_test
-
-import (
-	"bytes"
-	"os"
-	"strconv"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/google/go-cmp/cmp/cmpopts"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/fs/gguf"
-)
-
-func createBinFile(tb testing.TB) string {
-	tb.Helper()
-	f, err := os.CreateTemp(tb.TempDir(), "")
-	if err != nil {
-		tb.Fatal(err)
-	}
-	defer f.Close()
-
-	kv := ggml.KV{
-		"general.architecture":                   "llama",
-		"llama.block_count":                      uint32(8),
-		"llama.embedding_length":                 uint32(3),
-		"llama.attention.head_count":             uint32(2),
-		"llama.attention.head_count_kv":          uint32(2),
-		"llama.attention.key_length":             uint32(3),
-		"llama.rope.dimension_count":             uint32(4),
-		"llama.rope.freq_base":                   float32(10000.0),
-		"llama.rope.freq_scale":                  float32(1.0),
-		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
-		"tokenizer.ggml.eos_token_id":            uint32(0),
-		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
-		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
-		"tokenizer.ggml.scores":                  []float32{0, 1},
-	}
-
-	tensors := []*ggml.Tensor{
-		{
-			Name:     "token_embd.weight",
-			Kind:     0,
-			Shape:    []uint64{2, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
-		},
-		{
-			Name:     "output.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 2},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
-		},
-	}
-
-	for i := range 8 {
-		tensors = append(tensors, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		})
-	}
-
-	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
-		tb.Fatal(err)
-	}
-
-	return f.Name()
-}
-
-func TestRead(t *testing.T) {
-	f, err := gguf.Open(createBinFile(t))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	if got := f.KeyValue("does.not.exist").Valid(); got {
-		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
-	}
-
-	if got := f.KeyValue("general.architecture").String(); got != "llama" {
-		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
-	}
-
-	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
-		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
-	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
-		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
-	} else if got.Type != gguf.TensorTypeF32 {
-		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
-	}
-
-	if got := f.KeyValue("block_count").Uint(); got != 8 {
-		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
-	}
-
-	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
-		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
-		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
-	}
-
-	var kvs []string
-	for _, kv := range f.KeyValues() {
-		if !kv.Valid() {
-			t.Error("found invalid key-value pair:", kv)
-		}
-
-		kvs = append(kvs, kv.Key)
-	}
-
-	if len(kvs) != f.NumKeyValues() {
-		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
-	}
-
-	if diff := cmp.Diff(kvs, []string{
-		"general.architecture",
-		"llama.block_count",
-		"llama.embedding_length",
-		"llama.attention.head_count",
-		"llama.attention.head_count_kv",
-		"llama.attention.key_length",
-		"llama.rope.dimension_count",
-		"llama.rope.freq_base",
-		"llama.rope.freq_scale",
-		"llama.attention.layer_norm_rms_epsilon",
-		"tokenizer.ggml.eos_token_id",
-		"tokenizer.ggml.eos_token_ids",
-		"tokenizer.ggml.tokens",
-		"tokenizer.ggml.scores",
-	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
-		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
-	}
-
-	var tis []string
-	for _, ti := range f.TensorInfos() {
-		if !ti.Valid() {
-			t.Error("found invalid tensor info:", ti)
-		}
-
-		tis = append(tis, ti.Name)
-	}
-
-	if len(tis) != f.NumTensors() {
-		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
-	}
-
-	if diff := cmp.Diff(tis, []string{
-		"token_embd.weight",
-		"output.weight",
-		"blk.0.attn_q.weight",
-		"blk.0.attn_k.weight",
-		"blk.0.attn_v.weight",
-		"blk.0.attn_output.weight",
-		"blk.1.attn_q.weight",
-		"blk.1.attn_k.weight",
-		"blk.1.attn_v.weight",
-		"blk.1.attn_output.weight",
-		"blk.2.attn_q.weight",
-		"blk.2.attn_k.weight",
-		"blk.2.attn_v.weight",
-		"blk.2.attn_output.weight",
-		"blk.3.attn_q.weight",
-		"blk.3.attn_k.weight",
-		"blk.3.attn_v.weight",
-		"blk.3.attn_output.weight",
-		"blk.4.attn_q.weight",
-		"blk.4.attn_k.weight",
-		"blk.4.attn_v.weight",
-		"blk.4.attn_output.weight",
-		"blk.5.attn_q.weight",
-		"blk.5.attn_k.weight",
-		"blk.5.attn_v.weight",
-		"blk.5.attn_output.weight",
-		"blk.6.attn_q.weight",
-		"blk.6.attn_k.weight",
-		"blk.6.attn_v.weight",
-		"blk.6.attn_output.weight",
-		"blk.7.attn_q.weight",
-		"blk.7.attn_k.weight",
-		"blk.7.attn_v.weight",
-		"blk.7.attn_output.weight",
-	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
-		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
-	}
-
-	ti, r, err := f.TensorReader("output.weight")
-	if err != nil {
-		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
-	}
-
-	if ti.Name != "output.weight" {
-		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
-	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
-		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
-	} else if ti.Type != gguf.TensorTypeF32 {
-		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
-	}
-
-	var b bytes.Buffer
-	if _, err := b.ReadFrom(r); err != nil {
-		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
-	}
-
-	if b.Len() != int(ti.NumBytes()) {
-		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
-	}
-}
-
-func BenchmarkRead(b *testing.B) {
-	b.ReportAllocs()
-
-	p := createBinFile(b)
-	for b.Loop() {
-		f, err := gguf.Open(p)
-		if err != nil {
-			b.Fatal(err)
-		}
-
-		if got := f.KeyValue("general.architecture").String(); got != "llama" {
-			b.Errorf("got = %q, want %q", got, "llama")
-		}
-
-		// Iterate through some tensors
-		for range f.TensorInfos() {
-		}
-
-		f.Close()
-	}
-}
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -1,90 +0,0 @@
-package gguf
-
-import (
-	"reflect"
-	"slices"
-)
-
-type KeyValue struct {
-	Key string
-	Value
-}
-
-func (kv KeyValue) Valid() bool {
-	return kv.Key != "" && kv.Value.value != nil
-}
-
-type Value struct {
-	value any
-}
-
-func value[T any](v Value, kinds ...reflect.Kind) (t T) {
-	vv := reflect.ValueOf(v.value)
-	if slices.Contains(kinds, vv.Kind()) {
-		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
-	}
-	return
-}
-
-func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
-	switch vv := reflect.ValueOf(v.value); vv.Kind() {
-	case reflect.Slice:
-		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
-			ts = make([]T, vv.Len())
-			for i := range vv.Len() {
-				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
-			}
-		}
-	}
-	return
-}
-
-// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
-func (v Value) Int() int64 {
-	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
-func (v Value) Ints() (i64s []int64) {
-	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
-func (v Value) Uint() uint64 {
-	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
-func (v Value) Uints() (u64s []uint64) {
-	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Float returns Value as a float. If it is not a float, it returns 0.
-func (v Value) Float() float64 {
-	return value[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
-func (v Value) Floats() (f64s []float64) {
-	return values[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Bool returns Value as a boolean. If it is not a boolean, it returns false.
-func (v Value) Bool() bool {
-	return value[bool](v, reflect.Bool)
-}
-
-// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
-func (v Value) Bools() (bools []bool) {
-	return values[bool](v, reflect.Bool)
-}
-
-// String returns Value as a string. If it is not a string, it returns an empty string.
-func (v Value) String() string {
-	return value[string](v, reflect.String)
-}
-
-// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
-func (v Value) Strings() (strings []string) {
-	return values[string](v, reflect.String)
-}
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -1,208 +0,0 @@
-package gguf
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func split(name string, values map[string][]any) (matched []any, unmatched []any) {
-	for key, value := range values {
-		if key == name {
-			matched = value
-		} else {
-			unmatched = append(unmatched, value...)
-		}
-	}
-	return
-}
-
-func TestValue(t *testing.T) {
-	values := map[string][]any{
-		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
-		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
-		"float64": {float32(42), float64(42)},
-		"string":  {"42", "hello"},
-		"bool":    {true, false},
-	}
-
-	t.Run("int64", func(t *testing.T) {
-		matched, unmatched := split("int64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 42 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 0 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-	})
-
-	t.Run("uint64", func(t *testing.T) {
-		matched, unmatched := split("uint64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 42 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 0 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-	})
-
-	t.Run("float64", func(t *testing.T) {
-		matched, unmatched := split("float64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 42 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 0 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-	})
-
-	t.Run("string", func(t *testing.T) {
-		matched, unmatched := split("string", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != v {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != "" {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-	})
-
-	t.Run("bool", func(t *testing.T) {
-		matched, unmatched := split("bool", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != v {
-				t.Errorf("expected true, got %v", b)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != false {
-				t.Errorf("expected false, got %v", b)
-			}
-		}
-	})
-}
-
-func TestValues(t *testing.T) {
-	values := map[string][]any{
-		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
-		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
-		"float64s": {[]float32{42}, []float64{42}},
-		"strings":  {[]string{"42"}, []string{"hello"}},
-		"bools":    {[]bool{true}, []bool{false}},
-	}
-
-	t.Run("int64s", func(t *testing.T) {
-		matched, unmatched := split("int64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64s := kv.Ints(); i64s != nil {
-				t.Errorf("expected nil, got %v", i64s)
-			}
-		}
-	})
-
-	t.Run("uint64s", func(t *testing.T) {
-		matched, unmatched := split("uint64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64s := kv.Uints(); u64s != nil {
-				t.Errorf("expected nil, got %v", u64s)
-			}
-		}
-	})
-
-	t.Run("float64s", func(t *testing.T) {
-		matched, unmatched := split("float64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64s := kv.Floats(); f64s != nil {
-				t.Errorf("expected nil, got %v", f64s)
-			}
-		}
-	})
-
-	t.Run("strings", func(t *testing.T) {
-		matched, unmatched := split("strings", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.Strings(); s != nil {
-				t.Errorf("expected nil, got %v", s)
-			}
-		}
-	})
-
-	t.Run("bools", func(t *testing.T) {
-		matched, unmatched := split("bools", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bools(); b != nil {
-				t.Errorf("expected nil, got %v", b)
-			}
-		}
-	})
-}
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -1,89 +0,0 @@
-package gguf
-
-import (
-	"encoding/binary"
-	"iter"
-	"log/slog"
-)
-
-type lazy[T any] struct {
-	count  uint64
-	next   func() (T, bool)
-	stop   func()
-	values []T
-
-	// successFunc is called when all values have been successfully read.
-	successFunc func() error
-}
-
-func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
-	it := lazy[T]{}
-	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
-		return nil, err
-	}
-
-	it.values = make([]T, 0)
-	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
-		for i := range it.count {
-			t, err := fn()
-			if err != nil {
-				slog.Error("error reading tensor", "index", i, "error", err)
-				return
-			}
-
-			it.values = append(it.values, t)
-			if !yield(t) {
-				break
-			}
-		}
-
-		if it.successFunc != nil {
-			it.successFunc()
-		}
-	})
-
-	return &it, nil
-}
-
-func (g *lazy[T]) Values() iter.Seq[T] {
-	return func(yield func(T) bool) {
-		for _, v := range g.All() {
-			if !yield(v) {
-				break
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) All() iter.Seq2[int, T] {
-	return func(yield func(int, T) bool) {
-		for i := range int(g.count) {
-			if i < len(g.values) {
-				if !yield(i, g.values[i]) {
-					break
-				}
-			} else {
-				t, ok := g.next()
-				if !ok {
-					break
-				}
-
-				if !yield(i, t) {
-					break
-				}
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) rest() (collected bool) {
-	for {
-		_, ok := g.next()
-		collected = collected || ok
-		if !ok {
-			break
-		}
-	}
-
-	return collected
-}
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -1,23 +0,0 @@
-package gguf
-
-import (
-	"bufio"
-	"io"
-)
-
-type bufferedReader struct {
-	offset int64
-	*bufio.Reader
-}
-
-func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
-	return &bufferedReader{
-		Reader: bufio.NewReaderSize(rs, size),
-	}
-}
-
-func (rs *bufferedReader) Read(p []byte) (n int, err error) {
-	n, err = rs.Reader.Read(p)
-	rs.offset += int64(n)
-	return n, err
-}
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -1,288 +0,0 @@
-package gguf
-
-import (
-	"log/slog"
-	"strings"
-)
-
-type TensorInfo struct {
-	Name   string
-	Offset uint64
-	Shape  []uint64
-	Type   TensorType
-}
-
-func (ti TensorInfo) Valid() bool {
-	return ti.Name != "" && ti.NumBytes() > 0
-}
-
-func (ti TensorInfo) NumValues() int64 {
-	var numItems int64 = 1
-	for _, dim := range ti.Shape {
-		numItems *= int64(dim)
-	}
-	return numItems
-}
-
-// NumBytes returns the number of bytes in the tensor.
-func (ti TensorInfo) NumBytes() int64 {
-	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
-}
-
-func (ti TensorInfo) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.String("name", ti.Name),
-		slog.Int64("offset", int64(ti.Offset)),
-		slog.Any("shape", ti.Shape),
-		slog.Int64("num_values", ti.NumValues()),
-		slog.Int64("num_bytes", ti.NumBytes()),
-		slog.Any("type", ti.Type),
-	)
-}
-
-type TensorType uint32
-
-const (
-	TensorTypeF32 TensorType = iota
-	TensorTypeF16
-	TensorTypeQ4_0
-	TensorTypeQ4_1
-
-	// unexported // unused in gguf
-	tensorTypeQ4_2
-	tensorTypeQ4_3
-
-	TensorTypeQ5_0
-	TensorTypeQ5_1
-	TensorTypeQ8_0
-	TensorTypeQ8_1
-	TensorTypeQ2_K
-	TensorTypeQ3_K
-	TensorTypeQ4_K
-	TensorTypeQ5_K
-	TensorTypeQ6_K
-	TensorTypeQ8_K
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ2_XXS
-	tensorTypeIQ2_XS
-	tensorTypeIQ3_XXS
-	tensorTypeIQ1_S
-	tensorTypeIQ4_NL
-	tensorTypeIQ3_S
-	tensorTypeIQ2_S
-	tensorTypeIQ4_XS
-
-	TensorTypeI8
-	TensorTypeI16
-	TensorTypeI32
-	TensorTypeI64
-	TensorTypeF64
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ1_M
-
-	TensorTypeBF16
-
-	// unexported // unused in gguf
-	tensorTypeQ4_0_4_4
-	tensorTypeQ4_0_4_8
-	tensorTypeQ4_0_8_8
-
-	// unexported // unquantizable by ollama
-	tensorTypeTQ1_0
-	tensorTypeTQ2_0
-
-	// unexported // unused in gguf
-	tensorTypeIQ4_NL_4_4
-	tensorTypeIQ4_NL_4_8
-	tensorTypeIQ4_NL_8_8
-)
-
-func (tt TensorType) NumBytes() float64 {
-	return float64(tt.typeSize()) / float64(tt.blockSize())
-}
-
-func (tt TensorType) typeSize() int64 {
-	switch tt {
-	case TensorTypeF32:
-		return 4
-	case TensorTypeF16:
-		return 2
-	case TensorTypeQ4_0:
-		return 2 + tt.blockSize()/2
-	case TensorTypeQ4_1:
-		return 2 + 2 + tt.blockSize()/2
-	case TensorTypeQ5_0:
-		return 2 + 4 + tt.blockSize()/2
-	case TensorTypeQ5_1:
-		return 2 + 2 + 4 + tt.blockSize()/2
-	case TensorTypeQ8_0:
-		return 2 + tt.blockSize()
-	case TensorTypeQ8_1:
-		return 2 + 2 + tt.blockSize()
-	case TensorTypeQ2_K:
-		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
-	case TensorTypeQ3_K:
-		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
-	case TensorTypeQ4_K:
-		return 2 + 2 + 12 + tt.blockSize()/2
-	case TensorTypeQ5_K:
-		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
-	case TensorTypeQ6_K:
-		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
-	case TensorTypeQ8_K:
-		return 4 + tt.blockSize() + 2*tt.blockSize()/16
-	case tensorTypeIQ2_XXS:
-		return 2 + 2*tt.blockSize()/8
-	case tensorTypeIQ2_XS:
-		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
-	case tensorTypeIQ3_XXS:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/8
-	case tensorTypeIQ1_S:
-		return 2 + tt.blockSize()/8 + tt.blockSize()/16
-	case tensorTypeIQ4_NL:
-		return 2 + tt.blockSize()/2
-	case tensorTypeIQ3_S:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
-	case tensorTypeIQ2_S:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/16
-	case tensorTypeIQ4_XS:
-		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
-	case TensorTypeI8:
-		return 1
-	case TensorTypeI16:
-		return 2
-	case TensorTypeI32:
-		return 4
-	case TensorTypeI64:
-		return 8
-	case TensorTypeF64:
-		return 8
-	case tensorTypeIQ1_M:
-		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
-	case TensorTypeBF16:
-		return 2
-	default:
-		return 0
-	}
-}
-
-func (tt TensorType) blockSize() int64 {
-	switch tt {
-	case TensorTypeF32,
-		TensorTypeF16,
-		TensorTypeI8,
-		TensorTypeI16,
-		TensorTypeI32,
-		TensorTypeI64,
-		TensorTypeF64,
-		TensorTypeBF16:
-		return 1
-	case TensorTypeQ4_0,
-		TensorTypeQ4_1,
-		TensorTypeQ5_0,
-		TensorTypeQ5_1,
-		TensorTypeQ8_0,
-		TensorTypeQ8_1,
-		tensorTypeIQ4_NL:
-		return 32
-	default:
-		return 256
-	}
-}
-
-func (tt TensorType) String() string {
-	switch tt {
-	case TensorTypeF32:
-		return "f32"
-	case TensorTypeF16:
-		return "f16"
-	case TensorTypeQ4_0:
-		return "q4_0"
-	case TensorTypeQ4_1:
-		return "q4_1"
-	case tensorTypeQ4_2:
-		return "q4_2"
-	case tensorTypeQ4_3:
-		return "q4_3"
-	case TensorTypeQ5_0:
-		return "q5_0"
-	case TensorTypeQ5_1:
-		return "q5_1"
-	case TensorTypeQ8_0:
-		return "q8_0"
-	case TensorTypeQ8_1:
-		return "q8_1"
-	case TensorTypeQ2_K:
-		return "q2_k"
-	case TensorTypeQ3_K:
-		return "q3_k"
-	case TensorTypeQ4_K:
-		return "q4_k"
-	case TensorTypeQ5_K:
-		return "q5_k"
-	case TensorTypeQ6_K:
-		return "q6_k"
-	case TensorTypeQ8_K:
-		return "q8_k"
-	case tensorTypeIQ2_XXS:
-		return "iq2_xxs"
-	case tensorTypeIQ2_XS:
-		return "iq2_xs"
-	case tensorTypeIQ3_XXS:
-		return "iq3_xxs"
-	case tensorTypeIQ1_S:
-		return "iq1_s"
-	case tensorTypeIQ4_NL:
-		return "iq4_nl"
-	case tensorTypeIQ3_S:
-		return "iq3_s"
-	case tensorTypeIQ2_S:
-		return "iq2_s"
-	case tensorTypeIQ4_XS:
-		return "iq4_xs"
-	case TensorTypeI8:
-		return "i8"
-	case TensorTypeI16:
-		return "i16"
-	case TensorTypeI32:
-		return "i32"
-	case TensorTypeI64:
-		return "i64"
-	case TensorTypeF64:
-		return "f64"
-	case tensorTypeIQ1_M:
-		return "iq1_m"
-	case TensorTypeBF16:
-		return "bf16"
-	case tensorTypeQ4_0_4_4:
-		return "q4_0_4_4"
-	case tensorTypeQ4_0_4_8:
-		return "q4_0_4_8"
-	case tensorTypeQ4_0_8_8:
-		return "q4_0_8_8"
-	case tensorTypeTQ1_0:
-		return "tq1_0"
-	case tensorTypeTQ2_0:
-		return "tq2_0"
-	case tensorTypeIQ4_NL_4_4:
-		return "iq4_nl_4_4"
-	case tensorTypeIQ4_NL_4_8:
-		return "iq4_nl_4_8"
-	case tensorTypeIQ4_NL_8_8:
-		return "iq4_nl_8_8"
-	default:
-		return "unknown"
-	}
-}
-
-func (tt TensorType) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.Uint64("value", uint64(tt)),
-		slog.String("name", strings.ToUpper(tt.String())),
-		slog.Int64("size", tt.typeSize()),
-		slog.Int64("block_size", tt.blockSize()),
-		slog.Float64("num_bytes", tt.NumBytes()),
-	)
-}
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.7.0
+	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
-github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "qwen2.5vl",
+			model: "llava:7b",
 		},
 		{
 			model: "llama3.2-vision",
@@ -60,7 +60,6 @@ func TestVisionModels(t *testing.T) {
 }

 func TestIntegrationSplitBatch(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -45,8 +45,6 @@ var (
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
-		"codellama:latest",
-		"nous-hermes:latest",
 	}
 )

--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -30,11 +30,6 @@ type Causal struct {

 	// ** current forward pass **

-	// curReserve indicates that this forward pass is only for
-	// memory reservation and we should not update our metadata
-	// based on it.
-	curReserve bool
-
 	// the active layer for Get and Put
 	curLayer int

@@ -164,13 +159,12 @@ func (c *Causal) Close() {
 }

 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

-	if !c.curReserve {
+	if !reserve {
 		c.updateSlidingWindow()

 		var err error
@@ -217,9 +211,10 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}

-	c.curMask = c.buildMask(ctx)
+	var err error
+	c.curMask, err = c.buildMask(ctx)

-	return nil
+	return err
 }

 func newRange() cellRange {
@@ -302,7 +297,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
+func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)

@@ -310,11 +305,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1

 	length := c.curCellRange.max - c.curCellRange.min + 1
-
-	if c.curReserve {
-		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
-	}
-
 	mask := make([]float32, batchSize*length)

 	for i := range c.curBatchSize {
@@ -335,7 +325,10 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		mask[i] = float32(math.Inf(-1))
 	}

-	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	if err != nil {
+		return nil, err
+	}

 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@@ -343,7 +336,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		maskTensor = out
 	}

-	return maskTensor
+	return maskTensor, nil
 }

 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@@ -498,7 +491,12 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			c.curMask = c.buildMask(ctx)
+			var err error
+			c.curMask, err = c.buildMask(ctx)
+			if err != nil {
+				// This error should never occur because we have previously built a mask with the same shape
+				panic(fmt.Errorf("SetCausal: %w", err))
+			}
 		}
 	}
 }
@@ -654,7 +652,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}

-	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
+	if err != nil {
+		return err
+	}

 	for i, key := range c.keys {
 		if key == nil {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}

 			cache.SetLayer(0)
-			tensor := context.FromFloatSlice(test.in, test.inShape...)
+			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)

 			out, _, mask := cache.Get(context)
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)

 	// with window size 4, nothing has slid out of the window yet
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)

 	// only the latest position has overlapping windows
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }

-func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)

 	copy(t.data, s)

-	return t
+	return t, nil
 }

-func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}

-	out := c.FromFloatSlice(f, shape...)
+	out, _ := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32

-	return out
+	return out, nil
 }

 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}

-	out := c.FromFloatSlice(s, len(s))
+	out, _ := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }

 func (c *testContext) Compute(...ml.Tensor) {}

-func (c *testContext) Reserve() {}
+func (c *testContext) Reserve() error { return nil }

 func (c *testContext) MaxGraphNodes() int {
 	return 10
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyPresent)
+	cparams.penalty_present = C.float(params.PenaltyFreq)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
 	defer C.free(unsafe.Pointer(cStr))

 	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := max(32768, min(1024*1024, len(schema)*4))
+	maxLen := min(1024*1024, len(schema)*4)
 	buf := make([]byte, maxLen)

 	// Call C function to convert schema to grammar
@@ -602,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }

-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))

@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}

-	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
@@ -1,156 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Fri, 18 Apr 2025 15:58:19 -0700
-Subject: [PATCH] graph memory reporting on failure
-
---
- ggml/include/ggml-alloc.h   |  6 ++++++
- ggml/include/ggml-backend.h |  6 ++++++
- ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
- ggml/src/ggml-backend.cpp   | 10 ++++++++++
- 4 files changed, 56 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
-index 2cb150fd..781b1e10 100644
--- a/ggml/include/ggml-alloc.h
-+++ b/ggml/include/ggml-alloc.h
-@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
- 
- GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
- 
-+struct ggml_allocr_buffer_status {
-+    size_t size;
-+    bool allocated;
-+};
-+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-+
- // Utils
- // Create a buffer and allocate all the tensors in a ggml_context
- GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 778927f6..74e46716 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -304,6 +304,12 @@ extern "C" {
- 
-     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
- 
-+    struct ggml_backend_buffer_status {
-+        size_t size;
-+        bool allocated;
-+    };
-+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-+
-     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
- 
-diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index 5fd379f6..04812990 100644
--- a/ggml/src/ggml-alloc.c
-+++ b/ggml/src/ggml-alloc.c
-@@ -364,6 +364,7 @@ struct node_alloc {
- struct ggml_gallocr {
-     ggml_backend_buffer_type_t * bufts; // [n_buffers]
-     ggml_backend_buffer_t * buffers; // [n_buffers]
-+    size_t *buffer_sizes; // [n_buffers]
-     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
-     int n_buffers;
- 
-@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
-     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
-     GGML_ASSERT(galloc->buffers != NULL);
- 
-+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
-+    GGML_ASSERT(galloc->buffer_sizes != NULL);
-+
-     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
-     GGML_ASSERT(galloc->buf_tallocs != NULL);
- 
-@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
-     ggml_hash_set_free(&galloc->hash_set);
-     free(galloc->hash_values);
-     free(galloc->bufts);
-+    free(galloc->buffer_sizes);
-     free(galloc->buffers);
-     free(galloc->buf_tallocs);
-     free(galloc->node_allocs);
-@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
-         }
-     }
- 
-+    bool success = true;
-+
-     // reallocate buffers if needed
-     for (int i = 0; i < galloc->n_buffers; i++) {
-         // if the buffer type is used multiple times, we reuse the same buffer
-@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
- 
-             ggml_backend_buffer_free(galloc->buffers[i]);
-             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
-+            if (galloc->buffers[i]) {
-+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-+            } else {
-                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
-+                galloc->buffer_sizes[i] = new_size;
-+                success = false;
-             }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-+        } else {
-+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-         }
-     }
- 
-    return true;
-+    return success;
- }
- 
- bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
- }
- 
-+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
-+
-+    for (int i = 0; i < buffer_id; i++) {
-+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
-+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
-+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
-+            // case and we still want to know the attempted size.
-+
-+            struct ggml_allocr_buffer_status status = {0, true};
-+            return status;
-+        }
-+    }
-+
-+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
-+    return status;
-+}
-+
- // utils
- 
- static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
-diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 0ce73a99..be335e8c 100644
--- a/ggml/src/ggml-backend.cpp
-+++ b/ggml/src/ggml-backend.cpp
-@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
-     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
- }
- 
-+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
-+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-+
-+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
-+
-+    return status;
-+}
-+
- void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
-     int backend_index = ggml_backend_sched_backend_id(sched, backend);
-     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Thu, 24 Apr 2025 14:48:51 -0700
-Subject: [PATCH] ggml: Export GPU UUIDs
-
-This enables matching up devices and information reported by the backend
-with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
---
- ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 35 insertions(+)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..a880df33 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -152,6 +152,7 @@ extern "C" {
-     struct ggml_backend_dev_props {
-         const char * name;
-         const char * description;
-+        const char * uuid;
-         size_t memory_free;
-         size_t memory_total;
-         enum ggml_backend_dev_type type;
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..4c829153 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
-     int device;
-     std::string name;
-     std::string description;
-+    std::string uuid;
- };
- 
- static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
-     return ctx->description.c_str();
- }
- 
-+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->uuid.c_str();
-+}
-+
- static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-     ggml_cuda_set_device(ctx->device);
-@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
- static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_cuda_device_get_name(dev);
-     props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
-     props->type        = ggml_backend_cuda_device_get_type(dev);
-     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
- 
-@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
- 
-+                #if !defined(GGML_USE_HIP)
-+                char uuid[64];
-+                snprintf(uuid, sizeof(uuid),
-+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+                    (unsigned char)prop.uuid.bytes[0],
-+                    (unsigned char)prop.uuid.bytes[1],
-+                    (unsigned char)prop.uuid.bytes[2],
-+                    (unsigned char)prop.uuid.bytes[3],
-+                    (unsigned char)prop.uuid.bytes[4],
-+                    (unsigned char)prop.uuid.bytes[5],
-+                    (unsigned char)prop.uuid.bytes[6],
-+                    (unsigned char)prop.uuid.bytes[7],
-+                    (unsigned char)prop.uuid.bytes[8],
-+                    (unsigned char)prop.uuid.bytes[9],
-+                    (unsigned char)prop.uuid.bytes[10],
-+                    (unsigned char)prop.uuid.bytes[11],
-+                    (unsigned char)prop.uuid.bytes[12],
-+                    (unsigned char)prop.uuid.bytes[13],
-+                    (unsigned char)prop.uuid.bytes[14],
-+                    (unsigned char)prop.uuid.bytes[15]
-+                  );
-+                dev_ctx->uuid = uuid;
-+                #else
-+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-+                #endif
-+
-                 ggml_backend_dev_t dev = new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_cuda_device_interface,
-                     /* .reg     = */ &reg,
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..ee4f2dcb 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
- static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_metal_device_get_name(dev);
-     props->description = ggml_backend_metal_device_get_description(dev);
-+    props->uuid        = "0";
-     props->type        = ggml_backend_metal_device_get_type(dev);
-     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = (struct ggml_backend_dev_caps) {
--- a/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
+++ b/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -1,32 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Sun, 22 Jun 2025 09:22:05 -0700
-Subject: [PATCH] temporary prevent rocm+cuda mixed loading
-
---
- ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 4e67d243..8f49f084 100644
--- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
- 
-     ggml_backend_load_best("blas", silent, dir_path);
-     ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-+
-+    // Avoid mixed hip+cuda configurations
-+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
-+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
-+    if (!hip_devices && !rocr_devices) {
-+        ggml_backend_load_best("cuda", silent, dir_path);
-+    } else {
-+        ggml_backend_load_best("hip", silent, dir_path);
-+    }
-+    
-     ggml_backend_load_best("kompute", silent, dir_path);
-     ggml_backend_load_best("metal", silent, dir_path);
-     ggml_backend_load_best("rpc", silent, dir_path);
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,9 +1,12 @@
 package llm

 import (
+	"cmp"
 	"fmt"
 	"log/slog"
+	"maps"
 	"os"
+	"slices"
 	"strconv"
 	"strings"

@@ -82,11 +85,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64

 	// Projectors loaded into GPU0 only
-	var llamaEngineProjectorWeights uint64
-
-	// Projectors loaded with output layer
-	var ollamaEngineProjectorWeights uint64
-	var ollamaEngineProjectorGraph uint64
+	var projectorWeights uint64
+	var projectorGraph uint64

 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -111,23 +111,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

 	for _, projector := range projectors {
-		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
+		weight := projectorMemoryRequirements(projector)
+		projectorWeights += weight

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if llamaEngineProjectorWeights == 0 {
-		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
-		opts.NumCtx = max(opts.NumCtx, 2048)
+	if projectorWeights == 0 && projectorGraph == 0 {
+		projectorWeights, projectorGraph = f.VisionGraphSize()
 	}

 	layers := f.Tensors().GroupLayers()
-	// add one layer worth of memory as a buffer
-	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.Size()
-	} else {
-		slog.Warn("model missing blk.0 layer size")
-	}
+	// add one layer (chosing the max layer) worth of memory as a buffer
+	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
+		return cmp.Compare(a.Size(), b.Size())
+	}).Size()

 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -151,12 +149,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	if graphPartialOffload == 0 {
-		headsKV := f.KV().HeadCountKVMin()
-		if headsKV == 0 {
-			headsKV = 1
-		}
-		gqa := f.KV().HeadCountMax() / headsKV
-		graphPartialOffload = gqa * kvTotal / 6
+		graphPartialOffload = f.KV().GQA() * kvTotal / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
@@ -170,7 +163,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}

-	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -180,7 +172,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}

-	gpuZeroOverhead := llamaEngineProjectorWeights
+	// Output layer handled at the end if we have space
+	gpuZeroOverhead := projectorWeights + projectorGraph

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -223,8 +216,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
-	} else {
-		overflow += gpuZeroOverhead
 	}

 	// For all the layers, find where they can fit on the GPU(s)
@@ -265,24 +256,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// Determine if we need to consider output then find where it fits
-	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
-	if memoryLastLayer > 0 {
-		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
-			for j := len(gpusWithSpace); j > 0; j-- {
-				g := gpusWithSpace[layerCount%j]
-				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				if g.g.FreeMemory > overhead+used+memoryLastLayer {
-					gpuAllocations[g.i] += memoryLastLayer
-					layerCounts[g.i]++
-					layerCount++
-					break
-				}
+	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[layerCount%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
+				gpuAllocations[g.i] += memoryLayerOutput
+				layerCounts[g.i]++
+				layerCount++
+				break
 			}
 		}

 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLastLayer
+			overflow += memoryLayerOutput
 		}
 	}

@@ -340,8 +328,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
-		projectorGraph:      ollamaEngineProjectorGraph,
+		projectorWeights:    projectorWeights,
+		projectorGraph:      projectorGraph,
 	}

 	if gpus[0].Library == "cpu" {
@@ -427,7 +415,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()

-	ggml, err := ggml.Decode(file, 1024)
+	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -121,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()

-	ggml, err := ggml.Decode(f, maxArraySize)
+	ggml, _, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }

@@ -139,13 +139,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}

-	// Verify the requested context size is <= the model training size
-	trainCtx := f.KV().ContextLength()
-	if opts.NumCtx/numParallel > int(trainCtx) && trainCtx > 0 {
-		slog.Warn("requested context size too large for model", "num_ctx", opts.NumCtx, "num_parallel", numParallel, "n_ctx_train", trainCtx)
-		opts.NumCtx = int(trainCtx) * numParallel
-	}
-
 	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
@@ -318,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}

-	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
@@ -804,8 +797,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu

 	res, err := http.DefaultClient.Do(serverReq)
 	if err != nil {
-		slog.Error("post predict", "error", err)
-		return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
+		return fmt.Errorf("POST predict: %v", err)
 	}
 	defer res.Body.Close()

--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,8 +5,8 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"log/slog"
 	"math"
+	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -15,11 +15,6 @@ import (
 )

 type Backend interface {
-	Load(ctx context.Context, progress func(float32)) error
-
-	// BackendMemory returns the memory allocations that were made for this model
-	BackendMemory() BackendMemory
-
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -57,6 +52,10 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
+	// Progress is a callback function that allows reporting percentage completion
+	// of model loading
+	Progress func(float32)
+
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

@@ -73,130 +72,9 @@ type BackendParams struct {
 	FlashAttention bool
 }

-// ErrNoMem is returned when panicing due to insufficient memory. It includes
-// the attempted memory allocation.
-type ErrNoMem struct {
-	BackendMemory
-}
+var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))

-func (e ErrNoMem) Error() string {
-	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
-}
-
-type AllocationStatus int
-
-const (
-	// Unallocated memory - have not yet attempted to allocate
-	Unallocated AllocationStatus = iota
-
-	// Failed memory - tried to allocate the memory and did not succeed
-	Failed
-
-	// Allocated memory = tried and succeeded to allocate memory
-	Allocated
-)
-
-// Memory is the size of an allocation and whether it was successful.
-type Memory struct {
-	Size   uint64
-	Status AllocationStatus
-}
-
-func (m Memory) String() string {
-	s := fmt.Sprint(m.Size)
-
-	switch m.Status {
-	case Unallocated:
-		s += "U"
-	case Failed:
-		s += "F"
-	case Allocated:
-		s += "A"
-	}
-
-	return s
-}
-
-// DeviceMemory provides a breakdown of the memory needed
-// per device, such as a CPU or GPU.
-type DeviceMemory struct {
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string
-
-	// UUID is a unique persistent identifier for the device for matching
-	// with system management libraries
-	UUID string
-
-	// Weights is the per-layer memory needed for the model weights.
-	Weights []Memory
-
-	// Cache is the per-layer memory needed for the KV cache.
-	Cache []Memory
-
-	// Graph is the size of the compute graph. It is not per-layer.
-	Graph Memory
-}
-
-func memoryPresent(mem []Memory) bool {
-	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
-}
-
-func (m DeviceMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if memoryPresent(m.Weights) {
-		attrs = append(attrs, slog.Any("Weights", m.Weights))
-	}
-
-	if memoryPresent(m.Cache) {
-		attrs = append(attrs, slog.Any("Cache", m.Cache))
-	}
-
-	if m.Graph.Size != 0 {
-		attrs = append(attrs, slog.Any("Graph", m.Graph))
-	}
-
-	if len(attrs) > 0 && m.UUID != "" {
-		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-// BackendMemory provides the amount of memory required to load the model
-// per device based on the BackendParams. In some cases, not all required
-// allocations will be known at this point. However, the size of the most recent
-// allocation is guaranteed to be provided so that if it failed, the caller can
-// accommodate that to make forward progress.
-type BackendMemory struct {
-	// InputsWeights are always located on the CPU and cannot be moved
-	InputWeights Memory
-
-	// CPU model components are located in system memory. This does not
-	// include unified memory allocated through the GPU.
-	CPU DeviceMemory
-
-	// GPU model components are located on one or more GPUs.
-	GPUs []DeviceMemory
-}
-
-func (m BackendMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if m.InputWeights.Size != 0 {
-		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
-	}
-
-	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
-	for _, g := range m.GPUs {
-		attrs = append(attrs, slog.Any(g.Name, g))
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-var backends = make(map[string]func(string, BackendParams) (Backend, error))
-
-func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -204,9 +82,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
 	backends[name] = f
 }

-func NewBackend(modelPath string, params BackendParams) (Backend, error) {
+func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(modelPath, params)
+		return backend(ctx, f, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
@@ -215,8 +93,8 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) Tensor
-	FromIntSlice(s []int32, shape ...int) Tensor
+	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
+	FromIntSlice(s []int32, shape ...int) (Tensor, error)

 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -228,7 +106,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve()
+	Reserve() error

 	MaxGraphNodes() int
 	Close()
@@ -241,6 +119,21 @@ type Context interface {
 	Layer(int) Context
 }

+// RopeOptions contains optional parameters for RoPE function
+type RopeOptions struct {
+	OriginalContextLen uint32
+}
+
+// RopeOption defines a function that modifies RopeOpts
+type RopeOption func(*RopeOptions)
+
+// WithContextLen sets a custom context length
+func WithContextLen(len uint32) RopeOption {
+	return func(opts *RopeOptions) {
+		opts.OriginalContextLen = len
+	}
+}
+
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -254,8 +147,6 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
-	Div(ctx Context, t2 Tensor) Tensor
-
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -264,11 +155,11 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
-	SumRows(ctx Context) Tensor

 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

 	Sin(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,6 +10,7 @@ import "C"

 import (
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -29,7 +30,6 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )

@@ -44,15 +44,8 @@ func devices() []*C.struct_ggml_backend_device {
 }

 type Backend struct {
-	// modelPath is the location of the model data
-	modelPath string
-
 	meta *fsggml.GGML

-	// tensorLoadTargets maps from the name of the tensor in the file
-	// to the name that is used by the model definition
-	tensorLoadTargets map[string][]string
-
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -65,26 +58,14 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type

-	// requiredMemory is the cumulative memory allocations needed by the backend
-	requiredMemory *ml.BackendMemory
-
-	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
-	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
-
 	flashAttention bool

 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
 }

-func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
-	r, err := os.Open(modelPath)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	meta, err := fsggml.Decode(r, -1)
+func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
+	meta, n, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -99,9 +80,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		"num_key_values", len(meta.KV()),
 	)

-	var requiredMemory ml.BackendMemory
-	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
-
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -122,8 +100,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}

-	blocks := int(meta.KV().BlockCount())
-
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -131,33 +107,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
-			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}

-	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
-	var props C.struct_ggml_backend_dev_props
-	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.UUID = C.GoString(props.uuid)
-	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
-	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
-
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
-	for i, d := range gpus {
+	for _, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
-		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
-		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
-		var props C.struct_ggml_backend_dev_props
-		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
-		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
-		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

 	useDefaultSplit := true
@@ -196,6 +156,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	// inputs always use cpu
 	input := cpuDeviceBufferType

+	blocks := int(meta.KV().BlockCount())
+
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -236,7 +198,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {

 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -262,16 +224,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			C.ggml_set_name(tt, cname)

 			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
-
-			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
-			if layer == -1 {
-				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
-				requiredMemory.InputWeights.Status = ml.Allocated
-				requiredMemory.InputWeights.Size += uint64(size)
-			} else {
-				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
-			}
-
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -293,22 +245,22 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts, -1)
+			createTensor(tensor{source: t}, input.bts)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts, blocks)
+			createTensor(tensor{source: t}, output.bts)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts, blocks)
+			createTensor(tensor{source: t}, output.bts)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts, i)
+				}, layer.bts)
 			}
 		default:
 			layerIndex := -1
@@ -319,10 +271,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}

 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
+				createTensor(tensor{source: t}, layers[layerIndex].bts)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts, -1)
+				createTensor(tensor{source: t}, input.bts)
 			}
 		}
 	}
@@ -335,18 +287,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}

 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
-		for i := range btDeviceMemory[bt].Weights {
-			if btDeviceMemory[bt].Weights[i].Size != 0 {
-				if b != nil {
-					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
-				} else {
-					btDeviceMemory[bt].Weights[i].Status = ml.Failed
-				}
-			}
-		}
-
 		if b == nil {
-			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
+			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
 		}

 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -365,6 +307,73 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}

+	var doneBytes atomic.Uint64
+	totalBytes := uint64(n) - meta.Tensors().Offset
+
+	g, ctx := errgroup.WithContext(ctx)
+	g.SetLimit(runtime.GOMAXPROCS(0))
+	for _, t := range meta.Tensors().Items() {
+		t := t
+		g.Go(func() error {
+			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
+			for i := range tts {
+				target := targets[t.Name][i]
+				if target == "" {
+					target = t.Name
+				}
+
+				tt, ok := tensors[target]
+				if !ok {
+					return fmt.Errorf("unassigned tensor: %s", t.Name)
+				}
+
+				tts[i] = tt
+			}
+
+			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
+			// seeking around within an FD shared between all goroutines.
+			file, err := os.Open(r.Name())
+			if err != nil {
+				slog.Warn("file open error", "file", r.Name(), "error", err)
+				return err
+			}
+			defer file.Close()
+			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			bts := make([]byte, 128*format.KibiByte)
+
+			var s uint64
+			for s < t.Size() {
+				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
+				if err := ctx.Err(); err != nil {
+					return err
+				}
+
+				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
+				if err != nil {
+					slog.Warn("file read error", "file", r.Name(), "error", err)
+					return err
+				}
+
+				for _, tt := range tts {
+					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
+				}
+
+				s += uint64(n)
+
+				if params.Progress != nil {
+					done := doneBytes.Add(uint64(n))
+					params.Progress(float32(done) / float32(totalBytes))
+				}
+			}
+
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		return nil, err
+	}
+
 	// map devices to backend buffer types so new tensors can be assigned to the correct device
 	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)

@@ -388,11 +397,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {

 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
-		modelPath:         modelPath,
-		flashAttention:    params.FlashAttention,
-		meta:              meta,
-		tensorLoadTargets: targets,
-		tensors:           tensors,
+		flashAttention: params.FlashAttention,
+		meta:           meta,
+		tensors:        tensors,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
@@ -411,9 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}
 			return m
 		}(),
-		requiredMemory: &requiredMemory,
-		btDeviceMemory: btDeviceMemory,
-		maxGraphNodes:  maxGraphNodes,
+		maxGraphNodes: maxGraphNodes,
 	}, nil
 }

@@ -421,81 +426,6 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }

-func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
-	var doneBytes atomic.Uint64
-	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range b.meta.Tensors().Items() {
-		t := t
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
-			for i := range tts {
-				target := b.tensorLoadTargets[t.Name][i]
-				if target == "" {
-					target = t.Name
-				}
-
-				tt, ok := b.tensors[target]
-				if !ok {
-					return fmt.Errorf("unassigned tensor: %s", t.Name)
-				}
-
-				tts[i] = tt
-			}
-
-			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
-			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(b.modelPath)
-			if err != nil {
-				slog.Warn("file open error", "file", b.modelPath, "error", err)
-				return err
-			}
-			defer file.Close()
-			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
-				if err := ctx.Err(); err != nil {
-					return err
-				}
-
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					slog.Warn("file read error", "file", b.modelPath, "error", err)
-					return err
-				}
-
-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if progress != nil {
-					done := doneBytes.Add(uint64(n))
-					progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
-	}
-
-	if err := g.Wait(); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func (b *Backend) BackendMemory() ml.BackendMemory {
-	return *b.requiredMemory
-}
-
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
@@ -527,7 +457,6 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
-		layer:            -1,
 	}
 }

@@ -554,9 +483,6 @@ type Context struct {

 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
-
-	// layer is the graph layer that this context is allocating for - assumed to be cache
-	layer int
 }

 func (c *Context) Input() ml.Context {
@@ -567,7 +493,6 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
-			layer:            -1,
 		}
 	}

@@ -582,7 +507,6 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
-			layer:            i,
 		}
 	}

@@ -602,9 +526,7 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }

 func (c *Context) Compute(tensors ...ml.Tensor) {
-	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
-		panic(fmt.Errorf("error computing ggml graph: %v", status))
-	}
+	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
 	C.ggml_backend_sched_reset(c.b.sched)

 	needSync := true
@@ -622,34 +544,22 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }

-func (c *Context) Reserve() {
-	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
+func (c *Context) Reserve() error {
+	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
+		C.ggml_backend_sched_reset(c.b.sched)
+		return errors.New("failed to reserve graph")
+	}

 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
-
-	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
-	for _, bt := range c.b.schedBufts {
-		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
-	}
-
 	for i := range c.b.schedBackends {
-		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
-
-		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
-		graph.Size += uint64(bufferStatus.size)
-		if bufferStatus.allocated && graph.Status != ml.Failed {
-			graph.Status = ml.Allocated
-		} else {
-			graph.Status = ml.Failed
-		}
-
+		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
 		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(bufferStatus.size)))
+			"size", format.HumanBytes2(uint64(size)))
 	}

-	if !reserved {
-		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
-	}
+	C.ggml_backend_sched_reset(c.b.sched)
+
+	return nil
 }

 func (c *Context) MaxGraphNodes() int {
@@ -669,7 +579,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }

-func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
+func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -692,7 +602,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -705,43 +615,40 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {

 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
-
 	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
-	if c.layer >= 0 {
-		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
-
-		cache.Size += uint64(size)
-		if b != nil {
-			cache.Status = ml.Allocated
-		} else {
-			cache.Status = ml.Failed
-		}
-	}
-
 	if b == nil {
-		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
 	}
-
 	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
+
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}
+	return &Tensor{b: c.b, t: t}, nil
 }

 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	return c.newTensor(dtype, shape)
+	t, err := c.newTensor(dtype, shape)
+	if err != nil {
+		panic(err)
+	}
+
+	return t
 }

 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t := c.newTensor(dtype, shape)
+	t, err := c.newTensor(dtype, shape)
+	if err != nil {
+		panic(err)
+	}
+
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }

-func checkShape[S ~[]E, E any](s S, shape ...int) {
+func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	n := len(s)

 	if n == 0 {
-		return
+		return nil
 	}

 	for _, v := range shape {
@@ -749,32 +656,44 @@ func checkShape[S ~[]E, E any](s S, shape ...int) {
 	}

 	if n != 1 {
-		panic(fmt.Errorf("invalid shape: %v", shape))
+		return fmt.Errorf("invalid shape: %v", shape)
 	}
+
+	return nil
 }

-func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
-	checkShape(s, shape...)
+func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+	if err := checkShape(s, shape...); err != nil {
+		return nil, err
+	}

-	t := c.newTensor(ml.DTypeF32, shape)
+	t, err := c.newTensor(ml.DTypeF32, shape)
+	if err != nil {
+		return nil, err
+	}

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t
+	return t, nil
 }

-func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
-	checkShape(s, shape...)
+func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+	if err := checkShape(s, shape...); err != nil {
+		return nil, err
+	}

-	t := c.newTensor(ml.DTypeI32, shape)
+	t, err := c.newTensor(ml.DTypeI32, shape)
+	if err != nil {
+		return nil, err
+	}

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t
+	return t, nil
 }

 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -792,7 +711,12 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}

-		return c.Input().FromIntSlice(arange, len(arange))
+		t, err := c.Input().FromIntSlice(arange, len(arange))
+		if err != nil {
+			panic(err)
+		}
+
+		return t
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -943,13 +867,6 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
-	}
-}
-
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -998,8 +915,6 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
-	} else if shape[3] != 0 {
-		panic("cuda does not support 4d tensors")
 	}

 	return &Tensor{
@@ -1067,13 +982,6 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }

-func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
-	}
-}
-
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1145,15 +1053,28 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
+const (
+	ropeTypeNorm   C.int = 0
+	ropeTypeNeox   C.int = 2
+	ropeTypeMrope  C.int = 8
+	ropeTypeVision C.int = 24
+)
+
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
 	// Default options
-	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
+	opts := &ml.RopeOptions{
+		OriginalContextLen: 131072,
+	}

 	// Apply any provided options
 	for _, option := range options {
 		option(opts)
 	}

+	if ropeFactors == nil {
+		ropeFactors = &Tensor{b: t.b}
+	}
+
 	dequant := t.t
 	if C.ggml_is_quantized(t.t._type) {
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
@@ -1164,11 +1085,11 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx,
 			dequant,
-			positions.(*Tensor).t,
-			opts.Factors.(*Tensor).t,
+			positionIDs.(*Tensor).t,
+			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(opts.Type),
-			C.int(opts.OriginalContextLength),
+			C.int(ropeType),
+			C.int(opts.OriginalContextLen),
 			C.float(ropeBase),
 			C.float(ropeScale),
 			C.float(0.0),
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,12 +66,6 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph

 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

-struct ggml_allocr_buffer_status {
-    size_t size;
-    bool allocated;
-};
-GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,7 +152,6 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
-        const char * uuid;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
@@ -305,12 +304,6 @@ extern "C" {

    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

-    struct ggml_backend_buffer_status {
-        size_t size;
-        bool allocated;
-    };
-    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,7 +364,6 @@ struct node_alloc {
 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
    ggml_backend_buffer_t * buffers; // [n_buffers]
-    size_t *buffer_sizes; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -388,9 +387,6 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);

-    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
-    GGML_ASSERT(galloc->buffer_sizes != NULL);
-
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);

@@ -457,7 +453,6 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
-    free(galloc->buffer_sizes);
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
@@ -753,8 +748,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        }
    }

-    bool success = true;
-
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
        // if the buffer type is used multiple times, we reuse the same buffer
@@ -776,20 +769,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c

            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i]) {
-                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-            } else {
+            if (galloc->buffers[i] == NULL) {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                galloc->buffer_sizes[i] = new_size;
-                success = false;
+                return false;
            }
-        } else {
-            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }

-    return success;
+    return true;
 }

 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -946,24 +934,6 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }

-struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
-
-    for (int i = 0; i < buffer_id; i++) {
-        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
-            // This buffer is the same as a previous one due to the same buffer type being used multiple times
-            // (See above.) However, we need a different check because multiple buffers might be NULL in our
-            // case and we still want to know the attempted size.
-
-            struct ggml_allocr_buffer_status status = {0, true};
-            return status;
-        }
-    }
-
-    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
-    return status;
-}
-
 // utils

 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -573,16 +573,8 @@ void ggml_backend_load_all_from_path(const char * dir_path) {

    ggml_backend_load_best("blas", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
-
-    // Avoid mixed hip+cuda configurations
-    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
-    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
-    if (!hip_devices && !rocr_devices) {
-        ggml_backend_load_best("cuda", silent, dir_path);
-    } else {
-        ggml_backend_load_best("hip", silent, dir_path);
-    }
-    
+    ggml_backend_load_best("cuda", silent, dir_path);
+    ggml_backend_load_best("hip", silent, dir_path);
    ggml_backend_load_best("kompute", silent, dir_path);
    ggml_backend_load_best("metal", silent, dir_path);
    ggml_backend_load_best("rpc", silent, dir_path);
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1629,16 +1629,6 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }

-struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-
-    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
-
-    return status;
-}
-
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,7 +2884,6 @@ struct ggml_backend_cuda_device_context {
    int device;
    std::string name;
    std::string description;
-    std::string uuid;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2897,11 +2896,6 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
    return ctx->description.c_str();
 }

-static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->uuid.c_str();
-}
-
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    ggml_cuda_set_device(ctx->device);
@@ -2916,7 +2910,6 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
-    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

@@ -3465,32 +3458,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;

-                #if !defined(GGML_USE_HIP)
-                char uuid[64];
-                snprintf(uuid, sizeof(uuid),
-                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-                    (unsigned char)prop.uuid.bytes[0],
-                    (unsigned char)prop.uuid.bytes[1],
-                    (unsigned char)prop.uuid.bytes[2],
-                    (unsigned char)prop.uuid.bytes[3],
-                    (unsigned char)prop.uuid.bytes[4],
-                    (unsigned char)prop.uuid.bytes[5],
-                    (unsigned char)prop.uuid.bytes[6],
-                    (unsigned char)prop.uuid.bytes[7],
-                    (unsigned char)prop.uuid.bytes[8],
-                    (unsigned char)prop.uuid.bytes[9],
-                    (unsigned char)prop.uuid.bytes[10],
-                    (unsigned char)prop.uuid.bytes[11],
-                    (unsigned char)prop.uuid.bytes[12],
-                    (unsigned char)prop.uuid.bytes[13],
-                    (unsigned char)prop.uuid.bytes[14],
-                    (unsigned char)prop.uuid.bytes[15]
-                  );
-                dev_ctx->uuid = uuid;
-                #else
-                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-                #endif
-
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,7 +5703,6 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
-    props->uuid        = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal

 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"

-// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"
--- a/ml/nn/fast/rope.go
+++ b/ml/nn/fast/rope.go
@@ -1,21 +0,0 @@
-// fast provides implementations of fast (fused) operations for increased performance.
-package fast
-
-import (
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn/rope"
-)
-
-// fastRoPE is an interface for tensors that support fast rotary positional embedding.
-type fastRoPE interface {
-	RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
-}
-
-// RoPE applies rotary positional embedding to tensor `t`.
-func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
-	if t, ok := t.(fastRoPE); ok {
-		return t.RoPE(ctx, positions, dim, base, scale, options...)
-	}
-
-	panic("RoPE not implemented for this tensor type")
-}
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -1,33 +0,0 @@
-package rope
-
-import "github.com/ollama/ollama/ml"
-
-// Options contains optional parameters for RoPE function
-type Options struct {
-	OriginalContextLength int
-	Type                  int
-	Factors               ml.Tensor
-}
-
-// WithOriginalContextLength sets a custom context length
-func WithOriginalContextLength(n int) func(*Options) {
-	return func(opts *Options) {
-		opts.OriginalContextLength = n
-	}
-}
-
-// WithType sets RoPE type to NeoX
-func WithTypeNeoX() func(*Options) {
-	return func(opts *Options) {
-		opts.Type = 2
-	}
-}
-
-// WithFactors sets custom rope factors
-func WithFactors(factors ml.Tensor) func(*Options) {
-	return func(opts *Options) {
-		if factors != nil {
-			opts.Factors = factors
-		}
-	}
-}
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,30 +2,16 @@ package input

 import "github.com/ollama/ollama/ml"

-// Multimodal is a multimodal embedding or a component of one.
-// For example, it could be a row of an image that can be processed
-// independently.
-type Multimodal struct {
-	// Tensor is the embedding data. Implementations may chose what to
-	// store here or it may be nil if not needed. However, any ml.Tensor
-	// objects must be stored here and not in Data.
-	Tensor ml.Tensor
-
-	// Data is implementation-specific opaque data, such as metadata on how
-	// to layout Tensor. It may be nil if not needed. It may also store larger
-	// objects such as complete images if they are to be processed later.
-	Data any
-}
-
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is represents a non-text element such as an
-	// image (or part of one if the image can be processed in pieces).
-	// It may be used either together with Token or on its own.
-	Multimodal []Multimodal
+	// Multimodal is opaque data representing a non-text
+	// element such as an image (or part of one if the image
+	// can be processed in pieces). It may be either together
+	// with Token or on its own.
+	Multimodal any

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -46,7 +32,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal []Multimodal
+	Multimodal any
 }

 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@@ -40,13 +40,12 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is one or more tensors, each with optional model-specific
-	// opaque metadata. Typically, the tensors might be views into an embedding
-	// with each view representing a chunk of data that can be processed independently
-	// in different batches.
+	// The return value is most typically an ml.Tensor, however, different
+	// type are possible, such as an object containing a tensor plus
+	// additional metadata, a slice of tensors or even just the original input.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
+	EncodeMultimodal(ml.Context, []byte) (any, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
@@ -98,8 +97,14 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string, params ml.BackendParams) (Model, error) {
-	b, err := ml.NewBackend(modelPath, params)
+func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
+	r, err := os.Open(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	b, err := ml.NewBackend(ctx, r, params)
 	if err != nil {
 		return nil, err
 	}
@@ -128,7 +133,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, err := fsggml.Decode(r, -1)
+	meta, _, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -287,7 +292,11 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
+	var err error
+	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
+	if err != nil {
+		return nil, err
+	}

 	cache := m.Config().Cache
 	if cache != nil {
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,8 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -45,13 +43,10 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -85,10 +80,11 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -98,7 +94,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -128,7 +124,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
 }

 type MLP struct {
@@ -175,8 +171,15 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,16 +60,12 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(1),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{
-						int32(c.Uint("tokenizer.ggml.eos_token_id")),
-						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
-					},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				EOT:    int32(106),
+				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -86,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -101,30 +97,33 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	pixelValues := ctx.Input().FromFloatSlice(f32s,
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
+	if err != nil {
+		return nil, err
+	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return visionOutputs, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal[0].Tensor
+			inputMultimodal := inp.Multimodal.(ml.Tensor)

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
-				input.Input{Token: 255999},                                     // "<start_of_image>""
-				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
+				input.Input{Token: 255999},                                                   // "<start_of_image>""
+				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
@@ -141,8 +140,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,8 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -75,6 +73,7 @@ type TextSelfAttention struct {

 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)

 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -84,7 +83,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -95,7 +94,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -113,7 +112,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}

-	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
 }

 type TextMLP struct {
@@ -166,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal[0].Tensor
+		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,23 +1,22 @@
 package llama

 import (
-	"cmp"
+	"fmt"
 	"math"
+	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
-	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
 }

 type Model struct {
@@ -33,6 +32,10 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
+		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+	}
+
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -40,13 +43,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -54,11 +57,10 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}

@@ -75,31 +77,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
-	ropeDim := cmp.Or(opts.ropeDim, headDim)
+	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

-	query := sa.Query.Forward(ctx, hiddenState)
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

-	key := sa.Key.Forward(ctx, hiddenState)
-	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

-	value := sa.Value.Forward(ctx, hiddenState)
-	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
-	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)

-	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
-	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
-	return sa.Output.Forward(ctx, attention)
+	return sa.Output.Forward(ctx, kqv)
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
-	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -120,11 +122,11 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -142,19 +144,27 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tenso
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)

-		var outputs ml.Tensor
+		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+			lastLayerOutputs = outputs
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -40,13 +41,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -62,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -77,7 +78,10 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
+	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
+	if err != nil {
+		return nil, err
+	}

 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize

@@ -88,86 +92,81 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	pixelValues := tilesLocal

 	if len(pixelsGlobal) > 0 {
-		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
+		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
+		if err != nil {
+			return nil, err
+		}
+
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-
-	var multimodal []input.Multimodal
-	aspectRatio := image.Point{ratioW, ratioH}
-
-	var offset int
-	patchesPerChunk := projectedOutputs.Dim(1)
-	if aspectRatio.Y*aspectRatio.X > 1 {
-		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
-
-		for range aspectRatio.Y {
-			for x := range aspectRatio.X {
-				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
-					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
-					patchesPerChunk)
-				var separator separator
-				if x < aspectRatio.X-1 {
-					separator.x = true // <|tile_x_separator|>
-				} else {
-					separator.y = true // <|tile_y_separator|>
-				}
-				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
-				offset += patchesPerChunk
-			}
-		}
-	}
-
-	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
-		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
-		patchesPerChunk)
-	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
-
-	return multimodal, nil
+	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
 }

-type separator struct {
-	x bool
-	y bool
+type chunks struct {
+	*Model
+	ml.Tensor
+	aspectRatio image.Point
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 			continue
 		}

+		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		for i, mm := range inp.Multimodal {
-			patchesPerChunk := mm.Tensor.Dim(1)
+		var offset int
+		patchesPerChunk := t.Dim(1)
+		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
+			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)

-			if i < len(inp.Multimodal)-1 {
-				separator := mm.Data.(*separator)
-
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-
-				if separator.x {
-					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+			for range t.aspectRatio.Y {
+				for x := range t.aspectRatio.X {
+					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+					if x < t.aspectRatio.X-1 {
+						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+					}
+					offset += patchesPerChunk
 				}
-				if separator.y {
-					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
-				}
-			} else {
-				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
+
+				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
 			}
 		}

+		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
+		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
+
 		result = append(result, imageInputs...)
 	}

@@ -175,8 +174,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -33,8 +31,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

 	if useRope {
-		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
-		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
 	}

 	if opts.useQKNorm {
@@ -63,9 +61,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
 }

 type TextExperts struct {
-	Gate *nn.Linear `gguf:"ffn_gate_exps"`
-	Up   *nn.Linear `gguf:"ffn_up_exps"`
-	Down *nn.Linear `gguf:"ffn_down_exps"`
+	Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
+	Up   ml.Tensor `gguf:"ffn_up_exps.weight"`
+	Down ml.Tensor `gguf:"ffn_down_exps.weight"`
 }

 func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
@@ -76,13 +74,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
 	hiddenStates = hiddenStates.Mul(ctx, scores)

-	upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
-	gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
-	downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
+	upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
+	gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
+	downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)

 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}

 	return nextStates
@@ -212,7 +210,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

@@ -223,7 +226,11 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}

-		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		var err error
+		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		if err != nil {
+			panic(err)
+		}
 	}

 	for i, layer := range m.Layers {
@@ -248,5 +255,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
 }
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,7 +245,10 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}

-	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
+	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
+	if err != nil {
+		panic(err)
+	}

 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -31,26 +32,31 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
+	textModel, err := NewTextModel(c)
+	if err != nil {
+		return nil, err
+	}
+
 	m := &Model{
+		TextModel:           textModel,
+		VisionModel:         newVisionModel(c),
+		ImageProcessor:      newImageProcessor(c),
+		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
-		TextModel:           newTextModel(c),
-		VisionModel:         newVisionModel(c),
-		ImageProcessor:      newImageProcessor(c),
-		MultiModalProjector: newMultiModalProjector(c),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -99,7 +105,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -114,20 +120,46 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
+	if err != nil {
+		return nil, err
+	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	rows := make([]input.Multimodal, size.Y)
+	parent := imageFeatures{tensor: features}
+	rows := make([]*imageRow, size.Y)
 	for i := range rows {
-		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
+		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
 	}

 	return rows, nil
 }

+type imageFeatures struct {
+	tensor ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type imageRow struct {
+	parent *imageFeatures
+	s      int
+	shape  []int
+}
+
+func (r *imageRow) data() []float32 {
+	n := 1
+	for _, s := range r.shape {
+		n *= s
+	}
+
+	return r.parent.data[r.s*n : (r.s+1)*n]
+}
+
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -136,14 +168,15 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			for i, row := range inp.Multimodal {
+			inputMultimodal := inp.Multimodal.([]*imageRow)
+			for i, row := range inputMultimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
-				if i == len(inp.Multimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
+				if i == len(inputMultimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
@@ -158,8 +191,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,24 +1,27 @@
 package mistral3

 import (
-	"cmp"
+	"fmt"
 	"math"
+	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads int
-	headDim, ropeDim                 int
-	eps, ropeBase, ropeScale         float32
+	hiddenSize, numHeads, numKVHeads, headDim int
+	eps, ropeBase, ropeScale                  float32
+	ropeDim                                   uint32
 }

 type TextModel struct {
+	model.Base
+
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -36,15 +39,19 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeType := uint32(0)
+	headDim := opts.headDim
+	if headDim == 0 {
+		headDim = opts.hiddenSize / opts.numHeads
+	}

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -55,7 +62,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -102,7 +109,20 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		imageFeature := image.Multimodal[0].Tensor
+		row := image.Multimodal.(*imageRow)
+		row.parent.dataOnce.Do(func() {
+			// use a new, throwaway context so the image tensor is not added to the graph
+			temp := m.Backend().NewContext()
+			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
+			row.parent.data = row.parent.tensor.Floats()
+			temp.Close()
+		})
+
+		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

@@ -121,18 +141,24 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }

-func newTextModel(c fs.Config) *TextModel {
-	return &TextModel{
+func NewTextModel(c fs.Config) (*TextModel, error) {
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
+		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+	}
+
+	textModel := &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
+
+	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,8 +110,15 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}

-	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
+	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	if err != nil {
+		panic(err)
+	}
+
+	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
+	if err != nil {
+		panic(err)
+	}

 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -144,7 +151,10 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}

-	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
+	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
+	if err != nil {
+		panic(err)
+	}

 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
@@ -160,7 +170,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {

 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,7 +3,6 @@ package mllama
 import (
 	"bytes"
 	"image"
-	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -38,13 +37,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -59,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -74,20 +73,21 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	if ratio.numTiles() < m.maxNumTiles {
-		// Pad tiles to maxNumTiles
-		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
-		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
+	if err != nil {
+		return nil, err
 	}

-	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
-	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+	if err != nil {
+		return nil, err
+	}

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
-
-	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
+	return m.Projector.Forward(ctx, crossAttentionStates), nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -103,11 +103,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}

-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 )

 type TextSelfAttention struct {
@@ -23,14 +21,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -45,7 +44,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
+		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 	}

 	return key, nil
@@ -200,8 +199,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,

 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
-	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32

 	crossAttentionLayers []int32
 }
@@ -241,10 +240,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
-			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
+			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -16,6 +16,8 @@ type VisionSelfAttention struct {
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
+
+	Gate ml.Tensor `gguf:"attn_gate"`
 }

 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -23,16 +25,27 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
+	scores := key.Mulmat(ctx, query)
+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+	scores = scores.Softmax(ctx)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-	return sa.Output.Forward(ctx, attention)
+
+	hiddenState = sa.Output.Forward(ctx, attention)
+	return hiddenState
 }

 type VisionMLP struct {
@@ -63,18 +76,21 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

+	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
-	hiddenState = hiddenState.Add(ctx, residual)
+
 	return hiddenState
 }

--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,7 +7,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
-	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
-	_ "github.com/ollama/ollama/model/models/qwen3"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -1,164 +0,0 @@
-package qwen2
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	headDim, ropeDim                 int
-	eps, ropeBase, ropeScale         float32
-}
-
-type Attention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
-	ropeDim := cmp.Or(opts.ropeDim, headDim)
-
-	query := attn.Query.Forward(ctx, hiddenStates)
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-
-	key := attn.Key.Forward(ctx, hiddenStates)
-	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	value := attn.Value.Forward(ctx, hiddenStates)
-	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-
-	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
-	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
-
-	return attn.Output.Forward(ctx, attention)
-}
-
-type MLP struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type DecoderLayer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	Attention     *Attention
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
-}
-
-func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenStates
-
-	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	residual = hiddenStates
-
-	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
-	return hiddenStates.Add(ctx, residual)
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
-	Layers         []DecoderLayer `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
-	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
-
-	Options
-}
-
-// Forward implements model.Model.
-func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = m.Output.Forward(ctx, hiddenStates)
-	return hiddenStates, nil
-}
-
-func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
-	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
-}
-
-func New(c fs.Config) (model.Model, error) {
-	m := Model{
-		Layers: make([]DecoderLayer, c.Uint("block_count")),
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-		),
-		Options: Options{
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
-			ropeBase:   c.Float("rope.freq_base"),
-			ropeScale:  c.Float("rope.freq_scale", 1),
-			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-	return &m, nil
-}
-
-func init() {
-	model.Register("qwen2", New)
-}
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -34,13 +35,12 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		TextModel:      NewTextModel(c),
@@ -69,12 +69,15 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width

-	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
+	}

 	return pixelValues, grid, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -85,7 +88,31 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return &chunks{Model: m, Tensor: visionOutputs}, nil
+}
+
+type chunks struct {
+	*Model
+	ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }

 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
@@ -115,15 +142,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 				result = append(result, input.Input{Token: pre[i]})
 			}

-			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+			// This is an image token with multimodal data
+			chunksData := inp.Multimodal.(*chunks)
+			patchesPerChunk := chunksData.Dim(1)

 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken})
+			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})

 			// Add the image token with the multimodal tensor data at the first position
+			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
-				Multimodal:     inp.Multimodal,
+				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
 				MultimodalHash: inp.MultimodalHash,
 				SameBatch:      patchesPerChunk,
 			})
@@ -139,8 +169,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -7,15 +7,13 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads int
-	ropeDim, originalContextLength   int
-	eps, ropeBase, ropeScale         float32
+	ctxLen, hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale                 float32
+	ropeDim, defaultContextLen               uint32
 }

 type TextModel struct {
@@ -31,14 +29,15 @@ func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
-			hiddenSize:            int(c.Uint("embedding_length")),
-			numHeads:              int(c.Uint("attention.head_count")),
-			numKVHeads:            int(c.Uint("attention.head_count_kv")),
-			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
-			originalContextLength: int(c.Uint("context_length", 128000)),
-			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:              c.Float("rope.freq_base"),
-			ropeScale:             c.Float("rope.freq_scale", 1),
+			ctxLen:            int(c.Uint("context_length")),
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base"),
+			ropeScale:         c.Float("rope.freq_scale", 1),
+			ropeDim:           c.Uint("rope.dimension_count", 128),
+			defaultContextLen: c.Uint("context_length", 128000),
 		},
 	}

@@ -60,11 +59,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -78,7 +77,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 // Shift applies rotary position embeddings to the key tensor for causal attention caching
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
 }

 // MLP implements the feed-forward network component with SwiGLU activation
@@ -130,7 +129,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -1,6 +1,7 @@
 package qwen25vl

 import (
+	"fmt"
 	"math"
 	"slices"

@@ -43,8 +44,10 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
 		}
 	}

-	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
-
+	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+	if err != nil {
+		panic(err)
+	}
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)

@@ -300,7 +303,10 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
 		}
 	}

-	t := ctx.Input().FromIntSlice(index, len(index))
+	t, err := ctx.Input().FromIntSlice(index, len(index))
+	if err != nil {
+		panic(err)
+	}

 	return t, bounds
 }
@@ -320,7 +326,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
-	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	if err != nil {
+		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
+	}

 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
@@ -330,7 +339,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
-	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	if err != nil {
+		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
+	}

 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -1,233 +0,0 @@
-package qwen3
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	eps                              float32
-	ropeBase, ropeScale              float32
-
-	keyLength, valueLength int
-
-	numExperts, numExpertsUsed int
-	normTopKProb               bool
-}
-
-func (o Options) headDim() int {
-	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
-}
-
-type Attention struct {
-	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
-	Query     *nn.Linear  `gguf:"attn_q"`
-	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
-	Key       *nn.Linear  `gguf:"attn_k"`
-	Value     *nn.Linear  `gguf:"attn_v"`
-	Output    *nn.Linear  `gguf:"attn_output"`
-}
-
-func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-
-	query := sa.Query.Forward(ctx, hiddenStates)
-	key := sa.Key.Forward(ctx, hiddenStates)
-	value := sa.Value.Forward(ctx, hiddenStates)
-
-	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
-	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
-	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
-
-	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
-	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
-
-	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-
-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
-	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
-	return sa.Output.Forward(ctx, attention)
-}
-
-type MLP interface {
-	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
-}
-
-type sparse struct {
-	Router *nn.Linear `gguf:"ffn_gate_inp"`
-	Gate   *nn.Linear `gguf:"ffn_gate_exps"`
-	Up     *nn.Linear `gguf:"ffn_up_exps"`
-	Down   *nn.Linear `gguf:"ffn_down_exps"`
-}
-
-func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
-	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
-
-	routingWeights := routerLogits.Softmax(ctx)
-	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
-	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
-	if opts.normTopKProb {
-		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
-		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
-		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
-	}
-
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
-
-	upStates := mlp.Up.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
-
-	hiddenStates = mlp.Gate.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
-	hiddenStates = hiddenStates.SILU(ctx)
-	hiddenStates = hiddenStates.Mul(ctx, upStates)
-
-	experts := mlp.Down.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
-	experts = experts.Mul(ctx, routingWeights)
-
-	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
-	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
-	}
-
-	return nextStates
-}
-
-type dense struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	*Attention
-
-	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP
-}
-
-func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenStates
-	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
-
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	residual = hiddenStates
-	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
-	return hiddenStates.Add(ctx, residual)
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	Layers []Layer `gguf:"blk"`
-
-	*Options
-}
-
-// Forward implements model.Model.
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	return m.Output.Forward(ctx, hiddenStates), nil
-}
-
-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
-}
-
-var _ model.Model = (*Model)(nil)
-
-func New(c fs.Config) (model.Model, error) {
-	layers := make([]Layer, c.Uint("block_count"))
-	for i := range layers {
-		if c.String("general.architecture") == "qwen3moe" {
-			layers[i].MLP = &sparse{}
-		} else {
-			layers[i].MLP = &dense{}
-		}
-	}
-
-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-		),
-		Layers: layers,
-		Options: &Options{
-			hiddenSize:     int(c.Uint("embedding_length")),
-			numHeads:       int(c.Uint("attention.head_count")),
-			numKVHeads:     int(c.Uint("attention.head_count_kv")),
-			keyLength:      int(c.Uint("attention.key_length")),
-			valueLength:    int(c.Uint("attention.value_length")),
-			eps:            c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:       c.Float("rope.freq_base"),
-			ropeScale:      c.Float("rope.freq_scale", 1),
-			numExperts:     int(c.Uint("expert_count")),
-			numExpertsUsed: int(c.Uint("expert_used_count")),
-			normTopKProb:   c.Bool("norm_top_k_prob", true),
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-	return &m, nil
-}
-
-func init() {
-	model.Register("qwen3", New)
-	model.Register("qwen3moe", New)
-}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -3,16 +3,118 @@ package model
 import (
 	"cmp"
 	"context"
-	"fmt"
 	"iter"
 	"log/slog"
+	"slices"
 	"strings"
+	"sync"

 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )

+type Special int32
+
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
+type TextProcessor interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
+
+type Vocabulary struct {
+	Values []string
+	Types  []int32
+	Scores []float32
+	Merges []string
+
+	BOS, EOS, EOT          int32
+	AddBOS, AddEOS, AddEOT bool
+
+	specialOnce sync.Once
+	special     []string
+
+	valuesOnce sync.Once
+	values     map[string]int32
+
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+
+func (v *Vocabulary) Is(id int32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return id == v.BOS
+	case SpecialEOS:
+		return id == v.EOS || id == v.EOT
+	default:
+		return false
+	}
+}
+
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+
+	return -1
+}
+
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if slices.Contains([]int{105, 106}, i) {
+				v.special = append(v.special, v.Values[i])
+			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+
+	return v.special
+}
+
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+
+	return -1
+}
+
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -202,23 +304,30 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
-
 	if addSpecial && len(ids) > 0 {
-		ids = bpe.vocab.addSpecials(ids)
+		if bpe.vocab.AddBOS {
+			if ids[0] == bpe.vocab.BOS {
+				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
+			}
+
+			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
+			ids = append([]int32{bpe.vocab.BOS}, ids...)
+		}
+
+		if bpe.vocab.AddEOS {
+			if ids[len(ids)-1] == bpe.vocab.EOS {
+				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
+			}
+
+			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
+			ids = append(ids, bpe.vocab.EOS)
+		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

-type lazyIdsString struct {
-	ids []int32
-}
-
-func (l lazyIdsString) LogValue() slog.Value {
-	return slog.AnyValue(fmt.Sprint(l.ids))
-}
-
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
@@ -243,6 +352,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -182,12 +182,27 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
-
 	if addSpecial && len(ids) > 0 {
-		ids = spm.vocab.addSpecials(ids)
+		if spm.vocab.AddBOS {
+			if ids[0] == spm.vocab.BOS {
+				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
+			}
+
+			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
+			ids = append([]int32{spm.vocab.BOS}, ids...)
+		}
+
+		if spm.vocab.AddEOS {
+			if ids[len(ids)-1] == spm.vocab.EOS {
+				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
+			}
+
+			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
+			ids = append(ids, spm.vocab.EOS)
+		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

@@ -246,6 +261,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
@@ -1,17 +0,0 @@
-package model
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -1,112 +0,0 @@
-package model
-
-import (
-	"log/slog"
-	"slices"
-	"sync"
-)
-
-type Special int32
-
-const (
-	SpecialBOS Special = iota
-	SpecialEOS
-)
-
-type Vocabulary struct {
-	Values []string
-	Types  []int32
-	Scores []float32
-	Merges []string
-
-	BOS, EOS       []int32
-	AddBOS, AddEOS bool
-
-	specialOnce sync.Once
-	special     []string
-
-	valuesOnce sync.Once
-	values     map[string]int32
-
-	mergeOnce sync.Once
-	merge     map[string]int32
-}
-
-func (v *Vocabulary) Is(id int32, special Special) bool {
-	switch special {
-	case SpecialBOS:
-		return slices.Contains(v.BOS, id)
-	case SpecialEOS:
-		return slices.Contains(v.EOS, id)
-	default:
-		return false
-	}
-}
-
-func (v *Vocabulary) addSpecials(ids []int32) []int32 {
-	if v.AddBOS && len(v.BOS) > 0 {
-		if slices.Contains(v.BOS, ids[0]) {
-			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
-		}
-
-		slog.Debug("adding bos token to prompt", "id", v.BOS)
-		ids = append([]int32{v.BOS[0]}, ids...)
-	}
-
-	if v.AddEOS && len(v.EOS) > 0 {
-		if slices.Contains(v.BOS, ids[len(ids)-1]) {
-			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
-		}
-
-		slog.Debug("adding eos token to prompt", "id", v.EOS)
-		ids = append(ids, v.EOS[0])
-	}
-
-	return ids
-}
-
-func (v *Vocabulary) Encode(s string) int32 {
-	v.valuesOnce.Do(func() {
-		v.values = make(map[string]int32, len(v.Values))
-		for i, value := range v.Values {
-			v.values[value] = int32(i)
-		}
-	})
-
-	if id, ok := v.values[s]; ok {
-		return id
-	}
-
-	return -1
-}
-
-func (v *Vocabulary) Decode(id int32) string {
-	return v.Values[id]
-}
-
-func (v *Vocabulary) SpecialVocabulary() []string {
-	v.specialOnce.Do(func() {
-		for i := range v.Values {
-			if v.Types[i] == TOKEN_TYPE_CONTROL || v.Types[i] == TOKEN_TYPE_USER_DEFINED {
-				v.special = append(v.special, v.Values[i])
-			}
-		}
-	})
-
-	return v.special
-}
-
-func (v *Vocabulary) Merge(left, right string) int {
-	v.mergeOnce.Do(func() {
-		v.merge = make(map[string]int32, len(v.Merges))
-		for i, merge := range v.Merges {
-			v.merge[merge] = int32(i)
-		}
-	})
-
-	if id, ok := v.merge[left+" "+right]; ok {
-		return int(id)
-	}
-
-	return -1
-}
--- a/Show More
+++ b/Show More