gofmt

pr feedback
server: cache gguf model capabilities rather than reading off disc
2026-02-23 18:46:44 -05:00 · 2025-06-16 16:34:46 -07:00 · 2025-06-16 16:08:38 -07:00 · 2025-06-16 15:17:36 -07:00 · 2025-06-16 15:17:02 -07:00 · 2025-06-16 15:16:58 -07:00
137 changed files with 1712 additions and 140398 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -23,7 +23,7 @@ jobs:
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT

  darwin-build:
-    runs-on: macos-13-xlarge
+    runs-on: macos-13
    environment: release
    needs: setup-environment
    strategy:
@@ -54,6 +54,48 @@ jobs:
          name: build-${{ matrix.os }}-${{ matrix.arch }}
          path: dist/*

+  darwin-sign:
+    runs-on: macos-13
+    environment: release
+    needs: darwin-build
+    steps:
+      - uses: actions/checkout@v4
+      - run: |
+          echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
+          security create-keychain -p password build.keychain
+          security default-keychain -s build.keychain
+          security unlock-keychain -p password build.keychain
+          security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
+          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
+          security set-keychain-settings -lut 3600 build.keychain
+        env:
+          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
+          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
+      - uses: actions/download-artifact@v4
+        with:
+          name: build-darwin-amd64
+          path: dist/darwin-amd64
+      - uses: actions/download-artifact@v4
+        with:
+          name: build-darwin-arm64
+          path: dist/darwin-arm64
+      - run: |
+          export VERSION=${GITHUB_REF_NAME#v}
+          ./scripts/build_darwin.sh sign macapp
+        env:
+          APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
+          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
+          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
+          APPLE_ID: ${{ vars.APPLE_ID }}
+          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-darwin
+          path: |
+            dist/Ollama-darwin.zip
+            dist/ollama-darwin.tgz
+
  windows-depends:
    strategy:
      matrix:
@@ -61,18 +103,21 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 11'
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            cuda-version: '12.8'
-            flags: ''
          - os: windows
            arch: amd64
            preset: 'ROCm 6'
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
-            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -115,9 +160,6 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: matrix.preset == 'CPU'
        run: |
          echo "CC=clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
@@ -136,9 +178,9 @@ jobs:
          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
      - name: Build target "${{ matrix.preset }}"
        run: |
-          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
-          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          cmake --preset "${{ matrix.preset }}"
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
        env:
@@ -188,11 +230,61 @@ jobs:
          go-version-file: go.mod
      - run: |
          go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
+      - if: matrix.arch == 'arm64'
+        run: |
+          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vc_redist.arm64.exe" -OutFile "dist\windows-arm64\vc_redist.arm64.exe"
+      - run: |
+          $env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
+          & .\scripts\build_windows.ps1 buildApp
+        env:
+          VCToolsRedistDir: stub
      - uses: actions/upload-artifact@v4
        with:
          name: build-${{ matrix.os }}-${{ matrix.arch }}
          path: |
            dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
+            dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
+
+  windows-sign:
+    runs-on: windows-2022
+    environment: release
+    needs: [windows-depends, windows-build]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: google-github-actions/auth@v2
+        with:
+          project_id: ollama
+          credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
+      - run: |
+          $ErrorActionPreference = "Stop"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
+          Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
+          Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
+          & "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
+
+          echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: build-windows-*
+          path: dist\
+          merge-multiple: true
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: depends-windows-amd64-*
+          path: dist\windows-amd64\
+          merge-multiple: true
+      - run: |
+          & .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
+        env:
+          KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-windows
+          path: |
+            dist\OllamaSetup.exe
+            dist\ollama-windows-*.zip

  linux-build:
    strategy:
@@ -225,26 +317,21 @@ jobs:
            CGO_CFLAGS=${{ env.CGO_CFLAGS }}
            CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
-          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+          cache-from: type=registry,ref=ollama/ollama:latest
          cache-to: type=inline
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_sbsa)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
            esac
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
-      - run: |
-          echo "Manifests"
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
-            echo $ARCHIVE
-            cat $ARCHIVE
-          done
      - run: |
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
@@ -298,8 +385,8 @@ jobs:
          context: .
          platforms: ${{ matrix.os }}/${{ matrix.arch }}
          build-args: ${{ matrix.build-args }}
-          outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+          outputs: type=image,name=ollama/ollama,push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=registry,ref=ollama/ollama:latest
          cache-to: type=inline
      - run: |
          mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
@@ -331,7 +418,7 @@ jobs:
            latest=false
            suffix=${{ matrix.suffix }}
          images: |
-            ${{ vars.DOCKER_REPO }}
+            ollama/ollama
          tags: |
            type=ref,enable=true,priority=600,prefix=pr-,event=pr
            type=semver,pattern={{version}}
@@ -341,24 +428,56 @@ jobs:
          path: ${{ runner.temp }}
          merge-multiple: true
      - run: |
-          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf '${{ vars.DOCKER_REPO }}@%s ')
-          docker buildx imagetools inspect ${{ vars.DOCKER_REPO }}:${{ steps.metadata.outputs.version }}
+          docker buildx imagetools create $(echo '${{ steps.metadata.outputs.json }}' | jq -cr '.tags | map("-t", .) | join(" ")') $(cat *-${{ matrix.suffix }}.txt | xargs printf 'ollama/ollama@%s ')
+          docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
        working-directory: ${{ runner.temp }}

  # Trigger downstream release process
  trigger:
    runs-on: ubuntu-latest
    environment: release
-    needs: [darwin-build, windows-build, windows-depends, linux-build]
+    needs: [darwin-build, windows-build, windows-depends]
+    steps:
+      - name: Trigger downstream release process
+        run: |
+          curl -L \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
+            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
+
+  # Aggregate all the assets and ship a release
+  release:
+    needs: [darwin-sign, windows-sign, linux-build]
+    runs-on: linux
+    environment: release
    permissions:
      contents: write
    env:
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
-      - name: Create or update Release for tag
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist-darwin
+          path: dist
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist-windows
+          path: dist
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: dist-linux-*
+          path: dist
+          merge-multiple: true
+      - run: find . -type f -not -name 'sha256sum.txt' | xargs sha256sum | tee sha256sum.txt
+        working-directory: dist
+      - name: Create or update Release
        run: |
          RELEASE_VERSION="$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
+
          echo "Looking for existing release for ${RELEASE_VERSION}"
          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
          if [ -n "$OLD_TAG" ]; then
@@ -372,12 +491,5 @@ jobs:
              --generate-notes \
              --prerelease
          fi
-      - name: Trigger downstream release process
-        run: |
-          curl -L \
-            -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
-            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\", \"origin\": \"${GITHUB_REPOSITORY}\", \"publish\": \"1\"}}"
+          echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
+          gh release upload ${GITHUB_REF_NAME} dist/* --clobber
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -36,7 +36,7 @@ jobs:
              | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
          }

-          echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT

  linux:
    needs: [changes]
@@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,11 +78,11 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
-            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
+            flags: '-DAMDGPU_TARGETS=gfx1010'
    runs-on: windows
    steps:
      - run: |
@@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
@@ -120,9 +120,6 @@ jobs:
          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIPCXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "HIP_PLATFORM=amd" | Out-File -FilePath $env:GITHUB_ENV -Append
-          echo "CMAKE_PREFIX_PATH=$hipPath" | Out-File -FilePath $env:GITHUB_ENV -Append
      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
@@ -136,8 +133,8 @@ jobs:
          path: ${{ github.workspace }}\.ccache
          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
      - run: |
-          Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
          cmake --build --parallel --preset "${{ matrix.preset }}"
        env:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,13 +78,14 @@ if(CMAKE_CUDA_COMPILER)

    find_package(CUDAToolkit)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
    install(TARGETS ggml-cuda
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
            PRE_INCLUDE_REGEXES cublas cublasLt cudart
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
-        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA
+        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
    )
 endif()

@@ -115,11 +116,7 @@ if(CMAKE_HIP_COMPILER)

        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
-            RUNTIME_DEPENDENCY_SET rocm
-            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
-            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT HIP
-        )
-        install(RUNTIME_DEPENDENCY_SET rocm
+            RUNTIME_DEPENDENCIES
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -6,8 +6,7 @@
      "binaryDir": "${sourceDir}/build",
      "installDir": "${sourceDir}/dist",
      "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release",
-        "CMAKE_MSVC_RUNTIME_LIBRARY": "MultiThreaded"
+        "CMAKE_BUILD_TYPE": "Release"
      }
    },
    {
@@ -18,12 +17,20 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+      }
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
      }
    },
    {
@@ -51,7 +58,6 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
-        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
@@ -72,6 +78,11 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -65,7 +65,7 @@ continuation of the sentence:
 Examples:

      llm/backend/mlx: support the llama architecture
-      CONTRIBUTING: provide clarity on good commit messages, and bad
+      CONTRIBUTING: provide clairity on good commit messages, and bad

 Bad Examples:

--- a/26
+++ b/26
@@ -7,13 +7,12 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

-# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
+# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
-    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

@@ -39,6 +38,15 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

+FROM base AS cuda-11
+ARG CUDA11VERSION=11.3
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8
+
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -90,21 +98,23 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa
-COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5
-COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6

 FROM scratch AS rocm
-COPY --from=rocm-6 dist/lib/ollama /lib/ollama
+COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm

 FROM ${FLAVOR} AS archive
 COPY --from=cpu dist/lib/ollama /lib/ollama
 COPY --from=build /bin/ollama /bin/ollama

-FROM ubuntu:24.04
+FROM ubuntu:20.04
 RUN apt-get update \
    && apt-get install -y ca-certificates \
    && apt-get clean \
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
   <a href="https://ollama.com">
-    <img alt="ollama" width="240" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+    <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
  </a>
 </div>

@@ -10,7 +10,7 @@ Get up and running with large language models.

 ### macOS

-[Download](https://ollama.com/download/Ollama.dmg)
+[Download](https://ollama.com/download/Ollama-darwin.zip)

 ### Windows

@@ -360,7 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG and deep research on Mac/Windows/Linux)
+- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
@@ -409,8 +409,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)

 ### Cloud

@@ -456,7 +454,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 - [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
- [ollama-bash-toolshed](https://github.com/attogram/ollama-bash-toolshed) - Bash scripts to chat with tool using models. Add new tools to your shed with ease. Runs on Ollama.

 ### Apple Vision Pro

@@ -595,12 +592,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
- [NativeMind](https://github.com/NativeMindBrowser/NativeMindExtension) (Private, on-device AI Assistant, no cloud dependencies)
- [GMAI - Gradle Managed AI](https://gmai.premex.se/) (Gradle plugin for automated Ollama lifecycle management during build phases)

 ### Supported backends

- [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
 - [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
--- a/api/client.go
+++ b/api/client.go
@@ -222,6 +222,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			return fmt.Errorf("unmarshal: %w", err)
 		}

+		if errorResponse.Error != "" {
+			return errors.New(errorResponse.Error)
+		}
+
 		if response.StatusCode >= http.StatusBadRequest {
 			return StatusError{
 				StatusCode:   response.StatusCode,
@@ -230,10 +234,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 			}
 		}

-		if errorResponse.Error != "" {
-			return errors.New(errorResponse.Error)
-		}
-
 		if err := fn(bts); err != nil {
 			return err
 		}
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -89,16 +89,6 @@ func TestClientStream(t *testing.T) {
 			},
 			wantErr: "mid-stream error",
 		},
-		{
-			name: "http status error takes precedence over general error",
-			responses: []any{
-				testError{
-					message:    "custom error message",
-					statusCode: http.StatusInternalServerError,
-				},
-			},
-			wantErr: "500",
-		},
 		{
 			name: "successful stream completion",
 			responses: []any{
--- a/api/types.go
+++ b/api/types.go
@@ -85,11 +85,10 @@ type GenerateRequest struct {
 	Options map[string]any `json:"options"`

 	// Think controls whether thinking/reasoning models will think before
-	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
-	// for supported models. Needs to be a pointer so we can distinguish between false
+	// responding. Needs to be a pointer so we can distinguish between false
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
-	Think *ThinkValue `json:"think,omitempty"`
+	Think *bool `json:"think,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -117,9 +116,8 @@ type ChatRequest struct {
 	Options map[string]any `json:"options"`

 	// Think controls whether thinking/reasoning models will think before
-	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
-	// for supported models.
-	Think *ThinkValue `json:"think,omitempty"`
+	// responding
+	Think *bool `json:"think,omitempty"`
 }

 type Tools []Tool
@@ -145,7 +143,6 @@ type Message struct {
 	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
-	ToolName  string      `json:"tool_name,omitempty"`
 }

 func (m *Message) UnmarshalJSON(b []byte) error {
@@ -225,68 +222,20 @@ func (pt PropertyType) String() string {
 	return fmt.Sprintf("%v", []string(pt))
 }

-type ToolProperty struct {
-	AnyOf       []ToolProperty `json:"anyOf,omitempty"`
-	Type        PropertyType   `json:"type"`
-	Items       any            `json:"items,omitempty"`
-	Description string         `json:"description"`
-	Enum        []any          `json:"enum,omitempty"`
-}
-
-// ToTypeScriptType converts a ToolProperty to a TypeScript type string
-func (tp ToolProperty) ToTypeScriptType() string {
-	if len(tp.AnyOf) > 0 {
-		var types []string
-		for _, anyOf := range tp.AnyOf {
-			types = append(types, anyOf.ToTypeScriptType())
-		}
-		return strings.Join(types, " | ")
-	}
-
-	if len(tp.Type) == 0 {
-		return "any"
-	}
-
-	if len(tp.Type) == 1 {
-		return mapToTypeScriptType(tp.Type[0])
-	}
-
-	var types []string
-	for _, t := range tp.Type {
-		types = append(types, mapToTypeScriptType(t))
-	}
-	return strings.Join(types, " | ")
-}
-
-// mapToTypeScriptType maps JSON Schema types to TypeScript types
-func mapToTypeScriptType(jsonType string) string {
-	switch jsonType {
-	case "string":
-		return "string"
-	case "number", "integer":
-		return "number"
-	case "boolean":
-		return "boolean"
-	case "array":
-		return "any[]"
-	case "object":
-		return "Record<string, any>"
-	case "null":
-		return "null"
-	default:
-		return "any"
-	}
-}
-
 type ToolFunction struct {
 	Name        string `json:"name"`
 	Description string `json:"description"`
 	Parameters  struct {
-		Type       string                  `json:"type"`
-		Defs       any                     `json:"$defs,omitempty"`
-		Items      any                     `json:"items,omitempty"`
-		Required   []string                `json:"required"`
-		Properties map[string]ToolProperty `json:"properties"`
+		Type       string   `json:"type"`
+		Defs       any      `json:"$defs,omitempty"`
+		Items      any      `json:"items,omitempty"`
+		Required   []string `json:"required"`
+		Properties map[string]struct {
+			Type        PropertyType `json:"type"`
+			Items       any          `json:"items,omitempty"`
+			Description string       `json:"description"`
+			Enum        []any        `json:"enum,omitempty"`
+		} `json:"properties"`
 	} `json:"parameters"`
 }

@@ -518,14 +467,13 @@ type ListModelResponse struct {

 // ProcessModelResponse is a single model description in [ProcessResponse].
 type ProcessModelResponse struct {
-	Name          string       `json:"name"`
-	Model         string       `json:"model"`
-	Size          int64        `json:"size"`
-	Digest        string       `json:"digest"`
-	Details       ModelDetails `json:"details,omitempty"`
-	ExpiresAt     time.Time    `json:"expires_at"`
-	SizeVRAM      int64        `json:"size_vram"`
-	ContextLength int          `json:"context_length"`
+	Name      string       `json:"name"`
+	Model     string       `json:"model"`
+	Size      int64        `json:"size"`
+	Digest    string       `json:"digest"`
+	Details   ModelDetails `json:"details,omitempty"`
+	ExpiresAt time.Time    `json:"expires_at"`
+	SizeVRAM  int64        `json:"size_vram"`
 }

 type TokenResponse struct {
@@ -558,8 +506,6 @@ type GenerateResponse struct {
 	Context []int `json:"context,omitempty"`

 	Metrics
-
-	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
 }

 // ModelDetails provides details about a model.
@@ -729,113 +675,6 @@ func DefaultOptions() Options {
 	}
 }

-// ThinkValue represents a value that can be a boolean or a string ("high", "medium", "low")
-type ThinkValue struct {
-	// Value can be a bool or string
-	Value interface{}
-}
-
-// IsValid checks if the ThinkValue is valid
-func (t *ThinkValue) IsValid() bool {
-	if t == nil || t.Value == nil {
-		return true // nil is valid (means not set)
-	}
-
-	switch v := t.Value.(type) {
-	case bool:
-		return true
-	case string:
-		return v == "high" || v == "medium" || v == "low"
-	default:
-		return false
-	}
-}
-
-// IsBool returns true if the value is a boolean
-func (t *ThinkValue) IsBool() bool {
-	if t == nil || t.Value == nil {
-		return false
-	}
-	_, ok := t.Value.(bool)
-	return ok
-}
-
-// IsString returns true if the value is a string
-func (t *ThinkValue) IsString() bool {
-	if t == nil || t.Value == nil {
-		return false
-	}
-	_, ok := t.Value.(string)
-	return ok
-}
-
-// AsBool returns the value as a bool (true if enabled in any way)
-func (t *ThinkValue) AsBool() bool {
-	if t == nil || t.Value == nil {
-		return false
-	}
-
-	switch v := t.Value.(type) {
-	case bool:
-		return v
-	case string:
-		// Any string value ("high", "medium", "low") means thinking is enabled
-		return v == "high" || v == "medium" || v == "low"
-	default:
-		return false
-	}
-}
-
-// AsString returns the value as a string
-func (t *ThinkValue) AsString() string {
-	if t == nil || t.Value == nil {
-		return ""
-	}
-
-	switch v := t.Value.(type) {
-	case string:
-		return v
-	case bool:
-		if v {
-			return "medium" // Default level when just true
-		}
-		return ""
-	default:
-		return ""
-	}
-}
-
-// UnmarshalJSON implements json.Unmarshaler
-func (t *ThinkValue) UnmarshalJSON(data []byte) error {
-	// Try to unmarshal as bool first
-	var b bool
-	if err := json.Unmarshal(data, &b); err == nil {
-		t.Value = b
-		return nil
-	}
-
-	// Try to unmarshal as string
-	var s string
-	if err := json.Unmarshal(data, &s); err == nil {
-		// Validate string values
-		if s != "high" && s != "medium" && s != "low" {
-			return fmt.Errorf("invalid think value: %q (must be \"high\", \"medium\", \"low\", true, or false)", s)
-		}
-		t.Value = s
-		return nil
-	}
-
-	return fmt.Errorf("think must be a boolean or string (\"high\", \"medium\", \"low\")")
-}
-
-// MarshalJSON implements json.Marshaler
-func (t *ThinkValue) MarshalJSON() ([]byte, error) {
-	if t == nil || t.Value == nil {
-		return []byte("null"), nil
-	}
-	return json.Marshal(t.Value)
-}
-
 type Duration struct {
 	time.Duration
 }
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -374,21 +374,24 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 }

 func TestThinking_UnmarshalJSON(t *testing.T) {
+	trueVal := true
+	falseVal := false
+
 	tests := []struct {
 		name             string
 		input            string
-		expectedThinking *ThinkValue
+		expectedThinking *bool
 		expectedError    bool
 	}{
 		{
 			name:             "true",
 			input:            `{ "think": true }`,
-			expectedThinking: &ThinkValue{Value: true},
+			expectedThinking: &trueVal,
 		},
 		{
 			name:             "false",
 			input:            `{ "think": false }`,
-			expectedThinking: &ThinkValue{Value: false},
+			expectedThinking: &falseVal,
 		},
 		{
 			name:             "unset",
@@ -396,23 +399,8 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 			expectedThinking: nil,
 		},
 		{
-			name:             "string_high",
-			input:            `{ "think": "high" }`,
-			expectedThinking: &ThinkValue{Value: "high"},
-		},
-		{
-			name:             "string_medium",
-			input:            `{ "think": "medium" }`,
-			expectedThinking: &ThinkValue{Value: "medium"},
-		},
-		{
-			name:             "string_low",
-			input:            `{ "think": "low" }`,
-			expectedThinking: &ThinkValue{Value: "low"},
-		},
-		{
-			name:             "invalid_string",
-			input:            `{ "think": "invalid" }`,
+			name:             "invalid",
+			input:            `{ "think": "true" }`,
 			expectedThinking: nil,
 			expectedError:    true,
 		},
@@ -426,12 +414,7 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 				require.Error(t, err)
 			} else {
 				require.NoError(t, err)
-				if test.expectedThinking == nil {
-					assert.Nil(t, req.Think)
-				} else {
-					require.NotNil(t, req.Think)
-					assert.Equal(t, test.expectedThinking.Value, req.Think.Value)
-				}
+				assert.Equal(t, test.expectedThinking, req.Think)
 			}
 		})
 	}
--- a/api/types_typescript_test.go
+++ b/api/types_typescript_test.go
@@ -1,142 +0,0 @@
-package api
-
-import (
-	"testing"
-)
-
-func TestToolParameterToTypeScriptType(t *testing.T) {
-	tests := []struct {
-		name     string
-		param    ToolProperty
-		expected string
-	}{
-		{
-			name: "single string type",
-			param: ToolProperty{
-				Type: PropertyType{"string"},
-			},
-			expected: "string",
-		},
-		{
-			name: "single number type",
-			param: ToolProperty{
-				Type: PropertyType{"number"},
-			},
-			expected: "number",
-		},
-		{
-			name: "integer maps to number",
-			param: ToolProperty{
-				Type: PropertyType{"integer"},
-			},
-			expected: "number",
-		},
-		{
-			name: "boolean type",
-			param: ToolProperty{
-				Type: PropertyType{"boolean"},
-			},
-			expected: "boolean",
-		},
-		{
-			name: "array type",
-			param: ToolProperty{
-				Type: PropertyType{"array"},
-			},
-			expected: "any[]",
-		},
-		{
-			name: "object type",
-			param: ToolProperty{
-				Type: PropertyType{"object"},
-			},
-			expected: "Record<string, any>",
-		},
-		{
-			name: "null type",
-			param: ToolProperty{
-				Type: PropertyType{"null"},
-			},
-			expected: "null",
-		},
-		{
-			name: "multiple types as union",
-			param: ToolProperty{
-				Type: PropertyType{"string", "number"},
-			},
-			expected: "string | number",
-		},
-		{
-			name: "string or null union",
-			param: ToolProperty{
-				Type: PropertyType{"string", "null"},
-			},
-			expected: "string | null",
-		},
-		{
-			name: "anyOf with single types",
-			param: ToolProperty{
-				AnyOf: []ToolProperty{
-					{Type: PropertyType{"string"}},
-					{Type: PropertyType{"number"}},
-				},
-			},
-			expected: "string | number",
-		},
-		{
-			name: "anyOf with multiple types in each branch",
-			param: ToolProperty{
-				AnyOf: []ToolProperty{
-					{Type: PropertyType{"string", "null"}},
-					{Type: PropertyType{"number"}},
-				},
-			},
-			expected: "string | null | number",
-		},
-		{
-			name: "nested anyOf",
-			param: ToolProperty{
-				AnyOf: []ToolProperty{
-					{Type: PropertyType{"boolean"}},
-					{
-						AnyOf: []ToolProperty{
-							{Type: PropertyType{"string"}},
-							{Type: PropertyType{"number"}},
-						},
-					},
-				},
-			},
-			expected: "boolean | string | number",
-		},
-		{
-			name: "empty type returns any",
-			param: ToolProperty{
-				Type: PropertyType{},
-			},
-			expected: "any",
-		},
-		{
-			name: "unknown type maps to any",
-			param: ToolProperty{
-				Type: PropertyType{"unknown_type"},
-			},
-			expected: "any",
-		},
-		{
-			name: "multiple types including array",
-			param: ToolProperty{
-				Type: PropertyType{"string", "array", "null"},
-			},
-			expected: "string | any[] | null",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := tt.param.ToTypeScriptType()
-			if result != tt.expected {
-				t.Errorf("ToTypeScriptType() = %q, want %q", result, tt.expected)
-			}
-		})
-	}
-}
--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@@ -0,0 +1,178 @@
+package benchmark
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// Command line flags
+var modelFlag string
+
+func init() {
+	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
+	flag.Lookup("m").DefValue = "model"
+}
+
+// modelName returns the model name from flags, failing the test if not set
+func modelName(b *testing.B) string {
+	if modelFlag == "" {
+		b.Fatal("Error: -m flag is required for benchmark tests")
+	}
+	return modelFlag
+}
+
+type TestCase struct {
+	name      string
+	prompt    string
+	maxTokens int
+}
+
+// runGenerateBenchmark contains the common generate and metrics logic
+func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
+	start := time.Now()
+	var ttft time.Duration
+	var metrics api.Metrics
+
+	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
+		if ttft == 0 && resp.Response != "" {
+			ttft = time.Since(start)
+		}
+		if resp.Done {
+			metrics = resp.Metrics
+		}
+		return nil
+	})
+
+	// Report custom metrics as part of the benchmark results
+	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
+	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
+
+	// Token throughput metrics
+	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
+	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
+	b.ReportMetric(promptThroughput, "prompt_tok/s")
+	b.ReportMetric(genThroughput, "gen_tok/s")
+
+	// Token counts
+	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
+	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
+	if err != nil {
+		b.Fatal(err)
+	}
+}
+
+// BenchmarkColdStart runs benchmarks with model loading from cold state
+func BenchmarkColdStart(b *testing.B) {
+	client := setup(b)
+	tests := []TestCase{
+		{"short_prompt", "Write a long story", 100},
+		{"medium_prompt", "Write a detailed economic analysis", 500},
+		{"long_prompt", "Write a comprehensive AI research paper", 1000},
+	}
+	m := modelName(b)
+
+	for _, tt := range tests {
+		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
+			ctx := b.Context()
+
+			// Set number of tokens as our throughput metric
+			b.SetBytes(int64(tt.maxTokens))
+
+			for b.Loop() {
+				b.StopTimer()
+				// Ensure model is unloaded before each iteration
+				unload(client, m, b)
+				b.StartTimer()
+
+				req := &api.GenerateRequest{
+					Model:   m,
+					Prompt:  tt.prompt,
+					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
+				}
+
+				runGenerateBenchmark(b, ctx, client, req)
+			}
+		})
+	}
+}
+
+// BenchmarkWarmStart runs benchmarks with pre-loaded model
+func BenchmarkWarmStart(b *testing.B) {
+	client := setup(b)
+	tests := []TestCase{
+		{"short_prompt", "Write a long story", 100},
+		{"medium_prompt", "Write a detailed economic analysis", 500},
+		{"long_prompt", "Write a comprehensive AI research paper", 1000},
+	}
+	m := modelName(b)
+
+	for _, tt := range tests {
+		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
+			ctx := b.Context()
+
+			// Pre-warm the model
+			warmup(client, m, tt.prompt, b)
+
+			// Set number of tokens as our throughput metric
+			b.SetBytes(int64(tt.maxTokens))
+
+			for b.Loop() {
+				req := &api.GenerateRequest{
+					Model:   m,
+					Prompt:  tt.prompt,
+					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
+				}
+
+				runGenerateBenchmark(b, ctx, client, req)
+			}
+		})
+	}
+}
+
+// setup verifies server and model availability
+func setup(b *testing.B) *api.Client {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		b.Fatal(err)
+	}
+	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
+		b.Fatalf("Model unavailable: %v", err)
+	}
+
+	return client
+}
+
+// warmup ensures the model is loaded and warmed up
+func warmup(client *api.Client, model string, prompt string, b *testing.B) {
+	for range 3 {
+		err := client.Generate(
+			context.Background(),
+			&api.GenerateRequest{
+				Model:   model,
+				Prompt:  prompt,
+				Options: map[string]any{"num_predict": 50, "temperature": 0.1},
+			},
+			func(api.GenerateResponse) error { return nil },
+		)
+		if err != nil {
+			b.Logf("Error during model warm-up: %v", err)
+		}
+	}
+}
+
+// unload forces model unloading using KeepAlive: 0 parameter
+func unload(client *api.Client, model string, b *testing.B) {
+	req := &api.GenerateRequest{
+		Model:     model,
+		KeepAlive: &api.Duration{Duration: 0},
+	}
+	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
+		b.Logf("Unload error: %v", err)
+	}
+	time.Sleep(1 * time.Second)
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -322,23 +322,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	thinkFlag := cmd.Flags().Lookup("think")
 	if thinkFlag.Changed {
-		thinkStr, err := cmd.Flags().GetString("think")
+		think, err := cmd.Flags().GetBool("think")
 		if err != nil {
 			return err
 		}
-
-		// Handle different values for --think
-		switch thinkStr {
-		case "", "true":
-			// --think or --think=true
-			opts.Think = &api.ThinkValue{Value: true}
-		case "false":
-			opts.Think = &api.ThinkValue{Value: false}
-		case "high", "medium", "low":
-			opts.Think = &api.ThinkValue{Value: thinkStr}
-		default:
-			return fmt.Errorf("invalid value for --think: %q (must be true, false, high, medium, or low)", thinkStr)
-		}
+		opts.Think = &think
 	} else {
 		opts.Think = nil
 	}
@@ -595,13 +583,12 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 			} else {
 				until = format.HumanTime(m.ExpiresAt, "Never")
 			}
-			ctxStr := strconv.Itoa(m.ContextLength)
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, ctxStr, until})
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
 		}
 	}

 	table := tablewriter.NewWriter(os.Stdout)
-	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "CONTEXT", "UNTIL"})
+	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
 	table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	table.SetHeaderLine(false)
@@ -989,7 +976,7 @@ type runOptions struct {
 	Options      map[string]any
 	MultiModal   bool
 	KeepAlive    *api.Duration
-	Think        *api.ThinkValue
+	Think        *bool
 	HideThinking bool
 }

@@ -1029,11 +1016,10 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 				}

 				switch ch {
-				case ' ', '\t':
+				case ' ':
 					state.wordBuffer = ""
-				case '\n', '\r':
+				case '\n':
 					state.lineLength = 0
-					state.wordBuffer = ""
 				default:
 					state.wordBuffer += string(ch)
 				}
@@ -1091,14 +1077,12 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	}()

 	var state *displayResponseState = &displayResponseState{}
-	var thinkingContent strings.Builder
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
+	var role string
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false

-	role := "assistant"
-
 	fn := func(response api.ChatResponse) error {
 		if response.Message.Content != "" || !opts.HideThinking {
 			p.StopAndClear()
@@ -1111,21 +1095,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(false))
 				thinkTagOpened = true
-				thinkTagClosed = false
 			}
-			thinkingContent.WriteString(response.Message.Thinking)
 			displayResponse(response.Message.Thinking, opts.WordWrap, state)
 		}

 		content := response.Message.Content
-		if thinkTagOpened && !thinkTagClosed && (content != "" || len(response.Message.ToolCalls) > 0) {
-			if !strings.HasSuffix(thinkingContent.String(), "\n") {
-				fmt.Println()
-			}
+		if thinkTagOpened && !thinkTagClosed && content != "" {
 			fmt.Print(thinkingOutputClosingText(false))
-			thinkTagOpened = false
 			thinkTagClosed = true
-			state = &displayResponseState{}
 		}
 		// purposefully not putting thinking blocks in the response, which would
 		// only be needed if we later added tool calling to the cli (they get
@@ -1133,13 +1110,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		// about to finish some tool calls)
 		fullResponse.WriteString(content)

-		if response.Message.ToolCalls != nil {
-			toolCalls := response.Message.ToolCalls
-			if len(toolCalls) > 0 {
-				fmt.Print(renderToolCalls(toolCalls, false))
-			}
-		}
-
 		displayResponse(content, opts.WordWrap, state)

 		return nil
@@ -1165,14 +1135,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
 		}
-
-		// this error should ideally be wrapped properly by the client
-		if strings.Contains(err.Error(), "upstream error") {
-			p.StopAndClear()
-			fmt.Println("An error occurred while processing your message. Please try again.")
-			fmt.Println()
-			return nil, nil
-		}
 		return nil, err
 	}

@@ -1224,7 +1186,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()

 	var state *displayResponseState = &displayResponseState{}
-	var thinkingContent strings.Builder
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false

@@ -1242,31 +1203,17 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(plainText))
 				thinkTagOpened = true
-				thinkTagClosed = false
 			}
-			thinkingContent.WriteString(response.Thinking)
 			displayResponse(response.Thinking, opts.WordWrap, state)
 		}

-		if thinkTagOpened && !thinkTagClosed && (content != "" || len(response.ToolCalls) > 0) {
-			if !strings.HasSuffix(thinkingContent.String(), "\n") {
-				fmt.Println()
-			}
+		if thinkTagOpened && !thinkTagClosed && content != "" {
 			fmt.Print(thinkingOutputClosingText(plainText))
-			thinkTagOpened = false
 			thinkTagClosed = true
-			state = &displayResponseState{}
 		}

 		displayResponse(content, opts.WordWrap, state)

-		if response.ToolCalls != nil {
-			toolCalls := response.ToolCalls
-			if len(toolCalls) > 0 {
-				fmt.Print(renderToolCalls(toolCalls, plainText))
-			}
-		}
-
 		return nil
 	}

@@ -1469,13 +1416,13 @@ func NewCLI() *cobra.Command {

 	createCmd := &cobra.Command{
 		Use:     "create MODEL",
-		Short:   "Create a model",
+		Short:   "Create a model from a Modelfile",
 		Args:    cobra.ExactArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    CreateHandler,
 	}

-	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\")")
+	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
 	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")

 	showCmd := &cobra.Command{
@@ -1506,8 +1453,7 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-	runCmd.Flags().String("think", "", "Enable thinking mode: true/false or high/medium/low for supported models")
-	runCmd.Flags().Lookup("think").NoOptDefVal = "true"
+	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
 	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")

 	stopCmd := &cobra.Command{
@@ -1657,7 +1603,7 @@ func NewCLI() *cobra.Command {
 // to false).
 //
 // If capabilities are not provided, we fetch them from the server.
-func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*api.ThinkValue, error) {
+func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
 	if explicitlySetByUser {
 		return runOpts.Think, nil
 	}
@@ -1684,34 +1630,9 @@ func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicit
 	}

 	if thinkingSupported {
-		return &api.ThinkValue{Value: true}, nil
+		thinking := true
+		return &thinking, nil
 	}

 	return nil, nil
 }
-
-func renderToolCalls(toolCalls []api.ToolCall, plainText bool) string {
-	out := ""
-	formatExplanation := ""
-	formatValues := ""
-	if !plainText {
-		formatExplanation = readline.ColorGrey + readline.ColorBold
-		formatValues = readline.ColorDefault
-		out += formatExplanation
-	}
-	for i, toolCall := range toolCalls {
-		argsAsJSON, err := json.Marshal(toolCall.Function.Arguments)
-		if err != nil {
-			return ""
-		}
-		if i > 0 {
-			out += "\n"
-		}
-		// all tool calls are unexpected since we don't currently support registering any in the CLI
-		out += fmt.Sprintf("  Model called a non-existent function '%s()' with arguments: %s", formatValues+toolCall.Function.Name+formatExplanation, formatValues+string(argsAsJSON)+formatExplanation)
-	}
-	if !plainText {
-		out += readline.ColorDefault
-	}
-	return out
-}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -272,29 +272,16 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					}
 					fmt.Println("Set 'quiet' mode.")
 				case "think":
-					thinkValue := api.ThinkValue{Value: true}
-					var maybeLevel string
-					if len(args) > 2 {
-						maybeLevel = args[2]
-					}
-					if maybeLevel != "" {
-						// TODO(drifkin): validate the level, could be model dependent
-						// though... It will also be validated on the server once a call is
-						// made.
-						thinkValue.Value = maybeLevel
-					}
-					opts.Think = &thinkValue
+					think := true
+					opts.Think = &think
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
-					if maybeLevel != "" {
-						fmt.Printf("Set 'think' mode to '%s'.\n", maybeLevel)
-					} else {
-						fmt.Println("Set 'think' mode.")
-					}
+					fmt.Println("Set 'think' mode.")
 				case "nothink":
-					opts.Think = &api.ThinkValue{Value: false}
+					think := false
+					opts.Think = &think
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
@@ -398,21 +385,18 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
-					fmt.Println("Model defined parameters:")
 					if resp.Parameters == "" {
-						fmt.Println("  No additional parameters were specified for this model.")
+						fmt.Println("No parameters were specified for this model.")
 					} else {
-						for _, l := range strings.Split(resp.Parameters, "\n") {
-							fmt.Printf("  %s\n", l)
+						if len(opts.Options) > 0 {
+							fmt.Println("User defined parameters:")
+							for k, v := range opts.Options {
+								fmt.Printf("%-*s %v\n", 30, k, v)
+							}
+							fmt.Println()
 						}
-					}
-					fmt.Println()
-					if len(opts.Options) > 0 {
-						fmt.Println("User defined parameters:")
-						for k, v := range opts.Options {
-							fmt.Printf("  %-*s %v\n", 30, k, v)
-						}
-						fmt.Println()
+						fmt.Println("Model defined parameters:")
+						fmt.Println(resp.Parameters)
 					}
 				case "system":
 					switch {
@@ -491,8 +475,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 			assistant, err := chat(cmd, opts)
 			if err != nil {
-				if strings.Contains(err.Error(), "does not support thinking") ||
-					strings.Contains(err.Error(), "invalid think value") {
+				if strings.Contains(err.Error(), "does not support thinking") {
 					fmt.Printf("error: %v\n", err)
 					sb.Reset()
 					continue
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -190,8 +190,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &gemma2Model{}
 	case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration":
 		conv = &gemma3Model{Architecture: p.Architectures[0]}
-	case "Gemma3nForConditionalGeneration":
-		conv = &gemma3nModel{}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
@@ -202,8 +200,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &bertModel{}
 	case "CohereForCausalLM":
 		conv = &commandrModel{}
-	case "GptOssForCausalLM":
-		conv = &gptossModel{}
 	default:
 		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -1,165 +0,0 @@
-package convert
-
-import (
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-	"gonum.org/v1/gonum/stat/distuv"
-)
-
-type gemma3nModel struct {
-	ModelParameters
-
-	TextModel struct {
-		ActivationSparsityPattern []float32 `json:"activation_sparsity_pattern"`
-		AltupActiveIdx            uint32    `json:"altup_active_idx"`
-		AltupCoefClip             float32   `json:"altup_coef_clip"`
-		AltupCorrectScale         bool      `json:"altup_correct_scale"`
-		AltupLRMultiplier         float32   `json:"altup_lr_multiplier"`
-		AltupNumInputs            uint32    `json:"altup_num_inputs"`
-		HeadDim                   uint32    `json:"head_dim"`
-		HiddenSize                uint32    `json:"hidden_size"`
-		HiddenSizePerLayerInput   uint32    `json:"hidden_size_per_layer_input"`
-		IntermediateSize          uint32    `json:"intermediate_size"`
-		MaxPositionEmbeddings     uint32    `json:"max_position_embeddings"`
-		NumAttentionHeads         uint32    `json:"num_attention_heads"`
-		NumHiddenLayers           uint32    `json:"num_hidden_layers"`
-		NumKeyValueHeads          uint32    `json:"num_key_value_heads"`
-		NumKVSharedLayers         uint32    `json:"num_kv_shared_layers"`
-		RMSNormEPS                float32   `json:"rms_norm_eps"`
-		RopeLocalBaseFreq         float32   `json:"rope_local_base_freq"`
-		RopeTheta                 float32   `json:"rope_theta"`
-		SlidingWindow             uint32    `json:"sliding_window"`
-		LayerTypes                []string  `json:"layer_types"`
-	} `json:"text_config"`
-	VisionModel struct{} `json:"vision_config"`
-}
-
-func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
-	kv := m.ModelParameters.KV(t)
-	kv["general.architecture"] = "gemma3n"
-	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
-		norm := distuv.Normal{Mu: 0, Sigma: 1}
-		for _, v := range m.TextModel.ActivationSparsityPattern {
-			if !yield(float32(norm.Quantile(float64(v)))) {
-				break
-			}
-		}
-	})
-	kv["gemma3n.altup.active_idx"] = m.TextModel.AltupActiveIdx
-	kv["gemma3n.altup.correct_scale"] = m.TextModel.AltupCorrectScale
-	kv["gemma3n.altup.lr_multiplier"] = m.TextModel.AltupLRMultiplier
-	kv["gemma3n.altup.num_inputs"] = m.TextModel.AltupNumInputs
-	kv["gemma3n.attention.head_count_kv"] = m.TextModel.NumKeyValueHeads
-	kv["gemma3n.attention.head_count"] = m.TextModel.NumAttentionHeads
-	kv["gemma3n.attention.layer_norm_rms_epsilon"] = m.TextModel.RMSNormEPS
-	kv["gemma3n.attention.sliding_window"] = m.TextModel.SlidingWindow
-	kv["gemma3n.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
-		for _, t := range m.TextModel.LayerTypes {
-			if !yield(t == "sliding_attention") {
-				break
-			}
-		}
-	})
-	kv["gemma3n.attention.shared_kv_layers"] = m.TextModel.NumKVSharedLayers
-	kv["gemma3n.block_count"] = m.TextModel.NumHiddenLayers
-	kv["gemma3n.context_length"] = m.TextModel.MaxPositionEmbeddings
-	kv["gemma3n.embedding_length_per_layer_input"] = m.TextModel.HiddenSizePerLayerInput
-	kv["gemma3n.embedding_length"] = m.TextModel.HiddenSize
-	kv["gemma3n.feed_forward_length"] = m.TextModel.IntermediateSize
-	kv["gemma3n.head_dim"] = m.TextModel.HeadDim
-	kv["gemma3n.rope.freq_base_local"] = m.TextModel.RopeLocalBaseFreq
-	kv["gemma3n.rope.freq_base"] = m.TextModel.RopeTheta
-	return kv
-}
-
-func (m *gemma3nModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	out, ts := mergeTensors(ts,
-		merge{"altup_proj.*.weight", "altup_proj.weight"},
-		merge{"altup_unembd_proj.*.weight", "altup_unembd_proj.weight"},
-	)
-
-	for _, t := range ts {
-		switch {
-		case strings.Contains(t.Name(), "audio_tower"),
-			strings.Contains(t.Name(), "embed_audio"),
-			strings.Contains(t.Name(), "vision_tower"),
-			strings.Contains(t.Name(), "embed_vision"):
-			// TODO: handle audio and vision towers
-			continue
-		case strings.Contains(t.Name(), "altup_predict_coef"),
-			strings.Contains(t.Name(), "altup_correct_coef"):
-			if m.TextModel.AltupCoefClip > 0 {
-				t.SetRepacker(func(name string, data []float32, shape []uint64) (_ []float32, err error) {
-					dims := make([]int, len(shape))
-					for i := range shape {
-						dims[i] = int(shape[i])
-					}
-
-					var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-
-					t, err = tensor.Clamp(t, -m.TextModel.AltupCoefClip, m.TextModel.AltupCoefClip)
-					if err != nil {
-						return nil, err
-					}
-
-					if err := t.Reshape(t.Shape().TotalSize()); err != nil {
-						return nil, err
-					}
-
-					return native.VectorF32(t.(*tensor.Dense))
-				})
-			}
-		}
-
-		out = append(out, &ggml.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (m *gemma3nModel) Replacements() []string {
-	return []string{
-		"model.language_model.embed_tokens_per_layer", "per_layer_token_embd",
-		"model.language_model.embed_tokens", "token_embd",
-		"model.language_model.per_layer_model_projection", "per_layer_model_proj",
-		"model.language_model.per_layer_projection_norm", "per_layer_proj_norm", "model.language_model.altup_projections", "altup_proj",
-		"model.language_model.altup_unembed_projections", "altup_unembd_proj",
-		"model.language_model.norm", "output_norm",
-		"model.language_model.layers", "blk",
-
-		"input_layernorm", "attn_norm",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.q_norm", "attn_q_norm",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.k_norm", "attn_k_norm",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.o_proj", "attn_output",
-		"post_attention_layernorm", "post_attention_norm",
-		"pre_feedforward_layernorm", "ffn_norm",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.up_proj", "ffn_up",
-		"mlp.down_proj", "ffn_down",
-		"post_feedforward_layernorm", "post_ffw_norm",
-		"per_layer_input_gate", "inp_gate",
-		"per_layer_projection", "proj",
-		"post_per_layer_input_norm", "post_norm",
-		"altup.", "altup_",
-		"modality_router", "router",
-		"prediction_coefs", "predict_coef",
-		"correction_coefs", "correct_coef",
-		"correct_output_scale", "correct_scale.weight",
-		"laurel.", "laurel_",
-		"linear_left", "l",
-		"linear_right", "r",
-		"post_laurel_norm", "post_norm",
-	}
-}
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -1,178 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"io"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-)
-
-type gptossModel struct {
-	ModelParameters
-	HiddenLayers         uint32  `json:"num_hidden_layers"`
-	HiddenSize           uint32  `json:"hidden_size"`
-	IntermediateSize     uint32  `json:"intermediate_size"`
-	AttentionHeads       uint32  `json:"num_attention_heads"`
-	KeyValueHeads        uint32  `json:"num_key_value_heads"`
-	HeadDim              uint32  `json:"head_dim"`
-	Experts              uint32  `json:"num_experts"`
-	ExpertsPerToken      uint32  `json:"experts_per_token"`
-	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
-	InitialContextLength uint32  `json:"initial_context_length"`
-	RopeTheta            float32 `json:"rope_theta"`
-	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
-	SlidingWindow        uint32  `json:"sliding_window"`
-}
-
-var _ ModelConverter = (*gptossModel)(nil)
-
-func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
-	kv := m.ModelParameters.KV(t)
-	kv["general.architecture"] = "gptoss"
-	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
-	kv["gptoss.block_count"] = m.HiddenLayers
-	kv["gptoss.embedding_length"] = m.HiddenSize
-	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = m.Experts
-	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
-	kv["gptoss.attention.head_count"] = m.AttentionHeads
-	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
-	kv["gptoss.attention.key_length"] = m.HeadDim
-	kv["gptoss.attention.value_length"] = m.HeadDim
-	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
-	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
-	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
-	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
-	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
-	kv["tokenizer.ggml.add_bos_token"] = false
-	kv["tokenizer.ggml.eos_token_id"] = uint32(199999) // <|endoftext|>
-	kv["tokenizer.ggml.eos_token_ids"] = []int32{
-		199999, /* <|endoftext|> */
-		200002, /* <|return|> */
-		200012, /* <|call|> */
-	}
-	kv["tokenizer.ggml.add_eos_token"] = false
-	return kv
-}
-
-func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-	mxfp4s := make(map[string]*mxfp4)
-	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), ".blocks") || strings.HasSuffix(t.Name(), ".scales") {
-			dot := strings.LastIndex(t.Name(), ".")
-			name, suffix := t.Name()[:dot], t.Name()[dot+1:]
-			if _, ok := mxfp4s[name]; !ok {
-				mxfp4s[name] = &mxfp4{}
-			}
-
-			switch suffix {
-			case "blocks":
-				mxfp4s[name].blocks = t
-			case "scales":
-				mxfp4s[name].scales = t
-			}
-		} else {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		}
-	}
-
-	for name, mxfp4 := range mxfp4s {
-		dims := mxfp4.blocks.Shape()
-		out = append(out, &ggml.Tensor{
-			Name:     name,
-			Kind:     uint32(ggml.TensorTypeMXFP4),
-			Shape:    []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
-			WriterTo: mxfp4,
-		})
-	}
-
-	return out
-}
-
-func (m *gptossModel) Replacements() []string {
-	return []string{
-		// noop replacements so other replacements will not be applied
-		".blocks", ".blocks",
-		".scales", ".scales",
-		// real replacements
-		"block", "blk",
-		"attn.norm", "attn_norm",
-		"attn.qkv", "attn_qkv",
-		"attn.sinks", "attn_sinks",
-		"attn.out", "attn_out",
-		"mlp.norm", "ffn_norm",
-		"mlp.gate", "ffn_gate_inp",
-		"mlp.mlp1_", "ffn_gate_up_exps.",
-		"mlp.mlp2_", "ffn_down_exps.",
-		"embedding", "token_embd",
-		"norm", "output_norm",
-		"unembedding", "output",
-		"scale", "weight",
-	}
-}
-
-type mxfp4 struct {
-	blocks, scales Tensor
-}
-
-func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
-	var b bytes.Buffer
-	if _, err := m.blocks.WriteTo(&b); err != nil {
-		return 0, err
-	}
-
-	blocksDims := make([]int, len(m.blocks.Shape()))
-	for i, d := range m.blocks.Shape() {
-		blocksDims[i] = int(d)
-	}
-
-	var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))
-
-	var s bytes.Buffer
-	if _, err := m.scales.WriteTo(&s); err != nil {
-		return 0, err
-	}
-
-	scalesDims := slices.Repeat([]int{1}, len(m.blocks.Shape()))
-	for i, d := range m.scales.Shape() {
-		scalesDims[i] = int(d)
-	}
-
-	var scales tensor.Tensor = tensor.New(tensor.WithShape(scalesDims...), tensor.WithBacking(s.Bytes()))
-
-	out, err := tensor.Concat(3, scales, blocks)
-	if err != nil {
-		return 0, err
-	}
-
-	out = tensor.Materialize(out)
-
-	if err := out.Reshape(out.Shape().TotalSize()); err != nil {
-		return 0, err
-	}
-
-	u8s, err := native.VectorU8(out.(*tensor.Dense))
-	if err != nil {
-		return 0, err
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, u8s); err != nil {
-		return 0, err
-	}
-
-	return 0, nil
-}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,6 +2,9 @@ package convert

 import (
 	"fmt"
+	"io"
+	"slices"
+	"strings"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -27,38 +30,65 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }

 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	merges := make([]merge, 0, p.NumHiddenLayers*6)
-	for i := range p.NumHiddenLayers {
-		merges = append(merges, merge{
-			fmt.Sprintf("blk.%d.*.w1.weight", i),
-			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w1.bias", i),
-			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w2.weight", i),
-			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w2.bias", i),
-			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w3.weight", i),
-			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
-		}, merge{
-			fmt.Sprintf("blk.%d.*.w3.bias", i),
-			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
+	oldnew := []string{
+		"model.layers", "blk",
+		"w1", "ffn_gate_exps",
+		"w2", "ffn_down_exps",
+		"w3", "ffn_up_exps",
+	}
+
+	for i := range p.NumLocalExperts {
+		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
+	}
+
+	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
+	namer := strings.NewReplacer(oldnew...)
+	experts := make(map[string]experts)
+
+	// merge experts into a single tensor while removing them from ts
+	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
+		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
+			return false
+		}
+
+		name := namer.Replace(t.Name())
+		experts[name] = append(experts[name], t)
+		return true
+	})
+
+	var out []*ggml.Tensor
+	for n, e := range experts {
+		// TODO(mxyng): sanity check experts
+		out = append(out, &ggml.Tensor{
+			Name:     n,
+			Kind:     e[0].Kind(),
+			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
+			WriterTo: e,
 		})
 	}

-	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }

 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
-		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
-		"block_sparse_moe.experts.", ".",
 	)
 }
+
+type experts []Tensor
+
+func (e experts) WriteTo(w io.Writer) (int64, error) {
+	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
+	for _, t := range e {
+		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
+		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
+		// this accomplishes the same thing by writing each expert tensor in sequence
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -11,13 +11,14 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
-	"maps"
 	"os"
 	"path/filepath"
 	"slices"
 	"strings"
 	"testing"

+	"golang.org/x/exp/maps"
+
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -136,7 +137,9 @@ func TestConvertModel(t *testing.T) {
 				t.Fatal(err)
 			}

-			for _, k := range slices.Sorted(maps.Keys(expect)) {
+			keys := maps.Keys(expect)
+			slices.Sort(keys)
+			for _, k := range keys {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != expect[k] {
@@ -340,7 +343,9 @@ func TestConvertAdapter(t *testing.T) {

 			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())

-			for _, k := range slices.Sorted(maps.Keys(c.Expected)) {
+			keys := maps.Keys(c.Expected)
+			slices.Sort(keys)
+			for _, k := range keys {
 				if v, ok := actual[k]; !ok {
 					t.Errorf("missing %s", k)
 				} else if v != c.Expected[k] {
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -31,10 +31,8 @@ func (t tensorBase) Shape() []uint64 {
 }

 const (
-	tensorKindFP32 uint32 = iota
-	tensorKindFP16
-	tensorKindMXFP4 = 4
-	tensorKindBF16  = 30
+	tensorKindF32 uint32 = iota
+	tensorKindF16
 )

 func (t tensorBase) Kind() uint32 {
@@ -45,16 +43,16 @@ func (t tensorBase) Kind() uint32 {
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
-		return tensorKindFP32
+		return 0
 	}

 	switch len(t.shape) {
 	case 0:
 		panic("invalid tensor shape")
 	case 1:
-		return tensorKindFP32
+		return tensorKindF32
 	default:
-		return tensorKindFP16
+		return tensorKindF16
 	}
 }

--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
-	"maps"
 	"slices"
 	"strings"

 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
+	"golang.org/x/exp/maps"
 )

 type safetensorMetadata struct {
@@ -46,7 +46,8 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 			return nil, err
 		}

-		keys := slices.Sorted(maps.Keys(headers))
+		keys := maps.Keys(headers)
+		slices.Sort(keys)

 		names := make(map[string]struct{}, len(keys))

@@ -93,15 +94,6 @@ type safetensor struct {
 	*tensorBase
 }

-func (st safetensor) Kind() uint32 {
-	kind := st.tensorBase.Kind()
-	if st.dtype == "BF16" && kind != tensorKindFP32 {
-		kind = tensorKindBF16
-	}
-
-	return kind
-}
-
 func (st safetensor) Clone() Tensor {
 	return &safetensor{
 		fs:     st.fs,
@@ -159,9 +151,6 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 		}

 		f32s = bfloat16.DecodeFloat32(u8s)
-	case "U8":
-		// U8 tensors do not support repacking or type conversion.
-		return io.CopyN(w, f, st.size)
 	default:
 		return 0, fmt.Errorf("unknown data type: %s", st.dtype)
 	}
@@ -174,18 +163,15 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
 	}

 	switch st.Kind() {
-	case tensorKindFP32:
+	case tensorKindF32:
 		return 0, binary.Write(w, binary.LittleEndian, f32s)
-	case tensorKindFP16:
+	case tensorKindF16:
 		f16s := make([]uint16, len(f32s))
 		for i := range f32s {
 			f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
 		}

 		return 0, binary.Write(w, binary.LittleEndian, f16s)
-	case tensorKindBF16:
-		u8s := bfloat16.EncodeFloat32(f32s)
-		return 0, binary.Write(w, binary.LittleEndian, u8s)
 	default:
 		return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
 	}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -2,9 +2,7 @@ package convert

 import (
 	"cmp"
-	"io"
 	"iter"
-	"path"
 	"slices"
 	"strings"

@@ -76,54 +74,3 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 		}
 	}
 }
-
-type merge struct {
-	pattern, name string
-}
-
-// mergeTensors merges tensors that match a given pattern into a single tensor.
-func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
-	var matched []Tensor
-	for i := range merges {
-		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
-			matched, _ := path.Match(merges[i].pattern, t.Name())
-			return matched
-		})
-
-		if len(matched) > 0 {
-			out = append(out, &ggml.Tensor{
-				Name:     merges[i].name,
-				Kind:     matched[0].Kind(),
-				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
-				WriterTo: mergeGroup(matched),
-			})
-		}
-	}
-
-	return out, unmatched
-}
-
-// slicesSplitFunc splits a slice into two slices based on a predicate function.
-func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
-	for _, e := range s {
-		if fn(e) {
-			matched = append(matched, e)
-		} else {
-			unmatched = append(unmatched, e)
-		}
-	}
-
-	return matched, unmatched
-}
-
-type mergeGroup []Tensor
-
-func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
-	for _, t := range g {
-		if _, err := t.WriteTo(w); err != nil {
-			return 0, err
-		}
-	}
-
-	return 0, nil
-}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -8,10 +8,11 @@ import (
 	"fmt"
 	"io/fs"
 	"log/slog"
-	"maps"
 	"os"
 	"slices"
 	"strings"
+
+	"golang.org/x/exp/maps"
 )

 const (
@@ -259,8 +260,11 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		tokens[token.ID] = token
 	}

+	keys := maps.Keys(tokens)
+	slices.Sort(keys)
+
 	v := Vocabulary{Model: "gpt2"}
-	for _, k := range slices.Sorted(maps.Keys(tokens)) {
+	for _, k := range keys {
 		token := tokens[k]
 		v.Tokens = append(v.Tokens, token.Content)
 		v.Scores = append(v.Scores, float32(token.ID))
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -58,7 +58,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	driverMajor, driverMinor, err := AMDDriverVersion()
 	if err != nil {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn("ollama recommends running the https://www.amd.com/en/support/download/linux-drivers.html", "error", err)
+		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}

 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,7 +3,6 @@
 package discover

 import (
-	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -56,13 +55,10 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 				}
 			}
 		}
-		return "sbsa"
 	}

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
-		// The detected driver is older than Feb 2023
-		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,6 @@
 * [Quickstart](../README.md#quickstart)
 * [Examples](./examples.md)
 * [Importing models](./import.md)
-* [MacOS Documentation](./macos.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](./docker.md)
--- a/docs/api.md
+++ b/docs/api.md
@@ -500,30 +500,21 @@ The `message` object has the following fields:
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result

 Advanced parameters (optional):

- `format`: the format to return a response in. Format can be `json` or a JSON schema.
+- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

-### Tool calling
-
-Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
-
-Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
-
-[See models with tool calling capabilities](https://ollama.com/search?c=tool).
-
 ### Structured outputs

 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.

 ### Examples

-#### Chat request (Streaming)
+#### Chat Request (Streaming)

 ##### Request

@@ -578,88 +569,6 @@ Final response:
 }
 ```

-#### Chat request (Streaming with tools)
-
-##### Request
-
-```shell
-curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "what is the weather in tokyo?"
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "get_weather",
-        "description": "Get the weather in a given city",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "city": {
-              "type": "string",
-              "description": "The city to get the weather for"
-            }
-          },
-          "required": ["city"]
-        }
-      }
-    }
-  ],
-  "stream": true
-}'
-```
-
-##### Response
-
-A stream of JSON objects is returned:
-```json
-{
-    "model": "llama3.2",
-    "created_at": "2025-07-07T20:22:19.184789Z",
-    "message": {
-        "role": "assistant",
-        "content": "",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": {
-                        "city": "Tokyo"
-                    }
-                },
-            }
-        ]
-    },
-    "done": false
-}
-```
-
-Final response:
-
-```json
-{
-  "model":"llama3.2",
-  "created_at":"2025-07-07T20:22:19.19314Z",
-  "message": {
-    "role": "assistant",
-    "content": ""
-  },
-  "done_reason": "stop",
-  "done": true,
-  "total_duration": 182242375,
-  "load_duration": 41295167,
-  "prompt_eval_count": 169,
-  "prompt_eval_duration": 24573166,
-  "eval_count": 15,
-  "eval_duration": 115959084
-}
-```
-
 #### Chat request (No streaming)

 ##### Request
@@ -697,74 +606,6 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

-#### Chat request (No streaming, with tools)
-
-##### Request
-
-
-```shell
-curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "what is the weather in tokyo?"
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "get_weather",
-        "description": "Get the weather in a given city",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "city": {
-              "type": "string",
-              "description": "The city to get the weather for"
-            }
-          },
-          "required": ["city"]
-        }
-      }
-    }
-  ],
-  "stream": false 
-}'
-```
-
-##### Response
-
-```json
-{
-  "model": "llama3.2",
-  "created_at": "2025-07-07T20:32:53.844124Z",
-  "message": {
-    "role": "assistant",
-    "content": "",
-    "tool_calls": [
-      {
-        "function": {
-          "name": "get_weather",
-          "arguments": {
-            "city": "Tokyo"
-          }
-        },
-      }
-    ]
-  },
-  "done_reason": "stop",
-  "done": true,
-  "total_duration": 3244883583,
-  "load_duration": 2969184542,
-  "prompt_eval_count": 169,
-  "prompt_eval_duration": 141656333,
-  "eval_count": 18,
-  "eval_duration": 133293625
-}
-```
-
 #### Chat request (Structured outputs)

 ##### Request
@@ -871,87 +712,6 @@ Final response:
 }
 ```

-
-#### Chat request (With history, with tools)
-
-##### Request
-
-```shell
-curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "what is the weather in Toronto?"
-    },
-    // the message from the model appended to history
-    {
-      "role": "assistant",
-      "content": "",
-      "tool_calls": [
-        {
-          "function": {
-            "name": "get_temperature",
-            "arguments": {
-              "city": "Toronto"
-            }
-          },
-        }
-      ]
-    },
-    // the tool call result appended to history
-    {
-      "role": "tool",
-      "content": "11 degrees celsius",
-      "tool_name": "get_temperature",
-    }
-  ],
-  "stream": false,
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "get_weather",
-        "description": "Get the weather in a given city",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "city": {
-              "type": "string",
-              "description": "The city to get the weather for"
-            }
-          },
-          "required": ["city"]
-        }
-      }
-    }
-  ]
-}'
-```
-
-##### Response
-
-```json
-{
-  "model": "llama3.2",
-  "created_at": "2025-07-07T20:43:37.688511Z",
-  "message": {
-    "role": "assistant",
-    "content": "The current temperature in Toronto is 11°C."
-  },
-  "done_reason": "stop",
-  "done": true,
-  "total_duration": 890771750,
-  "load_duration": 707634750,
-  "prompt_eval_count": 94,
-  "prompt_eval_duration": 91703208,
-  "eval_count": 11,
-  "eval_duration": 90282125
-}
-
-```
-
-
 #### Chat request (with images)

 ##### Request
@@ -1593,7 +1353,7 @@ Then there is a series of downloading responses. Until any of the download is co

 ```json
 {
-  "status": "pulling digestname",
+  "status": "downloading digestname",
  "digest": "digestname",
  "total": 2142590208,
  "completed": 241970
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -0,0 +1,59 @@
+# Benchmark
+
+Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
+
+## When to use
+
+Run these benchmarks when:
+- Making changes to the model inference engine
+- Modifying model loading/unloading logic
+- Changing prompt processing or token generation code
+- Implementing a new model architecture
+- Testing performance across different hardware setups
+
+## Prerequisites
+- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
+## Usage and Examples
+
+>[!NOTE]
+>All commands must be run from the root directory of the Ollama project.
+
+Basic syntax:
+```bash
+go test -bench=. ./benchmark/... -m $MODEL_NAME
+```
+
+Required flags:
+- `-bench=.`: Run all benchmarks
+- `-m`: Model name to benchmark
+
+Optional flags:
+- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
+- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
+
+Common usage patterns:
+
+Single benchmark run with a model specified:
+```bash
+go test -bench=. ./benchmark/... -m llama3.3
+```
+
+## Output metrics
+
+The benchmark reports several key metrics:
+
+- `gen_tok/s`: Generated tokens per second
+- `prompt_tok/s`: Prompt processing tokens per second
+- `ttft_ms`: Time to first token in milliseconds
+- `load_ms`: Model load time in milliseconds
+- `gen_tokens`: Total tokens generated
+- `prompt_tokens`: Total prompt tokens processed
+
+Each benchmark runs two scenarios:
+- Cold start: Model is loaded from disk for each test
+- Warm start: Model is pre-loaded in memory
+
+Three prompt lengths are tested for each scenario:
+- Short prompt (100 tokens)
+- Medium prompt (500 tokens)
+- Long prompt (1000 tokens)
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```

-> NOTE: In rare circumstances, you may need to change a package using the new
+> NOTE: In rare cirumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
+By default, Ollama uses a context window size of 4096 tokens. 

-This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
+This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 

 ```shell
 OLLAMA_CONTEXT_LENGTH=8192 ollama serve
@@ -46,8 +46,6 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
-
 ## How can I tell if my model was loaded onto the GPU?

 Use the `ollama ps` command to see what models are currently loaded into memory.
@@ -59,8 +57,8 @@ ollama ps
 > **Output**:
 >
 > ```
-> NAME           ID              SIZE     PROCESSOR    CONTEXT    UNTIL
-> gpt-oss:20b    05afbac4bad6    16 GB    100% GPU     8192       4 minutes from now
+> NAME      	ID          	SIZE 	PROCESSOR	UNTIL
+> llama3:70b	bcfb190ca3a7	42 GB	100% GPU 	4 minutes from now
 > ```

 The `Processor` column will show which memory the model was loaded in to:
@@ -150,11 +148,9 @@ docker build -t ollama-with-ca .
 docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
 ```

-## Does Ollama send my prompts and responses back to ollama.com?
+## Does Ollama send my prompts and answers back to ollama.com?

-If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
-
-If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
+No. Ollama runs locally, and conversation data does not leave your machine.

 ## How can I expose Ollama on my network?

@@ -296,7 +292,7 @@ If too many requests are sent to the server, it will respond with a 503 error in

 ## How does Ollama handle concurrent requests?

-Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it can be configured to allow parallel request processing.
+Ollama supports two levels of concurrent processing.  If your system has sufficient available memory (system memory when using CPU inference, or VRAM for GPU inference) then multiple models can be loaded at the same time.  For a given model, if there is sufficient available memory when the model is loaded, it is configured to allow parallel request processing.

 If there is insufficient available memory to load a new model request while one or more models are already loaded, all new requests will be queued until the new model can be loaded.  As prior models become idle, one or more will be unloaded to make room for the new model.  Queued requests will be processed in order.  When using GPU inference new models must be able to completely fit in VRAM to allow concurrent model loads.

@@ -305,7 +301,7 @@ Parallel request processing for a given model results in increasing the context
 The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:

 - `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory.  The default is 3 * the number of GPUs or 3 for CPU inference.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default is 1, and will handle 1 request per model at a time.
+- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512

 Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
@@ -337,16 +333,3 @@ The currently available K/V cache quantization types are:
 How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.

 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
-
-## How can I stop Ollama from starting when I login to my computer
-
-Ollama for Windows and macOS register as a login item during installation.  You can disable this if you prefer not to have Ollama automatically start.  Ollama will respect this setting across upgrades, unless you uninstall the application.
-
-**Windows**
- Remove `%APPDATA%\Microsoft\Windows\Start Menu\Programs\Startup\Ollama.lnk`
-
-**MacOS Monterey (v12)**
- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
-
-**MacOS Ventura (v13) and later**
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,14 +1,12 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+Ollama supports Nvidia GPUs with compute capability 5.0+.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)

 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
-| 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                     |
-|                    | NVIDIA Professioal  | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`         |
 | 9.0                | NVIDIA              | `H200` `H100`                                                                                               |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
--- a/docs/import.md
+++ b/docs/import.md
@@ -53,8 +53,6 @@ FROM /path/to/safetensors/directory

 If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.

-If you do not create the Modelfile, ollama will act as if there was a Modelfile with the command `FROM .`.
-
 Now run the `ollama create` command from the directory where you created the `Modelfile`:

 ```shell
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -16,7 +16,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Download and extract the package:

 ```shell
-curl -LO https://ollama.com/download/ollama-linux-amd64.tgz
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
 sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```

--- a/docs/macos.md
+++ b/docs/macos.md
@@ -1,42 +0,0 @@
-# Ollama for macOS
-
-## System Requirements
-
-* MacOS Monterey (v12) or newer
-* Apple M series (CPU and GPU support) or x86 (CPU only)
-
-
-## Filesystem Requirements
-
-The preferred method of installation is to mount the `ollama.dmg` and drag-and-drop the Ollama application to the system-wide `Applications` folder.  Upon startup, the Ollama app will verify the `ollama` CLI is present in your PATH, and if not detected, will prompt for permission to create a link in `/usr/local/bin`
-
-Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
-
-### Changing Install Location
-
-To install the Ollama application somewhere other than `Applications`, place the Ollama application in the desired location, and ensure the CLI `Ollama.app/Contents/Resources/ollama` or a sym-link to the CLI can be found in your path.  Upon first start decline the "Move to Applications?" request.
-
-
-## Troubleshooting
-
-Ollama on MacOS stores files in a few different locations.
- `~/.ollama` contains models and configuration
- `~/.ollama/logs` contains logs
-    - *app.log* contains most recent logs from the GUI application
-    - *server.log* contains the most recent server logs
- `<install location>/Ollama.app/Contents/Resources/ollama` the CLI binary
-
-## Uninstall
-
-To fully remove Ollama from your system, remove the following files and folders:
-
-```
-sudo rm -rf /Applications/Ollama.app
-sudo rm /usr/local/bin/ollama
-rm -rf "~/Library/Application Support/Ollama"
-rm -rf "~/Library/Saved Application State/com.electron.ollama.savedState"
-rm -rf ~/Library/Caches/com.electron.ollama/
-rm -rf ~/Library/Caches/ollama
-rm -rf ~/Library/WebKit/com.electron.ollama
-rm -rf ~/.ollama
-```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -150,7 +150,7 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 4096)                                                                                                                                                                    | int        | num_ctx 4096         |
+| num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -72,7 +72,7 @@ client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 # Define the schema for the response
 class FriendInfo(BaseModel):
    name: str
-    age: int
+    age: int 
    is_available: bool

 class FriendList(BaseModel):
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -9,7 +9,7 @@ cat ~/.ollama/logs/server.log
 On **Linux** systems with systemd, the logs can be found with this command:

 ```shell
-journalctl -u ollama --no-pager --follow --pager-end
+journalctl -u ollama --no-pager --follow --pager-end 
 ```

 When you run Ollama in a **container**, the logs go to stdout/stderr in the container:
@@ -23,7 +23,7 @@ docker logs <container-name>
 If manually running `ollama serve` in a terminal, the logs will be on that terminal.

 When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
+- `explorer %LOCALAPPDATA%\Ollama` to view logs.  The most recent server logs will be in `server.log` and older logs will be in `server-#.log` 
 - `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
 - `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored

@@ -38,12 +38,12 @@ Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.

 ## LLM libraries

-Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` and the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library.
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU vector features. Ollama tries to pick the best one based on the capabilities of your system. If this autodetection has problems, or you run into other problems (e.g. crashes in your GPU) you can workaround this by forcing a specific LLM library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest but most compatible is `cpu`. Rosetta emulation under MacOS will work with the `cpu` library. 

 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```

 **Experimental LLM Library Override**
@@ -97,7 +97,7 @@ If none of those resolve the problem, gather additional information and file an

 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.

-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44`
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 

 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
--- a/docs/turbo.md
+++ b/docs/turbo.md
@@ -1,107 +0,0 @@
-# Turbo
-
-> ⚠️ Turbo is preview
-
-Ollama’s [Turbo](https://ollama.com/turbo) is a new way to run open-source models with acceleration from datacenter-grade hardware.
-
-Currently, the following models are available in Turbo:
-
- `gpt-oss:20b`
- `gpt-oss:120b`
-
-## Get started
-
-### Ollama for macOS & Windows
-
-Download Ollama
-
- Select a model such as `gpt-oss:20b` or `gpt-oss:120b`
- Click on **Turbo**. You’ll be prompted to create an account or sign in
-
-### Ollama’s CLI
-
- [Sign up](https://ollama.com/signup) for an Ollama account
- Add your Ollama key [to ollama.com](https://ollama.com/settings/keys).
-
-  On macOS and Linux:
-
-  ```shell
-  cat ~/.ollama/id_ed25519.pub
-  ```
-
-  On Windows:
-
-  ```
-  type "%USERPROFILE%\.ollama\id_ed25519.pub"
-  ```
-
- Then run a model setting `OLLAMA_HOST` to `ollama.com`:
-  ```shell
-  OLLAMA_HOST=ollama.com ollama run gpt-oss:120b
-  ```
-
-### Ollama’s Python library
-
- Download Ollama's [Python library](https://github.com/ollama/ollama-python)
- [Sign up](https://ollama.com/signup) for an Ollama account
- Create an API key by visiting https://ollama.com/settings/keys
-
-```python
-from ollama import Client
-
-client = Client(
-    host="https://ollama.com",
-    headers={'Authorization': '<api key>'}
-)
-
-messages = [
-  {
-    'role': 'user',
-    'content': 'Why is the sky blue?',
-  },
-]
-
-for part in client.chat('gpt-oss:120b', messages=messages, stream=True):
-  print(part['message']['content'], end='', flush=True)
-```
-
-### Ollama’s JavaScript library
-
- Download Ollama's [JavaScript library](https://github.com/ollama/ollama-js)
- [Sign up](https://ollama.com/signup) for an Ollama account
- Create an API key by visiting https://ollama.com/settings/keys
-
-```typescript
-import { Ollama } from 'ollama';
-
-const ollama = new Ollama({
-  host: 'https://ollama.com'
-  headers: {
-	  Authorization: "Bearer <api key>"
-  }
-});
-
-const response = await ollama.chat({
-  model: 'gpt-oss:120b',
-  messages: [{ role: 'user', content: 'Explain quantum computing' }],
-  stream: true
-});
-
-for await (const part of response) {
-    process.stdout.write(part.message.content)
-}
-```
-
-### Community integrations
-
-Turbo mode is also compatible with several community integrations.
-
-#### Open WebUI
-
- Go to **settings** → **Admin settings** → **Connections**
- Under **Ollama API,** click **+**
- For the **URL** put `https://ollama.com`
- For the **API key,** create an API key on https://ollama.com/settings/keys and add it.
- Click **Save**
-
-Now, if you navigate to the model selector, Turbo models should be available under **External**.
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -30,6 +30,20 @@ To install the Ollama application in a location different than your home directo
 OllamaSetup.exe /DIR="d:\some\location"
 ```

+### Changing Model Location
+
+To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
+
+1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
+
+2. Click on _Edit environment variables for your account_.
+
+3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
+
+4. Click OK/Apply to save.
+
+If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
+
 ## API Access

 Here's a quick example showing API access from `powershell`
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -219,7 +219,7 @@ func Uint(key string, defaultValue uint) func() uint {

 var (
 	// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
-	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 1)
+	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
 	// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
--- a/fs/config.go
+++ b/fs/config.go
@@ -10,5 +10,4 @@ type Config interface {
 	Strings(string, ...[]string) []string
 	Ints(string, ...[]int32) []int32
 	Floats(string, ...[]float32) []float32
-	Bools(string, ...[]bool) []bool
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,7 +1,6 @@
 package ggml

 import (
-	"cmp"
 	"encoding/binary"
 	"errors"
 	"fmt"
@@ -35,8 +34,7 @@ func (kv KV) Kind() string {
 }

 func (kv KV) ParameterCount() uint64 {
-	val, _ := keyValue(kv, "general.parameter_count", uint64(0))
-	return val
+	return keyValue(kv, "general.parameter_count", uint64(0))
 }

 func (kv KV) FileType() FileType {
@@ -55,27 +53,16 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

-func (kv KV) HeadCountMax() uint64 {
-	// TODO(drifkin): using the max value can cause an overestimation. In the
-	// future if array values become more popular, we can adapt the more invasive
-	// <https://github.com/ollama/ollama/pull/10225>
-	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
+func (kv KV) HeadCount() uint64 {
+	return uint64(kv.Uint("attention.head_count"))
 }

-func (kv KV) HeadCountMin() uint64 {
-	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
+func (kv KV) HeadCountKV() uint64 {
+	return uint64(kv.Uint("attention.head_count_kv", 1))
 }

-func (kv KV) HeadCountKVMax() uint64 {
-	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
-}
-
-func (kv KV) HeadCountKVMin() uint64 {
-	return uint64(kv.UintOrMinArrayValue("attention.head_count_kv", 1))
-}
-
-func (kv KV) EmbeddingHeadCountMax() uint64 {
-	if heads := kv.HeadCountMin(); heads > 0 {
+func (kv KV) EmbeddingHeadCount() uint64 {
+	if heads := kv.HeadCount(); heads > 0 {
 		return kv.EmbeddingLength() / heads
 	}

@@ -83,11 +70,15 @@ func (kv KV) EmbeddingHeadCountMax() uint64 {
 }

 func (kv KV) EmbeddingHeadCountK() uint64 {
-	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCountMax())))
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
 }

 func (kv KV) EmbeddingHeadCountV() uint64 {
-	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCountMax())))
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
+}
+
+func (kv KV) GQA() uint64 {
+	return kv.HeadCount() / kv.HeadCountKV()
 }

 func (kv KV) ContextLength() uint64 {
@@ -99,88 +90,44 @@ func (kv KV) ChatTemplate() string {
 }

 func (kv KV) String(key string, defaultValue ...string) string {
-	val, _ := keyValue(kv, key, append(defaultValue, "")...)
-	return val
+	return keyValue(kv, key, append(defaultValue, "")...)
 }

 func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
-	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
-	return val
+	return keyValue(kv, key, append(defaultValue, 0)...)
 }

 func (kv KV) Float(key string, defaultValue ...float32) float32 {
-	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
-	return val
+	return keyValue(kv, key, append(defaultValue, 0)...)
 }

 func (kv KV) Bool(key string, defaultValue ...bool) bool {
-	val, _ := keyValue(kv, key, append(defaultValue, false)...)
-	return val
-}
-
-func (kv KV) UintOrMaxArrayValue(key string, defaultValue uint32) uint32 {
-	_, max := kv.UintOrArrayValue(key, defaultValue)
-	return max
-}
-
-func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
-	min, _ := kv.UintOrArrayValue(key, defaultValue)
-	return min
-}
-
-func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
-	if u32, ok := keyValue(kv, key, uint32(0)); ok {
-		return u32, u32
-	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
-		min := slices.Min(u32s.values)
-		max := slices.Max(u32s.values)
-		return min, max
-	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
-		min := slices.Min(i32s.values)
-		max := slices.Max(i32s.values)
-		if min < 0 || max < 0 {
-			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
-		}
-		return uint32(min), uint32(max)
-	}
-
-	return defaultValue, defaultValue
+	return keyValue(kv, key, append(defaultValue, false)...)
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
-	val, _ := keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
 }

 func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
-	val, _ := keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
 }

 func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
-	val, _ := keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
 }

 func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	val, _ := keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]})
-	return val.values
-}
-
-func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
-	val, _ := keyValue(kv, key, &array[bool]{values: append(defaultValue, []bool(nil))[0]})
-	return val.values
+	return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
 }

 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"gemma3",
-		"gemma3n",
 		"mistral3",
 		"llama4",
 		"mllama",
 		"qwen25vl",
-		"gptoss",
 	}, kv.Architecture())
 }

@@ -196,17 +143,17 @@ type arrayValueTypes interface {
 		*array[string] | *array[float32] | *array[float64] | *array[bool]
 }

-func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
+func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 	}

-	if val, ok := kv[key].(T); ok {
-		return val, true
+	if val, ok := kv[key]; ok {
+		return val.(T)
 	}

-	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
-	return defaultValue[0], false
+	slog.Debug("key not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0]
 }

 type Tensors struct {
@@ -282,7 +229,7 @@ func (t Tensor) block() (n int) {
 }

 func (t Tensor) blockSize() uint64 {
-	return TensorType(t.Kind).BlockSize()
+	return (TensorType)(t.Kind).BlockSize()
 }

 func (t TensorType) BlockSize() uint64 {
@@ -300,7 +247,6 @@ func (t TensorType) BlockSize() uint64 {
 	case
 		2,  // Q4_0
 		3,  // Q4_1
-		4,  // MXFP4
 		6,  // Q5_0
 		7,  // Q5_1
 		8,  // Q8_0
@@ -328,8 +274,6 @@ func (t TensorType) TypeSize() uint64 {
 		return 2 + blockSize/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + blockSize/2
-	case TensorTypeMXFP4:
-		return 1 + blockSize/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + blockSize/2
 	case TensorTypeQ5_1:
@@ -481,22 +425,20 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
 	embedding := f.KV().EmbeddingLength()
-	heads := f.KV().HeadCountMax()
-	headsKV := f.KV().HeadCountKVMax()
+	heads := f.KV().HeadCount()
+	headsKV := f.KV().HeadCountKV()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)

-	embeddingHeads := f.KV().EmbeddingHeadCountMax()
+	embeddingHeads := f.KV().EmbeddingHeadCount()
 	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
 	embeddingHeadsV := f.KV().EmbeddingHeadCountV()

 	layers := f.Tensors().GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	var kvTotal uint64
 	kv = make([]uint64, f.KV().BlockCount())
 	for i := range kv {
 		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
-		kvTotal += kv[i]
 	}

 	switch f.KV().Architecture() {
@@ -562,7 +504,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
-	case "gemma", "gemma2", "gemma3", "gemma3n":
+	case "gemma", "gemma2", "gemma3":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
@@ -575,11 +517,6 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				embedding*embeddingHeadsK*heads*9/16,
 		)

-		if f.KV().Architecture() == "gemma3n" {
-			fullOffload *= 4
-			partialOffload *= 4
-		}
-
 		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
 		// engine. Gemma3 always uses the Ollama engine.
 		if f.KV().Architecture() == "gemma3" {
@@ -665,18 +602,6 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 					4*qkvBias.Shape[0],
 			)
 		}
-	case "gptoss":
-		kv = make([]uint64, f.KV().BlockCount())
-		for i := range kv {
-			kv[i] = uint64(float64((embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
-			if i%2 == 0 {
-				kv[i] *= (uint64(numParallel)*4096 + batch)
-			} else {
-				kv[i] *= context
-			}
-		}
-		fullOffload = 4 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
-		partialOffload = fullOffload
 	}

 	return
@@ -761,10 +686,6 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

-	if f.KV().Architecture() == "gptoss" {
-		return false
-	}
-
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()
--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -269,33 +269,3 @@ func TestKeyValue(t *testing.T) {
 		t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
 	}
 }
-
-func TestHeadCount(t *testing.T) {
-	valuesArray := []int32{1, 5, 3, 4}
-	cases := []struct {
-		kv   KV
-		want uint64
-	}{
-		{
-			kv: KV{
-				"general.architecture":     "abc",
-				"abc.attention.head_count": &array[int32]{values: valuesArray, size: len(valuesArray)},
-			},
-			want: uint64(5),
-		},
-		{
-			kv: KV{
-				"general.architecture":     "abc",
-				"abc.attention.head_count": uint32(3),
-			},
-			want: uint64(3),
-		},
-	}
-
-	for _, tt := range cases {
-		got := tt.kv.HeadCountMax()
-		if got != tt.want {
-			t.Errorf("unexpected max value: got=%d want=%d", got, tt.want)
-		}
-	}
-}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -609,10 +609,6 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 		err = writeGGUFArray(ws, ggufTypeString, v)
 	case *array[string]:
 		err = writeGGUFArray(ws, ggufTypeString, v.values)
-	case []bool:
-		err = writeGGUFArray(ws, ggufTypeBool, v)
-	case *array[bool]:
-		err = writeGGUFArray(ws, ggufTypeBool, v.values)
 	default:
 		return fmt.Errorf("improper type for '%s'", k)
 	}
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -14,9 +14,9 @@ const (
 	FileTypeF16
 	fileTypeQ4_0
 	fileTypeQ4_1
-	fileTypeMXFP4 // originally fileTypeQ4_1_F16 // unused by GGML
-	fileTypeQ4_2  // unused by GGML
-	fileTypeQ4_3  // unused by GGML
+	fileTypeQ4_1_F16 // unused by GGML
+	fileTypeQ4_2     // unused by GGML
+	fileTypeQ4_3     // unused by GGML
 	FileTypeQ8_0
 	fileTypeQ5_0
 	fileTypeQ5_1
@@ -97,8 +97,6 @@ func (t FileType) String() string {
 		return "Q4_0"
 	case fileTypeQ4_1:
 		return "Q4_1"
-	case fileTypeMXFP4:
-		return "MXFP4"
 	case FileTypeQ8_0:
 		return "Q8_0"
 	case fileTypeQ5_0:
@@ -146,8 +144,6 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeQ4_0
 	case fileTypeQ4_1:
 		return TensorTypeQ4_1
-	case fileTypeMXFP4:
-		return TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
 	case fileTypeQ5_0:
@@ -191,8 +187,8 @@ const (
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
-	TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
-	tensorTypeQ4_3  // unused by GGML
+	tensorTypeQ4_2 // unused by GGML
+	tensorTypeQ4_3 // unused by GGML
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
@@ -264,8 +260,6 @@ func ParseTensorType(s string) (TensorType, error) {
 		return TensorTypeF64, nil
 	case "BF16":
 		return TensorTypeBF16, nil
-	case "MXFP4":
-		return TensorTypeMXFP4, nil
 	default:
 		return 0, fmt.Errorf("unsupported quantization type %s", s)
 	}
@@ -318,8 +312,6 @@ func (t TensorType) String() string {
 		return "F64"
 	case TensorTypeBF16:
 		return "BF16"
-	case TensorTypeMXFP4:
-		return "MXFP4"
 	default:
 		return "unknown"
 	}
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -65,7 +65,7 @@ func Open(path string) (f *File, err error) {
 		return nil, err
 	}

-	if f.Version < 2 {
+	if f.Version != 3 {
 		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
 	}

--- a/go.mod
+++ b/go.mod
@@ -25,7 +25,6 @@ require (
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
 	golang.org/x/tools v0.30.0
-	gonum.org/v1/gonum v0.15.0
 )

 require (
@@ -45,6 +44,7 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
@@ -71,7 +71,7 @@ require (
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.36.0
-	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
+	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
 	golang.org/x/net v0.38.0 // indirect
 	golang.org/x/sys v0.31.0
 	golang.org/x/term v0.30.0
--- a/integration/library_models_test.go
+++ b/integration/library_models_test.go
@@ -1,57 +0,0 @@
-//go:build integration && library
-
-package integration
-
-import (
-	"context"
-	"log/slog"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// First run of this scenario on a target system will take a long time to download
-// ~1.5TB of models.  Set a sufficiently large -timeout for your network speed
-func TestLibraryModelsGenerate(t *testing.T) {
-	softTimeout, hardTimeout := getTimeouts(t)
-	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	chatModels := libraryChatModels
-	for _, model := range chatModels {
-		t.Run(model, func(t *testing.T) {
-			if time.Now().Sub(started) > softTimeout {
-				t.Skip("skipping remaining tests to avoid excessive runtime")
-			}
-			if err := PullIfMissing(ctx, client, model); err != nil {
-				t.Fatalf("pull failed %s", err)
-			}
-			req := api.GenerateRequest{
-				Model:     model,
-				Prompt:    "why is the sky blue?",
-				KeepAlive: &api.Duration{Duration: 10 * time.Second},
-				Options: map[string]interface{}{
-					"temperature": 0.1,
-					"seed":        123,
-				},
-			}
-			anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"}
-			// Special cases
-			if model == "duckdb-nsql" {
-				anyResp = []string{"select", "from"}
-			} else if model == "granite3-guardian" || model == "shieldgemma" || model == "llama-guard3" || model == "bespoke-minicheck" {
-				anyResp = []string{"yes", "no", "safe", "unsafe"}
-			} else if model == "openthinker" || model == "nexusraven" {
-				anyResp = []string{"plugin", "im_sep", "components", "function call"}
-			} else if model == "starcoder" || model == "starcoder2" || model == "magicoder" || model == "deepseek-coder" {
-				req.Prompt = "def fibonacci():"
-				anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"}
-			}
-			DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second)
-		})
-	}
-}
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -19,6 +19,35 @@ import (
 	"github.com/ollama/ollama/format"
 )

+var (
+	started    = time.Now()
+	chatModels = []string{
+		"granite3-moe:latest",
+		"granite-code:latest",
+		"nemotron-mini:latest",
+		"command-r:latest",
+		"gemma2:latest",
+		"gemma:latest",
+		"internlm2:latest",
+		"phi3.5:latest",
+		"phi3:latest",
+		// "phi:latest", // flaky, sometimes generates no response on first query
+		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
+		"falcon:latest",
+		"falcon2:latest",
+		"minicpm-v:latest",
+		"mistral:latest",
+		"orca-mini:latest",
+		"llama2:latest",
+		"llama3.1:latest",
+		"llama3.2:latest",
+		"llama3.2-vision:latest",
+		"qwen2.5-coder:latest",
+		"qwen:latest",
+		"solar-pro:latest",
+	}
+)
+
 func TestModelsGenerate(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
@@ -39,13 +68,6 @@ func TestModelsGenerate(t *testing.T) {
 		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
 	}

-	var chatModels []string
-	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
-		chatModels = ollamaEngineChatModels
-	} else {
-		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
-	}
-
 	for _, model := range chatModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
--- a/integration/model_perf_test.go
+++ b/integration/model_perf_test.go
@@ -1,266 +0,0 @@
-//go:build integration && perf
-
-package integration
-
-import (
-	"context"
-	"fmt"
-	"io/ioutil"
-	"log/slog"
-	"math"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/format"
-)
-
-var (
-	// Models that don't work reliably with the large context prompt in this test case
-	longContextFlakes = []string{
-		"granite-code:latest",
-		"nemotron-mini:latest",
-		"falcon:latest",  // 2k model
-		"falcon2:latest", // 2k model
-		"minicpm-v:latest",
-		"qwen:latest",
-		"solar-pro:latest",
-	}
-)
-
-// Note: this test case can take a long time to run, particularly on models with
-// large contexts.  Run with -timeout set to a large value to get reasonable coverage
-// Example usage:
-//
-// go test --tags=integration,perf -count 1 ./integration -v -timeout 90m -run TestModelsPerf 2>&1 | tee int.log
-// cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
-// cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
-func TestModelsPerf(t *testing.T) {
-	softTimeout, hardTimeout := getTimeouts(t)
-	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	// TODO use info API eventually
-	var maxVram uint64
-	var err error
-	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
-		maxVram, err = strconv.ParseUint(s, 10, 64)
-		if err != nil {
-			t.Fatalf("invalid  OLLAMA_MAX_VRAM %v", err)
-		}
-	} else {
-		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
-	}
-
-	data, err := ioutil.ReadFile(filepath.Join("testdata", "shakespeare.txt"))
-	if err != nil {
-		t.Fatalf("failed to open test data file: %s", err)
-	}
-	longPrompt := "summarize the following: " + string(data)
-
-	var chatModels []string
-	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
-		chatModels = ollamaEngineChatModels
-	} else {
-		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
-	}
-
-	for _, model := range chatModels {
-		t.Run(model, func(t *testing.T) {
-			if time.Now().Sub(started) > softTimeout {
-				t.Skip("skipping remaining tests to avoid excessive runtime")
-			}
-			if err := PullIfMissing(ctx, client, model); err != nil {
-				t.Fatalf("pull failed %s", err)
-			}
-			var maxContext int
-
-			resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
-			if err != nil {
-				t.Fatalf("show failed: %s", err)
-			}
-			arch := resp.ModelInfo["general.architecture"].(string)
-			maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
-
-			if maxVram > 0 {
-				resp, err := client.List(ctx)
-				if err != nil {
-					t.Fatalf("list models failed %v", err)
-				}
-				for _, m := range resp.Models {
-					// For these tests we want to exercise a some amount of overflow on the CPU
-					if m.Name == model && float32(m.Size)*0.75 > float32(maxVram) {
-						t.Skipf("model %s is too large %s for available VRAM %s", model, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
-					}
-				}
-			}
-			slog.Info("scneario", "model", model, "max_context", maxContext)
-			loaded := false
-			defer func() {
-				// best effort unload once we're done with the model
-				if loaded {
-					client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
-				}
-			}()
-
-			// Some models don't handle the long context data well so skip them to avoid flaky test results
-			longContextFlake := false
-			for _, flake := range longContextFlakes {
-				if model == flake {
-					longContextFlake = true
-					break
-				}
-			}
-
-			// iterate through a few context sizes for coverage without excessive runtime
-			var contexts []int
-			keepGoing := true
-			if maxContext > 16384 {
-				contexts = []int{4096, 8192, 16384, maxContext}
-			} else if maxContext > 8192 {
-				contexts = []int{4096, 8192, maxContext}
-			} else if maxContext > 4096 {
-				contexts = []int{4096, maxContext}
-			} else if maxContext > 0 {
-				contexts = []int{maxContext}
-			} else {
-				t.Fatal("unknown max context size")
-			}
-			for _, numCtx := range contexts {
-				if !keepGoing && numCtx > 8192 { // Always try up to 8k before bailing out
-					break
-				}
-				skipLongPrompt := false
-
-				// Workaround bug 11172 temporarily...
-				maxPrompt := longPrompt
-				// If we fill the context too full with the prompt, many models
-				// quickly hit context shifting and go bad.
-				if len(maxPrompt) > numCtx*2 { // typically yields ~1/2 full context
-					maxPrompt = maxPrompt[:numCtx*2]
-				}
-
-				testCases := []struct {
-					prompt  string
-					anyResp []string
-				}{
-					{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
-					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
-				}
-				var gpuPercent int
-				for _, tc := range testCases {
-					if len(tc.prompt) > 100 && (longContextFlake || skipLongPrompt) {
-						slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
-						continue
-					}
-					req := api.GenerateRequest{
-						Model:     model,
-						Prompt:    tc.prompt,
-						KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
-						Options: map[string]interface{}{
-							"temperature": 0,
-							"seed":        123,
-							"num_ctx":     numCtx,
-						},
-					}
-					atLeastOne := false
-					var resp api.GenerateResponse
-
-					stream := false
-					req.Stream = &stream
-
-					// Avoid potentially getting stuck indefinitely
-					limit := 5 * time.Minute
-					genCtx, cancel := context.WithDeadlineCause(
-						ctx,
-						time.Now().Add(limit),
-						fmt.Errorf("generate on model %s with ctx %d took longer than %v", model, numCtx, limit),
-					)
-					defer cancel()
-
-					err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
-						resp = rsp
-						return nil
-					})
-					if err != nil {
-						// Avoid excessive test runs, but don't consider a failure with massive context
-						if numCtx > 16384 && strings.Contains(err.Error(), "took longer") {
-							slog.Warn("max context was taking too long, skipping", "error", err)
-							keepGoing = false
-							skipLongPrompt = true
-							continue
-						}
-						t.Fatalf("generate error: ctx:%d err:%s", numCtx, err)
-					}
-					loaded = true
-					for _, expResp := range tc.anyResp {
-						if strings.Contains(strings.ToLower(resp.Response), expResp) {
-							atLeastOne = true
-							break
-						}
-					}
-					if !atLeastOne {
-						t.Fatalf("response didn't contain expected values: ctx:%d  expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
-					}
-					models, err := client.ListRunning(ctx)
-					if err != nil {
-						slog.Warn("failed to list running models", "error", err)
-						continue
-					}
-					if len(models.Models) > 1 {
-						slog.Warn("multiple models loaded, may impact performance results", "loaded", models.Models)
-					}
-					for _, m := range models.Models {
-						if m.Name == model {
-							if m.SizeVRAM == 0 {
-								slog.Info("Model fully loaded into CPU")
-								gpuPercent = 0
-								keepGoing = false
-								skipLongPrompt = true
-							} else if m.SizeVRAM == m.Size {
-								slog.Info("Model fully loaded into GPU")
-								gpuPercent = 100
-							} else {
-								sizeCPU := m.Size - m.SizeVRAM
-								cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
-								gpuPercent = int(100 - cpuPercent)
-								slog.Info("Model split between CPU/GPU", "CPU", cpuPercent, "GPU", gpuPercent)
-								keepGoing = false
-
-								// Heuristic to avoid excessive test run time
-								if gpuPercent < 90 {
-									skipLongPrompt = true
-								}
-							}
-						}
-					}
-					fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
-						"MODEL",
-						"CONTEXT",
-						"GPU PERCENT",
-						"PROMPT COUNT",
-						"LOAD TIME",
-						"PROMPT EVAL TPS",
-						"EVAL TPS",
-					)
-					fmt.Fprintf(os.Stderr, "MODEL_PERF_DATA:%s,%d,%d,%d,%0.2f,%0.2f,%0.2f\n",
-						model,
-						numCtx,
-						gpuPercent,
-						resp.PromptEvalCount,
-						float64(resp.LoadDuration)/1000000000.0,
-						float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
-						float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
-					)
-				}
-			}
-		})
-	}
-}
--- a/integration/testdata/shakespeare.txt
+++ b/integration/testdata/shakespeare.txt
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -32,229 +32,6 @@ const (
 	smol = "llama3.2:1b"
 )

-var (
-	started = time.Now()
-
-	// Note: add newer models at the top of the list to test them first
-	ollamaEngineChatModels = []string{
-		"gemma3n:e2b",
-		"mistral-small3.2:latest",
-		"deepseek-r1:1.5b",
-		"llama3.2-vision:latest",
-		"qwen2.5-coder:latest",
-		"qwen2.5vl:3b",
-		"qwen3:0.6b", // dense
-		"qwen3:30b",  // MOE
-		"gemma3:1b",
-		"llama3.1:latest",
-		"llama3.2:latest",
-		"gemma2:latest",
-		"minicpm-v:latest",    // arch=qwen2
-		"granite-code:latest", // arch=llama
-	}
-	llamaRunnerChatModels = []string{
-		"mistral:latest",
-		"falcon3:latest",
-		"granite3-moe:latest",
-		"command-r:latest",
-		"nemotron-mini:latest",
-		"phi3.5:latest",
-		"solar-pro:latest",
-		"internlm2:latest",
-		"codellama:latest", // arch=llama
-		"phi3:latest",
-		"falcon2:latest",
-		"gemma:latest",
-		"llama2:latest",
-		"nous-hermes:latest",
-		"orca-mini:latest",
-		"qwen:latest",
-		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
-		"falcon:latest",
-	}
-
-	// Some library models are quite large - ensure large VRAM and sufficient disk space
-	// before running scenarios based on this set
-	libraryChatModels = []string{
-		"alfred",
-		"athene-v2",
-		"aya-expanse",
-		"aya",
-		"bakllava",
-		"bespoke-minicheck",
-		"codebooga",
-		"codegeex4",
-		"codegemma",
-		"codellama",
-		"codeqwen",
-		"codestral",
-		"codeup",
-		"cogito",
-		"command-a",
-		"command-r-plus",
-		"command-r",
-		"command-r7b-arabic",
-		"command-r7b",
-		"dbrx",
-		"deepcoder",
-		"deepscaler",
-		"deepseek-coder-v2",
-		"deepseek-coder",
-		"deepseek-llm",
-		"deepseek-r1",
-		// "deepseek-v2.5", // requires 155 GB VRAM
-		"deepseek-v2",
-		// "deepseek-v3", // requires 482 GB VRAM
-		"devstral",
-		"dolphin-llama3",
-		"dolphin-mistral",
-		"dolphin-mixtral",
-		"dolphin-phi",
-		"dolphin3",
-		"dolphincoder",
-		"duckdb-nsql",
-		"everythinglm",
-		"exaone-deep",
-		"exaone3.5",
-		"falcon",
-		"falcon2",
-		"falcon3",
-		"firefunction-v2",
-		"gemma",
-		"gemma2",
-		"gemma3",
-		"gemma3n",
-		"glm4",
-		"goliath",
-		"granite-code",
-		"granite3-dense",
-		"granite3-guardian",
-		"granite3-moe",
-		"granite3.1-dense",
-		"granite3.1-moe",
-		"granite3.2-vision",
-		"granite3.2",
-		"granite3.3",
-		"hermes3",
-		"internlm2",
-		"llama-guard3",
-		"llama-pro",
-		"llama2-chinese",
-		"llama2-uncensored",
-		"llama2",
-		"llama3-chatqa",
-		"llama3-gradient",
-		"llama3-groq-tool-use",
-		"llama3.1",
-		"llama3.2-vision",
-		"llama3.2",
-		"llama3.3",
-		"llama3",
-		"llama4",
-		"llava-llama3",
-		"llava-phi3",
-		"llava",
-		"magicoder",
-		"magistral",
-		"marco-o1",
-		"mathstral",
-		"meditron",
-		"medllama2",
-		"megadolphin",
-		"minicpm-v",
-		"mistral-large",
-		"mistral-nemo",
-		"mistral-openorca",
-		"mistral-small",
-		"mistral-small3.1",
-		"mistral-small3.2",
-		"mistral",
-		"mistrallite",
-		"mixtral",
-		"moondream",
-		"nemotron-mini",
-		"nemotron",
-		"neural-chat",
-		"nexusraven",
-		"notus",
-		"nous-hermes",
-		"nous-hermes2-mixtral",
-		"nous-hermes2",
-		"nuextract",
-		"olmo2",
-		"open-orca-platypus2",
-		"openchat",
-		"opencoder",
-		"openhermes",
-		"openthinker",
-		"orca-mini",
-		"orca2",
-		// "phi", // unreliable
-		"phi3.5",
-		"phi3",
-		"phi4-mini-reasoning",
-		"phi4-mini",
-		"phi4-reasoning",
-		"phi4",
-		"phind-codellama",
-		"qwen",
-		"qwen2-math",
-		"qwen2.5-coder",
-		"qwen2.5",
-		"qwen2.5vl",
-		"qwen2",
-		"qwen3:0.6b", // dense
-		"qwen3:30b",  // MOE
-		"qwq",
-		"r1-1776",
-		"reader-lm",
-		"reflection",
-		"sailor2",
-		"samantha-mistral",
-		"shieldgemma",
-		"smallthinker",
-		"smollm",
-		"smollm2",
-		"solar-pro",
-		"solar",
-		"sqlcoder",
-		"stable-beluga",
-		"stable-code",
-		"stablelm-zephyr",
-		"stablelm2",
-		"starcoder",
-		"starcoder2",
-		"starling-lm",
-		"tinydolphin",
-		"tinyllama",
-		"tulu3",
-		"vicuna",
-		"wizard-math",
-		"wizard-vicuna-uncensored",
-		"wizard-vicuna",
-		"wizardcoder",
-		"wizardlm-uncensored",
-		"wizardlm2",
-		"xwinlm",
-		"yarn-llama2",
-		"yarn-mistral",
-		"yi-coder",
-		"yi",
-		"zephyr",
-	}
-	libraryEmbedModels = []string{
-		"all-minilm",
-		"bge-large",
-		"bge-m3",
-		"granite-embedding",
-		"mxbai-embed-large",
-		"nomic-embed-text",
-		"paraphrase-multilingual",
-		"snowflake-arctic-embed",
-		"snowflake-arctic-embed2",
-	}
-)
-
 func Init() {
 	lifecycle.InitLogging()
 }
@@ -494,10 +271,6 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 			t.Errorf("generate stalled.  Response so far:%s", buf.String())
 		}
 	case <-done:
-		if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") {
-			slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr)
-			return
-		}
 		require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
 		// Verify the response contains the expected data
 		response := buf.String()
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -19,22 +19,12 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 // The tensors are of shape embed dim, kv heads, batch size
 // The mask is of shape history size, batch size
 type Causal struct {
-	DType ml.DType
-
-	// swaWindowSize is the number of tokens that will be included in the mask
-	// during attention operations. swaMemorySize is the number of tokens that
-	// will be retained in memory for partial prefix caching. Set to math.MaxInt32
-	// for unlimited or if sliding window attention is not being used.
-	swaWindowSize int32
-	swaMemorySize int32
-
-	chunkSize int32
+	DType      ml.DType
+	windowSize int32
+	chunkSize  int32

 	opts CausalOptions

-	// maxBatch is the largest batch that we might receive
-	maxBatch int
-
 	// config controls mostly backend-specific optimizations
 	config *ml.CacheConfig

@@ -95,41 +85,32 @@ type cellRange struct {

 func NewCausalCache(shift shiftFn) *Causal {
 	return &Causal{
-		shiftFn: shift,
-		ctxs:    make(map[int]ml.Context),
-		keys:    make(map[int]ml.Tensor),
-		values:  make(map[int]ml.Tensor),
+		windowSize: math.MaxInt32,
+		shiftFn:    shift,
+		ctxs:       make(map[int]ml.Context),
+		keys:       make(map[int]ml.Tensor),
+		values:     make(map[int]ml.Tensor),
 	}
 }

 func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		swaWindowSize: windowSize,
-		shiftFn:       shift,
-		ctxs:          make(map[int]ml.Context),
-		keys:          make(map[int]ml.Tensor),
-		values:        make(map[int]ml.Tensor),
-	}
-}
-
-func NewSWAMemCache(windowSize int32, memorySize int32, shift shiftFn) *Causal {
-	return &Causal{
-		swaWindowSize: windowSize,
-		swaMemorySize: memorySize,
-		shiftFn:       shift,
-		ctxs:          make(map[int]ml.Context),
-		keys:          make(map[int]ml.Tensor),
-		values:        make(map[int]ml.Tensor),
+		windowSize: windowSize,
+		shiftFn:    shift,
+		ctxs:       make(map[int]ml.Context),
+		keys:       make(map[int]ml.Tensor),
+		values:     make(map[int]ml.Tensor),
 	}
 }

 func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		chunkSize: chunkSize,
-		shiftFn:   shift,
-		ctxs:      make(map[int]ml.Context),
-		keys:      make(map[int]ml.Tensor),
-		values:    make(map[int]ml.Tensor),
+		windowSize: math.MaxInt32,
+		chunkSize:  chunkSize,
+		shiftFn:    shift,
+		ctxs:       make(map[int]ml.Context),
+		keys:       make(map[int]ml.Tensor),
+		values:     make(map[int]ml.Tensor),
 	}
 }

@@ -154,25 +135,11 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.MaskDType = ml.DTypeF32
 	}

-	if c.swaWindowSize == 0 {
-		c.swaWindowSize = math.MaxInt32
-	}
-	if c.swaMemorySize == 0 {
-		c.swaMemorySize = c.swaWindowSize
-	}
-	if int(c.swaMemorySize) > capacity {
-		c.swaMemorySize = math.MaxInt32
-	}
-
-	if c.swaMemorySize < c.swaWindowSize {
-		panic(fmt.Errorf("sliding window memory (%v) must be at least as large as the window (%v)", c.swaMemorySize, c.swaWindowSize))
-	}
-
 	var cacheSize int
-	if c.swaMemorySize == math.MaxInt32 {
+	if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) {
 		cacheSize = maxSequences * capacity
 	} else {
-		cacheSize = (maxSequences * int(c.swaMemorySize)) + maxBatch
+		cacheSize = (maxSequences * int(c.windowSize)) + maxBatch
 	}
 	cacheSize = roundUp(cacheSize, c.config.CachePadding)
 	c.cells = make([]cacheCell, cacheSize)
@@ -180,7 +147,6 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 	c.DType = dtype
 	c.cellRanges = make(map[int]cellRange)
 	c.backend = backend
-	c.maxBatch = maxBatch
 }

 func (c *Causal) SetConfig(config ml.CacheConfig) {
@@ -214,10 +180,10 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 			c.curLoc, err = c.findStartLoc()
 		}
 		if err != nil {
-			slog.Warn("unable to find a kv cache slot", "cache", c)
 			return err
 		}

+		c.curCellRange = newRange()
 		for i, pos := range batch.Positions {
 			seq := batch.Sequences[i]

@@ -228,12 +194,19 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 				seqRange = newRange()
 			}

-			seqRange.min = min(seqRange.min, c.curLoc+i)
-			c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
-
-			seqRange.max = max(seqRange.max, c.curLoc+i)
-			c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
+			if c.curLoc+i > seqRange.max {
+				seqRange.max = c.curLoc + i
+			}
+			if seqRange.max > c.curCellRange.max {
+				c.curCellRange.max = seqRange.max
+			}

+			if c.curLoc+i < seqRange.min {
+				seqRange.min = c.curLoc + i
+			}
+			if seqRange.min < c.curCellRange.min {
+				c.curCellRange.min = seqRange.min
+			}
 			c.cellRanges[seq] = seqRange
 		}
 	} else {
@@ -275,16 +248,7 @@ func (c *Causal) findStartLoc() (int, error) {
 }

 func (c *Causal) updateSlidingWindow() {
-	c.curCellRange = newRange()
-
-	if c.swaMemorySize == math.MaxInt32 {
-		for _, seq := range c.curSequences {
-			if seqRange, ok := c.cellRanges[seq]; ok {
-				c.curCellRange.min = min(c.curCellRange.min, seqRange.min)
-				c.curCellRange.max = max(c.curCellRange.max, seqRange.max)
-			}
-		}
-
+	if c.windowSize == math.MaxInt32 {
 		return
 	}

@@ -314,16 +278,12 @@ func (c *Causal) updateSlidingWindow() {

 		for i := oldRange.min; i <= oldRange.max; i++ {
 			if slices.Contains(c.cells[i].sequences, seq) {
-				if c.cells[i].pos < pos-c.swaMemorySize {
+				if c.cells[i].pos < pos-c.windowSize {
 					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
 				} else {
 					newRange.min = min(newRange.min, i)
 					newRange.max = max(newRange.max, i)
 				}
-				if c.cells[i].pos >= pos-c.swaWindowSize {
-					c.curCellRange.min = min(c.curCellRange.min, i)
-					c.curCellRange.max = max(c.curCellRange.max, i)
-				}
 			}
 		}

@@ -363,7 +323,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
 				(enabled && c.cells[j].pos > c.curPositions[i]) ||
 				c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
-				c.cells[j].pos < c.curPositions[i]-c.swaWindowSize {
+				c.cells[j].pos < c.curPositions[i]-c.windowSize {
 				mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
 			}
 		}
@@ -521,8 +481,6 @@ func (c *Causal) defrag() {

 		c.cellRanges[seq] = seqRange
 	}
-
-	c.updateSlidingWindow()
 }

 func (c *Causal) SetLayer(layer int) {
@@ -648,7 +606,7 @@ func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
 }

 func (c *Causal) CanResume(seq int, pos int32) bool {
-	if c.swaMemorySize == math.MaxInt32 {
+	if c.windowSize == math.MaxInt32 {
 		return true
 	}

@@ -670,8 +628,8 @@ func (c *Causal) CanResume(seq int, pos int32) bool {
 		return false
 	}

-	lastWindowStart := max(0, last-c.swaMemorySize)
-	posWindowStart := max(0, pos-c.swaWindowSize)
+	lastWindowStart := max(0, last-c.windowSize)
+	posWindowStart := max(0, pos-c.windowSize)

 	return posWindowStart >= lastWindowStart
 }
@@ -681,64 +639,48 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		return ErrNotSupported
 	}

+	ctx := c.backend.NewContext()
+	defer ctx.Close()
+
 	seqRange := c.cellRanges[seq]
+	size := seqRange.max - seqRange.min + 1

-	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
-		size := min(seqRange.max-start+1, c.maxBatch)
-		offsets := make([]int32, size)
+	offsets := make([]int32, size)
+	for i := range offsets {
+		cell := c.cells[seqRange.min+i]

-		var batchFirst, batchLast int
-
-		batchFirst = -1
-		for i := range offsets {
-			cell := c.cells[start+i]
-
-			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
-				offsets[i] = offset
-				if batchFirst < 0 {
-					batchFirst = i
-				}
-				batchLast = i
-			}
+		if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
+			offsets[i] = offset
 		}
+	}

-		if batchFirst < 0 {
+	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+
+	for i, key := range c.keys {
+		if key == nil {
 			continue
 		}

-		offsets = offsets[batchFirst : batchLast+1]
+		kHeadDim := key.Dim(0)
+		numKVHeads := key.Dim(1)
+		rowSize := key.Stride(2)

-		ctx := c.backend.NewContext()
-		kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+		key = key.View(ctx, rowSize*seqRange.min,
+			kHeadDim, key.Stride(1),
+			numKVHeads, key.Stride(2),
+			size,
+		)

-		for i, key := range c.keys {
-			if key == nil {
-				continue
-			}
-
-			kHeadDim := key.Dim(0)
-			numKVHeads := key.Dim(1)
-			rowSize := key.Stride(2)
-
-			key = key.View(ctx, rowSize*(start+batchFirst),
-				kHeadDim, key.Stride(1),
-				numKVHeads, key.Stride(2),
-				len(offsets),
-			)
-
-			roped, err := c.shiftFn(ctx, i, key, kShift)
-			if err != nil {
-				ctx.Close()
-				return err
-			}
-
-			ctx.Forward(roped.Copy(ctx, key))
+		roped, err := c.shiftFn(ctx, i, key, kShift)
+		if err != nil {
+			return err
 		}

-		ctx.Compute()
-		ctx.Close()
+		ctx.Forward(roped.Copy(ctx, key))
 	}

+	ctx.Compute()
+
 	return nil
 }

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -60,8 +60,6 @@ func TestSWA(t *testing.T) {

 	cache.Init(backend, ml.DTypeF16, 1, 16, 16)

-	x := float32(math.Inf(-1))
-
 	tests := []testCase{
 		{
 			name:          "FirstBatch",
@@ -71,12 +69,7 @@ func TestSWA(t *testing.T) {
 			pos:           []int32{0, 1, 2, 3},
 			expected:      []float32{1, 2, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask: []float32{
-				0, x, x, x,
-				0, 0, x, x,
-				x, 0, 0, x,
-				x, x, 0, 0,
-			},
+			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
 		},
 		{
 			name:          "SecondBatch",
@@ -86,53 +79,7 @@ func TestSWA(t *testing.T) {
 			pos:           []int32{4, 5},
 			expected:      []float32{5, 6, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask: []float32{
-				0, x, x, 0,
-				0, 0, x, x,
-			},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestSWAMem(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewSWAMemCache(1, 3, nil)
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-	x := float32(math.Inf(-1))
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{1, 2, 3, 4},
-			inShape:       []int{1, 1, 4},
-			seqs:          []int{0, 0, 0, 0},
-			pos:           []int32{0, 1, 2, 3},
-			expected:      []float32{1, 2, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask: []float32{
-				0, x, x, x,
-				0, 0, x, x,
-				x, 0, 0, x,
-				x, x, 0, 0,
-			},
-		},
-		{
-			name:          "SecondBatch",
-			in:            []float32{5, 6},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{0, 0},
-			pos:           []int32{4, 5},
-			expected:      []float32{4, 5, 6},
-			expectedShape: []int{1, 1, 3},
-			expectedMask: []float32{
-				0, 0, x,
-				x, 0, 0,
-			},
+			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1))},
 		},
 	}

@@ -490,70 +437,6 @@ func TestCanResume(t *testing.T) {
 	}
 }

-func TestCanResumeSWAMem(t *testing.T) {
-	backend := &testBackend{}
-	windowSize := int32(4)
-	memSize := int32(5)
-	cache := NewSWAMemCache(windowSize, memSize, nil)
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-	context := backend.NewContext()
-	defer context.Close()
-
-	err := cache.StartForward(context, input.Batch{
-		Positions: []int32{0, 1, 2, 3, 4, 5},
-		Sequences: []int{0, 0, 0, 0, 0, 0},
-	}, false)
-	if err != nil {
-		t.Fatalf("StartForward failed: %v", err)
-	}
-
-	cache.SetLayer(0)
-	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4, 5, 6}, 1, 1, 6)
-	cache.Put(context, tensor, tensor)
-
-	// shift window by adding position 6
-	err = cache.StartForward(context, input.Batch{
-		Positions: []int32{6, 7},
-		Sequences: []int{0, 0},
-	}, false)
-	if err != nil {
-		t.Fatalf("StartForward failed: %v", err)
-	}
-
-	cache.SetLayer(0)
-	tensor = context.FromFloatSlice([]float32{7, 8}, 1, 1, 2)
-	cache.Put(context, tensor, tensor)
-
-	// only the latest position has overlapping windows
-	if cache.CanResume(0, 0) {
-		t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
-	}
-	if cache.CanResume(0, 1) {
-		t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
-	}
-	if cache.CanResume(0, 2) {
-		t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
-	}
-	if cache.CanResume(0, 3) {
-		t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
-	}
-	if cache.CanResume(0, 4) {
-		t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
-	}
-	if cache.CanResume(0, 5) {
-		t.Errorf("after shift: CanResume(0, 5) = true, want false (outside window)")
-	}
-	if !cache.CanResume(0, 6) {
-		t.Errorf("after shift: CanResume(0, 6) = false, want true (inside window)")
-	}
-	if !cache.CanResume(0, 7) {
-		t.Errorf("after shift: CanResume(0, 7) = false, want true (latest position)")
-	}
-}
-
 type testBackend struct {
 	ml.Backend
 }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -150,7 +150,7 @@ index 4cce5166..7f6617fa 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 3a4e72a3..db62973f 100644
+index 3a4e72a3..831b68c0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
--- a/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
 4 files changed, 59 insertions(+), 79 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index dca22d8b..1f3a3956 100644
+index c22687e4..c5948e8f 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -947,9 +947,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
@@ -41,7 +41,7 @@ index dca22d8b..1f3a3956 100644
         }
 
         ggml_backend_sched_reset(sched.get());
-@@ -1965,9 +1968,12 @@ void llama_context::opt_epoch_iter(
+@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
 
             // TODO: not sure if this is needed
             if (!kv_self->find_slot(ubatch)) {
--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
@@ -10,10 +10,10 @@ Subject: [PATCH] add argsort and cuda copy for i32
 3 files changed, 192 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 955fec59..654e2f28 100644
+index becdae07..7a44b6cf 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6822,6 +6822,45 @@ static void ggml_compute_forward_argsort_f32(
+@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
 
@@ -59,7 +59,7 @@ index 955fec59..654e2f28 100644
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
-@@ -6833,6 +6872,10 @@ void ggml_compute_forward_argsort(
+@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
@@ -195,7 +195,7 @@ index 607ded85..53b02634 100644
 +    }
 }
 diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
-index d027271f..4abd01d7 100644
+index 2d46176e..47383486 100644
 --- a/ggml/src/ggml-cuda/cpy.cu
 +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
@@ -257,7 +257,7 @@ index d027271f..4abd01d7 100644
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     const float * xi = (const float *) cxi;
     block_q8_0 * dsti = (block_q8_0 *) cdsti;
-@@ -633,6 +678,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -266,7 +266,7 @@ index d027271f..4abd01d7 100644
     } else {
         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
-@@ -688,6 +735,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -7,31 +7,31 @@ This enables matching up devices and information reported by the backend
 with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
 ---
 ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 39 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 41 insertions(+)
+ 3 files changed, 35 insertions(+)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..48839339 100644
+index 74e46716..a880df33 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
     struct ggml_backend_dev_props {
         const char * name;
         const char * description;
-+        const char * id;
+        const char * uuid;
         size_t memory_free;
         size_t memory_total;
         enum ggml_backend_dev_type type;
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..d6960174 100644
+index cb0d8528..4c829153 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
     int device;
     std::string name;
     std::string description;
-+    std::string id;
+    std::string uuid;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -39,9 +39,9 @@ index cb0d8528..d6960174 100644
     return ctx->description.c_str();
 }
 
-+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
 +    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->id.c_str();
+    return ctx->uuid.c_str();
 +}
 +
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -51,17 +51,17 @@ index cb0d8528..d6960174 100644
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->id          = ggml_backend_cuda_device_get_id(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
-@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
 
 +                #if !defined(GGML_USE_HIP)
-+                char id[64];
-+                snprintf(id, sizeof(id),
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
 +                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 +                    (unsigned char)prop.uuid.bytes[0],
 +                    (unsigned char)prop.uuid.bytes[1],
@@ -80,29 +80,23 @@ index cb0d8528..d6960174 100644
 +                    (unsigned char)prop.uuid.bytes[14],
 +                    (unsigned char)prop.uuid.bytes[15]
 +                  );
-+                dev_ctx->id = id;
+                dev_ctx->uuid = uuid;
 +                #else
-+                #ifdef _WIN32
-+                char id[16];
-+                snprintf(id, sizeof(id), "%d", i);
-+                dev_ctx->id = id;
-+                #else
-+                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
-+                #endif
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
 +                #endif
 +
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..a9eeebc6 100644
+index 1b56f858..ee4f2dcb 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
-+    props->id          = "0";
+    props->uuid        = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {
--- a/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
+++ b/llama/patches/0018-temporary-prevent-rocm-cuda-mixed-loading.patch
@@ -1,32 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Sun, 22 Jun 2025 09:22:05 -0700
-Subject: [PATCH] temporary prevent rocm+cuda mixed loading
-
---
- ggml/src/ggml-backend-reg.cpp | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 4e67d243..8f49f084 100644
--- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -573,8 +573,16 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
- 
-     ggml_backend_load_best("blas", silent, dir_path);
-     ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-+
-+    // Avoid mixed hip+cuda configurations
-+    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
-+    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
-+    if (!hip_devices && !rocr_devices) {
-+        ggml_backend_load_best("cuda", silent, dir_path);
-+    } else {
-+        ggml_backend_load_best("hip", silent, dir_path);
-+    }
-+    
-     ggml_backend_load_best("kompute", silent, dir_path);
-     ggml_backend_load_best("metal", silent, dir_path);
-     ggml_backend_load_best("rpc", silent, dir_path);
--- a/llama/patches/0019-metal-add-mean-kernel-14267.patch
+++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -1,169 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Georgi Gerganov <ggerganov@gmail.com>
-Date: Thu, 19 Jun 2025 08:05:21 +0300
-Subject: [PATCH] metal : add mean kernel (#14267)
-
-* metal : add mean kernel
-
-ggml-ci
-
-* cont : dedup implementation
-
-ggml-ci
---
- ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++---
- ggml/src/ggml-metal/ggml-metal.metal | 48 ++++++++++++++++++++++------
- 2 files changed, 67 insertions(+), 14 deletions(-)
-
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index a9eeebc6..110c9ece 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
-     GGML_METAL_KERNEL_TYPE_COS,
-     GGML_METAL_KERNEL_TYPE_NEG,
-     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
-+    GGML_METAL_KERNEL_TYPE_MEAN,
-     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
-     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
-     GGML_METAL_KERNEL_TYPE_ARGMAX,
-@@ -1436,6 +1437,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
-@@ -1634,6 +1636,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_LOG:
-             return false; // TODO: implement
-         case GGML_OP_SUM_ROWS:
-+        case GGML_OP_MEAN:
-         case GGML_OP_SOFT_MAX:
-         case GGML_OP_GROUP_NORM:
-             return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
-@@ -2362,11 +2365,30 @@ static bool ggml_metal_encode_node(
-                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-             } break;
-         case GGML_OP_SUM_ROWS:
-+        case GGML_OP_MEAN:
-             {
-                 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
- 
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-+                id<MTLComputePipelineState> pipeline = nil;
-+
-+                switch (dst->op) {
-+                    case GGML_OP_SUM_ROWS:
-+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-+                        break;
-+                    case GGML_OP_MEAN:
-+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
-+                        break;
-+                    default:
-+                        GGML_ABORT("fatal error");
-+                }
-+
-+                int nth = 32; // SIMD width
-+
-+                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
-+                    nth *= 2;
-+                }
- 
-+                nth = MIN(nth, ne00);
- 
-                 ggml_metal_kargs_sum_rows args = {
-                    /*.ne00 =*/ ne00,
-@@ -2396,11 +2418,12 @@ static bool ggml_metal_encode_node(
-                 };
- 
-                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&args length:sizeof(args) atIndex:2];
-+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
- 
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-             } break;
-         case GGML_OP_SOFT_MAX:
-             {
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 9cfddf45..08e8d807 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -956,31 +956,61 @@ kernel void kernel_neg(
-     dst[tpig] = -src0[tpig];
- }
- 
-+template <bool norm>
- kernel void kernel_sum_rows(
-+        constant ggml_metal_kargs_sum_rows & args,
-         device const float * src0,
-         device       float * dst,
-        constant ggml_metal_kargs_sum_rows & args,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
-+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
-+        uint3   tgpig[[threadgroup_position_in_grid]],
-+        ushort3 tpitg[[thread_position_in_threadgroup]],
-+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
-+        ushort  tiisg[[thread_index_in_simdgroup]],
-+        ushort3   ntg[[threads_per_threadgroup]]) {
-+    int64_t i3 = tgpig.z;
-+    int64_t i2 = tgpig.y;
-+    int64_t i1 = tgpig.x;
- 
-     if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
-         return;
-     }
- 
-+    if (sgitg == 0) {
-+        shmem_f32[tiisg] = 0.0f;
-+    }
-+
-     device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
-     device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
- 
-    float row_sum = 0;
-+    float sumf = 0;
- 
-    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
-        row_sum += src_row[i0];
-+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
-+        sumf += src_row[i0];
-     }
- 
-    dst_row[0] = row_sum;
-+    sumf = simd_sum(sumf);
-+
-+    threadgroup_barrier(mem_flags::mem_threadgroup);
-+
-+    if (tiisg == 0) {
-+        shmem_f32[sgitg] = sumf;
-+    }
-+
-+    threadgroup_barrier(mem_flags::mem_threadgroup);
-+
-+    sumf = shmem_f32[tiisg];
-+    sumf = simd_sum(sumf);
-+
-+    if (tpitg.x == 0) {
-+        dst_row[0] = norm ? sumf / args.ne00 : sumf;
-+    }
- }
- 
-+typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
-+
-+template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
-+template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
-+
- template<typename T>
- kernel void kernel_soft_max(
-         device const  char * src0,
--- a/llama/patches/0020-CUDA-add-mean-operation-14313.patch
+++ b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
--- a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
+++ b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -1,50 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Oliver Simons <osimons@nvidia.com>
-Date: Tue, 22 Jul 2025 11:02:28 +0200
-Subject: [PATCH] Enable CUDA Graphs for gemma3n.
-
-Similar to
-https://github.com/ggml-org/llama.cpp/pull/14741,
-though ollama has a slightly different model graph
-than llama.cpp which requires different workaround
-checks.
---
- ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++----
- 1 file changed, 12 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 2b9fabf4..28ccf4be 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
-     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
-     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
- 
-+    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
-+    const std::string gemma3n_node_name                  = "node_";
-+
-     for (int i = 0; i < cgraph->n_nodes; i++) {
-         ggml_tensor * node = cgraph->nodes[i];
- 
-@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
- #endif
-         }
- 
-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
-+        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
-+        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
-+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
-+                                                                                    && node->ne[2] == 1
-+                                                                                    && node->ne[3] == 1
-+                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
-+                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
-+            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
-             use_cuda_graph = false;
- #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
-+            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
- #endif
-         }
- 
--- a/llama/patches/0022-BF16-macos-version-guard.patch
+++ b/llama/patches/0022-BF16-macos-version-guard.patch
@@ -1,27 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Wed, 30 Jul 2025 08:43:46 -0700
-Subject: [PATCH] BF16 macos version guard
-
-Only enable BF16 on supported MacOS versions (v14+)
---
- ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 110c9ece..ab46f6e3 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -89,7 +89,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
-         ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
- 
- #if defined(GGML_METAL_USE_BF16)
-        ctx->use_bfloat = ctx->has_bfloat;
-+        if (@available(macOS 14.0, *)) {
-+            ctx->use_bfloat = ctx->has_bfloat;
-+        } else {
-+            ctx->use_bfloat = false;
-+        }
- #else
-         ctx->use_bfloat = false;
- #endif
--- a/llama/patches/0023-MXFP4.patch
+++ b/llama/patches/0023-MXFP4.patch
--- a/llama/patches/0024-cuda-disable-graph-compat-check-for-OP_ADD.patch
+++ b/llama/patches/0024-cuda-disable-graph-compat-check-for-OP_ADD.patch
@@ -1,34 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Thu, 31 Jul 2025 12:31:58 -0700
-Subject: [PATCH] cuda: disable graph compat check for OP_ADD
-
---
- ggml/src/ggml-cuda/ggml-cuda.cu | 14 --------------
- 1 file changed, 14 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index bb19b06e..080e7467 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2509,20 +2509,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
- #endif
-         }
- 
-        // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
-        // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
-                                                                                    && node->ne[2] == 1
-                                                                                    && node->ne[3] == 1
-                                                                                    && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
-                                                                                    && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
-            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
-            use_cuda_graph = false;
-#ifndef NDEBUG
-            GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
-#endif
-        }
-
-         if (node->op == GGML_OP_CPY) {
- 
-             // Store the pointers which are updated for each token, such that these can be sent
--- a/llama/patches/0025-Disable-ggml-blas-on-macos-v13-and-older.patch
+++ b/llama/patches/0025-Disable-ggml-blas-on-macos-v13-and-older.patch
@@ -1,25 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Sun, 3 Aug 2025 10:00:20 -0700
-Subject: [PATCH] Disable ggml-blas on macos v13 and older
-
---
- ggml/src/ggml-blas/ggml-blas.cpp | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
-index ec158dfa..22926d75 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas/ggml-blas.cpp
-@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
- };
- 
- ggml_backend_reg_t ggml_backend_blas_reg(void) {
-+    // MacOS prior to v14 does not include cblas_sgemm - disable this backend if it isn't available
-+    if (&cblas_sgemm == NULL) {
-+        GGML_LOG_INFO("Disabling ggml-blas backend on old MacOS version\n");
-+        return NULL;
-+    }
-     static struct ggml_backend_reg ggml_backend_blas_reg = {
-         /* .api_version = */ GGML_BACKEND_API_VERSION,
-         /* .iface       = */ ggml_backend_blas_reg_i,
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -151,12 +151,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	if graphPartialOffload == 0 {
-		headsKV := f.KV().HeadCountKVMin()
-		if headsKV == 0 {
-			headsKV = 1
-		}
-		gqa := f.KV().HeadCountMax() / headsKV
-		graphPartialOffload = gqa * kvTotal / 6
+		graphPartialOffload = f.KV().GQA() * kvTotal / 6
 	}
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
--- a/llm/server.go
+++ b/llm/server.go
@@ -139,13 +139,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		gpus = discover.GetCPUInfo()
 	}

-	// Verify the requested context size is <= the model training size
-	trainCtx := f.KV().ContextLength()
-	if opts.NumCtx/numParallel > int(trainCtx) && trainCtx > 0 {
-		slog.Warn("requested context size too large for model", "num_ctx", opts.NumCtx, "num_parallel", numParallel, "n_ctx_train", trainCtx)
-		opts.NumCtx = int(trainCtx) * numParallel
-	}
-
 	estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
@@ -318,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}

-	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -124,9 +124,9 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string

-	// ID is an identifier for the device for matching with system
-	// management libraries.
-	ID string
+	// UUID is a unique persistent identifier for the device for matching
+	// with system management libraries
+	UUID string

 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
@@ -156,8 +156,8 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}

-	if len(attrs) > 0 && m.ID != "" {
-		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
+	if len(attrs) > 0 && m.UUID != "" {
+		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
 	}

 	return slog.GroupValue(attrs...)
@@ -253,7 +253,6 @@ type Tensor interface {

 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
-	Sub(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
 	Div(ctx Context, t2 Tensor) Tensor

@@ -276,15 +275,13 @@ type Tensor interface {
 	Cos(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
-	QuickGELU(ctx Context) Tensor
 	SILU(ctx Context) Tensor
-	RELU(ctx Context) Tensor
 	Sigmoid(ctx Context) Tensor

 	Reshape(ctx Context, shape ...int) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
 	Permute(ctx Context, shape ...int) Tensor
-	Contiguous(ctx Context, shape ...int) Tensor
+	Contiguous(ctx Context) Tensor
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor

 	Pad(ctx Context, shape ...int) Tensor
@@ -300,12 +297,6 @@ type Tensor interface {

 	TopK(ctx Context, k int) Tensor
 	Argsort(ctx Context) Tensor
-	Mean(ctx Context) Tensor
-	Variance(ctx Context) Tensor
-	Stddev(ctx Context) Tensor
-	Sqr(ctx Context) Tensor
-	Sqrt(ctx Context) Tensor
-	Clamp(ctx Context, min, max float32) Tensor
 }

 // ScaledDotProductAttention implements a fused attention
@@ -469,5 +460,4 @@ const (
 	DTypeQ80
 	DTypeQ40
 	DTypeI32
-	DTypeMXFP4
 )
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -138,7 +138,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.ID = C.GoString(props.id)
+	requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)

@@ -155,7 +155,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].ID = C.GoString(props.id)
+		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
@@ -297,9 +297,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
 				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
-		case contains(t.Name, "cls", "output", "output_norm",
-			"altup_proj", "altup_unembd_proj",
-			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
+		case contains(t.Name, "cls", "output", "output_norm"):
 			createTensor(tensor{source: t}, output.bts, blocks)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
@@ -355,26 +353,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		bbs[c] = b
 	}

-	// Mimic llama runner logs summarizing layers and memory
-	gpuLayers := 0
-	for _, layer := range layers {
-		if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
-			gpuLayers++
-		}
-	}
-	slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers))
-
-	switch C.ggml_backend_dev_type(output.d) {
-	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
-		slog.Info("offloading output layer to CPU")
-	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
-		slog.Info("offloading output layer to GPU")
-		gpuLayers++
-	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
-		slog.Info("offloading output layer to ACCEL")
-	}
-	slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1))
-
 	for bs := range maps.Values(bbs) {
 		slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
 	}
@@ -420,7 +398,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
 			C.int(len(schedBackends)),
 			C.size_t(maxGraphNodes),
-			C._Bool(false),
+			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
 			C._Bool(false),
 		),
 		schedBackends: schedBackends,
@@ -624,9 +602,7 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }

 func (c *Context) Compute(tensors ...ml.Tensor) {
-	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
-		panic(fmt.Errorf("error computing ggml graph: %v", status))
-	}
+	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
 	C.ggml_backend_sched_reset(c.b.sched)

 	needSync := true
@@ -708,8 +684,6 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		cdtype = C.GGML_TYPE_Q4_0
 	case ml.DTypeI32:
 		cdtype = C.GGML_TYPE_I32
-	case ml.DTypeMXFP4:
-		cdtype = C.GGML_TYPE_MXFP4
 	default:
 		panic("unsupported dtype")
 	}
@@ -898,8 +872,6 @@ func (t *Tensor) DType() ml.DType {
 		return ml.DTypeQ40
 	case C.GGML_TYPE_I32:
 		return ml.DTypeI32
-	case C.GGML_TYPE_MXFP4:
-		return ml.DTypeMXFP4
 	default:
 		return ml.DTypeOther
 	}
@@ -919,13 +891,6 @@ func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) Sub(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_sub(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
-	}
-}
-
 func (t *Tensor) Repeat(ctx ml.Context, dim, n int) ml.Tensor {
 	if dim < 0 || dim >= C.GGML_MAX_DIMS {
 		panic("invalid dimension")
@@ -962,35 +927,10 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
 	}
 }

-func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
-	switch len(shape) {
-	case 0:
-		return &Tensor{
-			b: t.b,
-			t: C.ggml_cont(ctx.(*Context).ctx, t.t),
-		}
-	case 1:
-		return &Tensor{
-			b: t.b,
-			t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
-		}
-	case 2:
-		return &Tensor{
-			b: t.b,
-			t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
-		}
-	case 3:
-		return &Tensor{
-			b: t.b,
-			t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
-		}
-	case 4:
-		return &Tensor{
-			b: t.b,
-			t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
-		}
-	default:
-		panic("unsupported number of dimensions")
+func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
 	}
 }

@@ -1205,18 +1145,11 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {

 func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
 	// Default options
-	opts := rope.Options{
-		Factors:               &Tensor{},
-		OriginalContextLength: 131072,
-		ExtrapolationFactor:   0.,
-		AttentionFactor:       1.,
-		BetaFast:              32.,
-		BetaSlow:              1.,
-	}
+	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}

 	// Apply any provided options
 	for _, option := range options {
-		option(&opts)
+		option(opts)
 	}

 	dequant := t.t
@@ -1236,10 +1169,10 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 			C.int(opts.OriginalContextLength),
 			C.float(ropeBase),
 			C.float(ropeScale),
-			C.float(opts.ExtrapolationFactor),
-			C.float(opts.AttentionFactor),
-			C.float(opts.BetaFast),
-			C.float(opts.BetaSlow),
+			C.float(0.0),
+			C.float(1.0),
+			C.float(32.0),
+			C.float(1.0),
 		),
 	}
 }
@@ -1258,13 +1191,6 @@ func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
 	}
 }

-func (t *Tensor) QuickGELU(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t),
-	}
-}
-
 func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1272,13 +1198,6 @@ func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
 	}
 }

-func (t *Tensor) RELU(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_relu_inplace(ctx.(*Context).ctx, t.t),
-	}
-}
-
 func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1354,52 +1273,3 @@ func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
 		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
 	}
 }
-
-func (t *Tensor) Mean(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_mean(ctx.(*Context).ctx, t.t),
-	}
-}
-
-func (t *Tensor) Variance(ctx ml.Context) ml.Tensor {
-	return t.Add(ctx, t.Mean(ctx).Scale(ctx, -1)).
-		Sqr(ctx).
-		SumRows(ctx).
-		Scale(ctx, 1/float64(t.Dim(0)))
-}
-
-func (t *Tensor) Stddev(ctx ml.Context) ml.Tensor {
-	return t.Variance(ctx).Sqrt(ctx)
-}
-
-func (t *Tensor) Sqr(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_sqr(ctx.(*Context).ctx, t.t),
-	}
-}
-
-func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_sqrt(ctx.(*Context).ctx, t.t),
-	}
-}
-
-func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
-	}
-}
-
-func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
-	// Unchecked to handle quantized types
-	t := c.newTensor(dtype, shape)
-	if len(s) > 0 {
-		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
-	}
-
-	return t
-}
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,7 +152,7 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
-        const char * id;
+        const char * uuid;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -353,7 +353,7 @@ extern "C" {
        GGML_TYPE_F16     = 1,
        GGML_TYPE_Q4_0    = 2,
        GGML_TYPE_Q4_1    = 3,
-        GGML_TYPE_MXFP4   = 4, // Formerly removed type GGML_TYPE_Q4_2
+        // GGML_TYPE_Q4_2 = 4, support has been removed
        // GGML_TYPE_Q4_3 = 5, support has been removed
        GGML_TYPE_Q5_0    = 6,
        GGML_TYPE_Q5_1    = 7,
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -573,16 +573,8 @@ void ggml_backend_load_all_from_path(const char * dir_path) {

    ggml_backend_load_best("blas", silent, dir_path);
    ggml_backend_load_best("cann", silent, dir_path);
-
-    // Avoid mixed hip+cuda configurations
-    const char * hip_devices = std::getenv("HIP_VISIBLE_DEVICES");
-    const char * rocr_devices = std::getenv("ROCR_VISIBLE_DEVICES"); 
-    if (!hip_devices && !rocr_devices) {
-        ggml_backend_load_best("cuda", silent, dir_path);
-    } else {
-        ggml_backend_load_best("hip", silent, dir_path);
-    }
-    
+    ggml_backend_load_best("cuda", silent, dir_path);
+    ggml_backend_load_best("hip", silent, dir_path);
    ggml_backend_load_best("kompute", silent, dir_path);
    ggml_backend_load_best("metal", silent, dir_path);
    ggml_backend_load_best("rpc", silent, dir_path);
--- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
@@ -505,11 +505,6 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
 };

 ggml_backend_reg_t ggml_backend_blas_reg(void) {
-    // MacOS prior to v14 does not include cblas_sgemm - disable this backend if it isn't available
-    if (&cblas_sgemm == NULL) {
-        GGML_LOG_INFO("Disabling ggml-blas backend on old MacOS version\n");
-        return NULL;
-    }
    static struct ggml_backend_reg ggml_backend_blas_reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_blas_reg_i,
--- a/ml/backend/ggml/ggml/src/ggml-common.h
+++ b/ml/backend/ggml/ggml/src/ggml-common.h
@@ -417,13 +417,6 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

-#define MXFP4 32
-typedef struct {
-    uint8_t d;              // scale E8M0 float 
-    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
-} block_mxfp4;
-static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
-
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL

--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -58,8 +58,6 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

-void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-
 #ifdef __cplusplus
 }
 #endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -362,11 +362,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
-    [GGML_TYPE_MXFP4] = {
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_mxfp4,
-        .vec_dot_type             = GGML_TYPE_F32,
-        .nrows                    = 1,
-    },
 };

 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -4965,7 +4965,6 @@ void ggml_compute_forward_clamp(
        case GGML_TYPE_I32:
        case GGML_TYPE_I64:
        case GGML_TYPE_F64:
-        case GGML_TYPE_MXFP4:
        case GGML_TYPE_COUNT:
            {
                GGML_ABORT("fatal error");
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
@@ -250,93 +250,3 @@ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, fl
    }
    return sum = (ggml_float)logf(sum);
 }
-
-#define MXFP4 32
-typedef struct {
-    uint8_t d;              // scale E8M0 float 
-    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
-} block_mxfp4;
-static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
-#define MXFP4_VALS {0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0}
-
-void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
-    assert(nrc == 1);
-    GGML_UNUSED(nrc);
-    GGML_UNUSED(bx);
-    GGML_UNUSED(by);
-    GGML_UNUSED(bs);
-    ggml_float mxfp4_table[] = MXFP4_VALS;
-
-#if defined(GGML_SIMD)
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-    const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC scalev;
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-    for (int i = 0; i < np; i += GGML_F32_STEP) { // ARM: +16  AVX512: +64
-        for (int j = 0; j < GGML_F32_ARR; j++) { // ARM: 0 .. 4 AVX512: 0 .. 4
-            // convert GGML_F32_ARR X elements 
-            const int ib = (i + j*GGML_F32_EPR) / MXFP4;
-            const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
-            union {
-                uint32_t as_bits;
-                float as_value;
-            } scale;
-            scale.as_bits = (((uint32_t)x->d) << 23);
-            scalev = GGML_F32_VEC_SET1(scale.as_value);
-            float xf[GGML_F32_EPR]= {0.f};
-            assert(((i+j*GGML_F32_EPR) % MXFP4)+GGML_F32_ARR < MXFP4 && "block overrun");
-            for (int qi = 0; qi < GGML_F32_EPR/2 ; ++qi) {
-                xf[qi*2] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf)];
-                xf[qi*2+1] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf0) >> 4];
-            }
-
-            ax[j] = GGML_F32_VEC_MUL(GGML_F32_VEC_LOAD(xf), scalev);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; i+=2) {
-        const int ib = i / MXFP4;
-        const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
-        union {
-            uint32_t as_bits;
-            float as_value;
-        } scale;
-        scale.as_bits = (((uint32_t)x->d) << 23);
-        sumf += y[i] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf)];
-        sumf += y[i+1] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf0) >> 4];
-    }
-
-
-#else // defined(GGML_SIMD)
-    const int nb = n / MXFP4;
-    assert(n % MXFP4 == 0);
-
-    int yi = 0;
-
-    const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
-
-    ggml_float sumf = 0.0;
-    for (int ib = 0; ib < nb; ++ib) {
-        const block_mxfp4 * GGML_RESTRICT x = &xx[ib + 0];
-        union {
-            uint32_t as_bits;
-            float as_value;
-        } scale;
-        scale.as_bits = (((uint32_t)x->d) << 23);
-        for (int i = 0; i < MXFP4/2; ++i) {
-            sumf += mxfp4_table[(x->qs[i] & 0xf)] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2]);
-            sumf += mxfp4_table[(x->qs[i] & 0xf0) >> 4] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2+1]);
-        }
-    }
-#endif
-
-    *s = sumf;
-}
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
@@ -42,8 +42,6 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
 void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);

-void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-
 void ggml_vec_silu_f32(const int n, float * y, const float * x);
 ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
 ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -362,26 +362,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }

-// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
-template<bool norm>
-static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col != 0) {
-        return;
-    }
-
-    dst[row] = norm ? sum / ncols : sum;
-}
-
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
--- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
@@ -571,82 +571,6 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }

-// MXFP4 dequantize derived from dequantize_block_q4_0
-template<typename dst_t>
-static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
-    const uint16_t dst_bias = 15;
-    const uint16_t dst_0p5 = 0x3800;
-    const uint16_t dst_m_bits = 10;
-    const int64_t i = blockIdx.x;
-
-    // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t il  = tid/8;
-    const int64_t ir  = tid%8;
-    const int64_t ib = 8*i + ir;
-    if (ib >= nb32) {
-        return;
-    }
-
-    const uint64_t offset = 256*i + MXFP4*ir + 8*il;
-    dst_t * y = yy + offset;
-
-    const block_mxfp4 * x = (const block_mxfp4 *)vx + ib;
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } scale;
-    scale.as_bits = (((uint32_t)x->d) << 23);
-
-    // offset within the block 1/4 chunks (8 items)
-    const uint8_t * q = x->qs + 4*il;
-
-    for (int l = 0; l < 4; ++l) {
-        uint16_t em0 = q[l] & 0x07;
-        uint16_t em1 = q[l] & 0x70;
-        // float16 values
-        iq1m_scale_t x0;
-        iq1m_scale_t x1;
-
-        x0.u16 = (em0 << (dst_m_bits - 1)) | ((q[l] & 0x08) << 12);
-        x1.u16 = (em1 << (dst_m_bits - 5)) | ((q[l] & 0x80) << 8);
-
-        // Three cases:
-        // x is normal and non-zero: Correct bias
-        if ((em0 & 0x06) != 0) {
-            x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
-        }
-        if ((em1 & 0x60) != 0) {
-            x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
-        }
-        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-        if (em0 == 0x01) {
-            x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
-        }
-        if (em1 == 0x10) {
-            x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
-        }
-        // x is zero, do nothing
-
-        // XXX it looks correct here - but mulmat still gives bad results...
-        // printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
-        //     i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 0, scale * float(x0.f16));
-        // printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
-        //     i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 1, scale * float(x1.f16));
-
-        y[l*2] = scale.as_value * float(x0.f16);
-        y[l*2+1] = scale.as_value * float(x1.f16);
-    }
-}
-
-// derived from dequantize_row_q4_0_cuda
-template<typename dst_t>
-static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
-    const int nb32 = k / 32;
-    const int nb = (k + 255) / 256;
-    dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y, nb32);
-}
-
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
        const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
@@ -740,8 +664,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return convert_unary_cont_cuda<float>;
        case GGML_TYPE_BF16:
            return convert_unary_cont_cuda<nv_bfloat16>;
-        case GGML_TYPE_MXFP4:
-            return dequantize_row_mxfp4_cuda;
        default:
            return nullptr;
    }
@@ -791,8 +713,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return convert_unary_cont_cuda<half>;
        case GGML_TYPE_BF16:
            return convert_unary_cont_cuda<nv_bfloat16>;
-        case GGML_TYPE_MXFP4:
-            return dequantize_row_mxfp4_cuda;
        default:
            return nullptr;
    }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -21,7 +21,6 @@
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmq.cuh"
 #include "ggml-cuda/mmv.cuh"
-#include "ggml-cuda/mmvmxfp4.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
@@ -36,7 +35,6 @@
 #include "ggml-cuda/ssm-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
-#include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
@@ -1203,7 +1201,7 @@ static void ggml_cuda_op_mul_mat_cublas(

    const int cc = ggml_cuda_info().devices[id].cc;

-    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT && src0->type != GGML_TYPE_MXFP4;
+    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;

    if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
        ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
@@ -1925,11 +1923,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
-        && src0->type != GGML_TYPE_MXFP4;
-    bool use_mul_mat_vec_mxfp4 = src0->type == GGML_TYPE_MXFP4
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
+        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

@@ -1983,8 +1977,6 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
-    } else if (use_mul_mat_vec_mxfp4) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_mxfp4, nullptr);
    } else {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
    }
@@ -2004,10 +1996,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;

    if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        if (ne2 == 1 && src0->type == GGML_TYPE_MXFP4) {
-            ggml_cuda_mul_mat_vec_mxfp4(ctx, src0, src1, ids, dst);
-            return;
-        }
        if (ne2 == 1) {
            if (ggml_is_quantized(src0->type)) {
                ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
@@ -2334,9 +2322,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
-        case GGML_OP_MEAN:
-            ggml_cuda_op_mean(ctx, dst);
-            break;
        case GGML_OP_SSM_CONV:
            ggml_cuda_op_ssm_conv(ctx, dst);
            break;
@@ -2485,9 +2470,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
    // Loop over nodes in GGML graph to obtain info needed for CUDA graph
    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();

-    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
-    const std::string gemma3n_node_name                  = "node_";
-
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

@@ -2509,6 +2491,15 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
        }

+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
+            // disable CUDA graphs for batch size > 1 for now.
+            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+            use_cuda_graph = false;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+#endif
+        }
+
        if (node->op == GGML_OP_CPY) {

            // Store the pointers which are updated for each token, such that these can be sent
@@ -2893,7 +2884,7 @@ struct ggml_backend_cuda_device_context {
    int device;
    std::string name;
    std::string description;
-    std::string id;
+    std::string uuid;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2906,9 +2897,9 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
    return ctx->description.c_str();
 }

-static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->id.c_str();
+    return ctx->uuid.c_str();
 }

 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -2925,7 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
-    props->id          = ggml_backend_cuda_device_get_id(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

@@ -3053,7 +3044,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_IQ4_XS:
                    case GGML_TYPE_BF16:
-                    case GGML_TYPE_MXFP4:
 #ifdef GGML_USE_MUSA
                        if (a->type == GGML_TYPE_Q3_K) {
                            return false;
@@ -3221,7 +3211,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
            return true;
@@ -3477,8 +3466,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->description = prop.name;

                #if !defined(GGML_USE_HIP)
-                char id[64];
-                snprintf(id, sizeof(id),
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
                    (unsigned char)prop.uuid.bytes[0],
                    (unsigned char)prop.uuid.bytes[1],
@@ -3497,15 +3486,9 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                    (unsigned char)prop.uuid.bytes[14],
                    (unsigned char)prop.uuid.bytes[15]
                  );
-                dev_ctx->id = id;
+                dev_ctx->uuid = uuid;
                #else
-                #ifdef _WIN32
-                char id[16];
-                snprintf(id, sizeof(id), "%d", i);
-                dev_ctx->id = id;
-                #else
-                dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
-                #endif
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
                #endif

                ggml_backend_dev_t dev = new ggml_backend_device {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mean.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mean.cu
@@ -1,19 +0,0 @@
-#include "mean.cuh"
-
-void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0   = dst->src[0];
-    const float *       src0_d = (const float *) src0->data;
-    float *             dst_d  = (float *) dst->data;
-    cudaStream_t        stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums(nrows, 1, 1);
-    reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
-}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mean.cuh
@@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cu
@@ -1,307 +0,0 @@
-#include "ggml.h"
-#include "common.cuh"
-#include "mmvmxfp4.cuh"
-
-// MXFP4 implementation derived from mmv.cu float32 code paths
-typedef union {
-    half f16;
-    uint16_t  u16;
-} f16_t;
-
-template <typename type_acc, int block_size> // TODO type_acc unused - consider bf16 support
-static __global__ void mul_mat_vec_mxfp4(
-        const block_mxfp4 * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
-        const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
-        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
-        const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
-    const int64_t row         = blockIdx.x;
-    const int64_t channel_dst = blockIdx.y;
-    const int64_t channel_x   = ids ? ids[channel_dst]          : channel_dst / channel_ratio;
-    const int64_t channel_y   = ids ? channel_dst % nchannels_y : channel_dst;
-    const int64_t sample_dst  = blockIdx.z;
-    const int64_t sample_x    = sample_dst / sample_ratio;
-    const int64_t sample_y    = sample_dst;
-    const int     tid         = threadIdx.x;
-    constexpr int warp_size   = ggml_cuda_get_physical_warp_size();
-
-    const uint16_t dst_bias = 15;
-    const uint16_t dst_0p5 = 0x3800;
-    const uint16_t dst_m_bits = 10;
-
-    x   += sample_x  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
-    y   += sample_y  *stride_sample_y   + channel_y  *stride_channel_y;
-    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
-    
-    const float2 * y2 = (const float2 *) y;
-
-    extern __shared__ char data_mmv[]; // allocated in GPU shared memory: warp_size*sizeof(float)
-    float * buf_iw = (float *) data_mmv;
-
-    if (block_size > warp_size) {
-        if (tid < warp_size) {
-            buf_iw[tid] = 0.0f;
-        }
-        __syncthreads();
-    }
-
-    float sumf = 0.0f;
-
-    for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-        int offset0 = col2 / (MXFP4/2);
-        int i = col2 % (MXFP4/2);
-        const block_mxfp4 *x2 = x+offset0;
-
-        union {
-            uint32_t as_bits;
-            float as_value;
-        } scale;
-        scale.as_bits = (((uint32_t)x2->d) << 23);
-        uint16_t em0 = x2->qs[i] & 0x07;
-        uint16_t em1 = x2->qs[i] & 0x70;
-        // float16 values
-        f16_t x0;
-        f16_t x1;
-        x0.u16 = (em0 << (dst_m_bits - 1)) | ((x2->qs[i] & 0x08) << 12);
-        x1.u16 = (em1 << (dst_m_bits - 5)) | ((x2->qs[i] & 0x80) << 8);
-
-        // Three cases:
-        // x is normal and non-zero: Correct bias
-        if ((em0 & 0x06) != 0) {
-            x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
-        }
-        if ((em1 & 0x60) != 0) {
-            x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
-        }
-        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-        if (em0 == 0x01) {
-            x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
-        }
-        if (em1 == 0x10) {
-            x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
-        }
-        // x is zero, do nothing
-
-        if (isnan(scale.as_value)) {
-            sumf = scale.as_value;
-            break;
-        }
-
-        const float2 tmpx = {x0.f16, x1.f16};
-        const float2 tmpy = y2[col2];
-        sumf += tmpx.x*tmpy.x*scale.as_value;
-        sumf += tmpx.y*tmpy.y*scale.as_value;
-    }
-
-    sumf = warp_reduce_sum<warp_size>(sumf);
-
-    if (block_size > warp_size) {
-        buf_iw[tid/warp_size] = sumf;
-        __syncthreads();
-        if (tid >= warp_size) {
-            return;
-        }
-        sumf = buf_iw[tid];
-        sumf = warp_reduce_sum<warp_size>(sumf);
-    }
-
-    if (tid != 0) {
-        return;
-    }
-
-    dst[row] = sumf;
-}
-
-template <typename type_acc>
-static void launch_mul_mat_vec_cuda_mxfp4(
-        const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
-        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
-        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
-        cudaStream_t stream) {
-    GGML_ASSERT(ncols      % 2 == 0);
-    // GGML_ASSERT(stride_row % 2 == 0); // TODO 
-    GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
-    GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
-    const int64_t channel_ratio = nchannels_dst / nchannels_x;
-    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;
-    int device;
-    int warp_size;
-
-    CUDA_CHECK(cudaGetDevice(&device));
-    warp_size = ggml_cuda_info().devices[device].warp_size;
-
-    int64_t block_size_best = warp_size;
-    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
-    int64_t max_block_size  = 256;
-    if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
-        max_block_size = 128;
-    }
-    for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
-        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
-        if (niter < niter_best) {
-            niter_best      = niter;
-            block_size_best = block_size;
-        }
-    }
-
-    const int smem = warp_size*sizeof(float);
-    const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
-    const dim3 block_dims(block_size_best, 1, 1);
-
-    switch (block_size_best) {
-        case   32: {
-            mul_mat_vec_mxfp4<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case   64: {
-            mul_mat_vec_mxfp4<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case   96: {
-            mul_mat_vec_mxfp4<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case  128: {
-            mul_mat_vec_mxfp4<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case  160: {
-            mul_mat_vec_mxfp4<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case  192: {
-            mul_mat_vec_mxfp4<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case  224: {
-            mul_mat_vec_mxfp4<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        case  256: {
-            mul_mat_vec_mxfp4<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-        } break;
-        default: {
-            GGML_ABORT("fatal error");
-        } break;
-    }
-}
-
-static void mul_mat_vec_cuda_mxfp4(
-        const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
-        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
-        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
-        enum ggml_prec prec, cudaStream_t stream) {
-    launch_mul_mat_vec_cuda_mxfp4<float>
-        (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
-}
-
-void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
-    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
-    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const size_t ts_src0 = ggml_type_size(src0->type);
-    const size_t ts_src1 = ggml_type_size(src1->type);
-    const size_t ts_dst  = ggml_type_size(dst->type);
-
-    GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for  batch size 1.
-    GGML_ASSERT(ne13 == ne3);
-
-    // GGML_ASSERT(        nb00       == ts_src0); // TODO adjust for block sizing logic
-    GGML_ASSERT(        nb10       == ts_src1);
-    GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
-    GGML_ASSERT(        nb0        == ts_dst);
-
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
-
-    const float   * src1_d =       (const float   *) src1->data;
-    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
-    float         *  dst_d =       (float         *)  dst->data;
-
-    const int64_t stride_row = src0->nb[1] / ts_src0;
-    const int64_t s11 = src1->nb[1] / ts_src1;
-    const int64_t s1  =  dst->nb[1] / ts_dst;
-    const int64_t stride_channel_x = src0->nb[2] / ts_src0;
-    const int64_t s12 = src1->nb[2] / ts_src1;
-    const int64_t s2  =  dst->nb[2] / ts_dst;
-    const int64_t stride_sample_x = src0->nb[3] / ts_src0;
-    const int64_t stride_sample_y = src1->nb[3] / ts_src1;
-    const int64_t stride_sample_dst  =  dst->nb[3] / ts_dst;
-    const int64_t nsamples_dst = ne3;
-    const int64_t nsamples_x = ne03;
-    const int64_t nchannels_x = ne02;
-    const int64_t nrows = ne01;
-    const int64_t ncols = ne00;
-
-    // For MUL_MAT_ID the memory layout is different than for MUL_MAT:
-    const int64_t ncols_dst          = ids ? ne2  : ne1;
-    const int64_t nchannels_y        = ids ? ne11 : ne12;
-    const int64_t nchannels_dst      = ids ? ne1  : ne2;
-    const int64_t stride_channel_dst = ids ? s1   : s2;
-    const int64_t stride_channel_y   = ids ? s11  : s12;
-
-    GGML_ASSERT(ncols_dst == 1);
-
-    const block_mxfp4 * src0_d = (const block_mxfp4 *) src0->data;
-    mul_mat_vec_cuda_mxfp4(src0_d, src1_d, ids_d, dst_d, ncols, nrows, stride_row,
-        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-        nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, ctx.stream());
-}
-
-void ggml_cuda_op_mul_mat_vec_mxfp4(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    GGML_ASSERT(src1_ncols == 1);
-
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
-
-    // ggml_cuda_op provides single, contiguous matrices
-    const int64_t stride_row         = ne00 / MXFP4; 
-    const int64_t nchannels_x        = 1;
-    const int64_t nchannels_y        = 1;
-    const int64_t nchannels_dst      = 1;
-    const int64_t stride_channel_x   = 0;
-    const int64_t stride_channel_y   = 0;
-    const int64_t stride_channel_dst = 0;
-    const int64_t nsamples_x         = 1;
-    const int64_t nsamples_dst       = 1;
-    const int64_t stride_sample_x    = 0;
-    const int64_t stride_sample_y    = 0;
-    const int64_t stride_sample_dst  = 0;
-
-    const block_mxfp4 * src0_d = (const block_mxfp4 *) src0_dd_i;
-    mul_mat_vec_cuda_mxfp4(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
-        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-        nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
-
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(src1);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(src1_ddq_i);
-    GGML_UNUSED(src1_ncols);
-    GGML_UNUSED(src1_padded_row_size);
-}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvmxfp4.cuh
@@ -1,9 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
-
-void ggml_cuda_op_mul_mat_vec_mxfp4(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
@@ -1,9 +1,25 @@
 #include "sumrows.cuh"

+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums(nrows, 1, 1);
-    reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 }

 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -19,8 +35,5 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t ncols = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);

-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums(nrows, 1, 1);
-
-    reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
@@ -1,4 +1,5 @@
 #include "common.cuh"

 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+
 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -421,13 +421,6 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

-#define MXFP4 32
-typedef struct {
-    uint8_t d;              // scale E8M0 float 
-    uint8_t qs[MXFP4 / 2];  // (32) 4 bit elements E2M1 float
-} block_mxfp4;
-static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
-
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL

@@ -1936,9 +1929,6 @@ GGML_TABLE_END()
 #define N_R0_IQ4_XS 2
 #define N_SG_IQ4_XS 2

-#define N_R0_MXFP4 4
-#define N_SG_MXFP4 2
-
 // kernel argument structs
 //
 // - element counters (e.g. ne00) typically use int32_t to reduce register usage
@@ -3444,61 +3434,31 @@ kernel void kernel_neg(
    dst[tpig] = -src0[tpig];
 }

-template <bool norm>
 kernel void kernel_sum_rows(
-        constant ggml_metal_kargs_sum_rows & args,
        device const float * src0,
        device       float * dst,
-        threadgroup  float * shmem_f32 [[threadgroup(0)]],
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort  sgitg[[simdgroup_index_in_threadgroup]],
-        ushort  tiisg[[thread_index_in_simdgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    int64_t i3 = tgpig.z;
-    int64_t i2 = tgpig.y;
-    int64_t i1 = tgpig.x;
+        constant ggml_metal_kargs_sum_rows & args,
+        uint3 tpig[[thread_position_in_grid]]) {
+    int64_t i3 = tpig.z;
+    int64_t i2 = tpig.y;
+    int64_t i1 = tpig.x;

    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
        return;
    }

-    if (sgitg == 0) {
-        shmem_f32[tiisg] = 0.0f;
-    }
-
    device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);

-    float sumf = 0;
+    float row_sum = 0;

-    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
-        sumf += src_row[i0];
+    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
+        row_sum += src_row[i0];
    }

-    sumf = simd_sum(sumf);
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    if (tiisg == 0) {
-        shmem_f32[sgitg] = sumf;
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    sumf = shmem_f32[tiisg];
-    sumf = simd_sum(sumf);
-
-    if (tpitg.x == 0) {
-        dst_row[0] = norm ? sumf / args.ne00 : sumf;
-    }
+    dst_row[0] = row_sum;
 }

-typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
-
-template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
-template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
-
 template<typename T>
 kernel void kernel_soft_max(
        device const  char * src0,
@@ -4390,16 +4350,16 @@ void mul_vec_q_n_f32_impl(
        device const char * src1,
        device       char * dst,
        threadgroup  char * shmem,
-        uint3  tgpig, // Threadgroup Position in Grid
-        ushort tiisg, // Thread Index in SIMD Group
-        ushort sgitg) { // SIMD Group Index in ThreadGroup
-    const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const int nb = args.ne00/QK4_0;

    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
+    const int first_row = (r0 * nsg + sgitg) * nr0;

    const uint i12 = im%args.ne12;
    const uint i13 = im/args.ne12;
@@ -9232,49 +9192,6 @@ kernel void kernel_mul_mm_id(
    }
 }

-template <typename type4x4>
-void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
-    float4x4 reg_f;
-    const ushort dst_bias = 15;
-    const ushort dst_0p5 = 0x3800;
-    const ushort dst_m_bits = 10;
-    const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
-    // il:0 first 16, il:1 last 16
-    for (int i = 0; i < 8; i++) {
-        ushort em0 = xb->qs[il*8 + i] & 0x07;
-        ushort em1 = xb->qs[il*8 + i] & 0x70;
-        // float16 values
-        ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
-        ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
-
-        // Three cases:
-        // x is normal and non-zero: Correct bias
-        if ((em0 & 0x06) != 0) {
-            x0 = x0 + ((dst_bias - 1) << dst_m_bits);
-        }
-        if ((em1 & 0x60) != 0) {
-            x1 = x1 + ((dst_bias - 1) << dst_m_bits);
-        }
-        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-        if (em0 == 0x01) {
-            x0 = dst_0p5 | (x0 & 0x8000);
-        }
-        if (em1 == 0x10) {
-            x1 = dst_0p5 | (x1 & 0x8000);
-        }
-        // x is zero, do nothing
-
-        if (isnan(scale)) {
-            reg_f[i/2][2*(i%2) + 0] = scale;
-            reg_f[i/2][2*(i%2) + 1] = scale;
-        } else {
-            reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
-            reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
-        }
-    }
-    reg = (type4x4) reg_f;
-}
-
 #define QK_NL 16

 //
@@ -9342,8 +9259,6 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

-template [[host_name("kernel_mul_mm_mxfp4_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,   2,     dequantize_mxfp4>;
-
 //
 // indirect matrix-matrix multiplication
 //
@@ -9375,8 +9290,6 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_m
 template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

-template [[host_name("kernel_mul_mm_id_mxfp4_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,    2,    dequantize_mxfp4>;
-

 //
 // matrix-vector multiplication
@@ -9493,120 +9406,6 @@ kernel void kernel_mul_mv_id(
        sgitg);
 }

-// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
-void mul_mv_mxfp4_f32_impl(
-        ggml_metal_kargs_mul_mv args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        threadgroup  char * shmem,
-        uint3  tgpig,
-        ushort tiisg,
-        ushort sgitg) {
-    const ushort dst_bias = 15;
-    const ushort dst_0p5 = 0x3800;
-    const ushort dst_m_bits = 10;
-    const int nr0 = N_R0_MXFP4;
-    const int nsg = N_SG_MXFP4;
-    const int nw = N_SIMDWIDTH;
-    const int nb = args.ne00/MXFP4;
-
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * nsg + sgitg) * nr0;
-
-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
-
-    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
-
-    device const float       * y = (device const float       *) (src1 + offset1);
-
-    // pointers to src0 rows
-    device const block_mxfp4 * ax[nr0];
-    for (int row = 0; row < nr0; ++row) {
-        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-
-        ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
-    }
-
-    float yl[16]; // src1 vector cache
-    float sumf[nr0] = {0.f};
-
-    const short ix = (tiisg/2);
-    const short il = (tiisg%2)*16;
-
-    device const float * yb = y + ix*MXFP4 + il;
-
-    // each thread in a SIMD group deals with half a block.
-    for (int ib = ix; ib < nb; ib += nw/2) {
-
-#pragma unroll
-        for (short row = 0; row < nr0; row++) {
-            // Processes 16 items
-            device const block_mxfp4 * qb_curr = ax[row] + ib;
-            float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
-            // il = 0 or 16
-            device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
-            for (int i = 0; i < 8; ++i) {
-                ushort em0 = qs[i] & 0x07;
-                ushort em1 = qs[i] & 0x70;
-                ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
-                ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
-                // Three cases:
-                // x is normal and non-zero: Correct bias
-                if ((em0 & 0x06) != 0) {
-                    x0 = x0 + ((dst_bias - 1) << dst_m_bits);
-                }
-                if ((em1 & 0x60) != 0) {
-                    x1 = x1 + ((dst_bias - 1) << dst_m_bits);
-                }
-                // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-                if (em0 == 0x01) {
-                    x0 = dst_0p5 | (x0 & 0x8000);
-                }
-                if (em1 == 0x10) {
-                    x1 = dst_0p5 | (x1 & 0x8000);
-                }
-                // x is zero, do nothing
-                if (!isnan(d)) {
-                    sumf[row] += yb[i*2] * as_type<half>(x0) * d
-                        + yb[i*2+1] * as_type<half>(x1) * d;
-                } else {
-                    sumf[row] = d;
-                }
-            }
-        }
-
-        yb += MXFP4 * 16;
-    }
-
-    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
-
-    for (int row = 0; row < nr0; ++row) {
-        const float tot = simd_sum(sumf[row]);
-
-        if (tiisg == 0 && first_row + row < args.ne01) {
-            dst_f32[first_row + row] = tot;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_mxfp4_f32")]]
-kernel void kernel_mul_mv_mxfp4_f32(
-        constant ggml_metal_kargs_mul_mv & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        threadgroup  char * shmem [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
-}
-
 typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;

 template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
@@ -9636,8 +9435,6 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL,  N_SG_IQ4_NL,  N_SIMDWIDTH>>>;
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS,  N_SG_IQ4_XS,  N_SIMDWIDTH>>>;

-template [[host_name("kernel_mul_mv_id_mxfp4_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
-
 kernel void kernel_pool_2d_max_f32(
        device  const float * src0,
        device        float * dst,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -65,9 +65,6 @@
 #define N_R0_IQ4_XS 2
 #define N_SG_IQ4_XS 2

-#define N_R0_MXFP4 4
-#define N_SG_MXFP4 2
-
 // kernel argument structs
 //
 // - element counters (e.g. ne00) typically use int32_t to reduce register usage
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -40,7 +40,6 @@ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
 static struct ggml_backend_reg    g_ggml_backend_metal_reg;
 static struct ggml_backend_device g_ggml_backend_metal_device;

-
 // information about a Metal device
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
@@ -90,11 +89,7 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
        ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];

 #if defined(GGML_METAL_USE_BF16)
-        if (@available(macOS 14.0, *)) {
-            ctx->use_bfloat = ctx->has_bfloat;
-        } else {
-            ctx->use_bfloat = false;
-        }
+        ctx->use_bfloat = ctx->has_bfloat;
 #else
        ctx->use_bfloat = false;
 #endif
@@ -210,7 +205,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
@@ -290,7 +284,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,
@@ -313,7 +306,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
@@ -338,7 +330,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
@@ -498,7 +489,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_COS,
    GGML_METAL_KERNEL_TYPE_NEG,
    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
-    GGML_METAL_KERNEL_TYPE_MEAN,
    GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
    GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
    GGML_METAL_KERNEL_TYPE_ARGMAX,
@@ -939,7 +929,7 @@ static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfl

            MTLCompileOptions * options = [MTLCompileOptions new];
            options.preprocessorMacros = prep;
-            
+
            //[options setFastMathEnabled:false];

            metal_library = [device newLibraryWithSource:src options:options error:&error];
@@ -1162,7 +1152,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,                 mul_mv_q5_0_f32,                 has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,                 mul_mv_q5_1_f32,                 has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,                 mul_mv_q8_0_f32,                 has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,                mul_mv_mxfp4_f32,                has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,         mul_mv_ext_f16_f32_r1_2,         has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,         mul_mv_ext_f16_f32_r1_3,         has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,         mul_mv_ext_f16_f32_r1_4,         has_simdgroup_reduction);
@@ -1242,7 +1231,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,             mul_mv_id_iq1_m_f32,             has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,            mul_mv_id_iq4_nl_f32,            has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,            mul_mv_id_iq4_xs_f32,            has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,             mul_mv_id_mxfp4_f32,             has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                  mul_mm_f32_f32,                  has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                  mul_mm_f16_f32,                  has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,                 mul_mm_bf16_f32,                 has_simdgroup_mm && use_bfloat);
@@ -1265,7 +1253,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,                mul_mm_iq1_m_f32,                has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,               mul_mm_iq4_nl_f32,               has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,               mul_mm_iq4_xs_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,              mul_mm_id_map0_f16,              has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,              mul_mm_id_map1_f32,              has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,               mul_mm_id_f32_f16,               has_simdgroup_mm);
@@ -1290,7 +1277,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,             mul_mm_id_iq1_m_f16,             has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,            mul_mm_id_iq4_nl_f16,            has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,             mul_mm_id_mxfp4_f16,             has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
@@ -1450,7 +1436,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
@@ -1649,7 +1634,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_LOG:
            return false; // TODO: implement
        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_GROUP_NORM:
            return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
@@ -2378,30 +2362,11 @@ static bool ggml_metal_encode_node(
                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
            } break;
        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
            {
                GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));

-                id<MTLComputePipelineState> pipeline = nil;
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;

-                switch (dst->op) {
-                    case GGML_OP_SUM_ROWS:
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-                        break;
-                    case GGML_OP_MEAN:
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
-                        break;
-                    default:
-                        GGML_ABORT("fatal error");
-                }
-
-                int nth = 32; // SIMD width
-
-                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
-                    nth *= 2;
-                }
-
-                nth = MIN(nth, ne00);

                ggml_metal_kargs_sum_rows args = {
                   /*.ne00 =*/ ne00,
@@ -2431,12 +2396,11 @@ static bool ggml_metal_encode_node(
                };

                [encoder setComputePipelineState:pipeline];
-                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&args length:sizeof(args) atIndex:2];

-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
            } break;
        case GGML_OP_SOFT_MAX:
            {
@@ -3016,7 +2980,6 @@ static bool ggml_metal_encode_node(
                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
-                        case GGML_TYPE_MXFP4:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32  ].pipeline; break;
                        default: GGML_ABORT("MUL MAT-MAT not implemented");
                    }

@@ -3222,12 +3185,6 @@ static bool ggml_metal_encode_node(
                                smem = 32*sizeof(float);
                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
                            } break;
-                        case GGML_TYPE_MXFP4:
-                            {
-                                nsg = N_SG_MXFP4;
-                                nr0 = N_R0_MXFP4;
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32].pipeline;
-                            } break;
                        default:
                            {
                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@@ -3412,7 +3369,6 @@ static bool ggml_metal_encode_node(
                            case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16  ].pipeline; break;
                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
                            case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
-                            case GGML_TYPE_MXFP4:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16  ].pipeline; break;
                            default: GGML_ABORT("MUL_MAT_ID not implemented");
                        }

@@ -3624,12 +3580,6 @@ static bool ggml_metal_encode_node(
                                smem = 32*sizeof(float);
                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
                            } break;
-                        case GGML_TYPE_MXFP4:
-                            {
-                                nsg = N_SG_MXFP4;
-                                nr0 = N_R0_MXFP4;
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32].pipeline;
-                            } break;
                        default:
                            {
                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
@@ -5753,7 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
-    props->id          = "0";
+    props->uuid        = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -956,61 +956,31 @@ kernel void kernel_neg(
    dst[tpig] = -src0[tpig];
 }

-template <bool norm>
 kernel void kernel_sum_rows(
-        constant ggml_metal_kargs_sum_rows & args,
        device const float * src0,
        device       float * dst,
-        threadgroup  float * shmem_f32 [[threadgroup(0)]],
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort  sgitg[[simdgroup_index_in_threadgroup]],
-        ushort  tiisg[[thread_index_in_simdgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    int64_t i3 = tgpig.z;
-    int64_t i2 = tgpig.y;
-    int64_t i1 = tgpig.x;
+        constant ggml_metal_kargs_sum_rows & args,
+        uint3 tpig[[thread_position_in_grid]]) {
+    int64_t i3 = tpig.z;
+    int64_t i2 = tpig.y;
+    int64_t i1 = tpig.x;

    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
        return;
    }

-    if (sgitg == 0) {
-        shmem_f32[tiisg] = 0.0f;
-    }
-
    device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);

-    float sumf = 0;
+    float row_sum = 0;

-    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
-        sumf += src_row[i0];
+    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
+        row_sum += src_row[i0];
    }

-    sumf = simd_sum(sumf);
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    if (tiisg == 0) {
-        shmem_f32[sgitg] = sumf;
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    sumf = shmem_f32[tiisg];
-    sumf = simd_sum(sumf);
-
-    if (tpitg.x == 0) {
-        dst_row[0] = norm ? sumf / args.ne00 : sumf;
-    }
+    dst_row[0] = row_sum;
 }

-typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
-
-template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
-template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
-
 template<typename T>
 kernel void kernel_soft_max(
        device const  char * src0,
@@ -1902,16 +1872,16 @@ void mul_vec_q_n_f32_impl(
        device const char * src1,
        device       char * dst,
        threadgroup  char * shmem,
-        uint3  tgpig, // Threadgroup Position in Grid
-        ushort tiisg, // Thread Index in SIMD Group
-        ushort sgitg) { // SIMD Group Index in ThreadGroup
-    const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const int nb = args.ne00/QK4_0;

    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
+    const int first_row = (r0 * nsg + sgitg) * nr0;

    const uint i12 = im%args.ne12;
    const uint i13 = im/args.ne12;
@@ -6744,49 +6714,6 @@ kernel void kernel_mul_mm_id(
    }
 }

-template <typename type4x4>
-void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
-    float4x4 reg_f;
-    const ushort dst_bias = 15;
-    const ushort dst_0p5 = 0x3800;
-    const ushort dst_m_bits = 10;
-    const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
-    // il:0 first 16, il:1 last 16
-    for (int i = 0; i < 8; i++) {
-        ushort em0 = xb->qs[il*8 + i] & 0x07;
-        ushort em1 = xb->qs[il*8 + i] & 0x70;
-        // float16 values
-        ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
-        ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
-
-        // Three cases:
-        // x is normal and non-zero: Correct bias
-        if ((em0 & 0x06) != 0) {
-            x0 = x0 + ((dst_bias - 1) << dst_m_bits);
-        }
-        if ((em1 & 0x60) != 0) {
-            x1 = x1 + ((dst_bias - 1) << dst_m_bits);
-        }
-        // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-        if (em0 == 0x01) {
-            x0 = dst_0p5 | (x0 & 0x8000);
-        }
-        if (em1 == 0x10) {
-            x1 = dst_0p5 | (x1 & 0x8000);
-        }
-        // x is zero, do nothing
-
-        if (isnan(scale)) {
-            reg_f[i/2][2*(i%2) + 0] = scale;
-            reg_f[i/2][2*(i%2) + 1] = scale;
-        } else {
-            reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
-            reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
-        }
-    }
-    reg = (type4x4) reg_f;
-}
-
 #define QK_NL 16

 //
@@ -6854,8 +6781,6 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

-template [[host_name("kernel_mul_mm_mxfp4_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,   2,     dequantize_mxfp4>;
-
 //
 // indirect matrix-matrix multiplication
 //
@@ -6887,8 +6812,6 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_m
 template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
 template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

-template [[host_name("kernel_mul_mm_id_mxfp4_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_mxfp4,    2,    dequantize_mxfp4>;
-

 //
 // matrix-vector multiplication
@@ -7005,120 +6928,6 @@ kernel void kernel_mul_mv_id(
        sgitg);
 }

-// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
-void mul_mv_mxfp4_f32_impl(
-        ggml_metal_kargs_mul_mv args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        threadgroup  char * shmem,
-        uint3  tgpig,
-        ushort tiisg,
-        ushort sgitg) {
-    const ushort dst_bias = 15;
-    const ushort dst_0p5 = 0x3800;
-    const ushort dst_m_bits = 10;
-    const int nr0 = N_R0_MXFP4;
-    const int nsg = N_SG_MXFP4;
-    const int nw = N_SIMDWIDTH;
-    const int nb = args.ne00/MXFP4;
-
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * nsg + sgitg) * nr0;
-
-    const uint i12 = im%args.ne12;
-    const uint i13 = im/args.ne12;
-
-    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
-
-    device const float       * y = (device const float       *) (src1 + offset1);
-
-    // pointers to src0 rows
-    device const block_mxfp4 * ax[nr0];
-    for (int row = 0; row < nr0; ++row) {
-        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-
-        ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
-    }
-
-    float yl[16]; // src1 vector cache
-    float sumf[nr0] = {0.f};
-
-    const short ix = (tiisg/2);
-    const short il = (tiisg%2)*16;
-
-    device const float * yb = y + ix*MXFP4 + il;
-
-    // each thread in a SIMD group deals with half a block.
-    for (int ib = ix; ib < nb; ib += nw/2) {
-
-#pragma unroll
-        for (short row = 0; row < nr0; row++) {
-            // Processes 16 items
-            device const block_mxfp4 * qb_curr = ax[row] + ib;
-            float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
-            // il = 0 or 16
-            device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
-            for (int i = 0; i < 8; ++i) {
-                ushort em0 = qs[i] & 0x07;
-                ushort em1 = qs[i] & 0x70;
-                ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
-                ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
-                // Three cases:
-                // x is normal and non-zero: Correct bias
-                if ((em0 & 0x06) != 0) {
-                    x0 = x0 + ((dst_bias - 1) << dst_m_bits);
-                }
-                if ((em1 & 0x60) != 0) {
-                    x1 = x1 + ((dst_bias - 1) << dst_m_bits);
-                }
-                // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-                if (em0 == 0x01) {
-                    x0 = dst_0p5 | (x0 & 0x8000);
-                }
-                if (em1 == 0x10) {
-                    x1 = dst_0p5 | (x1 & 0x8000);
-                }
-                // x is zero, do nothing
-                if (!isnan(d)) {
-                    sumf[row] += yb[i*2] * as_type<half>(x0) * d
-                        + yb[i*2+1] * as_type<half>(x1) * d;
-                } else {
-                    sumf[row] = d;
-                }
-            }
-        }
-
-        yb += MXFP4 * 16;
-    }
-
-    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
-
-    for (int row = 0; row < nr0; ++row) {
-        const float tot = simd_sum(sumf[row]);
-
-        if (tiisg == 0 && first_row + row < args.ne01) {
-            dst_f32[first_row + row] = tot;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_mxfp4_f32")]]
-kernel void kernel_mul_mv_mxfp4_f32(
-        constant ggml_metal_kargs_mul_mv & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        threadgroup  char * shmem [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        ushort tiisg[[thread_index_in_simdgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
-}
-
 typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;

 template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
@@ -7148,8 +6957,6 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL,  N_SG_IQ4_NL,  N_SIMDWIDTH>>>;
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS,  N_SG_IQ4_XS,  N_SIMDWIDTH>>>;

-template [[host_name("kernel_mul_mv_id_mxfp4_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
-
 kernel void kernel_pool_2d_max_f32(
        device  const float * src0,
        device        float * dst,
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal

 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"

-// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -DGGML_METAL_USE_BF16 -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"
--- a/ml/backend/ggml/ggml/src/ggml-quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-quants.c
@@ -4925,144 +4925,6 @@ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RE
    quantize_iq2_s(x, y, 1, k, NULL);
 }

-// =============================== mxfp4 (de)-quantization
-
-void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k) {
-    static const int qk = MXFP4;
-    static const uint32_t E8_BIAS = 127;
-    static const uint32_t E2_BIAS = 1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-            }
-        }
-
-        const float dequant_scale  = amax / 6.0f;
-        uint32_t dequant_scale_exponent = 0;
-        memcpy(&dequant_scale_exponent, &dequant_scale, sizeof(dequant_scale_exponent));
-
-        // Rounding up
-        dequant_scale_exponent = (dequant_scale_exponent + 0x007FFFFF) & 0x7F800000;
-        // Rounding down
-        // dequant_scale_exponent = dequant_scale_exponent & 0x7F800000;
-
-        float dequant_scale_rounded = 0.0f;
-        memcpy(&dequant_scale_rounded, &dequant_scale_exponent, sizeof(dequant_scale_rounded));
-        float quant_scale = 0.0f;
-        if (dequant_scale_rounded != 0.0f) {
-            quant_scale = 1.0f / dequant_scale_rounded;
-        }
-
-        y[i].d = (uint8_t)(dequant_scale_exponent >> 23);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + j*2]*quant_scale;
-            const float x1 = x[i*qk + j*2+1]*quant_scale;
-
-            uint32_t xi0 = 0;
-            uint32_t xi1 = 0;
-            memcpy(&xi0, &x0, sizeof(xi0));
-            memcpy(&xi1, &x1, sizeof(xi1));
-
-            uint32_t s0 = xi0 & 0x80000000;
-            uint32_t s1 = xi1 & 0x80000000;
-            uint32_t e0 = (xi0 >> 23) & 0xFF;
-            uint32_t e1 = (xi1 >> 23) & 0xFF;
-            uint32_t m0 = (xi0 & 0x7FFFFF);
-            uint32_t m1 = (xi1 & 0x7FFFFF);
-
-            // 0.25 <= x < 0.75 maps to 0.5, a denormal number
-            // Move implicit bit 1 at the beginning to mantissa for denormals
-            // adjusted_exponents
-            uint32_t ae0 = E8_BIAS - (e0 + 1);
-            uint32_t ae1 = E8_BIAS - (e1 + 1);
-            if (e0 < E8_BIAS) {
-                m0 = (0x400000 | (m0 >> 1)) >> ae0;
-            }
-            if (e1 < E8_BIAS) {
-                m1 = (0x400000 | (m1 >> 1)) >> ae1;
-            }
-
-            // For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
-            e0 = MAX(e0, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
-            e1 = MAX(e1, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
-
-            // Combine sign, exponent, and mantissa, while saturating
-            // rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right
-            uint32_t tmp0 = MIN((((e0 << 2) | (m0 >> 21)) + 1) >> 1, 0x7);
-            uint32_t tmp1 = MIN((((e1 << 2) | (m1 >> 21)) + 1) >> 1, 0x7);
-            uint8_t v0 = (uint8_t)((s0 >> 28) | tmp0);
-            uint8_t v1 = (uint8_t)((s1 >> 28) | tmp1);           
-            y[i].qs[j]  = v0;
-            y[i].qs[j] |= v1 << 4;
-        }
-    }
-}
-
-void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
-    assert(k % MXFP4 == 0);
-
-    const int nb = k / MXFP4;
-    const uint16_t dst_bias = 15;
-    const uint16_t dst_0p5 = 0x3800;
-    const uint16_t dst_m_bits = 10;
-
-    for (int i = 0; i < nb; i++) {
-        union {
-            uint32_t as_bits;
-            float as_value;
-        } scale;
-        scale.as_bits = (((uint32_t)x[i].d) << 23);
-        for (int j = 0; j < MXFP4/2; ++j) {
-            uint16_t em0 = x[i].qs[j] & 0x07;
-            uint16_t em1 = x[i].qs[j] & 0x70;
-            // float16 values
-            uint16_t x0 = (em0 << (dst_m_bits - 1)) | ((x[i].qs[j] & 0x08) << 12);
-            uint16_t x1 = (em1 << (dst_m_bits - 5)) | ((x[i].qs[j] & 0x80) << 8);
-
-            // Three cases:
-            // x is normal and non-zero: Correct bias
-            if ((em0 & 0x06) != 0) {
-                x0 = x0 + ((dst_bias - 1) << dst_m_bits);
-            }
-            if ((em1 & 0x60) != 0) {
-                x1 = x1 + ((dst_bias - 1) << dst_m_bits);
-            }
-            // x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
-            if (em0 == 0x01) {
-                x0 = dst_0p5 | (x0 & 0x8000);
-            }
-            if (em1 == 0x10) {
-                x1 = dst_0p5 | (x1 & 0x8000);
-            }
-            // x is zero, do nothing
-
-            if (isnan(scale.as_value)) {
-                y[i*MXFP4 + j*2] = scale.as_value;
-                y[i*MXFP4 + j*2+1] = scale.as_value;
-            } else {
-                y[i*MXFP4 + j*2] = GGML_FP16_TO_FP32(x0)*scale.as_value;
-                y[i*MXFP4 + j*2+1] = GGML_FP16_TO_FP32(x1)*scale.as_value;
-            }
-        }
-    }
-}
-
-
-size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
-    return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
-}
-
 // =============================== data validation

 static bool validate_float(float f, size_t i) {
@@ -5352,9 +5214,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
            } break;
-        case GGML_TYPE_MXFP4:
-            // TODO - anything to validate?
-            break;
+
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
--- a/ml/backend/ggml/ggml/src/ggml-quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-quants.h
@@ -37,8 +37,6 @@ GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_
 GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);

-GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
-
 // Dequantization
 GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -67,8 +65,6 @@ GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, floa
 GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

-GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@@ -94,8 +90,6 @@ GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTR
 GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

-GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-
 GGML_API void iq2xs_init_impl(enum ggml_type type);
 GGML_API void iq2xs_free_impl(enum ggml_type type);
 GGML_API void iq3xs_init_impl(int grid_size);
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -589,13 +589,11 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
    },
-    [GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
-        .type_name                = "mxfp4",
-        .blck_size                = MXFP4,
-        .type_size                = sizeof(block_mxfp4),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_mxfp4,
-        .from_float_ref           = (ggml_from_float_t) quantize_row_mxfp4_ref,
+    [4] = { // GGML_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
    },
    [5] = { // GGML_TYPE_Q4_3
        .type_name                = "DEPRECATED",
@@ -6448,7 +6446,6 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Bruce MacDonald	f2a4d058f9	gofmt	2025-06-16 16:34:46 -07:00
Bruce MacDonald	63e7634014	pr feedback	2025-06-16 16:08:38 -07:00
Bruce MacDonald	8d51d92f3b	server: cache gguf model capabilities rather than reading off disc	2025-06-16 15:17:36 -07:00
Bruce MacDonald	2348fef568	Revert "server: model info caching system for improved performance" This reverts commit 8ef643d4978168a8563ae24434a424358ce390e3.	2025-06-16 15:17:02 -07:00
Bruce MacDonald	883f655dd6	server: model info caching system for improved performance Implements an in-memory cache for loaded models with file modification time tracking to ensure cache validity. Models are now cached after first load and retrieved from cache on subsequent requests if the underlying manifest file hasn't changed. Key changes: - Add ModelCache with get/set methods and modification time validation - Cache models in GetModel() and check cache before disk load - Move capabilities calculation to model loading time and store in model - Update capability access to use cached field instead of runtime calculation - Add test coverage for cache behavior and model loading This reduces redundant model loading operations and improves response times for model access.	2025-06-16 15:16:58 -07:00