wip

err handling + fixing scope issue
wip with json stuff and cleanup
2026-02-15 18:14:21 -05:00 · 2025-03-24 11:02:02 -07:00 · 2025-02-11 19:29:48 -08:00 · 2025-02-11 16:40:40 -08:00 · 2025-02-04 22:09:22 -08:00 · 2025-02-03 16:05:15 -08:00
619 changed files with 13031 additions and 10930 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
 llama/**/*.m linguist-vendored
 llama/**/*.metal linguist-vendored

+ml/backend/**/*.c linguist-vendored
+ml/backend/**/*.h linguist-vendored
+ml/backend/**/*.cpp linguist-vendored
+ml/backend/**/*.hpp linguist-vendored
+ml/backend/**/*.cu linguist-vendored
+ml/backend/**/*.cuh linguist-vendored
+ml/backend/**/*.m linguist-vendored
+ml/backend/**/*.metal linguist-vendored
+
 * text=auto
 *.go text eol=lf
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,11 +1,5 @@
 name: test

-env:
-  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
-  CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
-  CUDA_12_WINDOWS_VER: 12.4
-
 concurrency:
  # For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
  # cancels running CI jobs and starts all new ones.
@@ -27,7 +21,7 @@ jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
-      RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
+      changed: ${{ steps.changes.outputs.changed }}
    steps:
      - uses: actions/checkout@v4
        with:
@@ -35,309 +29,61 @@ jobs:
      - id: changes
        run: |
          changed() {
-            git diff-tree -r --no-commit-id --name-only \
-              $(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
-              ${{ github.event.pull_request.head.sha }} \
+            local BASE=${{ github.event.pull_request.base.sha }}
+            local HEAD=${{ github.event.pull_request.head.sha }}
+            local MERGE_BASE=$(git merge-base $BASE $HEAD)
+            git diff-tree -r --no-commit-id --name-only "$MERGE_BASE" "$HEAD" \
              | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
          }

-          {
-            echo RUNNERS=$(changed 'llama/**')
-          } >>$GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT

-  runners-linux-cuda:
+  linux:
    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
+    if: ${{ needs.changes.outputs.changed == 'True' }}
    strategy:
      matrix:
-        cuda-version:
-          - '11.8.0'
-    runs-on: linux
-    container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
+        include:
+          - container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            preset: CUDA
+          - container: rocm/dev-ubuntu-22.04:6.1.2
+            preset: ROCm
+            extra-packages: rocm-libs
+    runs-on: ubuntu-latest
+    container: ${{ matrix.container }}
    steps:
+      - uses: actions/checkout@v4
      - run: |
-          apt-get update && apt-get install -y git build-essential curl
+          apt-get update
+          apt-get install -y cmake pkg-config ${{ matrix.extra-packages }}
        env:
          DEBIAN_FRONTEND: noninteractive
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v4
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
      - run: |
-          git config --global --add safe.directory /__w/ollama/ollama
-          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-          make -j $cores cuda_v11
-  runners-linux-rocm:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    strategy:
-      matrix:
-        rocm-version:
-          - '6.1.2'
-    runs-on: linux
-    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
-    steps:
-      - run: |
-          apt-get update && apt-get install -y git build-essential curl rocm-libs
-        env:
-          DEBIAN_FRONTEND: noninteractive
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v4
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
-      - run: |
-          git config --global --add safe.directory /__w/ollama/ollama
-          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-          make -j $cores rocm
+          cmake --preset ${{ matrix.preset }}
+          cmake --build --preset ${{ matrix.preset }} --parallel

-  # ROCm generation step
-  runners-windows-rocm:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    runs-on: windows
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      # ROCM installation steps
-      - name: 'Cache ROCm installer'
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: rocm-install.exe
-          key: ${{ env.ROCM_WINDOWS_URL }}
-      - name: 'Conditionally Download ROCm'
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
-      - name: 'Install ROCm'
-        run: |
-          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-      - name: 'Verify ROCm'
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-
-      - name: make rocm runner
-        run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
-
-  # CUDA generation step
-  runners-windows-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    runs-on: windows
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      # CUDA installation steps
-      - name: 'Cache CUDA installer'
-        id: cache-cuda
-        uses: actions/cache@v4
-        with:
-          path: cuda-install.exe
-          key: ${{ env.CUDA_12_WINDOWS_URL }}
-      - name: 'Conditionally Download CUDA'
-        if: steps.cache-cuda.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
-      - name: 'Install CUDA'
-        run: |
-          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
-          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
-      - name: 'Verify CUDA'
-        run: |
-          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - name: make cuda runner
-        run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
-
-  runners-cpu:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      ARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - name: 'Build Windows Go Runners'
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -j 4
-      - name: 'Build Unix Go Runners'
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -j 4
-      - run: go build .
-
-  lint:
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-          - os: macos-latest
-            arch: amd64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: false
-      - run: |
-          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
-            arm64) echo ARCH=arm64 ;;
-          esac >>$GITHUB_ENV
-        shell: bash
-      - uses: golangci/golangci-lint-action@v6
-        with:
-          args: --timeout 10m0s -v
  test:
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
+        os: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.os }}
    env:
-      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
    steps:
      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
-          cache: true
-      - run: |
-          case ${{ matrix.arch }} in
-            amd64) echo ARCH=amd64 ;;
-            arm64) echo ARCH=arm64 ;;
-          esac >>$GITHUB_ENV
-        shell: bash
+      - uses: golangci/golangci-lint-action@v6
+        with:
+          args: --timeout 10m0s -v
      - run: go test ./...

  patches:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Verify patches carry all the changes
+      - name: Verify patches apply cleanly and do not change files
        run: |
-          make apply-patches sync && git diff --compact-summary --exit-code llama
+          make -f Makefile2 clean checkout sync
+          git diff --compact-summary --exit-code
--- a/.prettierrc.json
+++ b/.prettierrc.json
@@ -1,10 +0,0 @@
-{
-  "trailingComma": "es5",
-  "tabWidth": 2,
-  "useTabs": false,
-  "semi": false,
-  "singleQuote": true,
-  "jsxSingleQuote": true,
-  "printWidth": 120,
-  "arrowParens": "avoid"
-}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(Ollama C CXX)
+
+include(CheckLanguage)
+
+find_package(Threads REQUIRED)
+
+set(CMAKE_BUILD_TYPE Release)
+set(BUILD_SHARED_LIBS ON)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(GGML_BUILD ON)
+set(GGML_SHARED ON)
+set(GGML_CCACHE ON)
+set(GGML_BACKEND_DL ON)
+set(GGML_BACKEND_SHARED ON)
+set(GGML_SCHED_MAX_COPIES 4)
+set(GGML_CPU_ALL_VARIANTS ON)
+set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
+set(GGML_LLAMAFILE ON)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
+
+set(GGML_CPU ON)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
+set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
+
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+endif()
+
+check_language(HIP)
+if(CMAKE_HIP_COMPILER)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -0,0 +1,96 @@
+{
+  "version": 3,
+  "configurePresets": [
+    {
+      "name": "Default",
+      "binaryDir": "${sourceDir}/build",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release"
+      }
+    },
+    {
+      "name": "CUDA",
+      "inherits": [ "Default" ]
+    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;62;70;72;75;80;86"
+      }
+    },
+    {
+      "name": "CUDA 12",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "60;61;62;70;72;75;80;86;87;89;90;90a"
+      }
+    },
+    {
+      "name": "JetPack 5",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "72;87"
+      }
+    },
+    {
+      "name": "JetPack 6",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "87"
+      }
+    },
+    {
+      "name": "ROCm",
+      "inherits": [ "Default" ]
+    },
+    {
+      "name": "ROCm 6",
+      "inherits": [ "ROCm" ],
+      "cacheVariables": {
+        "CMAKE_HIP_ARCHITECTURES": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "Default",
+      "configurePreset": "Default"
+    },
+    {
+      "name": "CUDA",
+      "configurePreset": "CUDA",
+      "targets": [ "ggml-cuda" ]
+    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
+    {
+      "name": "CUDA 12",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 12"
+    },
+    {
+      "name": "JetPack 5",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "JetPack 5"
+    },
+    {
+      "name": "JetPack 6",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "JetPack 6"
+    },
+    {
+      "name": "ROCm",
+      "configurePreset": "ROCm",
+      "targets": [ "ggml-hip" ]
+    },
+    {
+      "name": "ROCm 6",
+      "inherits": [ "ROCm" ],
+      "configurePreset": "ROCm 6"
+    }
+  ]
+}
--- a/66
+++ b/66
@@ -0,0 +1,66 @@
+ARG CUDA_11_VERSION=11.3
+ARG CUDA_12_VERSION=12.4
+ARG ROCM_VERSION=6.1.2
+ARG JETPACK_5_VERSION=r35.4.1
+ARG JETPACK_6_VERSION=r36.2.0
+ARG CMAKE_VERSION=3.31.2
+
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+
+# FROM --platform=linux/arm64 rockylinux:8 AS base
+# ARG CMAKE_VERSION
+# RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+# RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+
+FROM base AS amd64
+ARG CUDA_11_VERSION
+ARG CUDA_12_VERSION
+RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
+    && yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
+COPY CMakeLists.txt CMakeLists.txt
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+
+FROM --platform=linux/amd64 amd64 AS cuda_11
+ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
+RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+RUN cmake --build build --target ggml-cuda -j
+
+FROM --platform=linux/amd64 amd64 AS cuda_12
+ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
+RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+RUN cmake --build build --target ggml-cuda -j
+
+FROM --platform=linux/amd64 amd64 AS rocm
+RUN cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+RUN cmake --build build --target ggml-hip -j
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+COPY CMakeLists.txt .
+COPY ml/backend/ggml/ggml .
+RUN cmake -S . -B build \
+    -DCMAKE_CUDA_ARCHITECTURES="72;87"
+RUN cmake --build build --target ggml-cuda
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+COPY CMakeLists.txt .
+COPY ml/backend/ggml/ggml .
+RUN cmake -S . -B build \
+    -DCMAKE_CUDA_ARCHITECTURES="87"
+RUN cmake --build build --target ggml-cuda
+
+FROM --platform=linux/amd64 golang:1.23
+COPY --from=cuda_11 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-11.so
+COPY --from=cuda_12 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-12.so
+COPY --from=rocm build/ml/backend/ggml/ggml/src/ggml-hip/libggml-hip.so libggml-hip.so
+
+# FROM --platform=linux/arm64 golang:1.23
+# COPY --from=jetpack_5 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-5.so
+# COPY --from=jetpack_6 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-6.so
--- a/103
+++ b/103
@@ -1,103 +0,0 @@
-# top level makefile for Ollama
-include make/common-defs.make
-
-
-# Determine which if any GPU runners we should build
-include make/cuda-v11-defs.make
-include make/cuda-v12-defs.make
-include make/rocm-defs.make
-
-ifeq ($(CUSTOM_CPU_FLAGS),)
-ifeq ($(ARCH),amd64)
-	RUNNER_TARGETS=cpu
-endif
-# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11_COMPILER),)
-	RUNNER_TARGETS += cuda_v11
-endif
-ifneq ($(CUDA_12_COMPILER),)
-	RUNNER_TARGETS += cuda_v12
-endif
-endif
-else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
-ifneq ($(CUDA_12_COMPILER),)
-	RUNNER_TARGETS += cuda_v12
-else ifneq ($(CUDA_11_COMPILER),)
-	RUNNER_TARGETS += cuda_v11
-endif
-endif
-
-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_COMPILER),)
-	RUNNER_TARGETS += rocm
-endif
-endif
-
-
-all: runners exe
-
-dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
-
-dist_%:
-	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
-
-runners: $(RUNNER_TARGETS)
-
-$(RUNNER_TARGETS):
-	@$(MAKE) --no-print-directory -f make/Makefile.$@
-
-exe dist_exe:
-	@$(MAKE) --no-print-directory -f make/Makefile.ollama $@
-
-help-sync apply-patches create-patches sync sync-clean:
-	@$(MAKE) --no-print-directory -f make/Makefile.sync $@
-
-test integration lint:
-	@$(MAKE) --no-print-directory -f make/Makefile.test $@
-
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE)
-	go clean -cache
-
-help:
-	@echo "The following make targets will help you build Ollama"
-	@echo ""
-	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
-	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
-	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
-	@echo "	make dist		# Build the runners and primary ollama executable for distribution"
-	@echo "	make help-sync 		# Help information on vendor update targets"
-	@echo "	make help-runners 	# Help information on runner targets"
-	@echo ""
-	@echo "The following make targets will help you test Ollama"
-	@echo ""
-	@echo "	make test   		# Run unit tests"
-	@echo "	make integration	# Run integration tests.  You must 'make all' first"
-	@echo "	make lint   		# Run lint and style tests"
-	@echo ""
-	@echo "For more information see 'docs/development.md'"
-	@echo ""
-
-
-help-runners:
-	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
-	@echo ""
-	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
-	@echo ""
-	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
-	@echo "CUDA_PATH=$(CUDA_PATH)"
-	@echo "	CUDA_11_PATH=$(CUDA_11_PATH)"
-	@echo "	CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
-	@echo "	CUDA_12_PATH=$(CUDA_12_PATH)"
-	@echo "	CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
-	@echo ""
-	@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
-	@echo "HIP_PATH=$(HIP_PATH)"
-	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
-
-.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS)
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
--- a/46
+++ b/46
@@ -0,0 +1,46 @@
+UPSTREAM=https://github.com/ggerganov/llama.cpp.git
+WORKDIR=llama/vendor
+FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
+
+all: sync
+
+.PHONY: sync
+sync: llama/llama.cpp ml/backend/ggml/ggml
+
+.PHONY: llama/llama.cpp
+llama/llama.cpp: llama/vendor/ apply_patches
+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
+
+.PHONY: ml/backend/ggml/ggml apply_patches
+ml/backend/ggml/ggml: llama/vendor/ggml/ apply_patches
+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
+
+PATCHES=$(wildcard llama/patches/*.patch)
+
+.PHONY: apply_patches
+.NOTPARALLEL:
+apply_patches: $(addsuffix ed, $(PATCHES))
+
+%.patched: %.patch
+	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
+
+.PHONY: checkout
+checkout: $(WORKDIR)
+	git -C $(WORKDIR) fetch
+	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
+
+$(WORKDIR):
+	git clone $(UPSTREAM) $(WORKDIR)
+
+.PHONE: format_patches
+format_patches: llama/patches
+	git -C $(WORKDIR) format-patch \
+		--no-signature \
+		--no-numbered \
+		--zero-commit \
+		-o $(realpath $<) \
+		$(FETCH_HEAD)
+
+.PHONE: clean
+clean: checkout
+	$(RM) $(addsuffix ed, $(PATCHES))
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ ollama run mario
 Hello! It's your friend Mario.
 ```

-For more examples, see the [examples](examples) directory. For more information on working with a Modelfile, see the [Modelfile](docs/modelfile.md) documentation.
+For more information on working with a Modelfile, see the [Modelfile](docs/modelfile.md) documentation.

 ## CLI Reference

@@ -441,6 +441,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
+- [LangChain for .NET](https://github.com/tryAGI/LangChain) with [example](https://github.com/tryAGI/LangChain/blob/main/examples/LangChain.Samples.OpenAI/Program.cs)
 - [LLPhant](https://github.com/theodo-group/LLPhant?tab=readme-ov-file#ollama)
 - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
--- a/cache/cache.go
+++ b/cache/cache.go
@@ -0,0 +1,63 @@
+package cache
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type Options struct {
+	Position int
+}
+
+type Cache interface {
+	Sub(i int) Cache
+	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
+}
+
+type Simple struct {
+	DType    ml.DType
+	Capacity int
+
+	keys, values []ml.Tensor
+}
+
+func (c *Simple) Sub(i int) Cache {
+	if i >= len(c.keys) {
+		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
+		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
+	}
+
+	return &Simple{
+		keys:     c.keys[i : i+1],
+		values:   c.values[i : i+1],
+		Capacity: c.Capacity,
+		DType:    c.DType,
+	}
+}
+
+func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
+	if c.keys[0] == nil || c.values[0] == nil {
+		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
+		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
+	}
+
+	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
+	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
+
+	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
+
+	key = c.keys[0].View(ctx, 0,
+		int(key.Dim(0)), int(key.Stride(1)),
+		int(key.Dim(1)), int(key.Stride(2)),
+		n,
+	)
+
+	value = c.values[0].View(ctx, 0,
+		int(value.Dim(0)), int(value.Stride(1)),
+		int(value.Dim(1)), int(value.Stride(2)),
+		n,
+	)
+
+	// TODO shift context if necessary
+
+	return key, value
+}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,7 @@ import (
 	"log/slog"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type ModelParameters struct {
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }

-func (ModelParameters) KV(t *Tokenizer) llm.KV {
-	kv := llm.KV{
+func (ModelParameters) KV(t *Tokenizer) ggml.KV {
+	kv := ggml.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p AdapterParameters) KV() llm.KV {
+func (p AdapterParameters) KV() ggml.KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 		alpha = p.LoraParameters.Alpha
 	}

-	kv := llm.KV{
+	kv := ggml.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(ws, kv, ts)
 }

-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(ws, kv, ts)
 }

 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) llm.KV
+	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
@@ -99,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }

 type moreParser interface {
@@ -108,17 +108,17 @@ type moreParser interface {

 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(llm.KV) llm.KV
+	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string

-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }

-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -187,6 +187,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &gemma2Model{}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
+	case "Qwen2ForCausalLM":
+		conv = &qwen2Model{}
 	case "BertModel":
 		conv = &bertModel{}
 	default:
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -8,7 +8,7 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type bertModel struct {
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bertModel) KV(t *Tokenizer) llm.KV {
+func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type gemmaModel struct {
@@ -23,7 +23,7 @@ type gemmaModel struct {

 var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,8 +1,6 @@
 package convert

-import (
-	"github.com/ollama/ollama/llm"
-)
+import "github.com/ollama/ollama/fs/ggml"

 type gemma2Model struct {
 	gemmaModel
@@ -11,7 +9,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type gemma2Adapter struct {
@@ -15,14 +15,14 @@ type gemma2Adapter struct {

 var _ AdapterConverter = (*gemma2Adapter)(nil)

-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
 }

-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -9,7 +9,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type llamaModel struct {
@@ -46,7 +46,7 @@ type llamaModel struct {

 var _ ModelConverter = (*llamaModel)(nil)

-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor

 	if p.RopeScaling.factors != nil {
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type llamaAdapter struct {
@@ -18,7 +18,7 @@ type llamaAdapter struct {

 var _ AdapterConverter = (*llamaAdapter)(nil)

-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 	return kv
 }

-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -6,7 +6,7 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type mixtralModel struct {
@@ -15,7 +15,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 		return true
 	})

-	var out []llm.Tensor
+	var out []ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 	"sync"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type phi3Model struct {
@@ -37,7 +37,7 @@ type phi3Model struct {

 var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 	var addRopeFactors sync.Once

-	out := make([]llm.Tensor, 0, len(ts)+2)
+	out := make([]ggml.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, llm.Tensor{
+				out = append(out, ggml.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, llm.Tensor{
+				}, ggml.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 			})
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -0,0 +1,79 @@
+package convert
+
+import "github.com/ollama/ollama/fs/ggml"
+
+
+type qwen2Model struct {
+	ModelParameters
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScaling           struct {
+		Type                          string     `json:"type"`
+		Factor                        ropeFactor `json:"factor"`
+		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
+	} `json:"rope_scaling"`
+	RMSNormEPS float32 `json:"rms_norm_eps"`
+}
+
+var _ ModelConverter = (*qwen2Model)(nil)
+
+func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
+	kv := q.ModelParameters.KV(t)
+	kv["general.architecture"] = "qwen2"
+	kv["qwen2.block_count"] = q.HiddenLayers
+	kv["qwen2.context_length"] = q.MaxPositionEmbeddings
+	kv["qwen2.embedding_length"] = q.HiddenSize
+	kv["qwen2.feed_forward_length"] = q.IntermediateSize
+	kv["qwen2.attention.head_count"] = q.NumAttentionHeads
+	kv["qwen2.attention.head_count_kv"] = q.NumKeyValueHeads
+	kv["qwen2.rope.freq_base"] = q.RopeTheta
+	kv["qwen2.attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
+
+	switch q.RopeScaling.Type {
+	case "":
+		// no scaling
+	case "yarn":
+		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
+		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
+	default:
+		panic("unknown rope scaling type")
+	}
+	return kv
+}
+
+func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
+	for _, t := range ts {
+		out = append(out, ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *qwen2Model) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.o_proj", "attn_output",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"post_attention_layernorm", "ffn_norm",
+		"model.norm", "output_norm",
+	}
+}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,7 +20,7 @@ import (

 	"golang.org/x/exp/maps"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )

 type tensorData struct {
@@ -29,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }

-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
+	m, _, err := ggml.Decode(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
 	actual := make(map[string]string)
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
@@ -75,7 +75,7 @@ func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tenso
 		}
 	}

-	for _, tensor := range tensors.Items {
+	for _, tensor := range tensors.Items() {
 		sha256sum := sha256.New()
 		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
 		if _, err := io.Copy(sha256sum, sr); err != nil {
@@ -108,6 +108,7 @@ func TestConvertModel(t *testing.T) {
 		"Phi-3-mini-128k-instruct",
 		"all-MiniLM-L6-v2",
 		"gemma-2-9b-it",
+		"Qwen2.5-0.5B-Instruct",
 	}

 	for i := range cases {
@@ -330,7 +331,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			m, _, err := ggml.Decode(r, math.MaxInt)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/testdata/Qwen2.5-0.5B-Instruct.json
+++ b/convert/testdata/Qwen2.5-0.5B-Instruct.json
@@ -0,0 +1,314 @@
+{
+    "general.architecture": "qwen2",
+    "general.file_type": "1",
+    "general.parameter_count": "494032768",
+    "general.quantization_version": "2",
+    "output_norm.weight": "93a01a6db3419e85320a244bbf8ae81c43033b1d10c342bea3797ff2ce348390",
+    "qwen2.attention.head_count": "14",
+    "qwen2.attention.head_count_kv": "2",
+    "qwen2.attention.layer_norm_rms_epsilon": "1e-06",
+    "qwen2.block_count": "24",
+    "qwen2.context_length": "32768",
+    "qwen2.embedding_length": "896",
+    "qwen2.feed_forward_length": "4864",
+    "qwen2.rope.freq_base": "1e+06",
+    "token_embd.weight": "d74257dc547b48be5ae7b93f1c9af072c0c42dbbb85503078e25c59cd09e68d0",
+    "tokenizer.ggml.add_eos_token": "false",
+    "tokenizer.ggml.add_padding_token": "false",
+    "tokenizer.ggml.eos_token_id": "151645",
+    "tokenizer.ggml.merges": "6b1b1c58f1223d74f9095929d3e6416cdd74784440221a5507b87b8197f2bfd2",
+    "tokenizer.ggml.model": "gpt2",
+    "tokenizer.ggml.padding_token_id": "151643",
+    "tokenizer.ggml.pre": "qwen2",
+    "tokenizer.ggml.scores": "94e247e531e8b0fa3d248f3de09c9beae0c87da8106208a8edfaac0b8ec4b53d",
+    "tokenizer.ggml.token_type": "b178dbc9d1b2e08f84d02918e00fc2de2619a250e6c188c91a6605f701860055",
+    "tokenizer.ggml.tokens": "1d93f6679b23a1152b725f7f473792d54d53c1040c5250d3e46b42f81e0a1a34",
+    "blk.0.attn_k.bias": "5ce6617845f66c34515978d23d52e729c298d8bffa28c356a0428bef17142cf1",
+    "blk.0.attn_k.weight": "a960832a9e0e83e4d95402e5d1a01cc74300fcca0c381237162126330e1a7af8",
+    "blk.0.attn_norm.weight": "32c7d51cd0958f1f1771174192db341f9770516d7595a2f0fd18a4d78bd5aba3",
+    "blk.0.attn_output.weight": "c67e6e7e868354a11bf9121c70ee56c140b20eec611a8955e7dfe54a21d40a98",
+    "blk.0.attn_q.bias": "3e9e994eb1f03bccfc82f8bb3c324c920d42d547e07de5be83be12c428645063",
+    "blk.0.attn_q.weight": "dc12132f789b97cfa1e3f5775ceb835247fa67aa47400fd09c8f9f3769208583",
+    "blk.0.attn_v.bias": "a3fd0757b31fdc78af5ec320332d239c1a79d34e8804df06c5454e86955e8cc9",
+    "blk.0.attn_v.weight": "f43094a2134c7ee2dcc52aac3c8b7d9d64fb0295a8adb94cabfd49213f017b84",
+    "blk.0.ffn_down.weight": "18c2aec92db14f21976838a8c35d5575f80d0e4b1e05ccc0d8388d5877e80147",
+    "blk.0.ffn_gate.weight": "a3a1c4ef38f8f750eabadfe3d83bbb0f77941eec1cc1a388e51852e99c8691f6",
+    "blk.0.ffn_norm.weight": "b59b779c42d44b5c4cec41e39b4eb61e0491a07c1b3e946ccb5b8d5c657eda3f",
+    "blk.0.ffn_up.weight": "db64f09987ea59449e90abae5a2ffcc20efd9203f0eebec77a6aacb5809d6cff",
+    "blk.1.attn_k.bias": "a5c8c5671703ec0aa0143ff70a20ffdd67b5d5790ca1dfa5bba4e87e4071ed9f",
+    "blk.1.attn_k.weight": "835c7c7cc95b3cb2e55bd9cac585aa0760a033896621d3e06421f3378c540f7d",
+    "blk.1.attn_norm.weight": "f4c36fb6c14fce721fab0de78cc118d6f66e3a3d3ea0017bb14aade24c3c5434",
+    "blk.1.attn_output.weight": "cc1e80310c97cef068e48e40b7096f32fa2138519d6209c6a1a9994985999016",
+    "blk.1.attn_q.bias": "bc332780e66b0aac80ec5e63ac32344919a840db2fcc8f87bcef16a43a54138e",
+    "blk.1.attn_q.weight": "d766f06c925cce38d4b31b2165b3448e1fb49a7d561985f95d9cd2fcba52367a",
+    "blk.1.attn_v.bias": "9f486626fb6ed9ac84970a71e9b9818dd2758501fd3f61bb1c08540dcc7a8631",
+    "blk.1.attn_v.weight": "e873d1e5bd4f4d6abfd47c0f55119c2c111105838753ee273a03c5ccea25ce5c",
+    "blk.1.ffn_down.weight": "b3ce82b093f187344de04284b1783a452de1b72640914609b8f830dc81580521",
+    "blk.1.ffn_gate.weight": "5cd44ad237edaca525a28a3ac13975d1b565f576d6a8003237a341ae0d156f2e",
+    "blk.1.ffn_norm.weight": "4ac774ee8afaee119610c46aa1ff89fc6c9084a29d226075bc4aa4d2f15f746c",
+    "blk.1.ffn_up.weight": "042d81ab5f1983d85c81213232f3bfc05a9302d9dfaa98d931ebba326b6058b8",
+    "blk.10.attn_k.bias": "767ecfeacd60a2c2221ac4d76c357190849dd9cdf64ced418d9d0c7949101401",
+    "blk.10.attn_k.weight": "a9f3df343227537636be8202303453086375091944e498bad11e0b91e45e8c71",
+    "blk.10.attn_norm.weight": "01acd0e7b3e363f873dbfde6f0995ffcce83f5aaa10ff91c31dbf775035f6d5a",
+    "blk.10.attn_output.weight": "a531fe660769604ab869f01b203eb115e025cad4c0baeacdd1bcca99cf6d0264",
+    "blk.10.attn_q.bias": "356a02c9163dd660c1340fbe1e049b335ac6178891e00996131bba9ab4cb3e59",
+    "blk.10.attn_q.weight": "81be0cfb227339d83f954cd8dcf35828441211c6e1d184060e3eb76085041e2f",
+    "blk.10.attn_v.bias": "ed0450653284b62f8bf2c2db19c0ff7a6cf3cda1324d0a044c5e3db7bb692bd3",
+    "blk.10.attn_v.weight": "c1247ff7092babd2ed979883095b9aa022b2996cab1c77fb9e6176ddc1498d16",
+    "blk.10.ffn_down.weight": "fda7544965dc9af874f1062c22151c6cefc8ba08cbe15dc67aa89979e77b2de4",
+    "blk.10.ffn_gate.weight": "9f2632b1dee7304d10c70bd38d85bb1f148a628a8468f894f57975b8a2f1d945",
+    "blk.10.ffn_norm.weight": "94f8cbd6b17a4d5aabd93fa32930a687db3b11f086142f1cd71c535c11adcad4",
+    "blk.10.ffn_up.weight": "8dc2f8db0474939a277a3d89db34c3bcc3381cfea57bd05a8426a164634d9112",
+    "blk.11.attn_k.bias": "3b8e5a662b19411e3f6530714b766aad2ee41eebc8161bec9db0bc82d383a6e0",
+    "blk.11.attn_k.weight": "2c29f1ed1ce53ce9604e9ea3663c2c373157e909a0d6064a8920005f6d15dad9",
+    "blk.11.attn_norm.weight": "48f68a99c3da4ab4c9e492677b606d1b8e0e3de1fdbf6a977523f97b8c21ec31",
+    "blk.11.attn_output.weight": "5859f3838a94898b020c23040941ed88f4fcb132db400d0849f30a01f62c0f1c",
+    "blk.11.attn_q.bias": "c5ad89a5628f2bd81252ef44ef6bbcbff15c33ad16fba66435509b959c2af6d3",
+    "blk.11.attn_q.weight": "d102104e5d61c1e3219564f1d0149fd593db6c6daa9f3872460c84403323cfef",
+    "blk.11.attn_v.bias": "8653f7d48c5f75a5b55630819f99ecf01c932f12d33fd1a3ee634613e70edde8",
+    "blk.11.attn_v.weight": "e0a7c7d89b9f2d0d781ce85330022229126e130a8600a09d4a5f920f0bbd50b2",
+    "blk.11.ffn_down.weight": "4a22b3361eba8bbe1d9a6fda1812618e894c49f13bcacb505defa9badb6b96a6",
+    "blk.11.ffn_gate.weight": "484698b206760d3fd8df68b252a3c5bae65c8bf6392fb53a5261b021b6f39144",
+    "blk.11.ffn_norm.weight": "da69e96338cbe30882cf5a9544004387f5bbc0bcb6038e61ba2baabbd2623bac",
+    "blk.11.ffn_up.weight": "26ec74f1f504d1281715680dfbcc321db4e9900c53932fa40955daceb891b9aa",
+    "blk.12.attn_k.bias": "f94b49ec3e498f14f6bc3ebefe1f82018935bbe594df03253bfffae36bc20751",
+    "blk.12.attn_k.weight": "ae6323d0bbcfcea01f598d308993d1a7530317e78c1f64923e36d4b1649e9e73",
+    "blk.12.attn_norm.weight": "3784536a7611a839a42a29a5cc538c74ee4f9793092e5efe1b227b48f8c4d37f",
+    "blk.12.attn_output.weight": "46826c00b066829355db78293ab216e890f5eaaed3a70499ee68785189a6b0d9",
+    "blk.12.attn_q.bias": "b14db2d327ce0deec97beda7d3965a56c43e1e63dc9181840fb176b114cf643a",
+    "blk.12.attn_q.weight": "30f67df52ced06f76b6c85531657584276a454d6ec9bb7d0c7d2ca8f067f5551",
+    "blk.12.attn_v.bias": "57ab4b7e43f4fc5853bca7bfbb2702f8c2c391a49252a760abbb7b26330dc4aa",
+    "blk.12.attn_v.weight": "3ccd9da0cfe241cd33a63310f3ca6d81c5bc5a50d200bfea6612ac376166aca2",
+    "blk.12.ffn_down.weight": "a095774413198a83c549ce132d7c9684c0baef33145eaa889be370ef9c881c81",
+    "blk.12.ffn_gate.weight": "bb3b2bbdfb065d2a0a795909c53beec327781a4a7e974bf9f99c436cea459991",
+    "blk.12.ffn_norm.weight": "3b486c6cd97eb4b17967d9d6c0cc3821a1a6ad73d96b4d8fbf980101b32b8dab",
+    "blk.12.ffn_up.weight": "d020b82dd39a5d5a9d3881397bf53a567790a07f395284e6eb0f5fe0fef53de3",
+    "blk.13.attn_k.bias": "69381f8254586eba3623eceb18697fe79f9b4d8f2c30136acb10d5926e3ba1d0",
+    "blk.13.attn_k.weight": "c4d7a31495d71269f81b586203a50abea3a9e2985667faf258c9306ec6030f1d",
+    "blk.13.attn_norm.weight": "907da11075d16eda668dabe548af3cfd794df26b8ab53939af1344d91bec6fba",
+    "blk.13.attn_output.weight": "ca01cf6d2b8ece2fb3b0f56f1eb76194471ac27b54fe264f99c909f5eb7fef4a",
+    "blk.13.attn_q.bias": "2f5ecebafe03b1d485b93c41cff756ca57fb65b02e9d8336f14a3d26ab5d159a",
+    "blk.13.attn_q.weight": "f557f8acad7f0fa62da06b5da134182fe04a5bed8bdb269e316f970c9cc440fb",
+    "blk.13.attn_v.bias": "a492a88ae131e95714b092545a8752eaea7c7d2f9cb77852628ca8296c415525",
+    "blk.13.attn_v.weight": "d1220b1fe9f1cc0a5a88ee239d65fec900f5eaf6c448b6c2cbe74c81e15ed333",
+    "blk.13.ffn_down.weight": "53184e33440b49848a896304eb16a983efbc6b8bee0b93de8c8de716e1585fcb",
+    "blk.13.ffn_gate.weight": "684bf8896f148c851506c62717e45c426921b93c10d536ecdeb0fb28259a106d",
+    "blk.13.ffn_norm.weight": "6cb4e547ad8665eb7c174855c08afe1e5490fece66122522c1e9e8132d9064eb",
+    "blk.13.ffn_up.weight": "c64107897e38c06727075aba4ea7940b2cdd0e278b5c555dffb2790ef553bb57",
+    "blk.14.attn_k.bias": "2814ca9b160b16ae39557c9b629482fbe3a7592d372c1e1bf1ac59a2d578fde1",
+    "blk.14.attn_k.weight": "3377177396463afba667742972920ebb45dfdc37e9950e1f0e1d60a2f936b27d",
+    "blk.14.attn_norm.weight": "5cae870477d51dd35a6d22aaeacfce4dff218ffba693820ede6a4e11f02afd6d",
+    "blk.14.attn_output.weight": "3cfe9ccf3d48ae9e95b93a132a1c6240189a277d764f58590fb36fdbb714cad0",
+    "blk.14.attn_q.bias": "6a75acc2f090b2e67bfc26f7fca080ae8bd7c7aa090ec252e694be66b8b8f038",
+    "blk.14.attn_q.weight": "5ef45c86d7dda1df585aa1b827b89823adf679a6bb9c164bd0f97b2aa6eb96f1",
+    "blk.14.attn_v.bias": "5534480443e10ed72c31a917f3d104b0f49df5e6dbfa58d0eb5e7318120e3aee",
+    "blk.14.attn_v.weight": "58f45cf3240c4623626ec415c7d5441eaa8d2fb184f101aba973f222989422d1",
+    "blk.14.ffn_down.weight": "2dc82a0f20c05b77512458738130d8d05ce150cc078680ae7ee6dd7ed68d955d",
+    "blk.14.ffn_gate.weight": "d4a6c6f0fcccddfd1fcaa074846622f4a74cb22b9a654ab497abdc1d0dde9450",
+    "blk.14.ffn_norm.weight": "777e444932a0212ff3feac98442444e17bd8a98cb758ea3356697d0846d12c56",
+    "blk.14.ffn_up.weight": "6b75f6bd00195198447b69a417ed9d98f8ca28b3cb8be82f4bad908be0777d57",
+    "blk.15.attn_k.bias": "2d07211a58e6c2f23aa3a6dc03c80a7d135dfb28726b60b0e0fdd0f35ea5c37b",
+    "blk.15.attn_k.weight": "e77f3c0075a1810e70df956cc51fd08612f576cc09b6de8708dcae5daedb0739",
+    "blk.15.attn_norm.weight": "379a10d90609a5d5ba67d633803eda1424fc61ba5cca8d3bffe70c8b18b58ebf",
+    "blk.15.attn_output.weight": "402751c12ee9dbc9db5e3bf66a7b23ebe7d36c0500e0be67be4c8b1c4357fa62",
+    "blk.15.attn_q.bias": "acb37fc409ee725ceedf7a3a41b40106086abc47b76780728f781942c5120208",
+    "blk.15.attn_q.weight": "89cd3047a09b46ed2bb57c69dd687f67a1f0235149b30376fa31b525898e4a55",
+    "blk.15.attn_v.bias": "f081a37289cbe811978feb4da3ef543bdeb7355414d476f44e09b498da10cb2c",
+    "blk.15.attn_v.weight": "8404f242a11e6d512c9ead9b2f083cda031e9b269f8a0a83f57ee4c56934764e",
+    "blk.15.ffn_down.weight": "93438f43ee8cc4f1a7fd3840a6afdd5f02123e76db4f0d9474430c0100d148fc",
+    "blk.15.ffn_gate.weight": "ff935a2698843e87fad9dbf7125f53e460190ec71ee128b650b3fc027fe37bfc",
+    "blk.15.ffn_norm.weight": "4be80f199841cba831982e988451e1833c3c938a4d6ca1169319087bf0bd723e",
+    "blk.15.ffn_up.weight": "ee9ba63c66d71053e33551ddd519878bb30b88eeb03cfe047119c5c4000fb0a6",
+    "blk.16.attn_k.bias": "3f5fbabed4510c620b99d9d542739295fa6a262a7157f3a00a4889253f8341b8",
+    "blk.16.attn_k.weight": "8ca6eb139b281c257324cddea97a8e9aa7c048b53075cf00153123b967c27ee5",
+    "blk.16.attn_norm.weight": "290157f005e5aa7dddf4bd60100e7ee7b0baa7f11ec5c2cea5e0ead2aad3a4c6",
+    "blk.16.attn_output.weight": "b1f4d80a7447f08f1c331712527f750d00147f35c042442ade96fd029dadc5a1",
+    "blk.16.attn_q.bias": "e3e4e442ad4416791b468cad8de0d0d2d68c7e7df8d06002f4d49b4da9cb25e4",
+    "blk.16.attn_q.weight": "cc7392fa5bb1107d3816e7e7363de252d37efd4165d065e258806291ce0a147b",
+    "blk.16.attn_v.bias": "a7629830f2f6293e018916849614636d40b1bcd11245f75dbc34d38abae8f324",
+    "blk.16.attn_v.weight": "b6c7856c7d594437630929c8cf3b31d476e817875daf1095334ec08e40c5e355",
+    "blk.16.ffn_down.weight": "f9c0a777a00170990a4982d5a06717511bf9b0dd08aeaab64d9040d59bcbebba",
+    "blk.16.ffn_gate.weight": "ed88f11bc3176c9f22004e3559ccb9830a278b75edd05e11971d51c014bd5cd2",
+    "blk.16.ffn_norm.weight": "ab24abdcc4957895e434c6bb3a5237a71ff5044efb9f76c1a9e76e280c128410",
+    "blk.16.ffn_up.weight": "99f594dc8db37f554efa606e71d215fbc3907aa464a54038d6e40e9229a547ff",
+    "blk.17.attn_k.bias": "f236625676f9b2faa6781c7184d12d84c089c130d2a9350a6cf70210990f6bf1",
+    "blk.17.attn_k.weight": "c2a4f20cd3e98538308a13afe9cc5880bdd90d543449c6072dedd694b511ee1a",
+    "blk.17.attn_norm.weight": "5a9da4ee168311f487a79fc9d065a035432c6cafa8adb963a84954cf32f57a2a",
+    "blk.17.attn_output.weight": "d5df7031e354186ce65dc09d6f8a92eb721c0319816f8596b0c8a5d148ed0a2a",
+    "blk.17.attn_q.bias": "3212d5eeaa7ed7fac93cc99e16544de93c01bb681ae9391256ed4a8671fc6b00",
+    "blk.17.attn_q.weight": "d18cd9aa7ee10c551cb705549fa1ae974aea233f86471c9a19022dc29b63d0d5",
+    "blk.17.attn_v.bias": "a74ad11a1f8357742f80e2a0c0b3a2578fc8bbaf14c8223000767e07a5d79703",
+    "blk.17.attn_v.weight": "da18ac0e90884436a1cb0ad6a067f97a37f321b03c70b8b03bf481339fef5c80",
+    "blk.17.ffn_down.weight": "81a8a5d7a194fb53d976558e0347efbe9fdb1effffde9634c70162e1a20eff51",
+    "blk.17.ffn_gate.weight": "72870d83ab62f2dcd45f593924e291a45e4ae1b87f804b5b88aa34cfd76dd15e",
+    "blk.17.ffn_norm.weight": "cae39ac69b9bdaeefab7533796fdf11dbb7a4bdbdeed601e20f209503aafe008",
+    "blk.17.ffn_up.weight": "e7cb40b0842468507cec0e502bbed8a86428b51d439e3466bc12f44b2754e28f",
+    "blk.18.attn_k.bias": "8bfc02b94f9587aa125e2d8bbc2b15f0a5eb8f378d8b3e64a8150ae0a8ca3df2",
+    "blk.18.attn_k.weight": "434bc3b3332ea48afee890aa689eb458a75c50bc783492b0cbf64d42db40e8ad",
+    "blk.18.attn_norm.weight": "d6ffc09396c42a70d1f0e97d81113eee704d3bfc9eeae2bed022075a5dd08075",
+    "blk.18.attn_output.weight": "133f001f81f3b082468a7de67cb2e7a76508fce34bcc4dee7f0858e06eee082c",
+    "blk.18.attn_q.bias": "758d0e28bf5e660b3090aafb70e2a3191b4f3bb218d65e9139a086ceacaf599f",
+    "blk.18.attn_q.weight": "12d7b86fc1b09b9fa7f8b7ed43d8a410892cec8672d0c752f8346f6193343696",
+    "blk.18.attn_v.bias": "9efd15bab0519462431d6c6e8a5b7dd4e151dc449468097ee0ddca369c0ecc2e",
+    "blk.18.attn_v.weight": "f631231a79d4a2e9730fb2e386d8c18621eb3fb7900fbfdff5e6d52cc42db122",
+    "blk.18.ffn_down.weight": "874a2dddf456f3ab56b958b0860d71c8c680a6f89322c9bf6b2f32a113592300",
+    "blk.18.ffn_gate.weight": "4549ef8976c345a511df4a7133bdaf6fe387335f52dfd8a4605a8ae3f728c403",
+    "blk.18.ffn_norm.weight": "80c258a2536a860e19bfcbd9f29afa13214fbb4c34bde0d4da51287d354e9a59",
+    "blk.18.ffn_up.weight": "8b03308a581457a3c038b7a086f3cdf14941d7ad4107c4bd6d9d6b062fd00d73",
+    "blk.19.attn_k.bias": "e77f7b0c8e3e0a9b0d61918cd88371047752a1b02b1576936f4ec807d4d870ee",
+    "blk.19.attn_k.weight": "a2a318e93355230c0d0f95c441b080bf9c4914507255f363fb67a5e771d4d1e6",
+    "blk.19.attn_norm.weight": "9a4bdeb3970be21ac74a94c2c81eb36986533db81b78db6edec48d9802910d59",
+    "blk.19.attn_output.weight": "2369b103dd3947e2cef02b2669b405af5957fb3a7f9d0ff40646078c4b4317ad",
+    "blk.19.attn_q.bias": "e20bf427bef69059ae84a5d9f98f7d688489627f198fb6153def018ff9fd2e34",
+    "blk.19.attn_q.weight": "45a3bb3bdfd2f29dd76e5f78ddae73678b9a2a85dfaf609e460240ef5b7be2ad",
+    "blk.19.attn_v.bias": "a441f58a3e02ed86ee1819eefc9bd4e8b70d11b864a929d58a2c2ac0aeb8203d",
+    "blk.19.attn_v.weight": "30b0b04480c510450a7abb2ce9fa05c65b150a3cc4dc76f8916bf8d013f1b6be",
+    "blk.19.ffn_down.weight": "eebb9ab8fdb6a6efcfff8cf383adac9ec2d64aeeff703d16ed60d3621f86c395",
+    "blk.19.ffn_gate.weight": "3fef1493029298378886586478410b3d2e4e879f6aa83c07e210a7ce6481817f",
+    "blk.19.ffn_norm.weight": "e1be99ea1e8fb9678f7b8ba200f3f37e03878f3574d65d57bcd3a9fd796e2112",
+    "blk.19.ffn_up.weight": "f07cf25e09394fb69fe3ef324bdc0df9a4cecf3dc53070b8acc39e6d1689bf82",
+    "blk.2.attn_k.bias": "b29baa8221f125eff6b8ac1a950fa1d7cfc1bce7bdc636bf3df7d4065ab6466c",
+    "blk.2.attn_k.weight": "4bd0c179bced8bc37a09f5748c394e0cf50273942fb38a866e5cf50b6c96c437",
+    "blk.2.attn_norm.weight": "07b3edc6a6325c3428aa12f29bcae0be0de363ce61a6af487bc5c93fb8c468d9",
+    "blk.2.attn_output.weight": "056b5b31dbc81087c81b9d41c25960aa66c7190004c842ba343979644d7f4d88",
+    "blk.2.attn_q.bias": "479b6212401e097767c9d52b12a1adb8961c0fce9fcaaab81f202a9d85744376",
+    "blk.2.attn_q.weight": "f89196076f446a6dd8a9eee017f303504f9c03094c326449cee5a7fc0a97fade",
+    "blk.2.attn_v.bias": "ef9b1b986dbd9d7291027a88b67dc31434435b20e76e4f1e9d6273ebd31224f0",
+    "blk.2.attn_v.weight": "9322f4f00e85f8c0936845c51ca64b202a93df104f36886986a8452a8e4967a5",
+    "blk.2.ffn_down.weight": "7beac0d2440dc49af33ededb85a6cc3ba23ab33ad3ffa5760714b2ef84d94f6e",
+    "blk.2.ffn_gate.weight": "818a93864a5890c1f4dc66429004fad07645a50142350e9bff9a68fe24608a52",
+    "blk.2.ffn_norm.weight": "152c924d5514942ad274aafb8cc91b35c1db3627c3d973d92f60ff75f3daf9ba",
+    "blk.2.ffn_up.weight": "9c9579e600f209546db6015c9acfeda4f51b6d3cca6e8db4d20a04285fe61a37",
+    "blk.20.attn_k.bias": "fd22bfeffb63d818ce2ff1ea2ace0db5d940f7a9489b6bfc1ec4a5398848d7fe",
+    "blk.20.attn_k.weight": "f74439bc74c2f9252130c9c28384fd7352368b58bb7ce3f2444cf0288dfff861",
+    "blk.20.attn_norm.weight": "5c15d2613df87be6495fb7546b7dcedd2801d12fa5ecc02c877df889330e8f37",
+    "blk.20.attn_output.weight": "6731a39286a67f6859832f96695732e579e14e0c36956eccd1edce3db11595b8",
+    "blk.20.attn_q.bias": "04466e5a3f454a19b9b433fc2585396feac780027ece7ccb4e4bb3e406fc14d8",
+    "blk.20.attn_q.weight": "ead4c71daaeb17bf20d014a34c88b97f238456488e815ae0f281a5daf6fc99b8",
+    "blk.20.attn_v.bias": "adcc848e043025de9bd55ccb14dd8fb6343e8b5185ed07e12964be41d0faf99f",
+    "blk.20.attn_v.weight": "81bfc23f83526386a4761c2c16b6a93cd0bbf9d846c1a51b82c71f1474a465f1",
+    "blk.20.ffn_down.weight": "9bf660af3bafad919d03173c89a65fc9c89440a76c42c9e55e4d171076f3c17f",
+    "blk.20.ffn_gate.weight": "c04b4f3ccce44917ee228b998e2c19dd702aef10a43413afb152e808b5ac5c42",
+    "blk.20.ffn_norm.weight": "3d5b555d7746a71220143c6b8fff5ce4eb63283d9d9c772f1233d848f69f4ff4",
+    "blk.20.ffn_up.weight": "d7a196505c39e5469dfc7c6958bdbb54e93629ac1a047a6663ed96b318753094",
+    "blk.21.attn_k.bias": "4db1f48e5c6a3bc5720a5da813bbef08283e6269e12d83f8a9c54e52715d8011",
+    "blk.21.attn_k.weight": "c687b2f0e132a5e220a2a059b61aa2a537f37d8a674d7709f87880637b263b31",
+    "blk.21.attn_norm.weight": "ec23b0ff847a4b45585ab8e04f10fc20bb1637c5f1fbcdc4d73f336bcb5d1bd0",
+    "blk.21.attn_output.weight": "01255390576316c1731ef201e32c6e934eba356c28438cd06d9027ac6a3ff84f",
+    "blk.21.attn_q.bias": "3098f37205a15418e1681e407c82b7ce7c6fda6c6826b0590a13e1b68a38a1ea",
+    "blk.21.attn_q.weight": "30ea62cbb702a5359229dc96819df17ee535e2e9988d044b005c73ea536e1005",
+    "blk.21.attn_v.bias": "7bbedb2c22a04737f21993115701d4a06b985b7ca3b64681f53cd1be8d7ea39e",
+    "blk.21.attn_v.weight": "e11905e63579e36fbee978062af7599339ae29633765a4835628d79a795ec8df",
+    "blk.21.ffn_down.weight": "84def2ffd8aca766f9ce12ed9ac76919ab81eb34bdeae44fa4224417c38af527",
+    "blk.21.ffn_gate.weight": "4e99f05377b4a0b8d875045530a5c59dee6a46ac8a45597f6579f6fdfa800787",
+    "blk.21.ffn_norm.weight": "af48f13d03fba38ff8794a5f5005e666e501f971ca2e30bbded2777a8096f37d",
+    "blk.21.ffn_up.weight": "a29541c39a6acbc364be86994632a5bf55d701027cb7f23320f8c6d55ee42c91",
+    "blk.22.attn_k.bias": "c97f84db6c75422df6ef5768676d4e9abefaa3b8337aa2730ff260f8fc350480",
+    "blk.22.attn_k.weight": "af9a0c56f68779513e95be11611b7be6175ddae27d48bee9dd72fdbf05f6cbfa",
+    "blk.22.attn_norm.weight": "1c7518eb5bcff4a202c6f4a2827f14abd76f9bcc64ce75fe9db60b69437a5c9c",
+    "blk.22.attn_output.weight": "1abcf1f3caa2f59dd018646b93f9cf8fd30d49e98a473e6a8704419a751be46f",
+    "blk.22.attn_q.bias": "7221e01cb692faf2f7f8c2eb6e2fac38a1b751a9c9fdb6a21a0a936eb0bf4b96",
+    "blk.22.attn_q.weight": "faaf8fb7b6c19f343d47f3ea6b57151fb46c787e0b3bd2c292fd327d3d4d8e35",
+    "blk.22.attn_v.bias": "3ec05942e82d735de99dfd0d8228d8425e63e2fc584da98b3326bdef89ecb2e5",
+    "blk.22.attn_v.weight": "42e7b0ad06db76227837da9d4e74b2db97f3df4050ecb3a87cb9b55e08dfcb42",
+    "blk.22.ffn_down.weight": "87ef98ad2d0e824b0fa5ad8aa18787162922e527c9b1b721a99bc07d3bf97c82",
+    "blk.22.ffn_gate.weight": "562d6e5a1654b03aaa0e33864d23c10297fd4bcaa72d30fac69fb771ee1df9d6",
+    "blk.22.ffn_norm.weight": "f8a405dee467749d59427ce05cdd4b9c11bb18934a89258ea461f013b7d251f5",
+    "blk.22.ffn_up.weight": "90e1f4ae4062649d4d838399eb353e8bb8d56a49982b6a7f64aa3945377f7187",
+    "blk.23.attn_k.bias": "9ad22178a85f3be7e25f5aff462f31627466364f2f5e92f265cc91db0da9a8a8",
+    "blk.23.attn_k.weight": "d813beffb10f03278f5b58eea0f9d73cdcb7b5b4045ae025c379592e854f7dfd",
+    "blk.23.attn_norm.weight": "f583c9836044bdb056d6f8911088ac28add68e500043ae1f97b5d9158fe3d769",
+    "blk.23.attn_output.weight": "02789911ac3b97f6b761e958b7dd6dc7da61a46a1be92bd0b346039ca7ecd2b2",
+    "blk.23.attn_q.bias": "38c4970fb9b4f7e4a139258a45639d848653814b4bc89ea9849709b13f16414b",
+    "blk.23.attn_q.weight": "eb694be9a5ab5858b8dab064ee4cce247dc757424e65282989bd4d015b8580ce",
+    "blk.23.attn_v.bias": "0a25f6533aa7e7a152a4b198cf6c411c2408a34afa4f161bb4d5ffba2f74e33f",
+    "blk.23.attn_v.weight": "187e1bac6b70f74e6364de226565aa8275ee2854d09cbe5895451a689596049e",
+    "blk.23.ffn_down.weight": "88880dd9ba7ee80ade972927f810b5d2c30a69520c615190b27f9daabc0a8c5a",
+    "blk.23.ffn_gate.weight": "5abec63197935ab3eb8e6de0a5307396ec46cdb1cc5de25d87c845f3c4a3e887",
+    "blk.23.ffn_norm.weight": "60e1f5e6310c3a531c554a6bb7cd883aed58db1e51853f739436ea461c1843d7",
+    "blk.23.ffn_up.weight": "3d7f502771743f4a634188dfcd8b8a384fb07467ca8528366aee59ddb25b7bce",
+    "blk.3.attn_k.bias": "0b6b442ebbac29c8c4b67e8e3876d0382dd2dc52efdf4ab0ebbc6f71b6252393",
+    "blk.3.attn_k.weight": "480f40584fbda692c26f2cee45f5923780b236f8b4e8ec7bbee0237777a0918d",
+    "blk.3.attn_norm.weight": "39872be2af31bc9cd6b583ebba6fb759f621d586d66e5a2fc0b85991615a8923",
+    "blk.3.attn_output.weight": "924b2c80d8513bf637f8ebb3756a340d9cf2243de723fd08d7f5dccd46b3f8b6",
+    "blk.3.attn_q.bias": "863c9d848156847a3fe9bbc44415a4395245b5d13e95673c014fdb71e494ab0a",
+    "blk.3.attn_q.weight": "bff73ee5de92fba8f6c089bbb19ce57e17ab3c9c29295712804bb752711b882e",
+    "blk.3.attn_v.bias": "e1b6fea126e86189112fcdfee79ffc66a087461527bc9c2dc52dc80f3b7de95e",
+    "blk.3.attn_v.weight": "7812b7f5133636f06cdbb4dcc48ef7803206538641b6c960777b37f60a8e6752",
+    "blk.3.ffn_down.weight": "00b393d6a7e3ad9b5224211ccdbc54a96aae151f24ed631764ac224972a6bc82",
+    "blk.3.ffn_gate.weight": "cfd63fa3a038af05dc53c6eeb3c192f1602f26ff24cb840bcf1510fcb37b5513",
+    "blk.3.ffn_norm.weight": "7389fc240a282949580ea2f5b0d7973ac79f32f76dc0155b537bb6b751f8e27a",
+    "blk.3.ffn_up.weight": "2a945f47090df9cb16f92f1f06c520f156f8e232182eaaed09f257b8947a2a62",
+    "blk.4.attn_k.bias": "62533c31f0de498187593f238c6597503fef2a92e920cd540a96bc5311b3b2a0",
+    "blk.4.attn_k.weight": "93e829868bffd980a8e589b9c4566cd81e6ce4296a5f357a2ae93febe1284156",
+    "blk.4.attn_norm.weight": "9e0aaa4bbdd1389890f8abec20533f3ab16d61b872b1a8dbd623023921c660a9",
+    "blk.4.attn_output.weight": "74467d6f44357d67f452ac49da861468b38e98057017bd38bc9a449f9d3538e6",
+    "blk.4.attn_q.bias": "8e6d9026fd69b314c1773c5946be2e11daf806ef22a5d91d744344fd30c58c59",
+    "blk.4.attn_q.weight": "e5bfbafd94a4d530f3769f5edbba8cc08d9b5bee8f66ebf4cb54e69bc0b7f63b",
+    "blk.4.attn_v.bias": "20c570f92022d9905eb85c0e41d1fdb30db22007a9628b51f512f8268d6c34a2",
+    "blk.4.attn_v.weight": "9638d459d61da03c9dd34dad985e03c43b4f8a5bc9701a82153478329b0517e0",
+    "blk.4.ffn_down.weight": "9d91b06e89d52f4365dece7eaeec50f81e52cb2407b333248a81e6e2f84c05b8",
+    "blk.4.ffn_gate.weight": "bf6350a79c6a6ee9146edfd788b88d4a4c2b54db1aa0adcc1464dbba8a84b646",
+    "blk.4.ffn_norm.weight": "11a70a6b9f7ce336292f4e3a2c6c92d366d4ee4306ad4fdb1870fde107e9cc31",
+    "blk.4.ffn_up.weight": "64f23f493d02b147a72a59605e6b7dd1c4c74f6813a38a2a60818bd66f697347",
+    "blk.5.attn_k.bias": "f6c2c279c0ed686f298ad1e5514b5cd882199341f896abbb2c2129d4c64ce9c5",
+    "blk.5.attn_k.weight": "0e682f75870abf9efaca10dac5f04c580f42820ecf4e234d43af967019acb86f",
+    "blk.5.attn_norm.weight": "01efae7653705e741932fcd79dff3be643d7e97f4b5719b887835dffe44b3a82",
+    "blk.5.attn_output.weight": "69e841d00d196acc489cd70bc5ffbbb63530ac5fabb169d40c4fb3a32ebb8ed8",
+    "blk.5.attn_q.bias": "f3304d76ccd44fed887565857c8e513b1211d89a5d3e81782de507ab3f6fc045",
+    "blk.5.attn_q.weight": "98612a6b7920a247853ada95c240807d4ca8e43604279e7a2fc9bb265ae40469",
+    "blk.5.attn_v.bias": "39940a9b353ceed3edfd4a39b985c9520490aa1b9f11749c94fdf6d879d1a259",
+    "blk.5.attn_v.weight": "839f84b828cf83aecf479a0dc7bc86cce05145ef77dcf29916dc3e0680f5b665",
+    "blk.5.ffn_down.weight": "1f48cbb0960f15e06ab8a3754ade792995a655856389ddbca629c07e89d1b114",
+    "blk.5.ffn_gate.weight": "33d8219fce3189e1aab376039896eebd4ad36ebd26a8278cd19b26e4357e4f81",
+    "blk.5.ffn_norm.weight": "0f4a0f83d37127fa4483f2905cb4f38ef6ddc71584b6cb05632c62a9af313dda",
+    "blk.5.ffn_up.weight": "22a64a11e5f0a1ff45ca327bf9e1efa258f085ff6a96edc398b7474f725b4514",
+    "blk.6.attn_k.bias": "baa91df99d4df2d25e8d590bca4e334b97f2d9aa3df8e748fedc8a6188499111",
+    "blk.6.attn_k.weight": "121f3b9f4b9491996499392e2688a929cafe102a67920b4cb2a039349c43d8eb",
+    "blk.6.attn_norm.weight": "b4cf987e923d71f2f84c58d20ea8af7576b225bf61952145b489fdd395e3d411",
+    "blk.6.attn_output.weight": "a112642150a138d54b2a4038042fd33619035a35694771e966f3575856c635d6",
+    "blk.6.attn_q.bias": "a97ea10469cdfa3fdddf8bad6de683ef99f6170eb8d29d15dcf6bf4bce37c5a3",
+    "blk.6.attn_q.weight": "d80c787019317a87361de6bbc7df6701357216bdd9b404522cede34a719a5500",
+    "blk.6.attn_v.bias": "d846269db9cd77ae28da26ba0914cace1b6754bd5301af9c44607085dfcbd2d7",
+    "blk.6.attn_v.weight": "06567c433e8a391647633291b50828a076ad7c2436106bb9278c60a3f8fccb3b",
+    "blk.6.ffn_down.weight": "f15f66f56b3c474eac8c6315c5fff07c3e29c6e483d7efd4d303c7f43814be91",
+    "blk.6.ffn_gate.weight": "47768f89c6da8eefb29adb766ff4eb38c9dfd79320bbc1386248319fcbcf567f",
+    "blk.6.ffn_norm.weight": "7f8195e6b148212967145fc9d86ce36b699cff0de026042245c2d344f1ef8510",
+    "blk.6.ffn_up.weight": "53d7707ae4347aadb445289f9f87a008b72df5cb855b00080a605442fdd8edf3",
+    "blk.7.attn_k.bias": "63e274df3217dde25b8369a383e480fe4f6b403a74385f15ac0b5db71dce2744",
+    "blk.7.attn_k.weight": "f6fce88602f5945eee09767acbcad387d132614e6da39ae359f2bbf380d94b1f",
+    "blk.7.attn_norm.weight": "bbf5dc7336c0f9a511afef6bf5efeffd78f1b83940850c3eb7eb20c621b75656",
+    "blk.7.attn_output.weight": "d9fb907a138396a859cecbfcb377927308dc93c24c7fb52dba5eb59265feadec",
+    "blk.7.attn_q.bias": "f02ba1318346af77e309f40aee716e2de7ee8cab67e67b17636db9bf40894fb0",
+    "blk.7.attn_q.weight": "54a691e824be287a61c35c172edc01922ed792d2addeee029afc17ba6c7e11b9",
+    "blk.7.attn_v.bias": "3a4f182f51e84ce862d558fb2751b91802b65d74596bb14d624808513a8a83ec",
+    "blk.7.attn_v.weight": "a142fe6e106d3ab484e2dc6f9c72b8fc0a385279dde08deb1ad1fd05ac25deb1",
+    "blk.7.ffn_down.weight": "8daf7e8c430d183a4d6ab3eb575fafa4b5e31689f68b290c8b370411ad9d0f12",
+    "blk.7.ffn_gate.weight": "a2a786b45eb660994254b48e2aaf22f3e9821cfb383dee0ba04cc4350a2f8e72",
+    "blk.7.ffn_norm.weight": "73828bbc8c9610cc139fcf03e96272648cdc291263251fe3a67367408deb69e1",
+    "blk.7.ffn_up.weight": "e85dd0f63fed449ce16893c5795ea6a050a2d7a66d9534410a227e22c905dafa",
+    "blk.8.attn_k.bias": "91a752a6e2c364e5ee6a015770fe289aece4911ae6c6bbfe74ac52f465465f93",
+    "blk.8.attn_k.weight": "99c069e92c43a2efb74e23188256b3cabbbe06399878e681ce203a05d5da378a",
+    "blk.8.attn_norm.weight": "c76d36d3cc06aa2a9edb1abf9f602bb7ed61ac9d61f8ef7ed736a1e619abe717",
+    "blk.8.attn_output.weight": "ee5ff156a2625e1f203f65e69b514f9df04bd9a5e82b28e3876e16cf1c6f65c5",
+    "blk.8.attn_q.bias": "8fbd868a93b330c8b0418b488c5301f42a7eb0c58445a4e515d56777f1d96ed5",
+    "blk.8.attn_q.weight": "9f20ef86e80098ba52a3a31ebcc315bea3a614dac9cba7ac1db02f156db9b577",
+    "blk.8.attn_v.bias": "c4813571d5d618742183a7890c0b89cd7f18e210c758f63aad564659bc38a26d",
+    "blk.8.attn_v.weight": "ea88e1a4cf8bd56e9a88ada427d2b0cd352234827640757ee2a9ed594fb67a53",
+    "blk.8.ffn_down.weight": "b0d1a7495811580b189aaa3e20ea871d6d01ed7b6c23e59825078ef786944ff2",
+    "blk.8.ffn_gate.weight": "0a17c0caa0b06721c49b59b2a63a5dcbf744dd1cffa55962b404ba910c658a62",
+    "blk.8.ffn_norm.weight": "f15f109d4a8e9d1ff7c71fa5bc6373df7ee80c5f7d1de3fa0d4849d747e36bcb",
+    "blk.8.ffn_up.weight": "bbf4c5c4c5c8a0f9ae8b88e3cc8b86f81b98148722d5a350995af176c0b774f2",
+    "blk.9.attn_k.bias": "a7f60d962686b8ca60f69643e0e0fa8614688be738fb0b1c6bd54de35c2beb5e",
+    "blk.9.attn_k.weight": "dd80ce4adb00e338fc04b307e4c18a27071f4ba4397184a24d765e6e4a268ef4",
+    "blk.9.attn_norm.weight": "721e6487547e2b3986ab4b4e2500ceade59d908bccf4436e1e8031f246deb2bd",
+    "blk.9.attn_output.weight": "5a800af39107b363861e5f5173483cdcd644d8ac3b0c8a443b9c759d71285db8",
+    "blk.9.attn_q.bias": "0a19b4925ea8ca8067acc909b058adc327de3874cfc94cc9eb4a106d3f370123",
+    "blk.9.attn_q.weight": "93e84906684c0c7ede79967236d9fc8344da84a9f1daa04e8295c2c9b6b26a24",
+    "blk.9.attn_v.bias": "615421f812f821e230ecde4e6da35d868823248355ce7e4e51e2d650ead565f9",
+    "blk.9.attn_v.weight": "7f4913e289aefd9ceecbdaf9767b1e95303f5d59dd67ecb2cc15768477f4d08e",
+    "blk.9.ffn_down.weight": "95d1b3933221e87dc4af70dd566daec9498bf358070b8d26f1fc70766a84a152",
+    "blk.9.ffn_gate.weight": "530f2d04f6a1fbffaaa5f2fbc3a328ebed7b330e3af14b4fc7d8a51b13ad8d42",
+    "blk.9.ffn_norm.weight": "28077de416217ea1df94b96017bef4cc562ab62e51b1a03a671c70abc29ce52a",
+    "blk.9.ffn_up.weight": "b87b6190778aaee4695938e24ac6c90dbbee6dce7c5c2ab5bc26ba4564581822"
+  }
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -100,6 +100,8 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			t.Pre = "deepseek-llm"
 		case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
 			t.Pre = "deepseek-coder"
+		case "1ff7f41064896984db5d1bb6ff64fa4bc29007d08c1b439e505b7392777a319e":
+			t.Pre = "qwen2"
 		case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
 			// noop, empty pretokenizer
 		default:
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -718,23 +718,18 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 func LibraryDirs() []string {
 	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
 	// This can be simplified once we no longer carry runners as payloads
-	paths := []string{}
-	appExe, err := os.Executable()
+	exe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to lookup executable path", "error", err)
-	} else {
-		appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
-		if _, err := os.Stat(appRelative); err == nil {
-			paths = append(paths, appRelative)
-		}
+		return nil
 	}
-	rDir := runners.Locate()
-	if err != nil {
-		slog.Warn("unable to locate gpu dependency libraries", "error", err)
-	} else {
-		paths = append(paths, filepath.Dir(rDir))
+
+	lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
+	if _, err := os.Stat(lib); err != nil {
+		return nil
 	}
-	return paths
+
+	return []string{lib}
 }

 func GetSystemInfo() SystemInfo {
--- a/docs/api.md
+++ b/docs/api.md
@@ -928,14 +928,25 @@ A single JSON object is returned:
 POST /api/create
 ```

-Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `modelfile` to the content of the Modelfile rather than just set `path`. This is a requirement for remote create. Remote model creation must also create any file blobs, fields such as `FROM` and `ADAPTER`, explicitly with the server using [Create a Blob](#create-a-blob) and the value to the path indicated in the response.
+Create a model from:
+ * another model;
+ * a safetensors directory; or
+ * a GGUF file.
+
+If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.

 ### Parameters

 - `model`: name of the model to create
- `modelfile` (optional): contents of the Modelfile
+- `from`: (optional) name of an existing model to create the new model from
+- `files`: (optional) a dictionary of file names to SHA256 digests of blobs to create the model from
+- `adapters`: (optional) a dictionary of file names to SHA256 digests of blobs for LORA adapters
+- `template`: (optional) the prompt template for the model
+- `license`: (optional) a string or list of strings containing the license or licenses for the model
+- `system`: (optional) a string containing the system prompt for the model
+- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.md#valid-parameters-and-values) for a list of parameters)
+- `messages`: (optional) a list of message objects used to create a conversation
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
- `path` (optional): path to the Modelfile
 - `quantize` (optional): quantize a non-quantized (e.g. float16) model

 #### Quantization types
@@ -961,14 +972,15 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m

 #### Create a new model

-Create a new model from a `Modelfile`.
+Create a new model from an existing model.

 ##### Request

 ```shell
 curl http://localhost:11434/api/create -d '{
  "model": "mario",
-  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
+  "from": "llama3.2",
+  "system": "You are Mario from Super Mario Bros."
 }'
 ```

@@ -999,7 +1011,7 @@ Quantize a non-quantized model.
 ```shell
 curl http://localhost:11434/api/create -d '{
  "model": "llama3.1:quantized",
-  "modelfile": "FROM llama3.1:8b-instruct-fp16",
+  "from": "llama3.1:8b-instruct-fp16",
  "quantize": "q4_K_M"
 }'
 ```
@@ -1019,52 +1031,112 @@ A stream of JSON objects is returned:
 {"status":"success"}
 ```

+#### Create a model from GGUF

-### Check if a Blob Exists
+Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
+
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "my-gguf-model",
+  "files": {
+    "test.gguf": "sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"
+  }
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```
+{"status":"parsing GGUF"}
+{"status":"using existing layer sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+
+#### Create a model from a Safetensors directory
+
+The `files` parameter should include a dictionary of files for the safetensors model which includes the file names and SHA256 digest of each file. Use [/api/blobs/:digest](#push-a-blob) to first push each of the files to the server before calling this API. Files will remain in the cache until the Ollama server is restarted.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "fred",
+  "files": {
+    "config.json": "sha256:dd3443e529fb2290423a0c65c2d633e67b419d273f170259e27297219828e389",
+    "generation_config.json": "sha256:88effbb63300dbbc7390143fbbdd9d9fa50587b37e8bfd16c8c90d4970a74a36",
+    "special_tokens_map.json": "sha256:b7455f0e8f00539108837bfa586c4fbf424e31f8717819a6798be74bef813d05",
+    "tokenizer.json": "sha256:bbc1904d35169c542dffbe1f7589a5994ec7426d9e5b609d07bab876f32e97ab",
+    "tokenizer_config.json": "sha256:24e8a6dc2547164b7002e3125f10b415105644fcf02bf9ad8b674c87b1eaaed6",
+    "model.safetensors": "sha256:1ff795ff6a07e6a68085d206fb84417da2f083f68391c2843cd2b8ac6df8538f"
+  }
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```shell
+{"status":"converting model"}
+{"status":"creating new layer sha256:05ca5b813af4a53d2c2922933936e398958855c44ee534858fcfd830940618b6"}
+{"status":"using autodetected template llama3-instruct"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+## Check if a Blob Exists

 ```shell
 HEAD /api/blobs/:digest
 ```

-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.
+Ensures that the file blob (Binary Large Object) used with create a model exists on the server. This checks your Ollama server and not ollama.com.

-#### Query Parameters
+### Query Parameters

 - `digest`: the SHA256 digest of the blob

-#### Examples
+### Examples

-##### Request
+#### Request

 ```shell
 curl -I http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2
 ```

-##### Response
+#### Response

 Return 200 OK if the blob exists, 404 Not Found if it does not.

-### Create a Blob
+## Push a Blob

 ```shell
 POST /api/blobs/:digest
 ```

-Create a blob from a file on the server. Returns the server file path.
+Push a file to the Ollama server to create a "blob" (Binary Large Object).

-#### Query Parameters
+### Query Parameters

 - `digest`: the expected SHA256 digest of the file

-#### Examples
+### Examples

-##### Request
+#### Request

 ```shell
-curl -T model.bin -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2
+curl -T model.gguf -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2
 ```

-##### Response
+#### Response

 Return 201 Created if the blob was successfully created, 400 Bad Request if the digest used is not expected.

--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,15 +1,15 @@
-package llm
+package ggml

 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+	"log/slog"
 	"slices"
 	"strings"
-	"sync"

-	"github.com/ollama/ollama/util/bufioutil"
+	"github.com/ollama/ollama/fs/util/bufioutil"
 )

 type GGML struct {
@@ -19,145 +19,168 @@ type GGML struct {

 type model interface {
 	KV() KV
-	Tensors() *Tensors
+	Tensors() Tensors
 }

 type KV map[string]any

-func (kv KV) u64(key string) uint64 {
-	switch v := kv[key].(type) {
-	case uint64:
-		return v
-	case uint32:
-		return uint64(v)
-	case float64:
-		return uint64(v)
-	default:
-		return 0
-	}
-}
-
 func (kv KV) Architecture() string {
-	if s, ok := kv["general.architecture"].(string); ok {
-		return s
-	}
-
-	return "unknown"
+	return kv.String("general.architecture", "unknown")
 }

 func (kv KV) Kind() string {
-	if s, ok := kv["general.type"].(string); ok {
-		return s
-	}
-
-	return "unknown"
+	return kv.String("general.type", "unknown")
 }

 func (kv KV) ParameterCount() uint64 {
-	return kv.u64("general.parameter_count")
+	return keyValue[uint64](kv, "general.parameter_count")
 }

 func (kv KV) FileType() fileType {
-	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
+	if t := kv.Uint("general.file_type"); t > 0 {
+		return fileType(t)
 	}

 	return fileTypeUnknown
 }

 func (kv KV) BlockCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
+	return uint64(kv.Uint("block_count"))
+}
+
+func (kv KV) EmbeddingLength() uint64 {
+	return uint64(kv.Uint("embedding_length"))
 }

 func (kv KV) HeadCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
+	return uint64(kv.Uint("attention.head_count"))
 }

 func (kv KV) HeadCountKV() uint64 {
-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
-		return headCountKV
-	}
-
-	return 1
+	return uint64(kv.Uint("attention.head_count_kv", 1))
 }

 func (kv KV) EmbeddingHeadCount() uint64 {
 	if heads := kv.HeadCount(); heads > 0 {
-		return kv.EmbeddingLength() / kv.HeadCount()
+		return kv.EmbeddingLength() / heads
 	}

 	return 0
 }

 func (kv KV) EmbeddingHeadCountK() uint64 {
-	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
-		return k
-	}
-
-	return kv.EmbeddingHeadCount()
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
 }

 func (kv KV) EmbeddingHeadCountV() uint64 {
-	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
-		return v
-	}
-
-	return kv.EmbeddingHeadCount()
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
 }

 func (kv KV) GQA() uint64 {
 	return kv.HeadCount() / kv.HeadCountKV()
 }

-func (kv KV) EmbeddingLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
-}
-
 func (kv KV) ContextLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
+	return uint64(kv.Uint("context_length"))
 }

 func (kv KV) ChatTemplate() string {
-	s, _ := kv["tokenizer.chat_template"].(string)
+	return kv.String("tokenizer.chat_template")
+}
+
+func (kv KV) String(key string, defaultValue ...string) string {
+	return keyValue(kv, key, append(defaultValue, "")...)
+}
+
+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
+	return keyValue(kv, key, append(defaultValue, 0)...)
+}
+
+func (kv KV) Float(key string, defaultValue ...float32) float32 {
+	return keyValue(kv, key, append(defaultValue, 0)...)
+}
+
+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
+	r := keyValue(kv, key, &array{})
+	s := make([]string, r.size)
+	for i := range r.size {
+		s[i] = r.values[i].(string)
+	}
+
 	return s
 }

-type Tensors struct {
-	Items  []*Tensor
-	Offset uint64
+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
+	r := keyValue(kv, key, &array{})
+	s := make([]uint32, r.size)
+	for i := range r.size {
+		s[i] = uint32(r.values[i].(int32))
+	}

-	layers     map[string]Layer
-	layersOnce sync.Once
+	return s
 }

-func (ts *Tensors) Layers() map[string]Layer {
-	ts.layersOnce.Do(func() {
-		ts.layers = make(map[string]Layer)
-		for _, t := range ts.Items {
-			parts := strings.Split(t.Name, ".")
-			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
-				if len(parts) > index+2 {
-					// blk and mm should have a number after them, join it
-					parts = append(
-						[]string{strings.Join(parts[:index+2], ".")},
-						parts[index+2:]...)
-				}
-			}
+func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
+		key = kv.Architecture() + "." + key
+	}

-			if _, ok := ts.layers[parts[0]]; !ok {
-				ts.layers[parts[0]] = make(Layer)
-			}
+	if val, ok := kv[key]; ok {
+		return val.(T)
+	}

-			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0]
+}
+
+type Tensors struct {
+	items  []*Tensor
+	Offset uint64
+}
+
+func (s Tensors) Items(prefix ...string) []*Tensor {
+	if len(prefix) == 0 {
+		return s.items
+	}
+
+	var items []*Tensor
+	for _, t := range s.items {
+		if strings.HasPrefix(t.Name, prefix[0]) {
+			items = append(items, t)
 		}
-	})
+	}

-	return ts.layers
+	return items
+}
+
+func (ts Tensors) Layers() map[string]Layer {
+	layers := make(map[string]Layer)
+	for _, t := range ts.items {
+		parts := strings.Split(t.Name, ".")
+		if i := slices.Index(parts, "blk"); i > 0 {
+			parts = append([]string{
+				strings.Join(parts[:i], "."),
+				strings.Join(parts[i:i+2], "."),
+			}, parts[i+2:]...)
+		} else if i == 0 {
+			parts = append([]string{
+				strings.Join(parts[i:i+2], "."),
+			}, parts[i+2:]...)
+		}
+
+		if _, ok := layers[parts[0]]; !ok {
+			layers[parts[0]] = make(Layer)
+		}
+
+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	}
+
+	return layers
 }

 type Layer map[string]*Tensor

-func (l Layer) size() (size uint64) {
+func (l Layer) Size() (size uint64) {
 	for _, t := range l {
 		size += t.Size()
 	}
@@ -255,8 +278,6 @@ func (t Tensor) typeSize() uint64 {
 		return 8
 	case 29: // IQ1_M
 		return blockSize/8 + blockSize/16 + blockSize/32
-	case 30: // BF16
-		return 2
 	default:
 		return 0
 	}
@@ -295,7 +316,7 @@ const (

 var ErrUnsupportedFormat = errors.New("unsupported model format")

-func DetectGGMLType(b []byte) string {
+func DetectContentType(b []byte) string {
 	switch binary.LittleEndian.Uint32(b[:4]) {
 	case FILE_MAGIC_GGML:
 		return "ggml"
@@ -312,12 +333,12 @@ func DetectGGMLType(b []byte) string {
 	}
 }

-// DecodeGGML decodes a GGML model from the given reader.
+// Decode decodes a GGML model from the given reader.
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	if maxArraySize == 0 {
 		maxArraySize = 1024
 	}
@@ -331,10 +352,6 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {

 	var c container
 	switch magic {
-	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
-		return nil, 0, ErrUnsupportedFormat
-	case FILE_MAGIC_GGLA:
-		c = &containerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
 		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
 	case FILE_MAGIC_GGUF_BE:
@@ -530,21 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 }

 // SupportsKVCacheType checks if the requested cache type is supported
-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
-	return slices.Contains(validKVCacheTypes, cacheType)
+func (llm GGML) SupportsKVCacheType(cacheType string) bool {
+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
-func (ggml GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
+func (llm GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
 	if isEmbedding {
 		return false
 	}

 	// Check head counts match and are non-zero
-	headCountK := ggml.KV().EmbeddingHeadCountK()
-	headCountV := ggml.KV().EmbeddingHeadCountV()
+	headCountK := llm.KV().EmbeddingHeadCountK()
+	headCountV := llm.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }

--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -1,4 +1,4 @@
-package llm
+package ggml

 import (
 	"bytes"
@@ -8,10 +8,9 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"maps"
 	"slices"
 	"strings"
-
-	"golang.org/x/exp/maps"
 )

 type containerGGUF struct {
@@ -110,9 +109,9 @@ func (llm *gguf) KV() KV {
 	return llm.kv
 }

-func (llm *gguf) Tensors() *Tensors {
-	return &Tensors{
-		Items:  llm.tensors,
+func (llm *gguf) Tensors() Tensors {
+	return Tensors{
+		items:  llm.tensors,
 		Offset: llm.tensorOffset,
 	}
 }
@@ -523,7 +522,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 		return err
 	}

-	keys := maps.Keys(kv)
+	keys := slices.Collect(maps.Keys(kv))
 	slices.Sort(keys)

 	for _, key := range keys {
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -1,4 +1,4 @@
-package llm
+package ggml

 import "fmt"

@@ -32,10 +32,9 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
-	fileTypeIQ3_M
 	fileTypeIQ2_S
-	fileTypeIQ2_M
 	fileTypeIQ4_XS
+	fileTypeIQ2_M
 	fileTypeIQ1_M
 	fileTypeBF16

@@ -94,8 +93,6 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
-	case "IQ3_M":
-		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
 	case "IQ4_XS":
@@ -163,8 +160,6 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
-	case fileTypeIQ3_M:
-		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:
--- a/fs/util/bufioutil/buffer_seeker.go
+++ b/fs/util/bufioutil/buffer_seeker.go
--- a/fs/util/bufioutil/buffer_seeker_test.go
+++ b/fs/util/bufioutil/buffer_seeker_test.go
--- a/go.mod
+++ b/go.mod
@@ -17,12 +17,15 @@ require (
 require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
+	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
+	golang.org/x/tools v0.28.0
+	gonum.org/v1/gonum v0.15.0
 )

 require (
@@ -42,7 +45,6 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
@@ -70,7 +72,7 @@ require (
 	golang.org/x/arch v0.8.0 // indirect
 	golang.org/x/crypto v0.31.0
 	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
-	golang.org/x/net v0.25.0 // indirect
+	golang.org/x/net v0.32.0 // indirect
 	golang.org/x/sys v0.28.0
 	golang.org/x/term v0.27.0
 	golang.org/x/text v0.21.0
--- a/go.sum
+++ b/go.sum
@@ -42,6 +42,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
+github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
+github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
 github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
@@ -255,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
-golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
+golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -307,6 +309,8 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
+golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/grammar/bench_test.go
+++ b/grammar/bench_test.go
@@ -0,0 +1,22 @@
+//go:build go1.24
+
+package grammar
+
+import "testing"
+
+func BenchmarkFromSchema(b *testing.B) {
+	for tt := range testCases(b) {
+		b.Run("", func(b *testing.B) {
+			s := []byte(tt.schema)
+
+			b.ReportAllocs()
+			for b.Loop() {
+				_, err := FromSchema(nil, s)
+				if err != nil {
+					b.Fatalf("GrammarFromSchema: %v", err)
+				}
+			}
+		})
+		return
+	}
+}
--- a/grammar/grammar.go
+++ b/grammar/grammar.go
@@ -0,0 +1,227 @@
+package grammar
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"iter"
+	"strconv"
+
+	"github.com/ollama/ollama/grammar/jsonschema"
+)
+
+const jsonTerms = `
+# Unicode
+#
+# Unicode characters can be specified directly in the grammar, for example
+# hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit
+# (\UXXXXXXXX).
+unicode ::= \x{hex}{2} | \u{hex}{4} | \U{hex}{8}
+
+# JSON grammar from RFC 7159
+null    ::= "null"
+object  ::= "{" (kv ("," kv)*)? "}"
+array   ::= "[" (value ("," value)*)? "]"
+kv      ::= string ":" value
+integer ::= "0" | [1-9] [0-9]*
+number  ::= "-"? integer frac? exp?
+frac    ::= "." [0-9]+
+exp     ::= ("e" | "E") ("+" | "-") [0-9]+
+string  ::= "\"" char* "\""
+escape  ::= ["/" | "b" | "f" | "n" | "r" | "t" | unicode]
+char    ::= [^"\\] | escape
+space   ::= (" " | "\t" | "\n" | "\r")*
+hex     ::= [0-9] | [a-f] | [A-F]
+boolean ::= "true" | "false"
+value   ::= object | array | string | number | boolean | "null"
+
+# User-defined
+`
+
+// FromSchema generates a grammar from a JSON schema.
+func FromSchema(buf []byte, jsonSchema []byte) ([]byte, error) {
+	var s *jsonschema.Schema
+	if err := json.Unmarshal(jsonSchema, &s); err != nil {
+		return nil, err
+	}
+
+	var g builder
+
+	// "root" is the only rule that is guaranteed to exist, so we start
+	// with its length for padding, and then adjust it as we go.
+	g.pad = len("root")
+	for id := range dependencies("root", s) {
+		g.pad = max(g.pad, len(id))
+	}
+
+	g.b.WriteString(jsonTerms)
+
+	ids := make(map[*jsonschema.Schema]string)
+	for id, s := range dependencies("root", s) {
+		ids[s] = id
+		g.define(id)
+		if err := fromSchema(&g, ids, s); err != nil {
+			return nil, err
+		}
+	}
+	g.define("root")
+	if err := fromSchema(&g, ids, s); err != nil {
+		return nil, err
+	}
+	g.define("") // finalize the last rule
+	return g.b.Bytes(), nil
+}
+
+func fromSchema(g *builder, ids map[*jsonschema.Schema]string, s *jsonschema.Schema) error {
+	switch typ := s.EffectiveType(); typ {
+	case "array":
+		if len(s.PrefixItems) == 0 && s.Items == nil {
+			g.u("array")
+		} else {
+			g.q("[")
+			for i, s := range s.PrefixItems {
+				if i > 0 {
+					g.q(",")
+				}
+				g.u(ids[s])
+			}
+			if s.Items != nil {
+				g.u("(")
+				if len(s.PrefixItems) > 0 {
+					g.q(",")
+				}
+				g.u(ids[s.Items])
+				g.u(")*")
+			}
+			g.q("]")
+		}
+	case "object":
+		if len(s.Properties) == 0 {
+			g.u("object")
+		} else {
+			g.q("{")
+			for i, p := range s.Properties {
+				name := ids[p]
+				if i > 0 {
+					g.q(",")
+				}
+				g.q(p.Name)
+				g.q(":")
+				g.u(name)
+			}
+			g.q("}")
+		}
+	case "number":
+		buildConstrainedNumber(g, s)
+	case "string":
+		if len(s.Enum) == 0 {
+			g.u("string")
+		} else {
+			g.u("(")
+			for i, e := range s.Enum {
+				if i > 0 {
+					g.q("|")
+				}
+				g.q(string(e))
+			}
+			g.u(")")
+		}
+	case "boolean", "value", "null", "integer":
+		g.u(typ)
+	default:
+		return fmt.Errorf("%s: unsupported type %q", s.Name, typ)
+	}
+	return nil
+}
+
+// dependencies returns a sequence of all child dependencies of the schema in
+// post-order.
+//
+// The first value is the id/pointer to the dependency, and the second value
+// is the schema.
+func dependencies(id string, s *jsonschema.Schema) iter.Seq2[string, *jsonschema.Schema] {
+	return func(yield func(string, *jsonschema.Schema) bool) {
+		for i, p := range s.Properties {
+			id := fmt.Sprintf("%s_%d", id, i)
+			for did, d := range dependencies(id, p) {
+				if !yield(did, d) {
+					return
+				}
+			}
+			if !yield(id, p) {
+				return
+			}
+		}
+		for i, p := range s.PrefixItems {
+			id := fmt.Sprintf("tuple_%d", i)
+			for did, d := range dependencies(id, p) {
+				id := fmt.Sprintf("%s_%s", id, did)
+				if !yield(id, d) {
+					return
+				}
+			}
+			if !yield(id, p) {
+				return
+			}
+		}
+		if s.Items != nil {
+			id := fmt.Sprintf("%s_tuple_%d", id, len(s.PrefixItems))
+			for did, d := range dependencies(id, s.Items) {
+				if !yield(did, d) {
+					return
+				}
+			}
+			if !yield(id, s.Items) {
+				return
+			}
+		}
+	}
+}
+
+type builder struct {
+	b     bytes.Buffer
+	pad   int
+	rules int
+	items int
+}
+
+// define terminates the current rule, if any, and then either starts a new
+// rule or does nothing else if the name is empty.
+func (b *builder) define(name string) {
+	if b.rules > 0 {
+		b.b.WriteString(";\n")
+	}
+	if name == "" {
+		return
+	}
+	fmt.Fprintf(&b.b, "% -*s", b.pad, name)
+	b.b.WriteString(" ::=")
+	b.rules++
+	b.items = 0
+}
+
+// quote appends a terminal to the current rule.
+func (b *builder) q(s string) {
+	if b.items > 0 {
+		b.b.WriteString(" ")
+	}
+	b.b.WriteString(" ")
+	b.b.WriteString(strconv.Quote(s))
+}
+
+// u appends a non-terminal to the current rule.
+func (b *builder) u(s string) {
+	if b.items > 0 {
+		b.b.WriteString(" ")
+	}
+	b.b.WriteString(" ")
+	b.b.WriteString(s)
+}
+
+func buildConstrainedNumber(b *builder, s *jsonschema.Schema) {
+	if s.Minimum == 0 && s.Maximum == 0 {
+		b.u("TODO")
+	} else {
+		b.u("number")
+	}
+}
--- a/grammar/grammar_test.go
+++ b/grammar/grammar_test.go
@@ -0,0 +1,75 @@
+package grammar
+
+import (
+	"bufio"
+	"cmp"
+	"iter"
+	"strings"
+	"testing"
+
+	_ "embed"
+
+	"github.com/ollama/ollama/grammar/internal/diff"
+)
+
+func TestFromSchema(t *testing.T) {
+	for tt := range testCases(t) {
+		t.Run(tt.name, func(t *testing.T) {
+			g, err := FromSchema(nil, []byte(tt.schema))
+			if err != nil {
+				t.Fatalf("FromSchema: %v", err)
+			}
+			got := string(g)
+			got = strings.TrimPrefix(got, jsonTerms)
+			if got != tt.want {
+				t.Logf("schema:\n%s", tt.schema)
+				t.Fatal(string(diff.Diff("got", []byte(got), "want", []byte(tt.want))))
+			}
+		})
+	}
+}
+
+type testCase struct {
+	name   string
+	schema string
+	want   string
+}
+
+//go:embed testdata/schemas.txt
+var tests string
+
+func testCases(t testing.TB) iter.Seq[testCase] {
+	t.Helper()
+	return func(yield func(testCase) bool) {
+		t.Helper()
+		sc := bufio.NewScanner(strings.NewReader(tests))
+		name := ""
+		for sc.Scan() {
+			line := strings.TrimSpace(sc.Text())
+			if line == "" {
+				name = ""
+				continue
+			}
+			if line[0] == '#' {
+				name = cmp.Or(name, strings.TrimSpace(line[1:]))
+				continue
+			}
+			s := sc.Text()
+			g := ""
+			for sc.Scan() {
+				line = strings.TrimSpace(sc.Text())
+				if line == "" || line[0] == '#' {
+					break
+				}
+				g += sc.Text() + "\n"
+			}
+			if !yield(testCase{name, s, g}) {
+				return
+			}
+			name = strings.TrimSpace(strings.TrimPrefix(line, "#"))
+		}
+		if err := sc.Err(); err != nil {
+			t.Fatalf("error reading tests: %v", err)
+		}
+	}
+}
--- a/grammar/internal/diff/diff.go
+++ b/grammar/internal/diff/diff.go
@@ -0,0 +1,261 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package diff
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strings"
+)
+
+// A pair is a pair of values tracked for both the x and y side of a diff.
+// It is typically a pair of line indexes.
+type pair struct{ x, y int }
+
+// Diff returns an anchored diff of the two texts old and new
+// in the “unified diff” format. If old and new are identical,
+// Diff returns a nil slice (no output).
+//
+// Unix diff implementations typically look for a diff with
+// the smallest number of lines inserted and removed,
+// which can in the worst case take time quadratic in the
+// number of lines in the texts. As a result, many implementations
+// either can be made to run for a long time or cut off the search
+// after a predetermined amount of work.
+//
+// In contrast, this implementation looks for a diff with the
+// smallest number of “unique” lines inserted and removed,
+// where unique means a line that appears just once in both old and new.
+// We call this an “anchored diff” because the unique lines anchor
+// the chosen matching regions. An anchored diff is usually clearer
+// than a standard diff, because the algorithm does not try to
+// reuse unrelated blank lines or closing braces.
+// The algorithm also guarantees to run in O(n log n) time
+// instead of the standard O(n²) time.
+//
+// Some systems call this approach a “patience diff,” named for
+// the “patience sorting” algorithm, itself named for a solitaire card game.
+// We avoid that name for two reasons. First, the name has been used
+// for a few different variants of the algorithm, so it is imprecise.
+// Second, the name is frequently interpreted as meaning that you have
+// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
+// when in fact the algorithm is faster than the standard one.
+func Diff(oldName string, old []byte, newName string, new []byte) []byte {
+	if bytes.Equal(old, new) {
+		return nil
+	}
+	x := lines(old)
+	y := lines(new)
+
+	// Print diff header.
+	var out bytes.Buffer
+	fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
+	fmt.Fprintf(&out, "--- %s\n", oldName)
+	fmt.Fprintf(&out, "+++ %s\n", newName)
+
+	// Loop over matches to consider,
+	// expanding each match to include surrounding lines,
+	// and then printing diff chunks.
+	// To avoid setup/teardown cases outside the loop,
+	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
+	// in the sequence of matches.
+	var (
+		done  pair     // printed up to x[:done.x] and y[:done.y]
+		chunk pair     // start lines of current chunk
+		count pair     // number of lines from each side in current chunk
+		ctext []string // lines for current chunk
+	)
+	for _, m := range tgs(x, y) {
+		if m.x < done.x {
+			// Already handled scanning forward from earlier match.
+			continue
+		}
+
+		// Expand matching lines as far as possible,
+		// establishing that x[start.x:end.x] == y[start.y:end.y].
+		// Note that on the first (or last) iteration we may (or definitely do)
+		// have an empty match: start.x==end.x and start.y==end.y.
+		start := m
+		for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
+			start.x--
+			start.y--
+		}
+		end := m
+		for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
+			end.x++
+			end.y++
+		}
+
+		// Emit the mismatched lines before start into this chunk.
+		// (No effect on first sentinel iteration, when start = {0,0}.)
+		for _, s := range x[done.x:start.x] {
+			ctext = append(ctext, "-"+s)
+			count.x++
+		}
+		for _, s := range y[done.y:start.y] {
+			ctext = append(ctext, "+"+s)
+			count.y++
+		}
+
+		// If we're not at EOF and have too few common lines,
+		// the chunk includes all the common lines and continues.
+		const C = 3 // number of context lines
+		if (end.x < len(x) || end.y < len(y)) &&
+			(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
+			for _, s := range x[start.x:end.x] {
+				ctext = append(ctext, " "+s)
+				count.x++
+				count.y++
+			}
+			done = end
+			continue
+		}
+
+		// End chunk with common lines for context.
+		if len(ctext) > 0 {
+			n := end.x - start.x
+			if n > C {
+				n = C
+			}
+			for _, s := range x[start.x : start.x+n] {
+				ctext = append(ctext, " "+s)
+				count.x++
+				count.y++
+			}
+			done = pair{start.x + n, start.y + n}
+
+			// Format and emit chunk.
+			// Convert line numbers to 1-indexed.
+			// Special case: empty file shows up as 0,0 not 1,0.
+			if count.x > 0 {
+				chunk.x++
+			}
+			if count.y > 0 {
+				chunk.y++
+			}
+			fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
+			for _, s := range ctext {
+				out.WriteString(s)
+			}
+			count.x = 0
+			count.y = 0
+			ctext = ctext[:0]
+		}
+
+		// If we reached EOF, we're done.
+		if end.x >= len(x) && end.y >= len(y) {
+			break
+		}
+
+		// Otherwise start a new chunk.
+		chunk = pair{end.x - C, end.y - C}
+		for _, s := range x[chunk.x:end.x] {
+			ctext = append(ctext, " "+s)
+			count.x++
+			count.y++
+		}
+		done = end
+	}
+
+	return out.Bytes()
+}
+
+// lines returns the lines in the file x, including newlines.
+// If the file does not end in a newline, one is supplied
+// along with a warning about the missing newline.
+func lines(x []byte) []string {
+	l := strings.SplitAfter(string(x), "\n")
+	if l[len(l)-1] == "" {
+		l = l[:len(l)-1]
+	} else {
+		// Treat last line as having a message about the missing newline attached,
+		// using the same text as BSD/GNU diff (including the leading backslash).
+		l[len(l)-1] += "\n\\ No newline at end of file\n"
+	}
+	return l
+}
+
+// tgs returns the pairs of indexes of the longest common subsequence
+// of unique lines in x and y, where a unique line is one that appears
+// once in x and once in y.
+//
+// The longest common subsequence algorithm is as described in
+// Thomas G. Szymanski, “A Special Case of the Maximal Common
+// Subsequence Problem,” Princeton TR #170 (January 1975),
+// available at https://research.swtch.com/tgs170.pdf.
+func tgs(x, y []string) []pair {
+	// Count the number of times each string appears in a and b.
+	// We only care about 0, 1, many, counted as 0, -1, -2
+	// for the x side and 0, -4, -8 for the y side.
+	// Using negative numbers now lets us distinguish positive line numbers later.
+	m := make(map[string]int)
+	for _, s := range x {
+		if c := m[s]; c > -2 {
+			m[s] = c - 1
+		}
+	}
+	for _, s := range y {
+		if c := m[s]; c > -8 {
+			m[s] = c - 4
+		}
+	}
+
+	// Now unique strings can be identified by m[s] = -1+-4.
+	//
+	// Gather the indexes of those strings in x and y, building:
+	//	xi[i] = increasing indexes of unique strings in x.
+	//	yi[i] = increasing indexes of unique strings in y.
+	//	inv[i] = index j such that x[xi[i]] = y[yi[j]].
+	var xi, yi, inv []int
+	for i, s := range y {
+		if m[s] == -1+-4 {
+			m[s] = len(yi)
+			yi = append(yi, i)
+		}
+	}
+	for i, s := range x {
+		if j, ok := m[s]; ok && j >= 0 {
+			xi = append(xi, i)
+			inv = append(inv, j)
+		}
+	}
+
+	// Apply Algorithm A from Szymanski's paper.
+	// In those terms, A = J = inv and B = [0, n).
+	// We add sentinel pairs {0,0}, and {len(x),len(y)}
+	// to the returned sequence, to help the processing loop.
+	J := inv
+	n := len(xi)
+	T := make([]int, n)
+	L := make([]int, n)
+	for i := range T {
+		T[i] = n + 1
+	}
+	for i := range n {
+		k := sort.Search(n, func(k int) bool {
+			return T[k] >= J[i]
+		})
+		T[k] = J[i]
+		L[i] = k + 1
+	}
+	k := 0
+	for _, v := range L {
+		if k < v {
+			k = v
+		}
+	}
+	seq := make([]pair, 2+k)
+	seq[1+k] = pair{len(x), len(y)} // sentinel at end
+	lastj := n
+	for i := n - 1; i >= 0; i-- {
+		if L[i] == k && J[i] < lastj {
+			seq[k] = pair{xi[i], yi[J[i]]}
+			k--
+		}
+	}
+	seq[0] = pair{0, 0} // sentinel at start
+	return seq
+}
--- a/grammar/internal/diff/diff_test.go
+++ b/grammar/internal/diff/diff_test.go
@@ -0,0 +1,44 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package diff
+
+import (
+	"bytes"
+	"path/filepath"
+	"testing"
+
+	"golang.org/x/tools/txtar"
+)
+
+func clean(text []byte) []byte {
+	text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n"))
+	text = bytes.TrimSuffix(text, []byte("^D\n"))
+	return text
+}
+
+func Test(t *testing.T) {
+	files, _ := filepath.Glob("testdata/*.txt")
+	if len(files) == 0 {
+		t.Fatalf("no testdata")
+	}
+
+	for _, file := range files {
+		t.Run(filepath.Base(file), func(t *testing.T) {
+			a, err := txtar.ParseFile(file)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if len(a.Files) != 3 || a.Files[2].Name != "diff" {
+				t.Fatalf("%s: want three files, third named \"diff\"", file)
+			}
+			diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data))
+			want := clean(a.Files[2].Data)
+			if !bytes.Equal(diffs, want) {
+				t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file,
+					diffs, want, Diff("have", diffs, "want", want))
+			}
+		})
+	}
+}
--- a/grammar/internal/diff/testdata/allnew.txt
+++ b/grammar/internal/diff/testdata/allnew.txt
@@ -0,0 +1,13 @@
+-- old --
+-- new --
+a
+b
+c
+-- diff --
+diff old new
+--- old
+++ new
+@@ -0,0 +1,3 @@
+a
+b
+c
--- a/grammar/internal/diff/testdata/allold.txt
+++ b/grammar/internal/diff/testdata/allold.txt
@@ -0,0 +1,13 @@
+-- old --
+a
+b
+c
+-- new --
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,3 +0,0 @@
+-a
+-b
+-c
--- a/grammar/internal/diff/testdata/basic.txt
+++ b/grammar/internal/diff/testdata/basic.txt
@@ -0,0 +1,35 @@
+Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.”
+https://www.cs.dartmouth.edu/~doug/diff.pdf
+
+-- old --
+a
+b
+c
+d
+e
+f
+g
+-- new --
+w
+a
+b
+x
+y
+z
+e
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,7 +1,7 @@
+w
+ a
+ b
+-c
+-d
+x
+y
+z
+ e
+-f
+-g
--- a/grammar/internal/diff/testdata/dups.txt
+++ b/grammar/internal/diff/testdata/dups.txt
@@ -0,0 +1,40 @@
+-- old --
+a
+
+b
+
+c
+
+d
+
+e
+
+f
+-- new --
+a
+
+B
+
+C
+
+d
+
+e
+
+f
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,8 +1,8 @@
+ a
+ $
+-b
+-
+-c
+B
+
+C
+ $
+ d
+ $
--- a/grammar/internal/diff/testdata/end.txt
+++ b/grammar/internal/diff/testdata/end.txt
@@ -0,0 +1,38 @@
+-- old --
+1
+2
+3
+4
+5
+6
+7
+eight
+nine
+ten
+eleven
+-- new --
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+-- diff --
+diff old new
+--- old
+++ new
+@@ -5,7 +5,6 @@
+ 5
+ 6
+ 7
+-eight
+-nine
+-ten
+-eleven
+8
+9
+10
--- a/grammar/internal/diff/testdata/eof.txt
+++ b/grammar/internal/diff/testdata/eof.txt
@@ -0,0 +1,9 @@
+-- old --
+a
+b
+c^D
+-- new --
+a
+b
+c^D
+-- diff --
--- a/grammar/internal/diff/testdata/eof1.txt
+++ b/grammar/internal/diff/testdata/eof1.txt
@@ -0,0 +1,18 @@
+-- old --
+a
+b
+c
+-- new --
+a
+b
+c^D
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,3 +1,3 @@
+ a
+ b
+-c
+c
+\ No newline at end of file
--- a/grammar/internal/diff/testdata/eof2.txt
+++ b/grammar/internal/diff/testdata/eof2.txt
@@ -0,0 +1,18 @@
+-- old --
+a
+b
+c^D
+-- new --
+a
+b
+c
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,3 +1,3 @@
+ a
+ b
+-c
+\ No newline at end of file
+c
--- a/grammar/internal/diff/testdata/long.txt
+++ b/grammar/internal/diff/testdata/long.txt
@@ -0,0 +1,62 @@
+-- old --
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+14½
+15
+16
+17
+18
+19
+20
+-- new --
+1
+2
+3
+4
+5
+6
+8
+9
+10
+11
+12
+13
+14
+17
+18
+19
+20
+-- diff --
+diff old new
+--- old
+++ new
+@@ -4,7 +4,6 @@
+ 4
+ 5
+ 6
+-7
+ 8
+ 9
+ 10
+@@ -12,9 +11,6 @@
+ 12
+ 13
+ 14
+-14½
+-15
+-16
+ 17
+ 18
+ 19
--- a/grammar/internal/diff/testdata/same.txt
+++ b/grammar/internal/diff/testdata/same.txt
@@ -0,0 +1,5 @@
+-- old --
+hello world
+-- new --
+hello world
+-- diff --
--- a/grammar/internal/diff/testdata/start.txt
+++ b/grammar/internal/diff/testdata/start.txt
@@ -0,0 +1,34 @@
+-- old --
+e
+pi
+4
+5
+6
+7
+8
+9
+10
+-- new --
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,5 +1,6 @@
+-e
+-pi
+1
+2
+3
+ 4
+ 5
+ 6
--- a/grammar/internal/diff/testdata/triv.txt
+++ b/grammar/internal/diff/testdata/triv.txt
@@ -0,0 +1,40 @@
+Another example from Hunt and McIlroy,
+“An Algorithm for Differential File Comparison.”
+https://www.cs.dartmouth.edu/~doug/diff.pdf
+
+Anchored diff gives up on finding anything,
+since there are no unique lines.
+
+-- old --
+a
+b
+c
+a
+b
+b
+a
+-- new --
+c
+a
+b
+a
+b
+c
+-- diff --
+diff old new
+--- old
+++ new
+@@ -1,7 +1,6 @@
+-a
+-b
+-c
+-a
+-b
+-b
+-a
+c
+a
+b
+a
+b
+c
--- a/grammar/jsonschema/decode.go
+++ b/grammar/jsonschema/decode.go
@@ -0,0 +1,171 @@
+package jsonschema
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+)
+
+// Schema holds a JSON schema.
+type Schema struct {
+	// Name is the name of the property. For the parent/root property, this
+	// is "root". For child properties, this is the name of the property.
+	Name string `json:"-"`
+
+	// Type is the type of the property.
+	//
+	// TODO: Union types (e.g. make this a []string).
+	Type string
+
+	// PrefixItems is a list of schemas for each item in a tuple. By
+	// default, the tuple is "closed." unless Items is set to true or a
+	// valid Schema.
+	PrefixItems []*Schema
+
+	// Items is the schema for each item in a list.
+	//
+	// If it is missing, or its JSON value is "null" or "false", it is nil.
+	// If the JSON value is "true", it is set to the empty Schema. If the
+	// JSON value is an object, it will be decoded as a Schema.
+	Items *Schema
+
+	// MinItems specifies the minimum number of items allowed in a list.
+	MinItems int
+
+	// MaxItems specifies the maximum number of items allowed in a list.
+	MaxItems int
+
+	// Properties is the schema for each property of an object.
+	Properties []*Schema
+
+	// Format is the format of the property. This is used to validate the
+	// property against a specific format.
+	//
+	// It is the callers responsibility to validate the property against
+	// the format.
+	Format string
+
+	// Minimum specifies the minimum value for numeric properties.
+	Minimum float64
+
+	// Maximum specifies the maximum value for numeric properties.
+	Maximum float64
+
+	// Enum is a list of valid values for the property.
+	Enum []json.RawMessage
+}
+
+func (s *Schema) UnmarshalJSON(data []byte) error {
+	type S Schema
+	w := struct {
+		Properties props
+		Items      items
+		*S
+	}{
+		S: (*S)(s),
+	}
+	if err := json.Unmarshal(data, &w); err != nil {
+		return err
+	}
+	if w.Items.set {
+		s.Items = &w.Items.Schema
+	}
+	s.Properties = w.Properties
+	return nil
+}
+
+type items struct {
+	Schema
+	set bool
+}
+
+func (s *items) UnmarshalJSON(data []byte) error {
+	switch b := data[0]; b {
+	case 't':
+		*s = items{set: true}
+	case '{':
+		type I items
+		if err := json.Unmarshal(data, (*I)(s)); err != nil {
+			return err
+		}
+		s.set = true
+	case 'n', 'f':
+	default:
+		return errors.New("invalid Items")
+	}
+	return nil
+}
+
+// EffectiveType returns the effective type of the schema. If the Type field is
+// not empty, it is returned; otherwise:
+//
+//   - If the schema has both Properties and Items, it returns an empty string.
+//   - If the schema has Properties, it returns "object".
+//   - If the schema has Items, it returns "array".
+//   - If the schema has neither Properties nor Items, it returns "value".
+//
+// The returned string is never empty.
+func (d *Schema) EffectiveType() string {
+	if d.Type == "" {
+		if len(d.Properties) > 0 {
+			return "object"
+		}
+		if len(d.PrefixItems) > 0 || d.Items != nil {
+			return "array"
+		}
+		return "value"
+	}
+	return d.Type
+}
+
+// props is an ordered list of properties. The order of the properties
+// is the order in which they were defined in the schema.
+type props []*Schema
+
+var _ json.Unmarshaler = (*props)(nil)
+
+func (v *props) UnmarshalJSON(data []byte) error {
+	if len(data) == 0 {
+		return nil
+	}
+	if data[0] != '{' {
+		return errors.New("expected object")
+	}
+
+	d := json.NewDecoder(bytes.NewReader(data))
+
+	// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
+	// llama.cpp, ignore unknown fields, which could be lead to unexpected
+	// behavior for clients of this package, since they may not be aware
+	// that "additionalFields", "itemsPrefix", etc, are being ignored.
+	//
+	// For now, just do what llama.cpp does.
+
+	t, err := d.Token()
+	if err != nil {
+		return err
+	}
+	if t != json.Delim('{') {
+		return errors.New("expected object")
+	}
+	for d.More() {
+		// Use the first token (map key) as the property name, then
+		// decode the rest of the object fields into a Schema and
+		// append.
+		t, err := d.Token()
+		if err != nil {
+			return err
+		}
+		if t == json.Delim('}') {
+			return nil
+		}
+		s := &Schema{
+			Name: t.(string),
+		}
+		if err := d.Decode(s); err != nil {
+			return err
+		}
+		*v = append(*v, s)
+	}
+	return nil
+}
--- a/grammar/jsonschema/decode_test.go
+++ b/grammar/jsonschema/decode_test.go
@@ -0,0 +1,104 @@
+package jsonschema
+
+import (
+	"encoding/json"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+const testSchemaBasic = `
+{
+  "properties": {
+    "tupleClosedEmpty":   { "prefixItems": [] },
+    "tupleClosedMissing": { "prefixItems": [{}] },
+    "tupleClosedNull":    { "prefixItems": [{}], "items": null },
+    "tupleClosedFalse":   { "prefixItems": [{}], "items": false },
+    "tupleOpenTrue":      { "prefixItems": [{}], "items": true },
+    "tupleOpenEmpty":     { "prefixItems": [{}], "items": {} },
+    "tupleOpenTyped":     { "prefixItems": [{}], "items": {"type": "boolean"} },
+    "tupleOpenMax":       { "prefixItems": [{}], "items": true, "maxItems": 3},
+
+    "array": { "items": {"type": "number"} },
+
+    "null": { "type": "null" },
+    "string": { "type": "string" },
+    "boolean": { "type": "boolean" }
+  }
+}
+`
+
+func TestSchemaUnmarshal(t *testing.T) {
+	var got *Schema
+	if err := json.Unmarshal([]byte(testSchemaBasic), &got); err != nil {
+		t.Fatalf("Unmarshal: %v", err)
+	}
+	want := &Schema{
+		Properties: []*Schema{
+			{Name: "tupleClosedEmpty", PrefixItems: []*Schema{}, Items: nil},
+			{Name: "tupleClosedMissing", PrefixItems: []*Schema{{}}, Items: nil},
+			{Name: "tupleClosedNull", PrefixItems: []*Schema{{}}, Items: nil},
+			{Name: "tupleClosedFalse", PrefixItems: []*Schema{{}}, Items: nil},
+
+			{Name: "tupleOpenTrue", PrefixItems: []*Schema{{}}, Items: &Schema{}},
+			{Name: "tupleOpenEmpty", PrefixItems: []*Schema{{}}, Items: &Schema{}},
+			{Name: "tupleOpenTyped", PrefixItems: []*Schema{{}}, Items: &Schema{Type: "boolean"}},
+			{Name: "tupleOpenMax", PrefixItems: []*Schema{{}}, Items: &Schema{}, MaxItems: 3},
+
+			{Name: "array", Items: &Schema{Type: "number"}},
+
+			{Name: "null", Type: "null"},
+			{Name: "string", Type: "string"},
+			{Name: "boolean", Type: "boolean"},
+		},
+	}
+
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("(-want, +got)\n%s", diff)
+	}
+}
+
+func TestEffectiveType(t *testing.T) {
+	const schema = `
+		{"properties": {
+			"o": {"type": "object"},
+			"a": {"type": "array"},
+			"n": {"type": "number"},
+			"s": {"type": "string"},
+			"z": {"type": "null"},
+			"b": {"type": "boolean"},
+
+			"t0": {"prefixItems": [{}], "items": {"type": "number"}},
+			"t1": {"items": {"type": "number"}, "maxItems": 3},
+
+			"v": {"maxItems": 3}
+		}}
+	`
+
+	var s *Schema
+	if err := json.Unmarshal([]byte(schema), &s); err != nil {
+		t.Fatalf("json.Unmarshal: %v", err)
+	}
+
+	var got []string
+	for _, p := range s.Properties {
+		got = append(got, p.EffectiveType())
+	}
+
+	want := strings.Fields(`
+		object
+		array
+		number
+		string
+		null
+		boolean
+		array
+		array
+		value
+	`)
+	if !reflect.DeepEqual(want, got) {
+		t.Errorf("\ngot:\n\t%v\nwant:\n\t%v", got, want)
+	}
+}
--- a/grammar/testdata/schemas.txt
+++ b/grammar/testdata/schemas.txt
@@ -0,0 +1,76 @@
+# This file holds tests for JSON schema to EBNF grammar conversions.
+#
+# The format is a JSON schema, followed by the expected EBNF grammar. Each test
+# MAY be preceded by a comment that describes the test (e.g. the test name), followed by
+# the JSON schema and the expected EBNF grammar. If no comment is present, the test
+# name the tests number in the file (e.g. "#0", "#1", etc.)
+#
+# Blank lines signify the end or start of a new test. Comments can be added
+# anywhere in the file, but they must be preceded by a '#' character and start at
+# the beginning of the line.
+
+# default
+{}
+root ::= value;
+
+{"properties": {}}
+root ::= value;
+
+# array
+{"properties": {"a": {"type": "array", "items": {"type": "string"}}}}
+root_0_tuple_0 ::= string;
+root_0         ::= "[" ( root_0_tuple_0 )* "]";
+root           ::= "{" "a" ":" root_0 "}";
+
+# array with nested array
+{"type": "array", "items": {"type": "array", "items": {"type": "string"}}}
+root_tuple_0_tuple_0 ::= string;
+root_tuple_0         ::= "[" ( root_tuple_0_tuple_0 )* "]";
+root                 ::= "[" ( root_tuple_0 )* "]";
+
+# object
+{"properties": {"e": {}}}
+root_0 ::= value;
+root   ::= "{" "e" ":" root_0 "}";
+
+# object with nested object
+{"properties": {"o": {"type": "object", "properties": {"e": {}}}}}
+root_0_0 ::= value;
+root_0   ::= "{" "e" ":" root_0_0 "}";
+root     ::= "{" "o" ":" root_0 "}";
+
+# boolean
+{"type": "boolean"}
+root ::= boolean;
+
+# number
+{"properties": {"n": {"type": "number", "minimum": 123, "maximum": 4567}}}
+root_0 ::= number;
+root   ::= "{" "n" ":" root_0 "}";
+
+# string
+{"type": "string"}
+root ::= string;
+
+# string with enum
+{"type": "string", "enum": ["a", "b", "c"]}
+root ::= ( "\"a\"" "|" "\"b\"" "|" "\"c\"" );
+
+# spaces in key
+{"properties": {"a b": {}}}
+root_0 ::= value;
+root   ::= "{" "a b" ":" root_0 "}";
+
+# issue7978
+{ "type": "object", "properties": { "steps": { "type": "array", "items": { "type": "object", "properties": { "explanation": { "type": "string" }, "output": { "type": "string" } }, "required": [ "explanation", "output" ], "additionalProperties": false } }, "final_answer": { "type": "string" } }, "required": [ "steps", "final_answer" ], "additionalProperties": false }
+root_0_tuple_0_0 ::= string;
+root_0_tuple_0_1 ::= string;
+root_0_tuple_0   ::= "{" "explanation" ":" root_0_tuple_0_0 "," "output" ":" root_0_tuple_0_1 "}";
+root_0           ::= "[" ( root_0_tuple_0 )* "]";
+root_1           ::= string;
+root             ::= "{" "steps" ":" root_0 "," "final_answer" ":" root_1 "}";
+
+# !! # special characters in key
+# !! {"properties": {"a!b": {}}}
+# !! !invalid character '!' in key
+# !! 
--- a/llama/README.md
+++ b/llama/README.md
@@ -37,8 +37,7 @@ go build -tags avx .
 ```shell
 # go doesn't recognize `-mfma` as a valid compiler flag
 # see https://github.com/golang/go/issues/17895
-go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
-go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
+go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
 go build -tags=avx,avx2 .
 ```

--- a/llama/amx.h
+++ b/llama/amx.h
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ggml-backend.h"
-#include "ggml-cpu-impl.h"
-
-// GGML internal header
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-#endif
--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
@@ -1,51 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
-
-GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
-
-// number of threads used for conversion to float
-// for openblas and blis, this will also set the number of threads used for blas operations
-GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
-
-
-#ifdef  __cplusplus
-}
-#endif
--- a/llama/ggml-cpu-aarch64.h
+++ b/llama/ggml-cpu-aarch64.h
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml-cpu-traits.h"
-#include "ggml.h"
-
-// GGML internal header
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
--- a/llama/ggml-cpu-traits.h
+++ b/llama/ggml-cpu-traits.h
@@ -1,64 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "ggml-backend-impl.h"
-#include "ggml-cpu-impl.h"
-#include "ggml.h"
-
-#ifdef __cplusplus
-#    include <vector>
-extern "C" {
-#endif
-
-// return true if op part of extra "accelerator"
-bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
-bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
-
-#ifdef __cplusplus
-}
-
-namespace ggml::cpu {
-// register in tensor->extra
-class tensor_traits {
-  public:
-    virtual ~tensor_traits();
-    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
-    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
-};
-
-class extra_buffer_type {
-  public:
-    virtual ~extra_buffer_type();
-    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
-    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
-};
-}  // namespace ggml::cpu
-
-// implemented in ggml-cpu.cpp.
-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
-
-#endif
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ACC_BLOCK_SIZE 256
-
-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,60 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arange.cuh"
-
-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
-}
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    float start;
-    float stop;
-    float step;
-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
-}
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ARANGE_BLOCK_SIZE 256
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/argmax.cuh
+++ b/llama/ggml-cuda/argmax.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,60 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "clamp.cuh"
-
-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
-
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
-}
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CLAMP_BLOCK_SIZE 256
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CONCAT_BLOCK_SIZE 256
-
-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
-
-void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,39 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-
-template<typename T>
-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
-
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<half> to_fp16_cuda_t;
-
-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
-
-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
--- a/llama/ggml-cuda/count-equal.cuh
+++ b/llama/ggml-cuda/count-equal.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
-
-void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CPY_BLOCK_SIZE 64
-
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
-
-void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -1,33 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
-
-void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-
-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-
-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_IM2COL_BLOCK_SIZE 256
-
-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/mmv.cuh
+++ b/llama/ggml-cuda/mmv.cuh
@@ -1,38 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
-#define MMV_MAX_ROWS 512
-
-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-void ggml_cuda_op_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
-
-void ggml_cuda_op_mul_mat_vec_q(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,33 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ b/llama/ggml-cuda/opt-step-adamw.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
-
-void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/out-prod.cuh
+++ b/llama/ggml-cuda/out-prod.cuh
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,32 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_PAD_BLOCK_SIZE 256
-
-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_POOL2D_BLOCK_SIZE 256
-
-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,50 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "common.cuh"
-#include "mmq.cuh"
-
-#include <cstdint>
-
-#define CUDA_QUANTIZE_BLOCK_SIZE     256
-#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
-
-static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
-static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
-
-typedef void (*quantize_cuda_t)(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
-
-void quantize_row_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
-
-void quantize_mmq_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ROPE_BLOCK_SIZE 256
-
-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,57 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "scale.cuh"
-
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
-}
-
-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
-}
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_SCALE_BLOCK_SIZE 256
-
-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-
-void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/sum.cuh
+++ b/llama/ggml-cuda/sum.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
-
-void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,65 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "sumrows.cuh"
-
-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums(nrows, 1, 1);
-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-}
-
-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
-}
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
-
-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ParthSareen	a4265c278a	wip	2025-03-24 11:02:02 -07:00
ParthSareen	ffd6428758	err handling + fixing scope issue	2025-02-11 19:29:48 -08:00
ParthSareen	aa6d5151df	wip with json stuff and cleanup	2025-02-11 16:40:40 -08:00
ParthSareen	25edfa6fdb	first pass so working	2025-02-04 22:09:22 -08:00
ParthSareen	cc2e44b885	Improved sampler interface	2025-02-03 16:05:15 -08:00
ParthSareen	d5f8670f0a	WIP simple SO working	2025-02-03 11:40:06 -08:00
ParthSareen	524029cd6d	tested with so	2025-01-31 17:12:39 -08:00
ParthSareen	b973dedb4b	saved state	2025-01-30 15:17:42 -08:00
ParthSareen	c56a8b7749	wip	2025-01-30 15:05:25 -08:00
ParthSareen	198fde82aa	Enable array type json	2025-01-29 14:54:39 -08:00
ParthSareen	77f709ebd5	cleanup state machine	2025-01-29 14:28:02 -08:00
ParthSareen	73098a2973	WIP working with other stuff	2025-01-29 10:28:22 -08:00
ParthSareen	e93db4d20e	WIP	2025-01-27 16:33:55 -08:00
ParthSareen	a2a73ce5e0	wip!	2025-01-23 20:21:50 -08:00
ParthSareen	6ba557f25b	checkpoint	2025-01-23 09:46:14 -08:00
ParthSareen	a7c8cc06da	json checkpoint	2025-01-21 17:48:12 -08:00
Blake Mizerany	089bbb537d	grammar: introduce new grammar package This package provides a way to convert JSON schemas to equivalent EBNF. It is intended to be a replacement to llama.cpp's schema_to_grammar. This is still an early version and does not yet support all JSON schema features. The to-do list includes: - minumum/maximum constraints on integer types - minLength/maxLength constraints on string types - defs and refs	2025-01-20 14:40:50 -08:00
ParthSareen	7cd9fbbbb1	improve temperature sampler	2025-01-20 13:53:07 -08:00
ParthSareen	b91487f289	Working json sampler	2025-01-15 14:08:54 -08:00
ParthSareen	5b19d4941a	addressing comments + cleanup	2025-01-14 16:23:48 -08:00
ParthSareen	5e73f24e16	sampling package	2025-01-14 16:23:38 -08:00
Michael Yang	4aac178cac	update ci	2025-01-14 14:25:56 -08:00
Michael Yang	800e21d060	sort devices by type	2025-01-14 14:25:56 -08:00
Michael Yang	5827999e9e	next ollama runner implement llama and mllama model architectures in go using ggml (through cgo)	2025-01-14 14:25:56 -08:00
Jeffrey Morgan	61676fb506	llama: move grammar tests to llama_test.go (#8411 )	2025-01-14 12:55:45 -08:00
Bruce MacDonald	f6f3713001	convert: qwen2 from safetensors (#8408 ) Add native support for converting Qwen2 family models (including Qwen2.5) from safetensors to gguf format so we can run it.	2025-01-14 10:34:37 -08:00
Steve Berdy	a30f347201	readme: add LangChain for .NET to community integrations (#8352 )	2025-01-14 09:37:35 -08:00
Jeffrey Morgan	74ea4fb604	remove .prettierrc.json (#8413 )	2025-01-14 09:30:34 -08:00
Jeffrey Morgan	6982e9cc96	readme: remove link to missing page	2025-01-13 18:56:31 -08:00
Patrick Devine	ab39872cb4	add new create api doc (#8388 )	2025-01-13 17:30:24 -08:00