Remove llama.cpp submodule and shift new build to top

Move Go code out of llm package
2026-01-15 02:50:25 -05:00 · 2024-10-23 22:06:01 -07:00 · 2024-10-23 12:38:11 -07:00
465 changed files with 42867 additions and 65887 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,9 +1,5 @@
 name: release

-env:
-  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
-
 on:
  push:
    tags:
@@ -12,7 +8,7 @@ on:
 jobs:
  # Full build of the Mac assets
  build-darwin:
-    runs-on: macos-13
+    runs-on: macos-12
    environment: release
    steps:
      - uses: actions/checkout@v4
@@ -43,8 +39,8 @@ jobs:
          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
          APPLE_ID: ${{ vars.APPLE_ID }}
-          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
+          SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+          DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
        run: |
          ./scripts/build_darwin.sh

@@ -64,33 +60,51 @@ jobs:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Add msys paths
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
+      - name: install Windows SDK 8.1 to get signtool
        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
+      - run: go get ./...
      - run: |
+          $gopath=(get-command go).source | split-path -parent
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make dist
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$env:PATH"
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores
        name: make
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
+            build/**/*
+            build/**/*.a
            dist/windows-amd64/**

  # ROCm generation step
@@ -101,53 +115,63 @@ jobs:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Add msys paths
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
+      - name: install Windows SDK 8.1 to get signtool
        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      # ROCM installation steps
-      - name: 'Cache ROCm installer'
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: rocm-install.exe
-          key: ${{ env.ROCM_WINDOWS_URL }}
-      - name: 'Conditionally Download ROCm'
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
      - name: 'Install ROCm'
        run: |
-          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          $ErrorActionPreference = "Stop"
+          write-host "downloading AMD HIP Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP"
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - name: make rocm runner
-        run: |
+      - run: go get ./...
+      - run: |
+          $gopath=(get-command go).source | split-path -parent
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make help-runners
-          make dist_rocm
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$env:PATH"
+          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores
+        name: make
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: |
+            build/**/*
            dist/windows-amd64/**

  # CUDA generation step
@@ -157,78 +181,76 @@ jobs:
    strategy:
      matrix:
        cuda:
-          - version: "11.3"
-            url: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-          - version: "12.4"
-            url: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
+          - version: "11"
+            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
+          - version: "12"
+            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Install msys2
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
+      - name: install Windows SDK 8.1 to get signtool
        run: |
-          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
-          write-host "Downloading msys2"
-          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
-          write-host "Installing msys2"
-          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: verify tools
-        run: |
-          get-command gcc
-          gcc --version
-          get-command make
-          make --version
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      # CUDA installation steps
-      - name: 'Cache CUDA installer'
-        id: cache-cuda
-        uses: actions/cache@v4
-        with:
-          path: cuda-install.exe
-          key: ${{ matrix.cuda.url }}
-      - name: 'Conditionally Download CUDA'
-        if: steps.cache-cuda.outputs.cache-hit != 'true'
+      - name: 'Install CUDA ${{ matrix.cuda.version }}'
        run: |
          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "cuda-install.exe"
-      - name: 'Install CUDA'
-        run: |
-          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ matrix.cuda.version }}"}
-          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
-      - name: 'Verify CUDA'
-        run: |
-          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
+          write-host "downloading CUDA Installer"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          write-host "Installing CUDA"
+          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
+          write-host "Completed CUDA"
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      - name: make cuda runner
+          echo "$cudaPath\bin" >> $env:GITHUB_PATH
+          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
+      - name: 'Verify CUDA'
+        run: nvcc -V
+      - run: go get ./...
+      - name: make
        run: |
+          $gopath=(get-command go).source | split-path -parent
+          $cudabin=(get-command nvcc).source | split-path
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$cudabin;$env:PATH"
+          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
+            build/**/*
            dist/windows-amd64/**

  # windows arm64 generate, go build, and zip file (no installer)
@@ -371,7 +393,7 @@ jobs:
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
          $env:ARCH="arm64"
-          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies sign distZip
+          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
        name: 'Windows Build'
      - uses: actions/upload-artifact@v4
        with:
@@ -421,24 +443,6 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
-      - name: Install msys2
-        run: |
-          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
-          write-host "Downloading msys2"
-          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
-          write-host "Installing msys2"
-          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: verify tools
-        run: |
-          get-command gcc
-          gcc --version
-          get-command make
-          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -447,29 +451,28 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cpu
-          path: dist/windows-amd64/
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-11.3
-          path: dist/windows-amd64/
+          name: generate-windows-cuda-11
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-12.4
-          path: dist/windows-amd64/
+          name: generate-windows-cuda-12
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-          path: dist/windows-amd64/
      - uses: actions/download-artifact@v4
        with:
          name: windows-arm64
          path: dist
+      - run: dir build
      - run: |
+          $gopath=(get-command go).source | split-path -parent
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
          $env:ARCH="amd64"
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,11 +1,5 @@
 name: test

-env:
-  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
-  CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
-  CUDA_12_WINDOWS_VER: 12.4
-
 concurrency:
  # For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
  # cancels running CI jobs and starts all new ones.
@@ -105,45 +99,30 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      # ROCM installation steps
-      - name: 'Cache ROCm installer'
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: rocm-install.exe
-          key: ${{ env.ROCM_WINDOWS_URL }}
-      - name: 'Conditionally Download ROCm'
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
      - name: 'Install ROCm'
        run: |
-          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          $ErrorActionPreference = "Stop"
+          write-host "downloading AMD HIP Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP"
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-
-      - name: make rocm runner
-        run: |
+      - run: go get ./...
+      - run: |
+          $gopath=(get-command go).source | split-path -parent
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          $env:PATH="$gopath;$env:PATH"
+          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          write-host $env:HIP_PATH
          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
+          make -j $cores rocm
+        name: make

  # CUDA generation step
  runners-windows-cuda:
@@ -156,49 +135,36 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      # CUDA installation steps
-      - name: 'Cache CUDA installer'
-        id: cache-cuda
-        uses: actions/cache@v4
-        with:
-          path: cuda-install.exe
-          key: ${{ env.CUDA_12_WINDOWS_URL }}
-      - name: 'Conditionally Download CUDA'
-        if: steps.cache-cuda.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
      - name: 'Install CUDA'
        run: |
-          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
-          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
-      - name: 'Verify CUDA'
-        run: |
-          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
+          $ErrorActionPreference = "Stop"
+          write-host "downloading CUDA Installer"
+          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          write-host "Installing CUDA"
+          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
+          write-host "Completed CUDA"
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - name: make cuda runner
+          echo "$cudaPath\bin" >> $env:GITHUB_PATH
+          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
+      - name: 'Verify CUDA'
+        run: nvcc -V
+      - run: go get ./...
+      - name: make
        run: |
+          $gopath=(get-command go).source | split-path -parent
+          $cudabin=(get-command nvcc).source | split-path
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$cudabin;$env:PATH"
+          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores cuda_v11
+        env:
+          OLLAMA_SKIP_CPU_GENERATE: '1'

  runners-cpu:
    needs: [changes]
@@ -223,15 +189,7 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
+      - run: go get ./...
      - name: 'Build Windows Go Runners'
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
@@ -242,8 +200,7 @@ jobs:
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -j 4
+          make -j 4      
      - name: 'Build Unix Go Runners'
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        run: make -j 4
@@ -269,15 +226,6 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -290,7 +238,7 @@ jobs:
        shell: bash
      - uses: golangci/golangci-lint-action@v6
        with:
-          args: --timeout 10m0s -v
+          args: --timeout 8m0s -v
  test:
    strategy:
      matrix:
@@ -309,15 +257,6 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@@ -328,7 +267,8 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
-      - run: go test ./...
+      - run: go build
+      - run: go test -v ./...

  patches:
    needs: [changes]
@@ -340,4 +280,4 @@ jobs:
          submodules: recursive
      - name: Verify patches carry all the changes
        run: |
-          make apply-patches sync && git diff --compact-summary --exit-code llama
+          make apply-patches sync && git diff --compact-summary --exit-code llama
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,9 @@ ollama
 .idea
 test_data
 *.crt
+llm/build
+build/*/*/*
+!build/**/placeholder
 llama/build
 __debug_bin*
 llama/vendor
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -8,6 +8,8 @@ linters:
    - containedctx
    - contextcheck
    - errcheck
+    - exportloopref
+    - gci
    - gocheckcompilerdirectives
    - gofmt
    - gofumpt
@@ -28,6 +30,8 @@ linters:
    - wastedassign
    - whitespace
 linters-settings:
+  gci:
+    sections: [standard, default, localmodule]
  staticcheck:
    checks:
      - all
--- a/167
+++ b/167
@@ -1,9 +1,11 @@
-ARG GOLANG_VERSION=1.22.8
+# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
+ARG GOLANG_VERSION=1.22.5
+ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
+ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
-ARG JETPACK_6=r36.2.0
-ARG JETPACK_5=r35.4.1

 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds
 #
@@ -12,22 +14,24 @@ ARG JETPACK_5=r35.4.1
 #
 ### Then incremental builds will be much faster in this container
 #
-# make -j 10 dist
+# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    dnf clean all && \
    dnf install -y \
    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
+    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
+    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
@@ -41,11 +45,12 @@ ENTRYPOINT [ "zsh" ]
 # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
 #
 FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
-RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
    dnf config-manager --set-enabled appstream && \
    dnf clean all && \
@@ -56,84 +61,86 @@ RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH arm64
+ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]

-FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
+FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_SKIP_CUDA_11_GENERATE
+ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
+ARG CUDA_V11_ARCHITECTURES
+ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
-ARG VERSION
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(nproc) dist ; \
+        make -C llama -j $(expr $(nproc) / 2 ) ; \
    else \
-        make -j 5 dist ; \
+        make -C llama -j 5 ; \
    fi
+
+FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
+COPY . .
+ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_SKIP_CUDA_11_GENERATE
+ARG OLLAMA_SKIP_CUDA_12_GENERATE
+ARG CUDA_V11_ARCHITECTURES
+ARG CUDA_V12_ARCHITECTURES
+ARG OLLAMA_FAST_BUILD
+RUN --mount=type=cache,target=/root/.ccache \
+    make -C llama -j 8
+
+
+# Intermediate stages used for ./scripts/build_linux.sh
+FROM --platform=linux/amd64 centos:7 AS builder-amd64
+ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV CGO_ENABLED 1
+ENV GOARCH amd64
+WORKDIR /go/src/github.com/ollama/ollama
+
+FROM --platform=linux/amd64 builder-amd64 AS build-amd64
+COPY . .
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
+ARG GOFLAGS
+ARG CGO_CFLAGS
+ARG OLLAMA_SKIP_ROCM_GENERATE
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
    cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
    fi

-# Jetsons need to be built in discrete stages
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
+FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
-RUN apt-get update && apt-get install -y git curl ccache && \
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
-    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-WORKDIR /go/src/github.com/ollama/ollama/
-COPY . .
-ARG CGO_CFLAGS
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV CGO_ENABLED 1
 ENV GOARCH arm64
-ARG VERSION
-RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v11 \
-        CUDA_ARCHITECTURES="72;87" \
-        GPU_RUNNER_VARIANT=_jetpack5 \
-        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
-        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
+WORKDIR /go/src/github.com/ollama/ollama

-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
-ARG GOLANG_VERSION
-RUN apt-get update && apt-get install -y git curl ccache && \
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
-    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-WORKDIR /go/src/github.com/ollama/ollama/
+FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
+ARG GOFLAGS
 ARG CGO_CFLAGS
-ENV GOARCH arm64
-ARG VERSION
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v12 \
-        CUDA_ARCHITECTURES="87" \
-        GPU_RUNNER_VARIANT=_jetpack6 \
-        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
-        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
-
-FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_FAST_BUILD
-ARG VERSION
-RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-jetpack5 && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
-RUN cd dist/linux-$GOARCH-jetpack6 && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz

 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
@@ -142,13 +149,30 @@ COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz
 FROM dist-$TARGETARCH AS dist


+# Optimized container images do not cary nested payloads
+FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/bin/ollama .
+
+FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-arm64/bin/ollama .
+
 # For amd64 container images, filter out cuda/rocm to minimize size
-FROM build-amd64 AS runners-cuda-amd64
+FROM runners-amd64 AS runners-cuda-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
    ./dist/linux-amd64/lib/ollama/runners/rocm*

-FROM build-amd64 AS runners-rocm-amd64
+FROM runners-amd64 AS runners-rocm-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
    ./dist/linux-amd64/lib/ollama/libcu*.so* \
@@ -157,19 +181,16 @@ RUN rm -rf \
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+    rm -rf /var/lib/apt/lists/*
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
-
+    rm -rf /var/lib/apt/lists/*
+COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/

 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
@@ -178,8 +199,8 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+    rm -rf /var/lib/apt/lists/*
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 EXPOSE 11434
--- a/107
+++ b/107
@@ -1,103 +1,4 @@
-# top level makefile for Ollama
-include make/common-defs.make
-
-
-# Determine which if any GPU runners we should build
-include make/cuda-v11-defs.make
-include make/cuda-v12-defs.make
-include make/rocm-defs.make
-
-ifeq ($(CUSTOM_CPU_FLAGS),)
-ifeq ($(ARCH),amd64)
-	RUNNER_TARGETS=cpu
-endif
-# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11_COMPILER),)
-	RUNNER_TARGETS += cuda_v11
-endif
-ifneq ($(CUDA_12_COMPILER),)
-	RUNNER_TARGETS += cuda_v12
-endif
-endif
-else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
-ifneq ($(CUDA_12_COMPILER),)
-	RUNNER_TARGETS += cuda_v12
-else ifneq ($(CUDA_11_COMPILER),)
-	RUNNER_TARGETS += cuda_v11
-endif
-endif
-
-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_COMPILER),)
-	RUNNER_TARGETS += rocm
-endif
-endif
-
-
-all: runners exe
-
-dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
-
-dist_%:
-	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
-
-runners: $(RUNNER_TARGETS)
-
-$(RUNNER_TARGETS):
-	@$(MAKE) --no-print-directory -f make/Makefile.$@
-
-exe dist_exe:
-	@$(MAKE) --no-print-directory -f make/Makefile.ollama $@
-
-help-sync apply-patches create-patches sync sync-clean:
-	@$(MAKE) --no-print-directory -f make/Makefile.sync $@
-
-test integration lint:
-	@$(MAKE) --no-print-directory -f make/Makefile.test $@
-
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE)
-	go clean -cache
-
-help:
-	@echo "The following make targets will help you build Ollama"
-	@echo ""
-	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
-	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
-	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
-	@echo "	make dist		# Build the runners and primary ollama executable for distribution"
-	@echo "	make help-sync 		# Help information on vendor update targets"
-	@echo "	make help-runners 	# Help information on runner targets"
-	@echo ""
-	@echo "The following make targets will help you test Ollama"
-	@echo ""
-	@echo "	make test   		# Run unit tests"
-	@echo "	make integration	# Run integration tests.  You must 'make all' first"
-	@echo "	make lint   		# Run lint and style tests"
-	@echo ""
-	@echo "For more information see 'docs/development.md'"
-	@echo ""
-
-
-help-runners:
-	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
-	@echo ""
-	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
-	@echo ""
-	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
-	@echo "CUDA_PATH=$(CUDA_PATH)"
-	@echo "	CUDA_11_PATH=$(CUDA_11_PATH)"
-	@echo "	CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
-	@echo "	CUDA_12_PATH=$(CUDA_12_PATH)"
-	@echo "	CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
-	@echo ""
-	@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
-	@echo "HIP_PATH=$(HIP_PATH)"
-	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
-
-.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS)
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
+GOALS := $(or $(MAKECMDGOALS),all)
+.PHONY: $(GOALS)
+$(GOALS):
+	$(MAKE) -C llama $@
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Get up and running with large language models.

 [Download](https://ollama.com/download/Ollama-darwin.zip)

-### Windows
+### Windows preview

 [Download](https://ollama.com/download/OllamaSetup.exe)

@@ -47,28 +47,26 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam

 Here are some example models that can be downloaded:

-| Model              | Parameters | Size  | Download                         |
-| ------------------ | ---------- | ----- | -------------------------------- |
-| Llama 3.3          | 70B        | 43GB  | `ollama run llama3.3`            |
-| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
-| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
-| Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
-| Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
-| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
-| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
-| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
-| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
-| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
-| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
-| Starling           | 7B         | 4.1GB | `ollama run starling-lm`         |
-| Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
-| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
-| LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`               |
+| Model              | Parameters | Size  | Download                       |
+| ------------------ | ---------- | ----- | ------------------------------ |
+| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
+| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
+| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
+| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`         |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
+| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
+| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
+| Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
+| Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
+| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
+| Solar              | 10.7B      | 6.1GB | `ollama run solar`             |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -298,7 +296,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
+- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
 - [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
@@ -308,17 +306,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
@@ -326,8 +318,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
@@ -337,36 +327,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
- [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
- [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
- [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [SpaceLlama](https://github.com/tcsenpai/spacellama) (Firefox and Chrome extension to quickly summarize web pages with ollama in a sidebar)
- [YouLama](https://github.com/tcsenpai/youlama) (Webapp to quickly summarize any YouTube video, supporting Invidious as well)
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
- [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application available for Mac/Windows/Linux)
- [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
- [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
-
-### Cloud
-
- [Google Cloud](https://cloud.google.com/run/docs/tutorials/gpu-gemma2-with-ollama)
- [Fly.io](https://fly.io/docs/python/do-more/add-ollama/)
- [Koyeb](https://www.koyeb.com/deploy/ollama)

 ### Terminal

@@ -382,7 +346,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
 - [ooo](https://github.com/npahlfer/ooo)
- [shell-pilot](https://github.com/reid41/shell-pilot)(Interact with models via pure shell scripts on Linux or macOS)
+- [shell-pilot](https://github.com/reid41/shell-pilot)
 - [tenere](https://github.com/pythops/tenere)
 - [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
 - [typechat-cli](https://github.com/anaisbetts/typechat-cli)
@@ -390,28 +354,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
- [ParLlama](https://github.com/paulrobello/parllama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
- [x-cmd ollama](https://x-cmd.com/mod/ollama)
- [bb7](https://github.com/drunkwcodes/bb7)
- [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.

 ### Apple Vision Pro
-
 - [Enchanted](https://github.com/AugustDev/enchanted)

 ### Database

- [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
-   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
- [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)

 ### Package managers

@@ -427,11 +380,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
- [Spring AI](https://github.com/spring-projects/spring-ai) with [reference](https://docs.spring.io/spring-ai/reference/api/chat/ollama-chat.html) and [example](https://github.com/tzolov/ollama-tools)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
- [LLPhant](https://github.com/theodo-group/LLPhant?tab=readme-ov-file#ollama)
 - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
@@ -456,20 +407,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
- [llm-axe](https://github.com/emirsahin1/llm-axe) (Python Toolkit for Building LLM Powered Apps)
 - [Gollm](https://docs.gollm.co/examples/ollama-example)
- [Gollama for Golang](https://github.com/jonathanhecl/gollama)
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
- [GoLamify](https://github.com/prasad89/golamify)
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)

 ### Mobile

@@ -483,7 +426,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
 - [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
 - [Continue](https://github.com/continuedev/continue)
- [Vibe](https://github.com/thewh1teagle/vibe) (Transcribe and analyze meetings with Ollama)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
@@ -506,25 +448,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
+- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
- [AI Summmary Helper plugin](https://github.com/philffm/ai-summary-helper)
- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)

 ### Supported backends

 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

-### Observability
-
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production. 
--- a/api/client.go
+++ b/api/client.go
@@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {

 // ClientFromEnvironment creates a new [Client] using configuration from the
 // environment variable OLLAMA_HOST, which points to the network host and
-// port on which the ollama service is listening. The format of this variable
+// port on which the ollama service is listenting. The format of this variable
 // is:
 //
 //	<scheme>://<host>:<port>
--- a/api/types.go
+++ b/api/types.go
@@ -12,7 +12,7 @@ import (
 	"time"
 )

-// StatusError is an error with an HTTP status code and message.
+// StatusError is an error with and HTTP status code.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@@ -57,7 +57,7 @@ type GenerateRequest struct {
 	Template string `json:"template"`

 	// Context is the context parameter returned from a previous call to
-	// [Client.Generate]. It can be used to keep a short conversational memory.
+	// Generate call. It can be used to keep a short conversational memory.
 	Context []int `json:"context,omitempty"`

 	// Stream specifies whether the response is streaming; it is true by default.
@@ -67,7 +67,7 @@ type GenerateRequest struct {
 	Raw bool `json:"raw,omitempty"`

 	// Format specifies the format to return a response in.
-	Format json.RawMessage `json:"format,omitempty"`
+	Format string `json:"format"`

 	// KeepAlive controls how long the model will stay loaded in memory following
 	// this request.
@@ -90,31 +90,23 @@ type ChatRequest struct {
 	// Messages is the messages of the chat - can be used to keep a chat memory.
 	Messages []Message `json:"messages"`

-	// Stream enables streaming of returned responses; true by default.
+	// Stream enable streaming of returned response; true by default.
 	Stream *bool `json:"stream,omitempty"`

 	// Format is the format to return the response in (e.g. "json").
-	Format json.RawMessage `json:"format,omitempty"`
+	Format string `json:"format"`

 	// KeepAlive controls how long the model will stay loaded into memory
-	// following the request.
+	// followin the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

 	// Tools is an optional list of tools the model has access to.
 	Tools `json:"tools,omitempty"`

-	Debug *Debug `json:"debug,omitempty"`
-
-	Dry bool `json:"dry,omitempty"`
-
 	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

-type Debug struct {
-	Include []string `json:"include,omitempty"`
-}
-
 type Tools []Tool

 func (t Tools) String() string {
@@ -154,7 +146,6 @@ type ToolCall struct {
 }

 type ToolCallFunction struct {
-	Index     int                       `json:"index,omitempty"`
 	Name      string                    `json:"name"`
 	Arguments ToolCallFunctionArguments `json:"arguments"`
 }
@@ -198,8 +189,6 @@ type ChatResponse struct {
 	Message    Message   `json:"message"`
 	DoneReason string    `json:"done_reason,omitempty"`

-	Debug map[string]any `json:"debug,omitempty"`
-
 	Done bool `json:"done"`

 	Metrics
@@ -214,8 +203,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specified in [GenerateRequest].  If you add a new option here, also
-// add it to the API docs.
+// Options specified in [GenerateRequest], if you add a new option here add it
+// to the API docs also.
 type Options struct {
 	Runner

@@ -226,6 +215,7 @@ type Options struct {
 	TopK             int      `json:"top_k,omitempty"`
 	TopP             float32  `json:"top_p,omitempty"`
 	MinP             float32  `json:"min_p,omitempty"`
+	TFSZ             float32  `json:"tfs_z,omitempty"`
 	TypicalP         float32  `json:"typical_p,omitempty"`
 	RepeatLastN      int      `json:"repeat_last_n,omitempty"`
 	Temperature      float32  `json:"temperature,omitempty"`
@@ -246,7 +236,7 @@ type Runner struct {
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
+	F16KV     bool  `json:"f16_kv,omitempty"`
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
@@ -604,6 +594,7 @@ func DefaultOptions() Options {
 		Temperature:      0.8,
 		TopK:             40,
 		TopP:             0.9,
+		TFSZ:             1.0,
 		TypicalP:         1.0,
 		RepeatLastN:      64,
 		RepeatPenalty:    1.1,
@@ -622,6 +613,7 @@ func DefaultOptions() Options {
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
+			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
--- a/app/lifecycle/lifecycle.go
+++ b/app/lifecycle/lifecycle.go
@@ -11,12 +11,10 @@ import (

 	"github.com/ollama/ollama/app/store"
 	"github.com/ollama/ollama/app/tray"
-	"github.com/ollama/ollama/envconfig"
 )

 func Run() {
 	InitLogging()
-	slog.Info("app config", "env", envconfig.Values())

 	ctx, cancel := context.WithCancel(context.Background())
 	var done chan int
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -36,13 +36,8 @@ func init() {
 		ServerLogFile = filepath.Join(AppDataDir, "server.log")
 		UpgradeLogFile = filepath.Join(AppDataDir, "upgrade.log")

-		exe, err := os.Executable()
-		if err != nil {
-			slog.Warn("error discovering executable directory", "error", err)
-			AppDir = filepath.Join(localAppData, "Programs", "Ollama")
-		} else {
-			AppDir = filepath.Dir(exe)
-		}
+		// Executables are stored in APPDATA
+		AppDir = filepath.Join(localAppData, "Programs", "Ollama")

 		// Make sure we have PATH set correctly for any spawned children
 		paths := strings.Split(os.Getenv("PATH"), ";")
@@ -69,7 +64,7 @@ func init() {
 		}

 		// Make sure our logging dir exists
-		_, err = os.Stat(AppDataDir)
+		_, err := os.Stat(AppDataDir)
 		if errors.Is(err, os.ErrNotExist) {
 			if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
 				slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -18,17 +18,11 @@ func getCLIFullPath(command string) string {
 	var cmdPath string
 	appExe, err := os.Executable()
 	if err == nil {
-		// Check both the same location as the tray app, as well as ./bin
 		cmdPath = filepath.Join(filepath.Dir(appExe), command)
 		_, err := os.Stat(cmdPath)
 		if err == nil {
 			return cmdPath
 		}
-		cmdPath = filepath.Join(filepath.Dir(appExe), "bin", command)
-		_, err = os.Stat(cmdPath)
-		if err == nil {
-			return cmdPath
-		}
 	}
 	cmdPath, err = exec.LookPath(command)
 	if err == nil {
--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@@ -26,15 +26,19 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 	slog.Info("starting upgrade with " + installerExe)
 	slog.Info("upgrade log file " + UpgradeLogFile)

-	// make the upgrade show progress, but non interactive
+	// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
 	installArgs := []string{
 		"/CLOSEAPPLICATIONS",                    // Quit the tray app if it's still running
 		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
 		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
-		"/SP",                                   // Skip the "This will install... Do you wish to continue" prompt
-		"/NOCANCEL",                             // Disable the ability to cancel upgrade mid-flight to avoid partially installed upgrades
-		"/SILENT",
 	}
+	// make the upgrade as quiet as possible (no GUI, no prompts)
+	installArgs = append(installArgs,
+		"/SP", // Skip the "This will install... Do you wish to continue" prompt
+		"/SUPPRESSMSGBOXES",
+		"/SILENT",
+		"/VERYSILENT",
+	)

 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -53,8 +53,8 @@ RestartIfNeededByRun=no
 ; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
 WizardSmallImageFile=.\assets\setup.bmp

-; Ollama requires Windows 10 22H2 or newer for proper unicode rendering
-; TODO: consider setting this to 10.0.19045
+; TODO verifty actual min windows version...
+; OG Win 10
 MinVersion=10.0.10240

 ; First release that supports WinRT UI Composition for win32 apps
@@ -97,6 +97,7 @@ Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Chec
 Source: "..\dist\windows-arm64\vc_redist.arm64.exe"; DestDir: "{tmp}"; Check: IsArm64() and vc_redist_needed(); Flags: deleteafterinstall
 Source: "..\dist\windows-arm64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: IsArm64();  Flags: ignoreversion 64bit
 Source: "..\dist\windows-arm64\ollama.exe"; DestDir: "{app}"; Check: IsArm64(); Flags: ignoreversion 64bit
+Source: "..\dist\windows-arm64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: IsArm64(); Flags: ignoreversion 64bit recursesubdirs
 #endif

 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
@@ -135,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
 Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"

 [Messages]
-WizardReady=Ollama
+WizardReady=Ollama Windows Preview
 ReadyLabel1=%nLet's get you up and running with your own large language models.
 SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.

--- a/app/store/store.go
+++ b/app/store/store.go
@@ -64,7 +64,7 @@ func initStore() {
 		slog.Debug(fmt.Sprintf("unexpected error searching for store: %s", err))
 	}
 	slog.Debug("initializing new store")
-	store.ID = uuid.NewString()
+	store.ID = uuid.New().String()
 	writeStore(getStorePath())
 }

--- a/app/tray/wintray/eventloop.go
+++ b/app/tray/wintray/eventloop.go
@@ -98,7 +98,7 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
 		}
 		err = t.wcex.unregister()
 		if err != nil {
-			slog.Error(fmt.Sprintf("failed to unregister window %s", err))
+			slog.Error(fmt.Sprintf("failed to uregister windo %s", err))
 		}
 	case WM_DESTROY:
 		// same as WM_ENDSESSION, but throws 0 exit code after all
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -11,13 +11,12 @@ import (
 )

 const (
-	_ = iota
-	updateAvailableMenuID
-	updateMenuID
-	separatorMenuID
-	diagLogsMenuID
-	diagSeparatorMenuID
-	quitMenuID
+	updateAvailableMenuID = 1
+	updateMenuID          = updateAvailableMenuID + 1
+	separatorMenuID       = updateMenuID + 1
+	diagLogsMenuID        = separatorMenuID + 1
+	diagSeparatorMenuID   = diagLogsMenuID + 1
+	quitMenuID            = diagSeparatorMenuID + 1
 )

 func (t *winTray) initMenus() error {
@@ -39,7 +38,7 @@ func (t *winTray) UpdateAvailable(ver string) error {
 		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenuTitle, false); err != nil {
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
--- a/app/tray/wintray/messages.go
+++ b/app/tray/wintray/messages.go
@@ -10,6 +10,6 @@ const (

 	quitMenuTitle            = "Quit Ollama"
 	updateAvailableMenuTitle = "An update is available"
-	updateMenuTitle          = "Restart to update"
+	updateMenutTitle         = "Restart to update"
 	diagLogsMenuTitle        = "View logs"
 )
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -361,7 +361,7 @@ func (t *winTray) showMenu() error {

 	boolRet, _, err = pTrackPopupMenu.Call(
 		uintptr(t.menus[0]),
-		TPM_BOTTOMALIGN|TPM_LEFTALIGN|TPM_RIGHTBUTTON,
+		TPM_BOTTOMALIGN|TPM_LEFTALIGN,
 		uintptr(p.X),
 		uintptr(p.Y),
 		0,
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -67,7 +67,6 @@ const (
 	SW_HIDE             = 0
 	TPM_BOTTOMALIGN     = 0x0020
 	TPM_LEFTALIGN       = 0x0000
-	TPM_RIGHTBUTTON     = 0x0002
 	WM_CLOSE            = 0x0010
 	WM_USER             = 0x0400
 	WS_CAPTION          = 0x00C00000
--- a/build/darwin/amd64/placeholder
+++ b/build/darwin/amd64/placeholder
@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command
--- a/build/darwin/arm64/placeholder
+++ b/build/darwin/arm64/placeholder
@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command
--- a/build/embed_darwin_amd64.go
+++ b/build/embed_darwin_amd64.go
@@ -0,0 +1,8 @@
+package build
+
+import "embed"
+
+// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
+
+//go:embed darwin/amd64/*
+var EmbedFS embed.FS
--- a/build/embed_darwin_arm64.go
+++ b/build/embed_darwin_arm64.go
@@ -0,0 +1,8 @@
+package build
+
+import "embed"
+
+// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
+
+//go:embed darwin/arm64/*
+var EmbedFS embed.FS
--- a/build/embed_linux.go
+++ b/build/embed_linux.go
@@ -0,0 +1,6 @@
+package build
+
+import "embed"
+
+//go:embed linux/*
+var EmbedFS embed.FS
--- a/build/embed_unused.go
+++ b/build/embed_unused.go
@@ -0,0 +1,8 @@
+//go:build !linux && !darwin
+
+package build
+
+import "embed"
+
+// unused on windows
+var EmbedFS embed.FS
--- a/build/linux/amd64/placeholder
+++ b/build/linux/amd64/placeholder
@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command
--- a/build/linux/arm64/placeholder
+++ b/build/linux/arm64/placeholder
@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -8,7 +8,6 @@ import (
 	"crypto/ed25519"
 	"crypto/rand"
 	"crypto/sha256"
-	"encoding/json"
 	"encoding/pem"
 	"errors"
 	"fmt"
@@ -20,6 +19,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
@@ -35,13 +35,13 @@ import (
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/llama/runner"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
+	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -456,10 +456,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	if len(prompts) > 0 {
 		interactive = false
 	}
-	// Be quiet if we're redirecting to a pipe or file
-	if !term.IsTerminal(int(os.Stdout.Fd())) {
-		interactive = false
-	}

 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {
@@ -516,6 +512,47 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generate(cmd, opts)
 }

+func errFromUnknownKey(unknownKeyErr error) error {
+	// find SSH public key in the error message
+	sshKeyPattern := `ssh-\w+ [^\s"]+`
+	re := regexp.MustCompile(sshKeyPattern)
+	matches := re.FindStringSubmatch(unknownKeyErr.Error())
+
+	if len(matches) > 0 {
+		serverPubKey := matches[0]
+
+		localPubKey, err := auth.GetPublicKey()
+		if err != nil {
+			return unknownKeyErr
+		}
+
+		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
+			// try the ollama service public key
+			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
+			if err != nil {
+				return unknownKeyErr
+			}
+			localPubKey = strings.TrimSpace(string(svcPubKey))
+		}
+
+		// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
+		if serverPubKey != localPubKey {
+			return unknownKeyErr
+		}
+
+		var msg strings.Builder
+		msg.WriteString(unknownKeyErr.Error())
+		msg.WriteString("\n\nYour ollama key is:\n")
+		msg.WriteString(localPubKey)
+		msg.WriteString("\nAdd your key at:\n")
+		msg.WriteString("https://ollama.com/settings/keys")
+
+		return errors.New(msg.String())
+	}
+
+	return unknownKeyErr
+}
+
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -562,8 +599,6 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.PushRequest{Name: args[0], Insecure: insecure}
-
-	n := model.ParseName(args[0])
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
 		if spinner != nil {
 			spinner.Stop()
@@ -571,19 +606,18 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 		if strings.Contains(err.Error(), "access denied") {
 			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
 		}
+		host := model.ParseName(args[0]).Host
+		isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
+		if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
+			// the user has not added their ollama key to ollama.com
+			// re-throw an error with a more user-friendly message
+			return errFromUnknownKey(err)
+		}
+
 		return err
 	}

-	p.Stop()
 	spinner.Stop()
-
-	destination := n.String()
-	if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") {
-		destination = "https://ollama.com/" + strings.TrimSuffix(n.DisplayShortest(), ":latest")
-	}
-	fmt.Printf("\nYou can find your model at:\n\n")
-	fmt.Printf("\t%s\n", destination)
-
 	return nil
 }

@@ -601,7 +635,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 	var data [][]string

 	for _, m := range models.Models {
-		if len(args) == 0 || strings.HasPrefix(strings.ToLower(m.Name), strings.ToLower(args[0])) {
+		if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
 			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
 		}
 	}
@@ -766,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Print(resp.System)
+			fmt.Println(resp.System)
 		case "template":
-			fmt.Print(resp.Template)
+			fmt.Println(resp.Template)
 		}

 		return nil
@@ -1038,14 +1072,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		return nil
 	}

-	if opts.Format == "json" {
-		opts.Format = `"` + opts.Format + `"`
-	}
-
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
-		Format:   json.RawMessage(opts.Format),
+		Format:   opts.Format,
 		Options:  opts.Options,
 	}

@@ -1127,16 +1157,12 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		}
 	}

-	if opts.Format == "json" {
-		opts.Format = `"` + opts.Format + `"`
-	}
-
 	request := api.GenerateRequest{
 		Model:     opts.Model,
 		Prompt:    opts.Prompt,
 		Context:   generateContext,
 		Images:    opts.Images,
-		Format:    json.RawMessage(opts.Format),
+		Format:    opts.Format,
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
@@ -1292,7 +1318,7 @@ func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
 	cobra.EnableCommandSorting = false

-	if runtime.GOOS == "windows" && term.IsTerminal(int(os.Stdout.Fd())) {
+	if runtime.GOOS == "windows" {
 		console.ConsoleFromFile(os.Stdin) //nolint:errcheck
 	}

@@ -1422,19 +1448,6 @@ func NewCLI() *cobra.Command {
 		RunE:    DeleteHandler,
 	}

-	runnerCmd := &cobra.Command{
-		Use:    "runner",
-		Short:  llama.PrintSystemInfo(),
-		Hidden: true,
-		RunE: func(cmd *cobra.Command, args []string) error {
-			return runner.Execute(os.Args[1:])
-		},
-		FParseErrWhitelist: cobra.FParseErrWhitelist{UnknownFlags: true},
-	}
-	runnerCmd.SetHelpFunc(func(cmd *cobra.Command, args []string) {
-		_ = runner.Execute(args[1:])
-	})
-
 	envVars := envconfig.AsMap()

 	envs := []envconfig.EnvVar{envVars["OLLAMA_HOST"]}
@@ -1469,7 +1482,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
-				envVars["OLLAMA_KV_CACHE_TYPE"],
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
@@ -1491,7 +1503,6 @@ func NewCLI() *cobra.Command {
 		psCmd,
 		copyCmd,
 		deleteCmd,
-		runnerCmd,
 	)

 	return rootCmd
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -4,10 +4,10 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
-	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"path/filepath"
 	"strings"
 	"testing"

@@ -179,14 +179,18 @@ Weigh anchor!

 	t.Run("license", func(t *testing.T) {
 		var b bytes.Buffer
-		license := "MIT License\nCopyright (c) Ollama\n"
+		license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
+		if err != nil {
+			t.Fatal(err)
+		}
+
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-			License: license,
+			License: string(license),
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
@@ -365,127 +369,3 @@ func TestGetModelfileName(t *testing.T) {
 		})
 	}
 }
-
-func TestPushHandler(t *testing.T) {
-	tests := []struct {
-		name           string
-		modelName      string
-		serverResponse map[string]func(w http.ResponseWriter, r *http.Request)
-		expectedError  string
-		expectedOutput string
-	}{
-		{
-			name:      "successful push",
-			modelName: "test-model",
-			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
-				"/api/push": func(w http.ResponseWriter, r *http.Request) {
-					if r.Method != http.MethodPost {
-						t.Errorf("expected POST request, got %s", r.Method)
-					}
-
-					var req api.PushRequest
-					if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-						http.Error(w, err.Error(), http.StatusBadRequest)
-						return
-					}
-
-					if req.Name != "test-model" {
-						t.Errorf("expected model name 'test-model', got %s", req.Name)
-					}
-
-					// Simulate progress updates
-					responses := []api.ProgressResponse{
-						{Status: "preparing manifest"},
-						{Digest: "sha256:abc123456789", Total: 100, Completed: 50},
-						{Digest: "sha256:abc123456789", Total: 100, Completed: 100},
-					}
-
-					for _, resp := range responses {
-						if err := json.NewEncoder(w).Encode(resp); err != nil {
-							http.Error(w, err.Error(), http.StatusInternalServerError)
-							return
-						}
-						w.(http.Flusher).Flush()
-					}
-				},
-			},
-			expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
-		},
-		{
-			name:      "unauthorized push",
-			modelName: "unauthorized-model",
-			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
-				"/api/push": func(w http.ResponseWriter, r *http.Request) {
-					w.Header().Set("Content-Type", "application/json")
-					w.WriteHeader(http.StatusUnauthorized)
-					err := json.NewEncoder(w).Encode(map[string]string{
-						"error": "access denied",
-					})
-					if err != nil {
-						t.Fatal(err)
-					}
-				},
-			},
-			expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if handler, ok := tt.serverResponse[r.URL.Path]; ok {
-					handler(w, r)
-					return
-				}
-				http.Error(w, "not found", http.StatusNotFound)
-			}))
-			defer mockServer.Close()
-
-			t.Setenv("OLLAMA_HOST", mockServer.URL)
-
-			cmd := &cobra.Command{}
-			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(context.TODO())
-
-			// Redirect stderr to capture progress output
-			oldStderr := os.Stderr
-			r, w, _ := os.Pipe()
-			os.Stderr = w
-
-			// Capture stdout for the "Model pushed" message
-			oldStdout := os.Stdout
-			outR, outW, _ := os.Pipe()
-			os.Stdout = outW
-
-			err := PushHandler(cmd, []string{tt.modelName})
-
-			// Restore stderr
-			w.Close()
-			os.Stderr = oldStderr
-			// drain the pipe
-			if _, err := io.ReadAll(r); err != nil {
-				t.Fatal(err)
-			}
-
-			// Restore stdout and get output
-			outW.Close()
-			os.Stdout = oldStdout
-			stdout, _ := io.ReadAll(outR)
-
-			if tt.expectedError == "" {
-				if err != nil {
-					t.Errorf("expected no error, got %v", err)
-				}
-				if tt.expectedOutput != "" {
-					if got := string(stdout); got != tt.expectedOutput {
-						t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
-					}
-				}
-			} else {
-				if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
-					t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
-				}
-			}
-		})
-	}
-}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -319,6 +319,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						opts.Messages = append(opts.Messages, newMessage)
 					}
 					fmt.Println("Set system message.")
+					sb.Reset()
+
 					sb.Reset()
 					continue
 				default:
@@ -514,7 +516,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
 	re := regexp.MustCompile(regexPattern)

 	return re.FindAllString(input, -1)
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -12,45 +12,44 @@ import (
 func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
- ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
+ ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
 	res := extractFileNames(input)
 	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
-	assert.Contains(t, res[4], "five.JPG")
+	assert.Contains(t, res[4], "five.svg")
 	assert.NotContains(t, res[4], '"')
-	assert.NotContains(t, res, "inbetween1")
-	assert.NotContains(t, res, "./1.svg")
+	assert.NotContains(t, res, "inbtween")

 	// Windows style paths
 	input = ` some preamble
 c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2 
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
-./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
-d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
+./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
+d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
 `
 	res = extractFileNames(input)
 	assert.Len(t, res, 10)
-	assert.NotContains(t, res, "inbetween2")
+	assert.NotContains(t, res, "inbtween")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[1], "c:")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
-	assert.Contains(t, res[4], "five.JPG")
+	assert.Contains(t, res[4], "five.svg")
 	assert.Contains(t, res[5], "six.png")
-	assert.Contains(t, res[6], "seven.JPEG")
+	assert.Contains(t, res[6], "seven.svg")
 	assert.Contains(t, res[6], "d:")
 	assert.Contains(t, res[7], "eight.png")
 	assert.Contains(t, res[7], "c:")
 	assert.Contains(t, res[8], "nine.png")
 	assert.Contains(t, res[8], "d:")
-	assert.Contains(t, res[9], "ten.PNG")
+	assert.Contains(t, res[9], "ten.svg")
 	assert.Contains(t, res[9], "E:")
 }

--- a/cmd/runner/main.go
+++ b/cmd/runner/main.go
@@ -1,15 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"os"
-
-	"github.com/ollama/ollama/llama/runner"
-)
-
-func main() {
-	if err := runner.Execute(os.Args[1:]); err != nil {
-		fmt.Fprintf(os.Stderr, "error: %s\n", err)
-		os.Exit(1)
-	}
-}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,7 @@ import (
 	"log/slog"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type ModelParameters struct {
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }

-func (ModelParameters) KV(t *Tokenizer) llm.KV {
-	kv := llm.KV{
+func (ModelParameters) KV(t *Tokenizer) fileutils.KV {
+	kv := fileutils.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p AdapterParameters) KV() llm.KV {
+func (p AdapterParameters) KV() fileutils.KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 		alpha = p.LoraParameters.Alpha
 	}

-	kv := llm.KV{
+	kv := fileutils.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
+	return fileutils.WriteGGUF(ws, kv, ts)
 }

-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
+	return fileutils.WriteGGUF(ws, kv, ts)
 }

 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) llm.KV
+	KV(*Tokenizer) fileutils.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []fileutils.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
@@ -99,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
 }

 type moreParser interface {
@@ -108,17 +108,17 @@ type moreParser interface {

 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(llm.KV) llm.KV
+	KV(fileutils.KV) fileutils.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []fileutils.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string

-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
 }

-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV fileutils.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -8,7 +8,7 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type bertModel struct {
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bertModel) KV(t *Tokenizer) llm.KV {
+func (p *bertModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type gemmaModel struct {
@@ -23,7 +23,7 @@ type gemmaModel struct {

 var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+func (p *gemmaModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,7 +1,7 @@
 package convert

 import (
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type gemma2Model struct {
@@ -11,7 +11,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+func (p *gemma2Model) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type gemma2Adapter struct {
@@ -15,14 +15,14 @@ type gemma2Adapter struct {

 var _ AdapterConverter = (*gemma2Adapter)(nil)

-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+func (p *gemma2Adapter) KV(baseKV fileutils.KV) fileutils.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
 }

-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -9,7 +9,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type llamaModel struct {
@@ -46,7 +46,7 @@ type llamaModel struct {

 var _ ModelConverter = (*llamaModel)(nil)

-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+func (p *llamaModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor

 	if p.RopeScaling.factors != nil {
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type llamaAdapter struct {
@@ -18,7 +18,7 @@ type llamaAdapter struct {

 var _ AdapterConverter = (*llamaAdapter)(nil)

-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+func (p *llamaAdapter) KV(baseKV fileutils.KV) fileutils.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 	return kv
 }

-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -6,7 +6,7 @@ import (
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type mixtralModel struct {
@@ -15,7 +15,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+func (p *mixtralModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []fileutils.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 		return true
 	})

-	var out []llm.Tensor
+	var out []fileutils.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 	"sync"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type phi3Model struct {
@@ -37,7 +37,7 @@ type phi3Model struct {

 var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+func (p *phi3Model) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []fileutils.Tensor {
 	var addRopeFactors sync.Once

-	out := make([]llm.Tensor, 0, len(ts)+2)
+	out := make([]fileutils.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, llm.Tensor{
+				out = append(out, fileutils.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, llm.Tensor{
+				}, fileutils.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 			})
 		}

-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,7 +20,7 @@ import (

 	"golang.org/x/exp/maps"

-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )

 type tensorData struct {
@@ -29,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }

-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, fileutils.KV, *fileutils.Tensors) {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
+	m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv fileutils.KV, tensors *fileutils.Tensors) map[string]string {
 	actual := make(map[string]string)
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
@@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/sentencepiece/sentencepiece_model.pb.go
+++ b/convert/sentencepiece/sentencepiece_model.pb.go
@@ -331,7 +331,7 @@ type TrainerSpec struct {
 	// Reserved special meta tokens.
 	// * -1 is not used.
 	// * unk_id must not be -1.
-	// Id must start with 0 and be contiguous.
+	// Id must starts with 0 and be contigous.
 	UnkId    *int32  `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"`  // <unk>
 	BosId    *int32  `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"`  // <s>
 	EosId    *int32  `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"`  // </s>
--- a/convert/sentencepiece_model.proto
+++ b/convert/sentencepiece_model.proto
@@ -213,7 +213,7 @@ message TrainerSpec {
  // Reserved special meta tokens.
  // * -1 is not used.
  // * unk_id must not be -1.
-  // Id must start with 0 and be contiguous.
+  // Id must starts with 0 and be contigous.
  optional int32 unk_id = 40 [default = 0];   // <unk>
  optional int32 bos_id = 41 [default = 1];   // <s>
  optional int32 eos_id = 42 [default = 2];   // </s>
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -10,7 +10,6 @@ import (
 	"log/slog"
 	"os"
 	"slices"
-	"strings"

 	"golang.org/x/exp/maps"
 )
@@ -61,25 +60,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			addedTokens[t.Content] = t
 		}

-		if len(tt.Model.Merges) == 0 {
-			// noop; merges is empty
-		} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
-			// noop; merges is []string
-		} else if merges, err := func() ([][]string, error) {
-			var merges [][]string
-			if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
-				return nil, err
-			}
-
-			return merges, nil
-		}(); err == nil {
-			t.Merges = make([]string, len(merges))
-			for i := range merges {
-				t.Merges[i] = strings.Join(merges[i], " ")
-			}
-		} else {
-			return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
-		}
+		t.Merges = tt.Model.Merges

 		sha256sum := sha256.New()
 		for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -175,9 +156,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 type tokenizer struct {
 	AddedTokens []token `json:"added_tokens"`
 	Model       struct {
-		Type   string          `json:"type"`
-		Vocab  map[string]int  `json:"vocab"`
-		Merges json.RawMessage `json:"merges"`
+		Type   string         `json:"type"`
+		Vocab  map[string]int `json:"vocab"`
+		Merges []string       `json:"merges"`
 	} `json:"model"`

 	PreTokenizer struct {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -191,62 +191,6 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
-		{
-			name: "list string merges",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"model": {
-						"merges": [
-							"a b",
-							"c d",
-							"e f"
-						]
-					}
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model: "gpt2",
-				},
-				Merges: []string{
-					"a b",
-					"c d",
-					"e f",
-				},
-				Pre: "default",
-			},
-		},
-		{
-			name: "list list string merges",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"model": {
-						"merges": [
-							[
-								"a", "b"
-							],
-							[
-								"c", "d"
-							],
-							[
-								"e", "f"
-							]
-						]
-					}
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model: "gpt2",
-				},
-				Merges: []string{
-					"a b",
-					"c d",
-					"e f",
-				},
-				Pre: "default",
-			},
-		},
 	}

 	for _, tt := range cases {
--- a/discover/README.md
+++ b/discover/README.md
@@ -0,0 +1,3 @@
+# `discover`
+
+This package is responsible for discovering information about the system and the capabilities to run LLM.  This includes GPU and CPU discovery so the optimal runner can be chosen for a given model.  The ollama scheduler relies on up-to-date available memory information, so this package provides the ability to refresh free memory as efficiently as possible.
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -37,6 +37,19 @@ func GetSupportedGFX(libDir string) ([]string, error) {
 	return ret, nil
 }

+func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "rocm" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
+}
+
 func commonAMDValidateLibDir() (string, error) {
 	// Favor our bundled version

--- a/discover/amd_hip_windows.go
+++ b/discover/amd_hip_windows.go
@@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) {
 	return hl, nil
 }

-// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
+// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
 // so we have to unload/reset the library after we do our initial discovery
 // to make sure our updates to that variable are processed by llama.cpp
 func (hl *HipLib) Release() {
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -64,20 +64,22 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
 	hipVD := envconfig.HipVisibleDevices()   // zero based index only
-	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
+	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
 	gpuDO := envconfig.GpuDeviceOrdinal()    // zero based index
 	switch {
-	case rocrVD != "":
-		visibleDevices = strings.Split(rocrVD, ",")
+	// TODO is this priorty order right?
 	case hipVD != "":
 		visibleDevices = strings.Split(hipVD, ",")
+	case rocrVD != "":
+		visibleDevices = strings.Split(rocrVD, ",")
+		// TODO - since we don't yet support UUIDs, consider detecting and reporting here
+		// all our test systems show GPU-XX indicating UUID is not supported
 	case gpuDO != "":
 		visibleDevices = strings.Split(gpuDO, ",")
 	}

 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
-	depPaths := LibraryDirs()
 	libDir := ""

 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
@@ -97,7 +99,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		}
 		return a < b
 	})
-	gpuCount := 0
+	cpuCount := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)
 		fp, err := os.Open(match)
@@ -106,6 +108,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			continue
 		}
 		defer fp.Close()
+		nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
+		if err != nil {
+			slog.Debug("failed to parse node ID", "error", err)
+			continue
+		}

 		scanner := bufio.NewScanner(fp)
 		isCPU := false
@@ -179,18 +186,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		// do reliably report VRAM usage.

 		if isCPU {
+			cpuCount++
 			continue
 		}

-		// Skip over any GPUs that are masked
-		if major == 0 && minor == 0 && patch == 0 {
-			slog.Debug("skipping gpu with gfx000")
-			continue
-		}
+		// CPUs are always first in the list
+		gpuID := nodeID - cpuCount

-		// Keep track of numeric IDs based on valid GPUs
-		gpuID := gpuCount
-		gpuCount += 1
+		// Shouldn't happen, but just in case...
+		if gpuID < 0 {
+			err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
+			slog.Error(err.Error())
+			return nil, err
+		}

 		// Look up the memory for the current node
 		totalMemory := uint64(0)
@@ -265,14 +273,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			name = fmt.Sprintf("%04x:%04x", vendor, device)
 		}

-		// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
-		var ID string
-		if uniqueID != 0 {
-			ID = fmt.Sprintf("GPU-%016x", uniqueID)
-		} else {
-			ID = strconv.Itoa(gpuID)
-		}
-
 		gpuInfo := RocmGPUInfo{
 			GpuInfo: GpuInfo{
 				Library: "rocm",
@@ -280,7 +280,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 					TotalMemory: totalMemory,
 					FreeMemory:  (totalMemory - usedMemory),
 				},
-				ID:            ID,
+				ID:            strconv.Itoa(gpuID),
 				Name:          name,
 				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 				MinimumMemory: rocmMinimumMemory,
@@ -288,7 +288,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				DriverMinor:   driverMinor,
 			},
 			usedFilepath: usedFile,
-			index:        gpuID,
 		}

 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
@@ -301,11 +300,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			})
 			continue
 		}
-		minVer, err := strconv.Atoi(RocmComputeMajorMin)
-		if err != nil {
-			slog.Error("invalid RocmComputeMajorMin setting", "value", RocmComputeMajorMin, "error", err)
-		}
-		if int(major) < minVer {
+
+		if int(major) < RocmComputeMin {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
 			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
@@ -323,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		if len(visibleDevices) > 0 {
 			include := false
 			for _, visible := range visibleDevices {
-				if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
+				if visible == gpuInfo.ID {
 					include = true
 					break
 				}
@@ -353,9 +349,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				})
 				return nil, err
 			}
-			depPaths = append(depPaths, libDir)
 		}
-		gpuInfo.DependencyPath = depPaths
+		gpuInfo.DependencyPath = libDir

 		if gfxOverride == "" {
 			// Only load supported list once
@@ -521,20 +516,3 @@ func verifyKFDDriverAccess() error {
 	fd.Close()
 	return nil
 }
-
-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "rocm" {
-			// TODO shouldn't happen if things are wired correctly...
-			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
-			continue
-		}
-		ids = append(ids, info.ID)
-	}
-	// There are 3 potential env vars to use to select GPUs.
-	// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
-	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
-}
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -43,21 +43,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Debug("error looking up amd driver version", "error", err)
 	}

-	// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
+	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
 	if count == 0 {
 		err := fmt.Errorf("no compatible amdgpu devices detected")
 		slog.Info(err.Error())
 		return nil, err
 	}
-	depPaths := LibraryDirs()
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
-	depPaths = append(depPaths, libDir)

 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -113,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,

 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: depPaths,
+				DependencyPath: libDir,
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
@@ -184,7 +182,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
-		return err
+		return nil
 	}
 	defer hl.Release()

@@ -203,20 +201,3 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 	}
 	return nil
 }
-
-func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	ids := []string{}
-	for _, info := range gpuInfo {
-		if info.Library != "rocm" {
-			// TODO shouldn't happen if things are wired correctly...
-			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
-			continue
-		}
-		ids = append(ids, info.ID)
-	}
-	// There are 3 potential env vars to use to select GPUs.
-	// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
-	// HIP_VISIBLE_DEVICES supports numeric IDs only
-	// GPU_DEVICE_ORDINAL supports numeric IDs only
-	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
-}
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@@ -5,8 +5,21 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+
+	"golang.org/x/sys/cpu"
 )

+func GetCPUCapability() CPUCapability {
+	if cpu.X86.HasAVX2 {
+		return CPUCapabilityAVX2
+	}
+	if cpu.X86.HasAVX {
+		return CPUCapabilityAVX
+	}
+	// else LCD
+	return CPUCapabilityNone
+}
+
 func IsNUMA() bool {
 	if runtime.GOOS != "linux" {
 		// numa support in llama.cpp is linux only
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -16,14 +16,12 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"unsafe"

 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )

 type cudaHandles struct {
@@ -47,6 +45,7 @@ const (
 var (
 	gpuMutex      sync.Mutex
 	bootstrapped  bool
+	cpuCapability CPUCapability
 	cpus          []CPUInfo
 	cudaGPUs      []CudaGPUInfo
 	nvcudaLibPath string
@@ -65,13 +64,9 @@ var (
 )

 // With our current CUDA compile flags, older than 5.0 will not work properly
-// (string values used to allow ldflags overrides at build time)
-var (
-	CudaComputeMajorMin = "5"
-	CudaComputeMinorMin = "0"
-)
+var CudaComputeMin = [2]C.int{5, 0}

-var RocmComputeMajorMin = "9"
+var RocmComputeMin = 9

 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
@@ -106,9 +101,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	libDirs := LibraryDirs()
-	for _, d := range libDirs {
-		cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
+	libDir := LibraryDir()
+	if libDir != "" {
+		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)

@@ -224,23 +219,16 @@ func GetGPUInfo() GpuInfoList {

 	if !bootstrapped {
 		slog.Info("looking for compatible GPUs")
-		cudaComputeMajorMin, err := strconv.Atoi(CudaComputeMajorMin)
-		if err != nil {
-			slog.Error("invalid CudaComputeMajorMin setting", "value", CudaComputeMajorMin, "error", err)
-		}
-		cudaComputeMinorMin, err := strconv.Atoi(CudaComputeMinorMin)
-		if err != nil {
-			slog.Error("invalid CudaComputeMinorMin setting", "value", CudaComputeMinorMin, "error", err)
-		}
 		bootstrapErrors = []error{}
 		needRefresh = false
+		cpuCapability = GetCPUCapability()
 		var memInfo C.mem_info_t

 		mem, err := GetCPUMem()
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
-		depPaths := LibraryDirs()
+		depPath := LibraryDir()
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
@@ -250,14 +238,24 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo:        mem,
 					Library:        "cpu",
-					Variant:        runners.GetCPUCapability().String(),
+					Variant:        cpuCapability.String(),
 					ID:             "0",
-					DependencyPath: depPaths,
+					DependencyPath: depPath,
 				},
 				CPUs: details,
 			},
 		}

+		// Fallback to CPU mode if we're lacking required vector extensions on x86
+		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
+			err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled.  Required:%s  Detected:%s", GPURunnerCPUCapability, cpuCapability)
+			slog.Warn(err.Error())
+			bootstrapErrors = append(bootstrapErrors, err)
+			bootstrapped = true
+			// No need to do any GPU discovery, since we can't run on them
+			return GpuInfoList{cpus[0].GpuInfo}
+		}
+
 		// Load ALL libraries
 		cHandles = initCudaHandles()

@@ -294,23 +292,19 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
-				if depPaths != nil {
-					gpuInfo.DependencyPath = depPaths
+				if depPath != "" {
+					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
 					if variant != "" {
-						for _, d := range depPaths {
-							if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
-								// Put the variant directory first in the search path to avoid runtime linking to the wrong library
-								gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
-								break
-							}
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
 						}
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.Variant = variant

-				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) {
+				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
 					unsupportedGPUs = append(unsupportedGPUs,
 						UnsupportedGPUInfo{
 							GpuInfo: gpuInfo.GpuInfo,
@@ -322,9 +316,7 @@ func GetGPUInfo() GpuInfoList {
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				if cHandles.nvml != nil {
-					uuid := C.CString(gpuInfo.ID)
-					defer C.free(unsafe.Pointer(uuid))
-					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
+					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
 					if memInfo.err != nil {
 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 						C.free(unsafe.Pointer(memInfo.err))
@@ -376,7 +368,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPaths
+						gpuInfo.DependencyPath = depPath
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
@@ -391,8 +383,6 @@ func GetGPUInfo() GpuInfoList {
 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
 			slog.Info("no compatible GPUs were discovered")
 		}
-
-		// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
 	}

 	// For detected GPUs, load library if not loaded
@@ -427,9 +417,7 @@ func GetGPUInfo() GpuInfoList {
 		}
 		for i, gpu := range cudaGPUs {
 			if cHandles.nvml != nil {
-				uuid := C.CString(gpu.ID)
-				defer C.free(unsafe.Pointer(uuid))
-				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
+				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
 			} else if cHandles.cudart != nil {
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
 			} else if cHandles.nvcuda != nil {
@@ -517,10 +505,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	slog.Debug("Searching for GPU library", "name", baseLibName)

 	// Start with our bundled libraries
-	patterns := []string{}
-	for _, d := range LibraryDirs() {
-		patterns = append(patterns, filepath.Join(d, baseLibName))
-	}
+	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}

 	switch runtime.GOOS {
 	case "windows":
@@ -716,26 +701,32 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }

-func LibraryDirs() []string {
-	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
-	// This can be simplified once we no longer carry runners as payloads
-	paths := []string{}
+func LibraryDir() string {
+	// On Windows/linux we bundle the dependencies at the same level as the executable
 	appExe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to lookup executable path", "error", err)
-	} else {
-		appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
-		if _, err := os.Stat(appRelative); err == nil {
-			paths = append(paths, appRelative)
+	}
+	cwd, err := os.Getwd()
+	if err != nil {
+		slog.Warn("failed to lookup working directory", "error", err)
+	}
+	// Scan for any of our dependeices, and pick first match
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
+		libDep := filepath.Join("lib", "ollama")
+		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
+			return filepath.Join(root, libDep)
+		}
+		// Developer mode, local build
+		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
 		}
 	}
-	rDir := runners.Locate()
-	if err != nil {
-		slog.Warn("unable to locate gpu dependency libraries", "error", err)
-	} else {
-		paths = append(paths, filepath.Dir(rDir))
-	}
-	return paths
+	slog.Warn("unable to locate gpu dependency libraries")
+	return ""
 }

 func GetSystemInfo() SystemInfo {
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -15,7 +15,6 @@ import (
 	"syscall"

 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )

 const (
@@ -28,7 +27,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: runners.GetCPUCapability().String(),
+				Variant: GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -51,7 +50,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: runners.GetCPUCapability().String(),
+			Variant: GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@@ -4,7 +4,6 @@
 #include "gpu_info_nvcuda.h"

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
-  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
@@ -58,10 +57,8 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
      resp->cudaErr = -1;
      return;
    }
-    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
  }

-  LOG(resp->ch.verbose, "calling cuInit\n");
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
@@ -78,18 +75,15 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  resp->ch.driver_minor = 0;

  // Report driver version if we're in verbose mode, ignore errors
-  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
-    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }

-  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
@@ -100,7 +94,6 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    resp->cudaErr = ret;
    return;
  }
-  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
 }

 const int buflen = 256;
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
@@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  } l[] = {
      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
-      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
      {NULL, NULL},
  };
@@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
 }


-void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
+void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
    nvmlDevice_t device;
    nvmlMemory_t memInfo = {0};
    nvmlReturn_t ret;
-    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
    if (ret != NVML_SUCCESS) {
-        LOG(1, "unable to get device handle %s: %d", uuid, ret);
+        LOG(1, "unable to get device handle %d: %d", device_id, ret);
        *free = 0;
        return;
    }

    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
-        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
+        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
        *free = 0;
        return;
    }
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
@@ -25,7 +25,7 @@ typedef struct nvml_handle {
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
 } nvml_handle_t;

@@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
 } nvml_compute_capability_t;

 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
 void nvml_release(nvml_handle_t ch);

 #endif  // __GPU_INFO_NVML_H__
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@@ -3,11 +3,9 @@ package discover
 import (
 	"bufio"
 	"fmt"
-	"io"
 	"os"
 	"reflect"
 	"regexp"
-	"sort"
 	"strings"

 	"github.com/ollama/ollama/format"
@@ -111,10 +109,6 @@ func GetCPUDetails() ([]CPU, error) {
 	if err != nil {
 		return nil, err
 	}
-	return linuxCPUDetails(file)
-}
-
-func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
@@ -137,9 +131,6 @@ func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 			cpu = &linuxCpuInfo{}
 		}
 	}
-	if cpu.ID != "" {
-		cpuInfos = append(cpuInfos, *cpu)
-	}

 	// Process the sockets/cores/threads
 	socketByID := map[string]*CPU{}
@@ -186,14 +177,10 @@ func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 			s.EfficiencyCoreCount = efficiencyCoreCount
 		}
 	}
-	keys := make([]string, 0, len(socketByID))
-	result := make([]CPU, 0, len(socketByID))
-	for k := range socketByID {
-		keys = append(keys, k)
-	}
-	sort.Strings(keys)
-	for _, k := range keys {
-		result = append(result, *socketByID[k])
+
+	result := []CPU{}
+	for _, c := range socketByID {
+		result = append(result, *c)
 	}
 	return result, nil
 }
--- a/discover/gpu_linux_test.go
+++ b/discover/gpu_linux_test.go
--- a/discover/gpu_windows.go
+++ b/discover/gpu_windows.go
@@ -209,7 +209,7 @@ func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
 		}
 	}

-	// Summarize the results
+	// Sumarize the results
 	for i, pkg := range packages {
 		slog.Info("", "package", i, "cores", pkg.coreCount, "efficiency", pkg.efficiencyCoreCount, "threads", pkg.threadCount)
 	}
--- a/discover/types.go
+++ b/discover/types.go
@@ -5,7 +5,6 @@ import (
 	"log/slog"

 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )

 type memInfo struct {
@@ -26,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	MinimumMemory uint64 `json:"-"`

 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath []string `json:"lib_path,omitempty"`
+	DependencyPath string `json:"lib_path,omitempty"`

 	// Extra environment variables specific to the GPU as list of [key,value]
 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
@@ -48,13 +47,6 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// TODO other performance capability info to help in scheduling decisions
 }

-func (gpu GpuInfo) RunnerName() string {
-	if gpu.Variant != "" {
-		return gpu.Library + "_" + gpu.Variant
-	}
-	return gpu.Library
-}
-
 type CPUInfo struct {
 	GpuInfo
 	CPUs []CPU
@@ -107,7 +99,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != runners.CPUCapabilityNone.String() {
+		if info.Variant != CPUCapabilityNone.String() {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
@@ -148,6 +140,29 @@ func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }

+type CPUCapability uint32
+
+// Override at build time when building base GPU runners
+var GPURunnerCPUCapability = CPUCapabilityAVX
+
+const (
+	CPUCapabilityNone CPUCapability = iota
+	CPUCapabilityAVX
+	CPUCapabilityAVX2
+	// TODO AVX512
+)
+
+func (c CPUCapability) String() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "avx"
+	case CPUCapabilityAVX2:
+		return "avx2"
+	default:
+		return "no vector extensions"
+	}
+}
+
 type SystemInfo struct {
 	System          CPUInfo              `json:"system"`
 	GPUs            []GpuInfo            `json:"gpus"`
@@ -160,25 +175,6 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
 		return 0
 	}
-
-	coreCount := 0
-	for _, c := range si.System.CPUs {
-		coreCount += c.CoreCount - c.EfficiencyCoreCount
-	}
-
-	return coreCount
-}
-
-// For each GPU, check if it does NOT support flash attention
-func (l GpuInfoList) FlashAttentionSupported() bool {
-	for _, gpu := range l {
-		supportsFA := gpu.Library == "metal" ||
-			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
-			gpu.Library == "rocm"
-
-		if !supportsFA {
-			return false
-		}
-	}
-	return true
+	// Allocate thread count matching the performance cores on a single socket
+	return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
 }
--- a/docs/api.md
+++ b/docs/api.md
@@ -45,18 +45,14 @@ Generate a response for a given prompt with a provided model. This is a streamin

 Advanced parameters (optional):

- `format`: the format to return a response in. Format can be `json` or a JSON schema
+- `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
+- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
-
-#### Structured outputs
-
-Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.

 #### JSON mode

@@ -189,52 +185,6 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```

-#### Request (Structured outputs)
-
-##### Request
-
-```shell
-curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
-  "model": "llama3.1:8b",
-  "prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
-  "stream": false,
-  "format": {
-    "type": "object",
-    "properties": {
-      "age": {
-        "type": "integer"
-      },
-      "available": {
-        "type": "boolean"
-      }
-    },
-    "required": [
-      "age",
-      "available"
-    ]
-  }
-}'
-```
-
-##### Response
-
-```json
-{
-  "model": "llama3.1:8b",
-  "created_at": "2024-12-06T00:48:09.983619Z",
-  "response": "{\n  \"age\": 22,\n  \"available\": true\n}",
-  "done": true,
-  "done_reason": "stop",
-  "context": [1, 2, 3],
-  "total_duration": 1075509083,
-  "load_duration": 567678166,
-  "prompt_eval_count": 28,
-  "prompt_eval_duration": 236000000,
-  "eval_count": 16,
-  "eval_duration": 269000000
-}
-```
-
 #### Request (JSON mode)

 > [!IMPORTANT]
@@ -387,6 +337,7 @@ curl http://localhost:11434/api/generate -d '{
    "top_k": 20,
    "top_p": 0.9,
    "min_p": 0.0,
+    "tfs_z": 0.5,
    "typical_p": 0.7,
    "repeat_last_n": 33,
    "temperature": 0.8,
@@ -404,6 +355,7 @@ curl http://localhost:11434/api/generate -d '{
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
+    "f16_kv": true,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
@@ -505,15 +457,11 @@ The `message` object has the following fields:

 Advanced parameters (optional):

- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
+- `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

-### Structured outputs
-
-Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
-
 ### Examples

 #### Chat Request (Streaming)
@@ -604,54 +552,6 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

-#### Chat request (Structured outputs)
-
-##### Request
-
-```shell
-curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
-  "model": "llama3.1",
-  "messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
-  "stream": false,
-  "format": {
-    "type": "object",
-    "properties": {
-      "age": {
-        "type": "integer"
-      },
-      "available": {
-        "type": "boolean"
-      }
-    },
-    "required": [
-      "age",
-      "available"
-    ]
-  },
-  "options": {
-    "temperature": 0
-  }
-}'
-```
-
-##### Response
-
-```json
-{
-  "model": "llama3.1",
-  "created_at": "2024-12-06T00:46:58.265747Z",
-  "message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
-  "done_reason": "stop",
-  "done": true,
-  "total_duration": 2254970291,
-  "load_duration": 574751416,
-  "prompt_eval_count": 34,
-  "prompt_eval_duration": 1502000000,
-  "eval_count": 12,
-  "eval_duration": 175000000
-}
-```
-
 #### Chat request (With History)

 Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
@@ -931,30 +831,10 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m

 ### Parameters

- `model`: name of the model to create
+- `name`: name of the model to create
 - `modelfile` (optional): contents of the Modelfile
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `path` (optional): path to the Modelfile
- `quantize` (optional): quantize a non-quantized (e.g. float16) model
-
-#### Quantization types
-
-| Type | Recommended |
-| --- | :-: |
-| q2_K | |
-| q3_K_L | |
-| q3_K_M | |
-| q3_K_S | |
-| q4_0 | |
-| q4_1 | |
-| q4_K_M | * |
-| q4_K_S | |
-| q5_0 | |
-| q5_1 | |
-| q5_K_M | |
-| q5_K_S | |
-| q6_K | |
-| q8_0 | * |

 ### Examples

@@ -966,14 +846,14 @@ Create a new model from a `Modelfile`.

 ```shell
 curl http://localhost:11434/api/create -d '{
-  "model": "mario",
+  "name": "mario",
  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```

 ##### Response

-A stream of JSON objects is returned:
+A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.

 ```json
 {"status":"reading model metadata"}
@@ -989,43 +869,13 @@ A stream of JSON objects is returned:
 {"status":"success"}
 ```

-#### Quantize a model
-
-Quantize a non-quantized model.
-
-##### Request
-
-```shell
-curl http://localhost:11434/api/create -d '{
-  "model": "llama3.1:quantized",
-  "modelfile": "FROM llama3.1:8b-instruct-fp16",
-  "quantize": "q4_K_M"
-}'
-```
-
-##### Response
-
-A stream of JSON objects is returned:
-
-```
-{"status":"quantizing F16 model to Q4_K_M"}
-{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
-{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
-{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
-{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
-{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
-{"status":"writing manifest"}
-{"status":"success"}
-```
-
-
 ### Check if a Blob Exists

 ```shell
 HEAD /api/blobs/:digest
 ```

-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.

 #### Query Parameters

@@ -1130,7 +980,7 @@ Show information about a model including details, modelfile, template, parameter

 ### Parameters

- `model`: name of the model to show
+- `name`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields

 ### Examples
@@ -1139,7 +989,7 @@ Show information about a model including details, modelfile, template, parameter

 ```shell
 curl http://localhost:11434/api/show -d '{
-  "model": "llama3.2"
+  "name": "llama3.2"
 }'
 ```

@@ -1219,7 +1069,7 @@ Delete a model and its data.

 ### Parameters

- `model`: model name to delete
+- `name`: model name to delete

 ### Examples

@@ -1227,7 +1077,7 @@ Delete a model and its data.

 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "model": "llama3:13b"
+  "name": "llama3:13b"
 }'
 ```

@@ -1245,7 +1095,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ### Parameters

- `model`: name of the model to pull
+- `name`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects

@@ -1255,7 +1105,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "model": "llama3.2"
+  "name": "llama3.2"
 }'
 ```

@@ -1317,7 +1167,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 ### Parameters

- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects

@@ -1327,7 +1177,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 ```shell
 curl http://localhost:11434/api/push -d '{
-  "model": "mattw/pygmalion:latest"
+  "name": "mattw/pygmalion:latest"
 }'
 ```

--- a/docs/development.md
+++ b/docs/development.md
@@ -3,24 +3,35 @@
 Install required tools:

 - go version 1.22 or higher
- OS specific C/C++ compiler (see below)
- GNU Make
+- gcc version 11.4.0 or higher


-## Overview
-
-Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application that will run within the repo directory.  Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to reduce build times.  If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
-
-Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
-
 ### MacOS

 [Download Go](https://go.dev/dl/)

+Optionally enable debugging and more verbose logging:
+
+```bash
+# At build time
+export CGO_CFLAGS="-g"
+
+# At runtime
+export OLLAMA_DEBUG=1
+```
+
+Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
+
 ```bash
 make -j 5
 ```

+Then build ollama:
+
+```bash
+go build .
+```
+
 Now you can run `ollama`:

 ```bash
@@ -40,42 +51,64 @@ _Your operating system distribution may already have packages for NVIDIA CUDA. D
 Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.

-Typically the makefile will auto-detect CUDA, however, if your Linux distro
-or installation approach uses alternative paths, you can specify the location by
-overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
-a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)
+Typically the build scripts will auto-detect CUDA, however, if your Linux distro
+or installation approach uses unusual paths, you can specify the location by
+specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
+libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
+a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
+
+Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)

 ```
 make -j 5
 ```

-If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`
+Then build the binary:

-#### Older Linux CUDA (NVIDIA)
-
-To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11).  When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDFLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`.  To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
+```
+go build .
+```

 #### Linux ROCm (AMD)

-_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_

-Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.

 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `HIP_PATH` to the location of the ROCm
-install (typically `/opt/rocm`). You can also customize
-the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)
+specifying an environment variable `ROCM_PATH` to the location of the ROCm
+install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
+CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
+the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
+
+Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)

 ```
 make -j 5
 ```

+Then build the binary:
+
+```
+go build .
+```
+
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.

+#### Advanced CPU Settings
+
+By default, running `make` will compile a few different variations
+of the LLM library based on common CPU families and vector math capabilities,
+including a lowest-common-denominator which should run on almost any 64 bit CPU
+somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
+load. 
+
+Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition.
+
 #### Containerized Linux Build

-If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
+If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`

 ### Windows

@@ -85,16 +118,17 @@ The following tools are required as a minimal development environment to build C
  - https://go.dev/dl/
 - Git
  - https://git-scm.com/download/win
- clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
+- GCC and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
  - [MSYS2](https://www.msys2.org/)
-    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
-  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
+    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-ucrt-x86_64-gcc make` to install the required tools
+  - Assuming you used the default install prefix for msys2 above, add `c:\msys64\ucrt64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)

-> [!NOTE]  
-> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
+Then, build the `ollama` binary:

-```
-make -j 5
+```powershell
+$env:CGO_ENABLED="1"
+make -j 8
+go build .
 ```

 #### GPU Support
@@ -136,30 +170,3 @@ pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw
 ```

 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
-
-
-## Advanced CPU Vector Settings
-
-On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
-
-To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
-
-To build without any vector flags:
-
-```
-make CUSTOM_CPU_FLAGS=""
-```
-
-To build with both AVX and AVX2:
-```
-make CUSTOM_CPU_FLAGS=avx,avx2
-```
-
-To build with AVX512 features turned on:
-
-```
-make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
-```
-
-> [!NOTE]  
-> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -50,9 +50,6 @@ sudo systemctl restart docker
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```

-> [!NOTE]  
-> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
-
 ### AMD GPU

 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:

-```nginx
+```
 server {
    listen 80;
    server_name example.com;  # Replace with your domain or IP
@@ -285,28 +285,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit

 ## How does Ollama load models on multiple GPUs?

-When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
-
-## How can I enable Flash Attention?
-
-Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows.  To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
-
-## How can I set the quantization type for the K/V cache?
-
-The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled.
-
-To use quantized K/V cache with Ollama you can set the following environment variable:
-
- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache.  Default is `f16`.
-
-> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
-
-The currently available K/V cache quantization types are:
-
- `f16` - high precision and memory usage (default).
- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
-
-How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
-
-You may need to experiment with different quantization types to find the best balance between memory usage and quality.
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -28,7 +28,6 @@ Check your compute compatibility to see if your card is supported:
 | 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                            |
 |                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`  |

-For building locally to support older GPUs, see [developer.md](./development.md#linux-cuda-nvidia)

 ### GPU Selection

@@ -75,10 +74,6 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
 server.  If you have an unsupported AMD GPU you can experiment using the list of
 supported types below.

-If you have multiple GPUs with different GFX versions, append the numeric device
-number to the environment variable to set them individually.  For example,
-`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and  `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
-
 At this time, the known supported GPU types on linux are the following LLVM Targets.
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
@@ -104,10 +99,9 @@ Reach out on [Discord](https://discord.gg/ollama) or file an
 ### GPU Selection

 If you have multiple AMD GPUs in your system and want to limit Ollama to use a
-subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs.
+subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
 You can see the list of devices with `rocminfo`.  If you want to ignore the GPUs
-and force CPU usage, use an invalid GPU ID (e.g., "-1").  When available, use the
-`Uuid` to uniquely identify the device instead of numeric value.
+and force CPU usage, use an invalid GPU ID (e.g., "-1")

 ### Container Permission

--- a/docs/import.md
+++ b/docs/import.md
@@ -32,7 +32,7 @@ ollama run my-model

 Ollama supports importing adapters based on several different model architectures including:

-  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
+  * Llama (including Llama 2, Llama 3, and Llama 3.1);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
  * Gemma (including Gemma 1 and Gemma 2)

@@ -67,12 +67,14 @@ ollama run my-model

 Ollama supports importing models for several different architectures including:

-  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
+  * Llama (including Llama 2, Llama 3, and Llama 3.1);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
  * Gemma (including Gemma 1 and Gemma 2); and
  * Phi3

-This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
+This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
+
+
 ## Importing a GGUF based model or adapter

 If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
@@ -81,7 +83,7 @@ If you have a GGUF based model or adapter it is possible to import it into Ollam
  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
  * downloading a model or adapter from a place such as HuggingFace

-To import a GGUF model, create a `Modelfile` containing:
+To import a GGUF model, create a `Modelfile` containg:

 ```dockerfile
 FROM /path/to/file.gguf
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,9 +10,6 @@ curl -fsSL https://ollama.com/install.sh | sh

 ## Manual install

-> [!NOTE]
-> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
-
 Download and extract the package:

 ```shell
@@ -115,21 +112,6 @@ sudo systemctl status ollama
 > https://www.amd.com/en/support/linux-drivers for best support of your Radeon
 > GPU.

-## Customizing
-
-To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
-
-```
-sudo systemctl edit ollama
-```
-
-Alternatively, create an override file manually in `/etc/systemd/system/ollama.service.d/override.conf`:
-
-```ini
-[Service]
-Environment="OLLAMA_DEBUG=1"
-```
-
 ## Updating

 Update Ollama by running the install script again:
@@ -147,7 +129,7 @@ sudo tar -C /usr -xzf ollama-linux-amd64.tgz

 ## Installing specific versions

-Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases).
+Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases). 

 For example:

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -63,7 +63,7 @@ SYSTEM You are Mario from super mario bros, acting as an assistant.
 To use this:

 1. Save it as a file (e.g. `Modelfile`)
-2. `ollama create choose-a-model-name -f <location of the file e.g. ./Modelfile>`
+2. `ollama create choose-a-model-name -f <location of the file e.g. ./Modelfile>'`
 3. `ollama run choose-a-model-name`
 4. Start using the model!

@@ -120,7 +120,7 @@ FROM <model directory>
 The model directory should contain the Safetensors weights for a supported architecture.

 Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
+  * Llama (including Llama 2, Llama 3, and Llama 3.1)
  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
  * Gemma (including Gemma 1 and Gemma 2)
  * Phi3
@@ -156,7 +156,7 @@ PARAMETER <parameter> <parametervalue>
 | seed           | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)                                                                                       | int        | seed 42              |
 | stop           | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile.                                      | string     | stop "AI assistant:" |
 | tfs_z          | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)                                               | float      | tfs_z 1              |
-| num_predict    | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation)                                                                                                                                   | int        | num_predict 42       |
+| num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
 | min_p          | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05            |
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -59,40 +59,6 @@ embeddings = client.embeddings.create(
    input=["why is the sky blue?", "why is the grass green?"],
 )
 ```
-#### Structured outputs
-```py
-from pydantic import BaseModel
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
-
-# Define the schema for the response
-class FriendInfo(BaseModel):
-    name: str
-    age: int 
-    is_available: bool
-
-class FriendList(BaseModel):
-    friends: list[FriendInfo]
-
-try:
-    completion = client.beta.chat.completions.parse(
-        temperature=0,
-        model="llama3.1:8b",
-        messages=[
-            {"role": "user", "content": "I have two friends. The first is Ollama 22 years old busy saving the world, and the second is Alonso 23 years old and wants to hang out. Return a list of friends in JSON format"}
-        ],
-        response_format=FriendList,
-    )
-
-    friends_response = completion.choices[0].message
-    if friends_response.parsed:
-        print(friends_response.parsed)
-    elif friends_response.refusal:
-        print(friends_response.refusal)
-except Exception as e:
-    print(f"Error: {e}")
-```

 ### OpenAI JavaScript library

@@ -215,7 +181,7 @@ curl http://localhost:11434/v1/embeddings \
 - [x] JSON mode
 - [x] Reproducible outputs
 - [x] Vision
- [x] Tools
+- [x] Tools (streaming support coming soon)
 - [ ] Logprobs

 #### Supported request fields
@@ -233,8 +199,6 @@ curl http://localhost:11434/v1/embeddings \
 - [x] `seed`
 - [x] `stop`
 - [x] `stream`
- [x] `stream_options`
-  - [x] `include_usage`
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
@@ -263,8 +227,6 @@ curl http://localhost:11434/v1/embeddings \
 - [x] `seed`
 - [x] `stop`
 - [x] `stream`
- [x] `stream_options`
-  - [x] `include_usage`
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -80,7 +80,7 @@ If you are using a container to run Ollama, make sure you've set up the containe

 Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem

- If you are using a container, is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama won't be able to see your NVIDIA GPU.
+- If you are using a container, is the container runtime working?  Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
 - Is the uvm driver loaded? `sudo nvidia-modprobe -u`
 - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
 - Try rebooting
@@ -95,21 +95,13 @@ If none of those resolve the problem, gather additional information and file an

 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.

-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
-
-If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.

 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`

-## Multiple AMD GPUs
-
-If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
-
- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
-
 ## Windows Terminal Errors

 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/tutorials/fly-gpu.md
+++ b/docs/tutorials/fly-gpu.md
@@ -0,0 +1,83 @@
+# Running Ollama on Fly.io GPU Instances
+
+Ollama runs with little to no configuration on [Fly.io GPU instances](https://fly.io/docs/gpus/gpu-quickstart/). If you don't have access to GPUs yet, you'll need to [apply for access](https://fly.io/gpu/) on the waitlist. Once you're accepted, you'll get an email with instructions on how to get started.
+
+Create a new app with `fly apps create`:
+
+```bash
+fly apps create
+```
+
+Then create a `fly.toml` file in a new folder that looks like this:
+
+```toml
+app = "sparkling-violet-709"
+primary_region = "ord"
+vm.size = "a100-40gb" # see https://fly.io/docs/gpus/gpu-quickstart/ for more info
+
+[build]
+  image = "ollama/ollama"
+
+[http_service]
+  internal_port = 11434
+  force_https = false
+  auto_stop_machines = true
+  auto_start_machines = true
+  min_machines_running = 0
+  processes = ["app"]
+
+[mounts]
+  source = "models"
+  destination = "/root/.ollama"
+  initial_size = "100gb"
+```
+
+Then create a [new private IPv6 address](https://fly.io/docs/reference/private-networking/#flycast-private-load-balancing) for your app:
+
+```bash
+fly ips allocate-v6 --private
+```
+
+Then deploy your app:
+
+```bash
+fly deploy
+```
+
+And finally you can access it interactively with a new Fly.io Machine:
+
+```
+fly machine run -e OLLAMA_HOST=http://your-app-name.flycast --shell ollama/ollama
+```
+
+```bash
+$ ollama run openchat:7b-v3.5-fp16
+>>> How do I bake chocolate chip cookies?
+ To bake chocolate chip cookies, follow these steps:
+
+1. Preheat the oven to 375°F (190°C) and line a baking sheet with parchment paper or silicone baking mat.
+
+2. In a large bowl, mix together 1 cup of unsalted butter (softened), 3/4 cup granulated sugar, and 3/4
+cup packed brown sugar until light and fluffy.
+
+3. Add 2 large eggs, one at a time, to the butter mixture, beating well after each addition. Stir in 1
+teaspoon of pure vanilla extract.
+
+4. In a separate bowl, whisk together 2 cups all-purpose flour, 1/2 teaspoon baking soda, and 1/2 teaspoon
+salt. Gradually add the dry ingredients to the wet ingredients, stirring until just combined.
+
+5. Fold in 2 cups of chocolate chips (or chunks) into the dough.
+
+6. Drop rounded tablespoons of dough onto the prepared baking sheet, spacing them about 2 inches apart.
+
+7. Bake for 10-12 minutes, or until the edges are golden brown. The centers should still be slightly soft.
+
+8. Allow the cookies to cool on the baking sheet for a few minutes before transferring them to a wire rack
+to cool completely.
+
+Enjoy your homemade chocolate chip cookies!
+```
+
+When you set it up like this, it will automatically turn off when you're done using it. Then when you access it again, it will automatically turn back on. This is a great way to save money on GPU instances when you're not using them. If you want a persistent wake-on-use connection to your Ollama instance, you can set up a [connection to your Fly network using WireGuard](https://fly.io/docs/reference/private-networking/#discovering-apps-through-dns-on-a-wireguard-connection). Then you can access your Ollama instance at `http://your-app-name.flycast`.
+
+And that's it!
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -0,0 +1,77 @@
+# Using LangChain with Ollama using JavaScript
+
+In this tutorial, we are going to use JavaScript with LangChain and Ollama to learn about something just a touch more recent. In August 2023, there was a series of wildfires on Maui. There is no way an LLM trained before that time can know about this, since their training data would not include anything as recent as that. So we can find the [Wikipedia article about the fires](https://en.wikipedia.org/wiki/2023_Hawaii_wildfires) and ask questions about the contents.
+
+To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
+
+```bash
+npm install @langchain/community
+```
+
+Now we can start building out our JavaScript:
+
+```javascript
+import { Ollama } from "@langchain/community/llms/ollama";
+
+const ollama = new Ollama({
+  baseUrl: "http://localhost:11434",
+  model: "llama3.2",
+});
+
+const answer = await ollama.invoke(`why is the sky blue?`);
+
+console.log(answer);
+```
+
+That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+
+```bash
+npm install cheerio
+```
+
+```javascript
+import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";
+
+const loader = new CheerioWebBaseLoader("https://en.wikipedia.org/wiki/2023_Hawaii_wildfires");
+const data = await loader.load();
+```
+
+That will load the document. Although this page is smaller than the Odyssey, it is certainly bigger than the context size for most LLMs. So we are going to need to split into smaller pieces, and then select just the pieces relevant to our question. This is a great use for a vector datastore. In this example, we will use the **MemoryVectorStore** that is part of **LangChain**. But there is one more thing we need to get the content into the datastore. We have to run an embeddings process that converts the tokens in the text into a series of vectors. And for that, we are going to use **Tensorflow**. There is a lot of stuff going on in this one. First, install the **Tensorflow** components that we need.
+
+```javascript
+npm install @tensorflow/tfjs-core@3.6.0 @tensorflow/tfjs-converter@3.6.0 @tensorflow-models/universal-sentence-encoder@1.3.3 @tensorflow/tfjs-node@4.10.0
+```
+
+If you just install those components without the version numbers, it will install the latest versions, but there are conflicts within **Tensorflow**, so you need to install the compatible versions.
+
+```javascript
+import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
+import { MemoryVectorStore } from "langchain/vectorstores/memory";
+import "@tensorflow/tfjs-node";
+import { TensorFlowEmbeddings } from "langchain/embeddings/tensorflow";
+
+// Split the text into 500 character chunks. And overlap each chunk by 20 characters
+const textSplitter = new RecursiveCharacterTextSplitter({
+ chunkSize: 500,
+ chunkOverlap: 20
+});
+const splitDocs = await textSplitter.splitDocuments(data);
+
+// Then use the TensorFlow Embedding to store these chunks in the datastore
+const vectorStore = await MemoryVectorStore.fromDocuments(splitDocs, new TensorFlowEmbeddings());
+```
+
+To connect the datastore to a question asked to a LLM, we need to use the concept at the heart of **LangChain**: the chain. Chains are a way to connect a number of activities together to accomplish a particular tasks. There are a number of chain types available, but for this tutorial we are using the **RetrievalQAChain**.
+
+```javascript
+import { RetrievalQAChain } from "langchain/chains";
+
+const retriever = vectorStore.asRetriever();
+const chain = RetrievalQAChain.fromLLM(ollama, retriever);
+const result = await chain.call({query: "When was Hawaii's request for a major disaster declaration approved?"});
+console.log(result.text)
+```
+
+So we created a retriever, which is a way to return the chunks that match a query from a datastore. And then connect the retriever and the model via a chain. Finally, we send a query to the chain, which results in an answer using our document as a source. The answer it returned was correct, August 10, 2023.
+
+And that is a simple introduction to what you can do with **LangChain** and **Ollama.**
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -0,0 +1,85 @@
+# Using LangChain with Ollama in Python
+
+Let's imagine we are studying the classics, such as **the Odyssey** by **Homer**. We might have a question about Neleus and his family. If you ask llama2 for that info, you may get something like:
+
+> I apologize, but I'm a large language model, I cannot provide information on individuals or families that do not exist in reality. Neleus is not a real person or character, and therefore does not have a family or any other personal details. My apologies for any confusion. Is there anything else I can help you with?
+
+This sounds like a typical censored response, but even llama2-uncensored gives a mediocre answer:
+
+> Neleus was a legendary king of Pylos and the father of Nestor, one of the Argonauts. His mother was Clymene, a sea nymph, while his father was Neptune, the god of the sea.
+
+So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
+
+Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
+
+`pip install langchain_community`
+
+Then we can create a model and ask the question:
+
+```python
+from langchain_community.llms import Ollama
+ollama = Ollama(
+    base_url='http://localhost:11434',
+    model="llama3"
+)
+print(ollama.invoke("why is the sky blue"))
+```
+
+Notice that we are defining the model and the base URL for Ollama.
+
+Now let's load a document to ask questions against. I'll load up the Odyssey by Homer, which you can find at Project Gutenberg. We will need **WebBaseLoader** which is part of **LangChain** and loads text from any webpage. On my machine, I also needed to install **bs4** to get that to work, so run `pip install bs4`.
+
+```python
+from langchain.document_loaders import WebBaseLoader
+loader = WebBaseLoader("https://www.gutenberg.org/files/1727/1727-h/1727-h.htm")
+data = loader.load()
+```
+
+This file is pretty big. Just the preface is 3000 tokens. Which means the full document won't fit into the context for the model. So we need to split it up into smaller pieces.
+
+```python
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
+all_splits = text_splitter.split_documents(data)
+```
+
+It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`
+We also need to pull embedding model: `ollama pull nomic-embed-text`
+```python
+from langchain.embeddings import OllamaEmbeddings
+from langchain.vectorstores import Chroma
+oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
+vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
+```
+
+Now let's ask a question from the document. **Who was Neleus, and who is in his family?** Neleus is a character in the Odyssey, and the answer can be found in our text.
+
+```python
+question="Who is Neleus and who is in Neleus' family?"
+docs = vectorstore.similarity_search(question)
+len(docs)
+```
+
+This will output the number of matches for chunks of data similar to the search.
+
+The next thing is to send the question and the relevant parts of the docs to the model to see if we can get a good answer. But we are stitching two parts of the process together, and that is called a chain. This means we need to define a chain:
+
+```python
+from langchain.chains import RetrievalQA
+qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
+res = qachain.invoke({"query": question})
+print(res['result'])
+```
+
+The answer received from this chain was:
+
+> Neleus is a character in Homer's "Odyssey" and is mentioned in the context of Penelope's suitors. Neleus is the father of Chloris, who is married to Neleus and bears him several children, including Nestor, Chromius, Periclymenus, and Pero. Amphinomus, the son of Nisus, is also mentioned as a suitor of Penelope and is known for his good natural disposition and agreeable conversation.
+
+It's not a perfect answer, as it implies Neleus married his daughter when actually Chloris "was the youngest daughter to Amphion son of Iasus and king of Minyan Orchomenus, and was Queen in Pylos".
+
+I updated the chunk_overlap for the text splitter to 20 and tried again and got a much better answer:
+
+> Neleus is a character in Homer's epic poem "The Odyssey." He is the husband of Chloris, who is the youngest daughter of Amphion son of Iasus and king of Minyan Orchomenus. Neleus has several children with Chloris, including Nestor, Chromius, Periclymenus, and Pero.
+
+And that is a much better answer.
--- a/docs/tutorials/nvidia-jetson.md
+++ b/docs/tutorials/nvidia-jetson.md
@@ -0,0 +1,15 @@
+# Running Ollama on NVIDIA Jetson Devices
+
+Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions. 
+
+The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
+
+- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
+- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
+- Start an interactive session: `ollama run mistral`
+
+And that's it!
+
+# Running Ollama in Docker
+
+When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -1,15 +1,22 @@
-# Ollama Windows
+# Ollama Windows Preview

-Welcome to Ollama for Windows.
+Welcome to the Ollama Windows preview.

 No more WSL required!

 Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama for Windows, Ollama will run in the background and
+After installing Ollama Windows Preview, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
 `http://localhost:11434`.

+As this is a preview release, you should expect a few bugs here and there.  If
+you run into a problem you can reach out on
+[Discord](https://discord.gg/ollama), or file an
+[issue](https://github.com/ollama/ollama/issues).
+Logs will often be helpful in diagnosing the problem (see
+[Troubleshooting](#troubleshooting) below)
+
 ## System Requirements

 * Windows 10 22H2 or newer, Home or Pro
@@ -18,32 +25,6 @@ terminal application. As usual the Ollama [api](./api.md) will be served on

 Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.

-## Filesystem Requirements
-
-The Ollama install does not require Administrator, and installs in your home directory by default.  You'll need at least 4GB of space for the binary install.  Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
-
-### Changing Install Location
-
-To install the Ollama application in a location different than your home directory, start the installer with the following flag
-
-```powershell
-OllamaSetup.exe /DIR="d:\some\location"
-```
-
-### Changing Model Location
-
-To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
-
-1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
-
-2. Click on _Edit environment variables for your account_.
-
-3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
-
-4. Click OK/Apply to save.
-
-If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
-
 ## API Access

 Here's a quick example showing API access from `powershell`
@@ -53,6 +34,10 @@ Here's a quick example showing API access from `powershell`

 ## Troubleshooting

+While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
+a "view logs" menu item to the app, and increases logging for the GUI app and
+server.
+
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
@@ -67,10 +52,6 @@ the explorer window by hitting `<cmd>+R` and type in:

 The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.

-> [!NOTE]
-> If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
-
-
 ## Standalone CLI

 The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
@@ -83,6 +64,3 @@ If you'd like to install or integrate Ollama as a service, a standalone
 and GPU library dependencies for Nvidia and AMD. This allows for embedding
 Ollama in existing applications, or running it as a system service via `ollama
 serve` with tools such as [NSSM](https://nssm.cc/).
-
-> [!NOTE]  
-> If you are upgrading from a prior version, you should remove the old directories first.
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -153,8 +153,6 @@ var (
 	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
-	// KvCacheType is the quantization type for the K/V cache.
-	KvCacheType = String("OLLAMA_KV_CACHE_TYPE")
 	// NoHistory disables readline history.
 	NoHistory = Bool("OLLAMA_NOHISTORY")
 	// NoPrune disables pruning of model blobs on startup.
@@ -175,6 +173,7 @@ func String(s string) func() string {

 var (
 	LLMLibrary = String("OLLAMA_LLM_LIBRARY")
+	TmpDir     = String("OLLAMA_TMPDIR")

 	CudaVisibleDevices    = String("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices     = String("HIP_VISIBLE_DEVICES")
@@ -235,7 +234,6 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
-		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
@@ -249,6 +247,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
+		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},

 		// Informational
@@ -266,9 +265,9 @@ func AsMap() map[string]EnvVar {

 	if runtime.GOOS != "darwin" {
 		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
-		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
-		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
-		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
+		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
+		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
+		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
 		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,14 +1,3 @@
 # Examples

 This directory contains different examples of using Ollama.
-
-## Python examples
-Ollama Python examples at [ollama-python/examples](https://github.com/ollama/ollama-python/tree/main/examples)
-
-
-## JavaScript examples
-Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/ollama-js/tree/main/examples)
-
-
-## OpenAI compatibility examples
-Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama

-input = input("What is your question?\n> ")
+input = input("What is your question?")
 llm = Ollama(model="llama3.2")
-res = llm.invoke(input)
+res = llm.predict(input)
 print (res)
--- a/examples/python-grounded-factuality-rag-check/README.md
+++ b/examples/python-grounded-factuality-rag-check/README.md
@@ -1,6 +1,6 @@
 # RAG Hallucination Checker using Bespoke-Minicheck

-This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retrieved from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations.
+This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 

 ## Running the Example

--- a/examples/python-grounded-factuality-rag-check/main.py
+++ b/examples/python-grounded-factuality-rag-check/main.py
@@ -115,7 +115,7 @@ if __name__ == "__main__":

        print(f"\nRetrieved chunks: \n{sourcetext}\n")

-        # Give the retrieved chunks and question to the chat model
+        # Give the retreived chunks and question to the chat model
        system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"

        ollama_response = ollama.generate(
--- a/fileutils/README.md
+++ b/fileutils/README.md
@@ -0,0 +1,3 @@
+# `modelfile`
+
+This package provides utilities for loading and inspecting model files
--- a/fileutils/filetype.go
+++ b/fileutils/filetype.go
@@ -1,9 +1,11 @@
-package llm
+package fileutils

 import "fmt"

 type fileType uint32

+// TODO this should map over to the GGML CGO enum type
+
 const (
 	fileTypeF32 fileType = iota
 	fileTypeF16
@@ -32,10 +34,9 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
-	fileTypeIQ3_M
 	fileTypeIQ2_S
-	fileTypeIQ2_M
 	fileTypeIQ4_XS
+	fileTypeIQ2_M
 	fileTypeIQ1_M
 	fileTypeBF16

@@ -94,8 +95,6 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
-	case "IQ3_M":
-		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
 	case "IQ4_XS":
@@ -163,8 +162,6 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
-	case fileTypeIQ3_M:
-		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:
--- a/fileutils/ggla.go
+++ b/fileutils/ggla.go
@@ -1,4 +1,4 @@
-package llm
+package fileutils

 import (
 	"encoding/binary"
--- a/fileutils/ggml.go
+++ b/fileutils/ggml.go
@@ -1,10 +1,11 @@
-package llm
+package fileutils

 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+	"os"
 	"slices"
 	"strings"
 	"sync"
@@ -360,7 +361,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }

-func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
+func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
@@ -368,13 +369,9 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia

 	embeddingHeads := llm.KV().EmbeddingHeadCount()
 	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
-	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()

 	layers := llm.Tensors().Layers()

-	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
-
 	switch llm.KV().Architecture() {
 	case "llama":
 		fullOffload = max(
@@ -404,42 +401,6 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
-	case "mllama":
-		var visionTokens, tiles uint64 = 1601, 4
-
-		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
-			kv = headsKV *
-				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
-				(2* // sizeof(float16)
-					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
-					context +
-					4* // sizeof(float32)
-						uint64(crossAttentionLayers.size)* // num cross attention layers
-						visionTokens*
-						tiles)
-		}
-
-		fullOffload = max(
-			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
-			// vocab graph
-			4*batch*(embedding+vocab),
-		)
-
-		var ropeFreqsCount uint64
-		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
-			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
-				ropeFreqsCount = ropeFreqsWeights.parameters()
-			}
-		}
-
-		partialOffload = max(
-			4*(batch*
-				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
-				ropeFreqsCount+
-				embeddingHeadsK*context*headsKV),
-			// vocab graph
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-		)
 	case "gemma", "gemma2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
@@ -529,33 +490,22 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 	return
 }

-// SupportsKVCacheType checks if the requested cache type is supported
-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
-	return slices.Contains(validKVCacheTypes, cacheType)
-}
-
-// SupportsFlashAttention checks if the model supports flash attention
-func (ggml GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
-	if isEmbedding {
-		return false
+// LoadModel will load a model from disk. The model must be in the GGML format.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func LoadModel(model string, maxArraySize int) (*GGML, error) {
+	if _, err := os.Stat(model); err != nil {
+		return nil, err
 	}

-	// Check head counts match and are non-zero
-	headCountK := ggml.KV().EmbeddingHeadCountK()
-	headCountV := ggml.KV().EmbeddingHeadCountV()
-	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
-}
-
-// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
-func kvCacheBytesPerElement(cacheType string) float64 {
-	switch cacheType {
-	case "q8_0":
-		return 1 // 1/2 of fp16
-	case "q4_0":
-		return 0.5 // 1/4 of fp16
-	default:
-		return 2 // f16 (default)
+	f, err := os.Open(model)
+	if err != nil {
+		return nil, err
 	}
+	defer f.Close()
+
+	ggml, _, err := DecodeGGML(f, maxArraySize)
+	return ggml, err
 }
--- a/fileutils/ggml_test.go
+++ b/fileutils/ggml_test.go
@@ -0,0 +1 @@
+package fileutils
--- a/fileutils/gguf.go
+++ b/fileutils/gguf.go
@@ -1,4 +1,4 @@
-package llm
+package fileutils

 import (
 	"bytes"
--- a/fileutils/memory.go
+++ b/fileutils/memory.go
@@ -1,4 +1,4 @@
-package llm
+package fileutils

 import (
 	"fmt"
@@ -123,23 +123,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		slog.Warn("model missing blk.0 layer size")
 	}

-	fa := envconfig.FlashAttention() &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
-		ggml.SupportsFlashAttention()
-
-	var kvct string
-	if fa {
-		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && ggml.SupportsKVCacheType(requested) {
-			kvct = requested
-		}
-	}
-
-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+	// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
+	var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()

 	// KV is proportional to the number of layers
 	layerSize += kv / ggml.KV().BlockCount()

+	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
@@ -182,7 +172,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+		if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
 			slog.Debug("gpu has too little memory to allocate any layers",
 				"id", gpus[i].ID,
 				"library", gpus[i].Library,
@@ -228,7 +218,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[i%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+layerSize {
+			if (g.g.FreeMemory - overhead) > used+layerSize {
 				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
@@ -251,7 +241,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[layerCount%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
+			if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
 				gpuAllocations[g.i] += memoryLayerOutput
 				layerCounts[g.i]++
 				layerCount++
@@ -339,7 +329,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	return estimate
 }

-func (m MemoryEstimate) log() {
+func (m MemoryEstimate) Log() {
 	overhead := envconfig.GpuOverhead()

 	log := slog.With()
--- a/fileutils/memory_test.go
+++ b/fileutils/memory_test.go
@@ -1,4 +1,4 @@
-package llm
+package fileutils

 import (
 	"bytes"
@@ -15,7 +15,6 @@ import (

 func TestEstimateGPULayers(t *testing.T) {
 	t.Setenv("OLLAMA_DEBUG", "1")
-	t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16

 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
@@ -71,7 +70,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	projectorSize := uint64(0)
 	memoryLayerOutput := uint64(4)

-	// Dual CUDA scenario with asymmetry
+	// Dual CUDA scenario with assymetry
 	gpuMinimumMemory := uint64(2048)
 	gpus = []discover.GpuInfo{
 		{
--- a/go.mod
+++ b/go.mod
@@ -1,18 +1,18 @@
 module github.com/ollama/ollama

-go 1.23.4
+go 1.22.5

 require (
 	github.com/containerd/console v1.0.3
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.10.0
 	github.com/golang/protobuf v1.5.4 // indirect
-	github.com/google/uuid v1.6.0
+	github.com/google/uuid v1.1.2
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.9.0
+	golang.org/x/sync v0.3.0
 )

 require (
@@ -22,14 +22,14 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
-	golang.org/x/image v0.22.0
+	golang.org/x/image v0.14.0
 )

 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
-	github.com/chewxy/math32 v1.11.0 // indirect
+	github.com/chewxy/math32 v1.10.1 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
@@ -73,7 +73,7 @@ require (
 	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.20.0
 	golang.org/x/term v0.20.0
-	golang.org/x/text v0.20.0
+	golang.org/x/text v0.15.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -21,8 +21,8 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
-github.com/chewxy/math32 v1.11.0 h1:8sek2JWqeaKkVnHa7bPVqCEOUPbARo4SGxs6toKyAOo=
-github.com/chewxy/math32 v1.11.0/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU=
+github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
 github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
@@ -113,9 +113,8 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
-github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
@@ -231,8 +230,8 @@ golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+o
 golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
-golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
+golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
+golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
@@ -266,8 +265,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
-golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -292,8 +291,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
-golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
+golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -30,30 +30,7 @@ func TestOrcaMiniBlueSky(t *testing.T) {
 	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
 }

-func TestUnicode(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
-	defer cancel()
-	// Set up the test data
-	req := api.GenerateRequest{
-		// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
-		Model:  "deepseek-coder-v2:16b-lite-instruct-q2_K",
-		Prompt: "天空为什么是蓝色的?",
-		Stream: &stream,
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-			// Workaround deepseek context shifting bug
-			"num_ctx":     8192,
-			"num_predict": 2048,
-		},
-	}
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
-	DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
-}
-
-func TestExtendedUnicodeOutput(t *testing.T) {
+func TestUnicodeOutput(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 	// Set up the test data
@@ -66,10 +43,7 @@ func TestExtendedUnicodeOutput(t *testing.T) {
 			"seed":        123,
 		},
 	}
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
-	DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
+	GenerateTestHelper(ctx, t, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"})
 }

 func TestUnicodeModelDir(t *testing.T) {
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -60,8 +60,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
-			// Note: CPU based inference can crawl so don't give up too quickly
-			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
+			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
 		}(i)
 	}
 	wg.Wait()
@@ -207,7 +206,7 @@ func TestMultiModelStress(t *testing.T) {
 		chosenModels = mediumModels
 		// default:
 		// 	slog.Info("selecting large models")
-		// 	chosenModels = largeModels
+		// 	chosenModels = largModels
 	}

 	req, resp := GenerateRequests()
@@ -232,7 +231,7 @@ func TestMultiModelStress(t *testing.T) {
 	var wg sync.WaitGroup
 	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
 	for i := 0; i < len(req); i++ {
-		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
+		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
 		if i > 1 && consumed > maxVram {
 			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
 			break
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -10,38 +10,7 @@ import (
 	"github.com/ollama/ollama/api"
 )

-func TestLongInputContext(t *testing.T) {
-	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
-	// we asked for and there is nothing extra that we could spill over into
-	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
-
-	// Longer needed for small footprint GPUs
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
-	defer cancel()
-	// Set up the test data
-	req := api.GenerateRequest{
-		Model:  "llama2",
-		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
-		Stream: &stream,
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-			"num_ctx":     128,
-		},
-	}
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("PullIfMissing failed: %v", err)
-	}
-	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
-}
-
 func TestContextExhaustion(t *testing.T) {
-	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
-	// we asked for and there is nothing extra that we could spill over into
-	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
-
 	// Longer needed for small footprint GPUs
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -12,7 +12,7 @@ import (
 	"github.com/stretchr/testify/require"
 )

-func TestIntegrationLlava(t *testing.T) {
+func TestIntegrationMultimodal(t *testing.T) {
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
@@ -39,33 +39,6 @@ func TestIntegrationLlava(t *testing.T) {
 	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }

-func TestIntegrationMllama(t *testing.T) {
-	image, err := base64.StdEncoding.DecodeString(imageEncoding)
-	require.NoError(t, err)
-	req := api.GenerateRequest{
-		// TODO fix up once we publish the final image
-		Model:  "x/llama3.2-vision",
-		Prompt: "what does the text in this image say?",
-		Stream: &stream,
-		Options: map[string]interface{}{
-			"seed":        42,
-			"temperature": 0.0,
-		},
-		Images: []api.ImageData{
-			image,
-		},
-	}
-
-	resp := "the ollamas"
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
-	// mllama models on CPU can be quite slow to start,
-	DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
-}
-
 const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
 AAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAABIAAAAAQAAAEgAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAANKgAwAEAAAAAQAA
 AHgAAAAAXdsepgAAAAlwSFlzAAALEwAACxMBAJqcGAAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -16,18 +16,23 @@ import (
 	"github.com/stretchr/testify/require"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 )

 func TestMaxQueue(t *testing.T) {
 	if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
-		t.Skip("Max Queue test requires spawning a local server so we can adjust the queue size")
+		t.Skip("Max Queue test requires spawing a local server so we can adjust the queue size")
 		return
 	}

 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
-	threadCount := 16
-	t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
+	threadCount := 32
+	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
+		threadCount = int(maxQueue)
+	} else {
+		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
+	}

 	req := api.GenerateRequest{
 		Model:  "orca-mini",
@@ -67,7 +72,7 @@ func TestMaxQueue(t *testing.T) {
 	busyCount := 0
 	resetByPeerCount := 0
 	canceledCount := 0
-	successCount := 0
+	succesCount := 0
 	counterMu := sync.Mutex{}
 	var embedwg sync.WaitGroup
 	for i := 0; i < threadCount; i++ {
@@ -88,7 +93,7 @@ func TestMaxQueue(t *testing.T) {
 			defer counterMu.Unlock()
 			switch {
 			case genErr == nil:
-				successCount++
+				succesCount++
 				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
 			case errors.Is(genErr, context.Canceled):
 				canceledCount++
@@ -107,7 +112,7 @@ func TestMaxQueue(t *testing.T) {
 	slog.Info("generate done, waiting for embeds")
 	embedwg.Wait()

-	slog.Info("embeds completed", "success", successCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
+	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
 	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
 	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
 	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -0,0 +1,57 @@
+# top level makefile for Go server
+include make/common-defs.make
+
+RUNNER_TARGETS := default
+
+# Determine which if any GPU runners we should build
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
+	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
+	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
+else ifeq ($(OS),linux)
+	HIP_PATH?=/opt/rocm
+	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
+	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+endif
+
+ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
+ifneq ($(CUDA_11),)
+	RUNNER_TARGETS += cuda_v11
+endif
+ifneq ($(CUDA_12),)
+	RUNNER_TARGETS += cuda_v12
+endif
+endif
+ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
+ifneq ($(HIP_LIB_DIR),)
+	RUNNER_TARGETS += rocm
+endif
+endif
+
+
+all: clean-payload .WAIT runners
+
+runners: $(RUNNER_TARGETS)
+
+$(RUNNER_TARGETS):
+	$(MAKE) -f make/Makefile.$@
+
+help-sync apply-patches create-patches sync:
+	$(MAKE) -f make/Makefile.sync $@
+
+clean:
+	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
+	go clean -cache
+
+clean-payload:
+	rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)
+
+.PHONY: all runners clean clean-payload $(RUNNER_TARGETS) .WAIT
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
--- a/llama/README.md
+++ b/llama/README.md
@@ -55,7 +55,7 @@ go build -tags avx,cuda .

 ### ROCm

-Install [ROCm](https://rocm.docs.amd.com/en/latest/).
+Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):

 ```shell
 make ggml_hipblas.so
@@ -77,7 +77,7 @@ go build -tags avx,cuda .

 ### ROCm

-Install [ROCm](https://rocm.docs.amd.com/en/latest/).
+Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/).

 ```shell
 make ggml_hipblas.dll
@@ -93,7 +93,7 @@ make -j

 ## Vendoring

-Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
+Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.

 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.

@@ -105,35 +105,35 @@ make apply-patches

 **Pin to new base commit**

-To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`

 #### Applying patches

 When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.

-Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
+Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.

 ```
 make apply-patches
 ```

-If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
+If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.

 ```
 make create-patches sync
 ```

-Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
+Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.

 ### Generating Patches

-When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
+When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:

 ```
 make apply-patches
 ```

-Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
+Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:

 ```
 make sync
@@ -142,9 +142,9 @@ go build .
 ```

 > [!IMPORTANT]
-> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
+> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo.  It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).

-Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
+Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with

 ```
 make create-patches
@@ -157,4 +157,4 @@ In your `./vendor/` directory, create a branch, and cherry-pick the new commit t

 Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.

-After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.
+After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.
--- a/llama/amx.cpp
+++ b/llama/amx.cpp
@@ -1,246 +0,0 @@
-/**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "amx.h"
-#include "common.h"
-#include "mmq.h"
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
-#include "ggml-impl.h"
-#include "ggml-cpu.h"
-#include "ggml-cpu-traits.h"
-
-#if defined(__gnu_linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-
-// AMX type_trais
-namespace ggml::cpu::amx {
-class tensor_traits : public ggml::cpu::tensor_traits {
-    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        size = ggml_backend_amx_desired_wsize(op);
-        return true;
-    }
-
-    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT) {
-            ggml_backend_amx_mul_mat(params, op);
-            return true;
-        }
-        return false;
-    }
-};
-
-static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
-    static tensor_traits traits;
-    return &traits;
-}
-}  // namespace ggml::cpu::amx
-
-// AMX buffer interface
-static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *) (buffer->context);
-}
-
-static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
-                                                  uint8_t value, size_t offset, size_t size) {
-    memset((char *) tensor->data + offset, value, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
-                                               const void * data, size_t offset, size_t size) {
-    if (qtype_has_amx_kernels(tensor->type)) {
-        GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
-        ggml_backend_amx_convert_weight(tensor, data, offset, size);
-    } else {
-        memcpy((char *) tensor->data + offset, data, size);
-    }
-
-    GGML_UNUSED(buffer);
-}
-
-/*
-// need to figure what we need to do with buffer->extra.
-static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        if (qtype_has_amx_kernels(src->type)) {
-            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
-        } else {
-            memcpy(dst->data, src->data, ggml_nbytes(src));
-        }
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-*/
-
-static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_amx_buffer_init_tensor,
-    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ nullptr,
-    /* .cpy_tensor      = */ nullptr,
-    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ nullptr,
-};
-
-static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "AMX";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = ggml_aligned_malloc(size);
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
-}
-
-static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-namespace ggml::cpu::amx {
-class extra_buffer_type : ggml::cpu::extra_buffer_type {
-    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        // handle only 2d gemm for now
-        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-        };
-
-        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
-            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
-            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
-            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
-            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
-            // src1 must be host buffer
-            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
-                return false;
-            }
-            // src1 must be float32
-            if (op->src[1]->type == GGML_TYPE_F32) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
-            op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
-            return (ggml::cpu::tensor_traits *) op->src[0]->extra;
-        }
-
-        return nullptr;
-    }
-};
-}  // namespace ggml::cpu::amx
-
-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    return ggml_backend_amx_get_alloc_size(tensor);
-
-    GGML_UNUSED(buft);
-}
-
-#define ARCH_GET_XCOMP_PERM     0x1022
-#define ARCH_REQ_XCOMP_PERM     0x1023
-#define XFEATURE_XTILECFG       17
-#define XFEATURE_XTILEDATA      18
-
-static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
-    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
-        fprintf(stderr, "AMX is not ready to be used!\n");
-        return false;
-    }
-    return true;
-#elif defined(_WIN32)
-    return true;
-#endif
-}
-
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
-        /* .iface = */ {
-                        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-                        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-                        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-                        /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
-                        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-                        /* .is_host          = */ nullptr,
-                        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
-    };
-
-    if (!ggml_amx_init()) {
-        return nullptr;
-    }
-
-    return &ggml_backend_buffer_type_amx;
-}
-
-#endif  // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Daniel Hiltgen	8de8729e35	Remove llama.cpp submodule and shift new build to top	2024-10-23 22:06:01 -07:00
Daniel Hiltgen	4e988ad5d6	Move Go code out of llm package	2024-10-23 12:38:11 -07:00
				`@@ -0,0 +1 @@`
				`This is here to make sure the build/ directory exists for the go:embed command`