testing new cmake script

2026-01-06 06:31:14 -05:00 · 2024-03-03 00:51:07 -08:00
136 changed files with 1646 additions and 45635 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,9 +1,8 @@
 .vscode
 ollama
 app
-macapp
 dist
 llm/llama.cpp
 .env
 .cache
-test_data
+test_data
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +0,0 @@
-llm/ext_server/* linguist-vendored
--- a/.github/ISSUE_TEMPLATE/10_model_request.yml
+++ b/.github/ISSUE_TEMPLATE/10_model_request.yml
@@ -1,18 +0,0 @@
-name: Model request
-description: Request a new model for the library
-labels: [mr]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Please check if your Model request is [already available](https://ollama.com/search) or that you cannot [import it](https://github.com/ollama/ollama/blob/main/docs/import.md#import-a-model) yourself.
-        Tell us about which Model you'd like to see in the library!
-  - type: textarea
-    id: problem
-    attributes:
-      label: What model would you like?
-      description: Please provide a link to the model.
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for filing a model request!
--- a/.github/ISSUE_TEMPLATE/20_feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/20_feature_request.yml
@@ -1,41 +0,0 @@
-name: Feature request
-description: Propose a new feature
-labels: [needs-triage, fr]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Please check if your feature request is [already filed](https://github.com/ollama/ollama/issues).
-        Tell us about your idea!
-  - type: textarea
-    id: problem
-    attributes:
-      label: What are you trying to do?
-      description: Tell us about the problem you're trying to solve.
-    validations:
-      required: false
-  - type: textarea
-    id: solution
-    attributes:
-      label: How should we solve this?
-      description: If you have an idea of how you'd like to see this feature work, let us know.
-    validations:
-      required: false
-  - type: textarea
-    id: alternative
-    attributes:
-      label: What is the impact of not solving this?
-      description: (How) Are you currently working around the issue?
-    validations:
-      required: false
-  - type: textarea
-    id: context
-    attributes:
-      label: Anything else?
-      description: Any additional context to share, e.g., links
-    validations:
-      required: false
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for filing a feature request!
--- a/.github/ISSUE_TEMPLATE/90_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/90_bug_report.yml
@@ -1,125 +0,0 @@
-name: Bug report
-description: File a bug report. If you need help, please join our Discord server.
-labels: [needs-triage, bug]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Please check if your bug is [already filed](https://github.com/ollama/ollama/issues) before filing a new one.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What is the issue?
-      description: What happened? What did you expect to happen?
-    validations:
-      required: true
-  - type: textarea
-    id: what-was-expected
-    attributes:
-      label: What did you expect to see?
-      description: What did you expect to see/happen instead?
-    validations:
-      required: true
-  - type: textarea
-    id: steps
-    attributes:
-      label: Steps to reproduce
-      description: What are the steps you took that hit this issue?
-    validations:
-      required: false
-  - type: textarea
-    id: changes
-    attributes:
-      label: Are there any recent changes that introduced the issue?
-      description: If so, what are those changes?
-    validations:
-      required: false
-  - type: dropdown
-    id: os
-    attributes:
-      label: OS
-      description: What OS are you using? You may select more than one.
-      multiple: true
-      options:
-        - Linux
-        - macOS
-        - Windows
-        - Other
-    validations:
-      required: false
-  - type: dropdown
-    id: architecture
-    attributes:
-      label: Architecture
-      description: What architecture are you using? You may select more than one.
-      multiple: true
-      options:
-        - arm64
-        - amd64
-        - x86
-        - Other
-  - type: dropdown
-    id: platform
-    attributes:
-      label: Platform
-      description: What platform are you using? You may select more than one.
-      multiple: true
-      options:
-        - Docker
-        - WSL
-        - WSL2
-    validations:
-      required: false
-  - type: input
-    id: ollama-version
-    attributes:
-      label: Ollama version
-      description: What Ollama version are you using? (`ollama --version`)
-      placeholder: e.g., 1.14.4
-    validations:
-      required: false
-  - type: dropdown
-    id: gpu
-    attributes:
-      label: GPU
-      description: What GPU, if any, are you using? You may select more than one.
-      multiple: true
-      options:
-        - Nvidia
-        - AMD
-        - Intel
-        - Apple
-        - Other
-    validations:
-      required: false
-  - type: textarea
-    id: gpu-info
-    attributes:
-      label: GPU info
-      description: What GPU info do you have? (`nvidia-smi`, `rocminfo`, `system_profiler SPDisplaysDataType`, etc.)
-    validations:
-      required: false
-  - type: dropdown
-    id: cpu
-    attributes:
-      label: CPU
-      description: What CPU are you using? You may select more than one.
-      multiple: true
-      options:
-        - Intel
-        - AMD
-        - Apple
-        - Other
-    validations:
-      required: false
-  - type: textarea
-    id: other-software
-    attributes:
-      label: Other software
-      description: What other software are you using that might be related to this issue?
-    validations:
-      required: false
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for filing a bug report!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,8 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Help
-    url: https://discord.com/invite/ollama
-    about: Please join our Discord server for help using Ollama
-  - name: Troubleshooting
-    url: https://github.com/ollama/ollama/blob/main/docs/faq.md#faq
-    about: See the FAQ for common issues and solutions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,465 +0,0 @@
-name: release
-
-on:
-  push:
-    tags:
-      - 'v*'
-
-jobs:
-  # Full build of the Mac assets
-  build-darwin:
-    runs-on: macos-12
-    environment: release
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: |
-          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
-      - name: key
-        env:
-          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
-          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
-        run: |
-          echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
-          security create-keychain -p password build.keychain
-          security default-keychain -s build.keychain
-          security unlock-keychain -p password build.keychain
-          security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
-          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.22'
-          cache: true
-      - name: Build Darwin
-        env:
-          APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
-          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
-          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
-          APPLE_ID: ${{ vars.APPLE_ID }}
-          SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-          DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
-        run: |
-          ./scripts/build_darwin.sh
-        
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-darwin
-          path: |
-            dist/*arwin*
-            !dist/*-cov
-
-
-  # Windows builds take a long time to both install the dependencies and build, so parallelize
-  # CPU generation step
-  generate-windows-cpu:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.22'
-          cache: true
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
-        name: go generate
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-cpu
-          path: llm/llama.cpp/build/**/lib/*
-
-  # ROCm generation step
-  generate-windows-rocm:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.22'
-          cache: true
-      - name: "Install ROCm"
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading AMD HIP Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP"
-      - name: "Verify ROCm"
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
-        name: go generate
-      - name: "gather rocm dependencies"
-        run: |
-          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          md "dist\deps\bin\rocblas\library"
-          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-rocm
-          path: llm/llama.cpp/build/**/lib/*
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-rocm-deps
-          path: dist/deps/*
-
-  # CUDA generation step
-  generate-windows-cuda:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.22'
-          cache: true
-      # TODO - consider replacing this action with a ps1 snippet to install
-      # This actions seems to fail sometimes with "no tools in cache" but a re-run of the failed job clears it
-      # https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-      - name: "Install CUDA"
-        uses: Jimver/cuda-toolkit@v0.2.14
-        id: cuda-toolkit
-        with:
-          cuda: '11.3.1'      
-      - name: "Verify CUDA"
-        run: nvcc -V
-      - run: go get ./...
-      - name: go generate
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-      - name: "gather cuda dependencies"
-        run: |
-          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
-          md "dist\deps"
-          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-cuda
-          path: llm/llama.cpp/build/**/lib/*
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-cuda-deps
-          path: dist/deps/*
-
-  # Import the prior generation steps and build the final windows assets
-  build-windows:
-    environment: release
-    runs-on: windows
-    needs:
-      - generate-windows-cuda
-      - generate-windows-rocm
-      - generate-windows-cpu
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.22'
-          cache: true
-      - run: go get
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-cpu
-          path: llm/llama.cpp/build
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-cuda
-          path: llm/llama.cpp/build
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps
-          path: dist/deps
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-rocm-deps
-          path: dist/deps
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-rocm
-          path: llm/llama.cpp/build
-      - run: dir llm/llama.cpp/build
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          $env:OLLAMA_SKIP_GENERATE="1"
-          $env:NVIDIA_DIR=$(resolve-path ".\dist\deps")
-          $env:HIP_PATH=$(resolve-path ".\dist\deps")
-          & .\scripts\build_windows.ps1
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-windows
-          path: dist/*.exe
-
-  # Linux x86 assets built using the container based build 
-  build-linux-amd64:
-    environment: release
-    runs-on: linux
-    env:
-      OLLAMA_SKIP_MANIFEST_CREATE: "1"
-      BUILD_ARCH: amd64
-      PUSH: "1"
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - run: |
-          ./scripts/build_linux.sh
-          ./scripts/build_docker.sh
-          mv dist/deps/* dist/
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-linux-amd64
-          path: |
-            dist/*linux*
-            !dist/*-cov
-
-  # Linux ARM assets built using the container based build
-  # (at present, docker isn't pre-installed on arm ubunutu images)
-  build-linux-arm64:
-    environment: release
-    runs-on: linux-arm64
-    env:
-      OLLAMA_SKIP_MANIFEST_CREATE: "1"
-      BUILD_ARCH: arm64
-      PUSH: "1"
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: "Install Docker"
-        run: |
-          # Add Docker's official GPG key:
-          env
-          uname -a
-          sudo apt-get update
-          sudo apt-get install -y ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-
-          # Add the repository to Apt sources:
-          echo \
-            "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt-get update
-          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
-          sudo usermod -aG docker $USER
-          sudo apt-get install acl
-          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - run: |
-          ./scripts/build_linux.sh
-          ./scripts/build_docker.sh
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-linux-arm64
-          path: |
-            dist/*linux*
-            !dist/*-cov
-
-  # Aggregate all the assets and ship a release
-  release: 
-    needs:
-      - build-darwin
-      - build-windows
-      - build-linux-amd64
-      - build-linux-arm64
-    runs-on: linux
-    environment: release
-    permissions:
-      contents: write
-    env:
-      OLLAMA_SKIP_IMAGE_BUILD: "1"
-      PUSH: "1"
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: |
-          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - run: ./scripts/build_docker.sh
-      - name: Retrieve built artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: dist
-          pattern: dist-*
-          merge-multiple: true
-      - run: |
-          ls -lh dist/
-          (cd dist; sha256sum * > sha256sum.txt)
-          cat dist/sha256sum.txt
-      - uses: ncipollo/release-action@v1
-        with:
-          name: ${{ env.RELEASE_VERSION }}
-          allowUpdates: true
-          artifacts: "dist/*"
-          draft: true
-          prerelease: true
-          omitBodyDuringUpdate: true
-          generateReleaseNotes: true
-          omitDraftDuringUpdate: true
-          omitPrereleaseDuringUpdate: true
-          replacesArtifacts: true
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -2,46 +2,17 @@ name: test

 on:
  pull_request:
-    paths:
-      - '**/*'
-      - '!docs/**'
-      - '!examples/**'
-      - '!README.md'

 jobs:
-  changes:
-    runs-on: ubuntu-latest
-    outputs:
-      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
-      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - id: changes
-        run: |
-          changed() {
-            git diff-tree -r --no-commit-id --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
-              | xargs python3 -c "import sys; print(any([x.startswith('$1') for x in sys.argv[1:]]))"
-          }
-
-          {
-            echo GENERATE=$(changed llm/)
-            echo GENERATE_CUDA=$(changed llm/)
-            echo GENERATE_ROCM=$(changed llm/)
-          } >>$GITHUB_OUTPUT
-
  generate:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-2019
+          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
@@ -50,28 +21,15 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: "Windows Go Generate"
      - run: go generate -x ./...
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: "Unix Go Generate"
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
          path: llm/llama.cpp/build/**/lib/*
  generate-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
    strategy:
      matrix:
        cuda-version:
@@ -88,7 +46,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get ./...
      - run: |
@@ -101,11 +59,10 @@ jobs:
          name: cuda-${{ matrix.cuda-version }}-libraries
          path: llm/llama.cpp/build/**/lib/*
  generate-rocm:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
    strategy:
      matrix:
        rocm-version:
+          - '5.7.1'
          - '6.0'
    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
@@ -119,7 +76,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get ./...
      - run: |
@@ -134,89 +91,69 @@ jobs:
  lint:
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-2019
+          - os: windows-latest
            arch: arm64
          - os: macos-latest
            arch: amd64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
+      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: false
      - run: |
-          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
-            arm64) echo ARCH=arm64 ;;
-          esac >>$GITHUB_ENV
-        shell: bash
-      - run: |
-          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
-          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
+          mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
-          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
-          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
+          mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
          touch llm/llama.cpp/ggml-metal.metal
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
-          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
-          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
+          mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
        if: ${{ startsWith(matrix.os, 'windows-') }}
      - uses: golangci/golangci-lint-action@v3
  test:
+    needs: generate
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [amd64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
-          - os: windows-2019
+          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
+      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version: '1.22'
+          go-version: '1.21'
          cache: true
      - run: go get
-      - run: |
-          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
-            arm64) echo ARCH=arm64 ;;
-          esac >>$GITHUB_ENV
-        shell: bash
-      - run: |
-          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
-          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
-        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
-      - run: |
-          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
-          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
-          touch llm/llama.cpp/ggml-metal.metal
-        if: ${{ startsWith(matrix.os, 'macos-') }}
-      - run: |
-          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
-          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
-        if: ${{ startsWith(matrix.os, 'windows-') }}
+      - uses: actions/download-artifact@v4
+        with:
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
+          path: llm/llama.cpp/build
      - run: go build
      - run: go test -v ./...
      - uses: actions/upload-artifact@v4
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -24,4 +24,4 @@ linters-settings:
      - (*os.File).Seek
      - (*bufio.Writer).WriteString
      - (*github.com/spf13/pflag.FlagSet).Set
-      - (*github.com/ollama/ollama/llm.readSeekOffset).Seek
+      - (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek
--- a/73
+++ b/73
@@ -1,8 +1,6 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.21.3
 ARG CMAKE_VERSION=3.22.1
-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0

 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -15,8 +13,8 @@ ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

@@ -25,30 +23,34 @@ ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-RUN mkdir /tmp/scratch && \
-    for dep in $(cat /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
-    done && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
-    mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )

+FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LIBRARY_PATH /opt/amdgpu/lib64
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG AMDGPU_TARGETS
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -56,10 +58,10 @@ ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate

 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
@@ -74,8 +76,8 @@ ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 # Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
@@ -84,41 +86,40 @@ RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama
+WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .

 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
-WORKDIR /go/src/github.com/ollama/ollama
+WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/
+COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .

 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama

 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0

--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+  <img alt="ollama" height="200px" src="https://github.com/jmorganca/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>

 # Ollama
@@ -22,7 +22,7 @@ Get up and running with large language models locally.
 curl -fsSL https://ollama.com/install.sh | sh
 ```

-[Manual install instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md)
+[Manual install instructions](https://github.com/jmorganca/ollama/blob/main/docs/linux.md)

 ### Docker

@@ -213,7 +213,7 @@ Then build the binary:
 go build .
 ```

-More detailed instructions can be found in the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
+More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)

 ### Running local builds

@@ -259,11 +259,9 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Web & Desktop

- [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Saddle](https://github.com/jikkuatwork/saddle)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
@@ -274,21 +272,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Amica](https://github.com/semperai/amica)
 - [chatd](https://github.com/BruceMacD/chatd)
 - [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [Dify.AI](https://github.com/langgenius/dify)
 - [MindMac](https://mindmac.app)
 - [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
 - [Msty](https://msty.app)
 - [Chatbox](https://github.com/Bin-Huang/Chatbox)
- [WinForm Ollama Copilot](https://github.com/tgraupmann/WinForm_Ollama_Copilot)
 - [NextChat](https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web) with [Get Started Doc](https://docs.nextchat.dev/models/ollama)
- [Alpaca WebUI](https://github.com/mmo80/alpaca-webui)
- [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
- [OpenAOE](https://github.com/InternLM/OpenAOE)
- [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)

 ### Terminal

@@ -297,18 +285,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Emacs client](https://github.com/zweifisch/ollama)
 - [gen.nvim](https://github.com/David-Kunz/gen.nvim)
 - [ollama.nvim](https://github.com/nomnivore/ollama.nvim)
- [ollero.nvim](https://github.com/marco-souza/ollero.nvim)
 - [ollama-chat.nvim](https://github.com/gerazov/ollama-chat.nvim)
 - [ogpt.nvim](https://github.com/huynle/ogpt.nvim)
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
- [ooo](https://github.com/npahlfer/ooo)
 - [tenere](https://github.com/pythops/tenere)
 - [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
- [typechat-cli](https://github.com/anaisbetts/typechat-cli)
 - [ShellOracle](https://github.com/djcopley/ShellOracle)
- [tlm](https://github.com/yusufcanb/tlm)

 ### Database

@@ -325,6 +309,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
+- [LangChain4j](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
@@ -341,7 +326,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for R - rollama](https://github.com/JBGruber/rollama)
 - [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
- [Testcontainers](https://testcontainers.com/modules/ollama/)

 ### Mobile

@@ -355,7 +339,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Continue](https://github.com/continuedev/continue)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
- [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
 - [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
@@ -363,11 +346,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
 - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
--- a/api/client.go
+++ b/api/client.go
@@ -15,8 +15,8 @@ import (
 	"runtime"
 	"strings"

-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/version"
+	"github.com/jmorganca/ollama/format"
+	"github.com/jmorganca/ollama/version"
 )

 type Client struct {
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -1,50 +0,0 @@
-package api
-
-import (
-	"encoding/json"
-	"math"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func TestKeepAliveParsingFromJSON(t *testing.T) {
-	tests := []struct {
-		name string
-		req  string
-		exp  *Duration
-	}{
-		{
-			name: "Positive Integer",
-			req:  `{ "keep_alive": 42 }`,
-			exp:  &Duration{42 * time.Second},
-		},
-		{
-			name: "Positive Integer String",
-			req:  `{ "keep_alive": "42m" }`,
-			exp:  &Duration{42 * time.Minute},
-		},
-		{
-			name: "Negative Integer",
-			req:  `{ "keep_alive": -1 }`,
-			exp:  &Duration{math.MaxInt64},
-		},
-		{
-			name: "Negative Integer String",
-			req:  `{ "keep_alive": "-1m" }`,
-			exp:  &Duration{math.MaxInt64},
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			var dec ChatRequest
-			err := json.Unmarshal([]byte(test.req), &dec)
-			require.NoError(t, err)
-
-			assert.Equal(t, test.exp, dec.KeepAlive)
-		})
-	}
-}
--- a/app/lifecycle/lifecycle.go
+++ b/app/lifecycle/lifecycle.go
@@ -9,8 +9,8 @@ import (
 	"os/signal"
 	"syscall"

-	"github.com/ollama/ollama/app/store"
-	"github.com/ollama/ollama/app/tray"
+	"github.com/jmorganca/ollama/app/store"
+	"github.com/jmorganca/ollama/app/tray"
 )

 func Run() {
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -11,7 +11,7 @@ import (
 	"path/filepath"
 	"time"

-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )

 func getCLIFullPath(command string) string {
--- a/app/lifecycle/updater.go
+++ b/app/lifecycle/updater.go
@@ -18,8 +18,8 @@ import (
 	"strings"
 	"time"

-	"github.com/ollama/ollama/auth"
-	"github.com/ollama/ollama/version"
+	"github.com/jmorganca/ollama/auth"
+	"github.com/jmorganca/ollama/version"
 )

 var (
--- a/app/main.go
+++ b/app/main.go
@@ -4,7 +4,7 @@ package main
 // go build -ldflags="-H windowsgui" .

 import (
-	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/jmorganca/ollama/app/lifecycle"
 )

 func main() {
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -28,8 +28,8 @@ AppPublisher={#MyAppPublisher}
 AppPublisherURL={#MyAppURL}
 AppSupportURL={#MyAppURL}
 AppUpdatesURL={#MyAppURL}
-ArchitecturesAllowed=x64 arm64
-ArchitecturesInstallIn64BitMode=x64 arm64
+ArchitecturesAllowed=x64
+ArchitecturesInstallIn64BitMode=x64
 DefaultDirName={localappdata}\Programs\{#MyAppName}
 DefaultGroupName={#MyAppName}
 DisableProgramGroupPage=yes
@@ -91,14 +91,6 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-; Assumes v5.7, may need adjustments for v6
-#if GetEnv("HIP_PATH") != ""
-  Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
-  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
-  ; amdhip64.dll dependency comes from the driver and must be installed already
-  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
-#endif
-

 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
--- a/app/tray/tray.go
+++ b/app/tray/tray.go
@@ -4,8 +4,8 @@ import (
 	"fmt"
 	"runtime"

-	"github.com/ollama/ollama/app/assets"
-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/assets"
+	"github.com/jmorganca/ollama/app/tray/commontray"
 )

 func NewTray() (commontray.OllamaTray, error) {
--- a/app/tray/tray_nonwindows.go
+++ b/app/tray/tray_nonwindows.go
@@ -5,7 +5,7 @@ package tray
 import (
 	"fmt"

-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
 )

 func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
--- a/app/tray/tray_windows.go
+++ b/app/tray/tray_windows.go
@@ -1,8 +1,8 @@
 package tray

 import (
-	"github.com/ollama/ollama/app/tray/commontray"
-	"github.com/ollama/ollama/app/tray/wintray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/wintray"
 )

 func InitPlatformTray(icon, updateIcon []byte) (commontray.OllamaTray, error) {
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -13,7 +13,7 @@ import (
 	"sync"
 	"unsafe"

-	"github.com/ollama/ollama/app/tray/commontray"
+	"github.com/jmorganca/ollama/app/tray/commontray"
 	"golang.org/x/sys/windows"
 )

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1,7 +1,6 @@
 package cmd

 import (
-	"archive/zip"
 	"bytes"
 	"context"
 	"crypto/ed25519"
@@ -30,12 +29,12 @@ import (
 	"golang.org/x/exp/slices"
 	"golang.org/x/term"

-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/parser"
-	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/server"
-	"github.com/ollama/ollama/version"
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/format"
+	"github.com/jmorganca/ollama/parser"
+	"github.com/jmorganca/ollama/progress"
+	"github.com/jmorganca/ollama/server"
+	"github.com/jmorganca/ollama/version"
 )

 func CreateHandler(cmd *cobra.Command, args []string) error {
@@ -88,82 +87,22 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				path = filepath.Join(filepath.Dir(filename), path)
 			}

-			fi, err := os.Stat(path)
+			bin, err := os.Open(path)
 			if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
 				continue
 			} else if err != nil {
 				return err
 			}
+			defer bin.Close()

-			// TODO make this work w/ adapters
-			if fi.IsDir() {
-				tf, err := os.CreateTemp("", "ollama-tf")
-				if err != nil {
-					return err
-				}
-				defer os.RemoveAll(tf.Name())
-
-				zf := zip.NewWriter(tf)
-
-				files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
-				if err != nil {
-					return err
-				}
-
-				if len(files) == 0 {
-					return fmt.Errorf("no safetensors files were found in '%s'", path)
-				}
-
-				// add the safetensor config file + tokenizer
-				files = append(files, filepath.Join(path, "config.json"))
-				files = append(files, filepath.Join(path, "added_tokens.json"))
-				files = append(files, filepath.Join(path, "tokenizer.model"))
-
-				for _, fn := range files {
-					f, err := os.Open(fn)
-					if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
-						continue
-					} else if err != nil {
-						return err
-					}
-
-					fi, err := f.Stat()
-					if err != nil {
-						return err
-					}
-
-					h, err := zip.FileInfoHeader(fi)
-					if err != nil {
-						return err
-					}
-
-					h.Name = filepath.Base(fn)
-					h.Method = zip.Store
-
-					w, err := zf.CreateHeader(h)
-					if err != nil {
-						return err
-					}
-
-					_, err = io.Copy(w, f)
-					if err != nil {
-						return err
-					}
-
-				}
-
-				if err := zf.Close(); err != nil {
-					return err
-				}
-
-				if err := tf.Close(); err != nil {
-					return err
-				}
-				path = tf.Name()
+			hash := sha256.New()
+			if _, err := io.Copy(hash, bin); err != nil {
+				return err
 			}
+			bin.Seek(0, io.SeekStart)

-			digest, err := createBlob(cmd, client, path)
-			if err != nil {
+			digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
+			if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
 				return err
 			}

@@ -202,26 +141,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

-func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) {
-	bin, err := os.Open(path)
-	if err != nil {
-		return "", err
-	}
-	defer bin.Close()
-
-	hash := sha256.New()
-	if _, err := io.Copy(hash, bin); err != nil {
-		return "", err
-	}
-	bin.Seek(0, io.SeekStart)
-
-	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
-	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
-		return "", err
-	}
-	return digest, nil
-}
-
 func RunHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -887,20 +806,13 @@ func versionHandler(cmd *cobra.Command, _ []string) {
 	}
 }

-func appendHostEnvDocs(cmd *cobra.Command) {
-	const hostEnvDocs = `
-Environment Variables:
-      OLLAMA_HOST        The host:port or base URL of the Ollama server (e.g. http://localhost:11434)
-`
-	cmd.SetUsageTemplate(cmd.UsageTemplate() + hostEnvDocs)
-}
-
 func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
 	cobra.EnableCommandSorting = false

 	if runtime.GOOS == "windows" {
-		console.ConsoleFromFile(os.Stdin) //nolint:errcheck
+		// Enable colorful ANSI escape code in Windows terminal (disabled by default)
+		console.ConsoleFromFile(os.Stdout) //nolint:errcheck
 	}

 	rootCmd := &cobra.Command{
@@ -959,6 +871,7 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
+
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
@@ -966,14 +879,6 @@ func NewCLI() *cobra.Command {
 		Args:    cobra.ExactArgs(0),
 		RunE:    RunServer,
 	}
-	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
-Environment Variables:
-
-    OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS      A comma separated list of allowed origins.
-    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
-    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
-`)

 	pullCmd := &cobra.Command{
 		Use:     "pull MODEL",
@@ -1002,6 +907,7 @@ Environment Variables:
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListHandler,
 	}
+
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE TARGET",
 		Short:   "Copy a model",
@@ -1018,19 +924,6 @@ Environment Variables:
 		RunE:    DeleteHandler,
 	}

-	for _, cmd := range []*cobra.Command{
-		createCmd,
-		showCmd,
-		runCmd,
-		pullCmd,
-		pushCmd,
-		listCmd,
-		copyCmd,
-		deleteCmd,
-	} {
-		appendHostEnvDocs(cmd)
-	}
-
 	rootCmd.AddCommand(
 		serveCmd,
 		createCmd,
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -14,9 +14,9 @@ import (
 	"github.com/spf13/cobra"
 	"golang.org/x/exp/slices"

-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/readline"
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/progress"
+	"github.com/jmorganca/ollama/readline"
 )

 type MultilineState int
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -7,7 +7,7 @@ import (

 	"github.com/stretchr/testify/assert"

-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )

 func TestExtractFilenames(t *testing.T) {
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -7,7 +7,7 @@ import (
 	"os/exec"
 	"strings"

-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )

 func startApp(ctx context.Context, client *api.Client) error {
--- a/cmd/start_default.go
+++ b/cmd/start_default.go
@@ -6,7 +6,7 @@ import (
 	"context"
 	"fmt"

-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )

 func startApp(ctx context.Context, client *api.Client) error {
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -10,7 +10,7 @@ import (
 	"strings"
 	"syscall"

-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )

 func startApp(ctx context.Context, client *api.Client) error {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,331 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"regexp"
-	"slices"
-
-	"github.com/mitchellh/mapstructure"
-	"google.golang.org/protobuf/proto"
-
-	"github.com/ollama/ollama/convert/sentencepiece"
-	"github.com/ollama/ollama/llm"
-)
-
-type Params struct {
-	Architectures    []string `json:"architectures"`
-	VocabSize        int      `json:"vocab_size"`
-	HiddenSize       int      `json:"hidden_size"`       // n_embd
-	HiddenLayers     int      `json:"num_hidden_layers"` // n_layer
-	ContextSize      int      `json:"max_position_embeddings"`
-	IntermediateSize int      `json:"intermediate_size"`
-	AttentionHeads   int      `json:"num_attention_heads"` // n_head
-	KeyValHeads      int      `json:"num_key_value_heads"`
-	NormEPS          float64  `json:"rms_norm_eps"`
-	RopeFreqBase     float64  `json:"rope_theta"`
-	BoSTokenID       int      `json:"bos_token_id"`
-	EoSTokenID       int      `json:"eos_token_id"`
-}
-
-type MetaData struct {
-	Type    string `mapstructure:"dtype"`
-	Shape   []int  `mapstructure:"shape"`
-	Offsets []int  `mapstructure:"data_offsets"`
-}
-
-func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
-	f, err := os.Open(fn)
-	if err != nil {
-		return []llm.Tensor{}, 0, err
-	}
-	defer f.Close()
-
-	var jsonSize uint64
-	binary.Read(f, binary.LittleEndian, &jsonSize)
-
-	buf := make([]byte, jsonSize)
-	_, err = io.ReadFull(f, buf)
-	if err != nil {
-		return []llm.Tensor{}, 0, err
-	}
-
-	d := json.NewDecoder(bytes.NewBuffer(buf))
-	d.UseNumber()
-	var parsed map[string]interface{}
-	if err = d.Decode(&parsed); err != nil {
-		return []llm.Tensor{}, 0, err
-	}
-
-	var keys []string
-	for k := range parsed {
-		keys = append(keys, k)
-	}
-
-	slices.Sort(keys)
-
-	slog.Info("converting layers")
-
-	var tensors []llm.Tensor
-	for _, k := range keys {
-		vals := parsed[k].(map[string]interface{})
-		var data MetaData
-		if err = mapstructure.Decode(vals, &data); err != nil {
-			return []llm.Tensor{}, 0, err
-		}
-
-		var size uint64
-		var kind uint32
-		switch len(data.Shape) {
-		case 0:
-			// metadata
-			continue
-		case 1:
-			// convert to float32
-			kind = 0
-			size = uint64(data.Shape[0] * 4)
-		case 2:
-			// convert to float16
-			kind = 1
-			size = uint64(data.Shape[0] * data.Shape[1] * 2)
-		}
-
-		ggufName, err := GetTensorName(k)
-		if err != nil {
-			slog.Error("%v", err)
-			return []llm.Tensor{}, 0, err
-		}
-
-		shape := []uint64{0, 0, 0, 0}
-		for i := range data.Shape {
-			shape[i] = uint64(data.Shape[i])
-		}
-
-		t := llm.Tensor{
-			Name:          ggufName,
-			Kind:          kind,
-			Offset:        offset,
-			Shape:         shape[:],
-			FileName:      fn,
-			OffsetPadding: 8 + jsonSize,
-			FileOffsets:   []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
-		}
-		slog.Debug(fmt.Sprintf("%v", t))
-		tensors = append(tensors, t)
-		offset += size
-	}
-	return tensors, offset, nil
-}
-
-func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
-	var tensors []llm.Tensor
-	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
-	if err != nil {
-		return []llm.Tensor{}, err
-	}
-
-	var offset uint64
-	for _, f := range files {
-		var t []llm.Tensor
-		var err error
-		t, offset, err = ReadSafeTensors(f, offset)
-		if err != nil {
-			slog.Error("%v", err)
-			return []llm.Tensor{}, err
-		}
-		tensors = append(tensors, t...)
-	}
-	return tensors, nil
-}
-
-func GetParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	var params Params
-
-	d := json.NewDecoder(f)
-	err = d.Decode(&params)
-	if err != nil {
-		return nil, err
-	}
-
-	return &params, nil
-}
-
-// Details on gguf's tokenizer can be found at:
-// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#tokenizer
-type Vocab struct {
-	Tokens []string
-	Scores []float32
-	Types  []int32
-}
-
-func LoadTokens(dirpath string) (*Vocab, error) {
-	slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
-	in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
-	if err != nil {
-		return nil, err
-	}
-
-	// To regenerate sentencepiece from the protobufs use:
-	// protoc -I=./ --go_out=./ sentencepiece_model.proto
-	modelProto := &sentencepiece.ModelProto{}
-	if err := proto.Unmarshal(in, modelProto); err != nil {
-		return nil, err
-	}
-
-	v := &Vocab{
-		Tokens: make([]string, 0),
-		Scores: make([]float32, 0),
-		Types:  make([]int32, 0),
-	}
-
-	pieces := modelProto.GetPieces()
-	for _, p := range pieces {
-		v.Tokens = append(v.Tokens, p.GetPiece())
-		v.Scores = append(v.Scores, p.GetScore())
-		t := p.GetType()
-		v.Types = append(v.Types, int32(t))
-	}
-
-	slog.Info(fmt.Sprintf("vocab size: %d", len(v.Tokens)))
-
-	// add any additional tokens
-	addIn, err := os.ReadFile(filepath.Join(dirpath, "added_tokens.json"))
-	if os.IsNotExist(err) {
-		return v, nil
-	} else if err != nil {
-		return nil, err
-	}
-
-	slog.Info("reading user defined tokens")
-
-	var extraTokenData map[string]int
-	if err := json.Unmarshal(addIn, &extraTokenData); err != nil {
-		return nil, err
-	}
-
-	type token struct {
-		key string
-		pos int
-	}
-
-	extraTokens := make([]token, 0)
-	for k, id := range extraTokenData {
-		extraTokens = append(extraTokens, token{k, id})
-	}
-
-	slices.SortFunc(extraTokens, func(a, b token) int {
-		return cmp.Compare(a.pos, b.pos)
-	})
-
-	numToks := len(v.Tokens)
-
-	for cnt, t := range extraTokens {
-		// the token id should match the specific index for the total number of tokens
-		if t.pos != cnt+numToks {
-			return nil, fmt.Errorf("token ID '%d' for '%s' doesn't match total token size", t.pos, t.key)
-		}
-		v.Tokens = append(v.Tokens, t.key)
-		v.Scores = append(v.Scores, -1000.0)
-		v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
-	}
-	slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
-
-	return v, nil
-}
-
-func GetTensorName(n string) (string, error) {
-	tMap := map[string]string{
-		"model.embed_tokens.weight":                           "token_embd.weight",
-		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
-		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
-		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
-		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
-		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
-		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
-		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
-		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
-		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
-		"lm_head.weight":    "output.weight",
-		"model.norm.weight": "output_norm.weight",
-	}
-
-	v, ok := tMap[n]
-	if ok {
-		return v, nil
-	}
-
-	// quick hack to rename the layers to gguf format
-	for k, v := range tMap {
-		re := regexp.MustCompile(k)
-		newName := re.ReplaceAllString(n, v)
-		if newName != n {
-			return newName, nil
-		}
-	}
-
-	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
-}
-
-func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
-	c := llm.ContainerGGUF{
-		ByteOrder: binary.LittleEndian,
-	}
-
-	m := llm.NewGGUFModel(&c)
-	m.Tensors = tensors
-	m.KV["general.architecture"] = "llama"
-	m.KV["general.name"] = name
-	m.KV["llama.context_length"] = uint32(params.ContextSize)
-	m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
-	m.KV["llama.block_count"] = uint32(params.HiddenLayers)
-	m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
-	m.KV["llama.rope.dimension_count"] = uint32(128)
-	m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
-	m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
-	m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
-	m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
-	m.KV["general.file_type"] = uint32(1)
-	m.KV["tokenizer.ggml.model"] = "llama"
-
-	m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
-	m.KV["tokenizer.ggml.scores"] = vocab.Scores
-	m.KV["tokenizer.ggml.token_type"] = vocab.Types
-
-	m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
-	m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
-	m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
-	m.KV["tokenizer.ggml.add_bos_token"] = true
-	m.KV["tokenizer.ggml.add_eos_token"] = false
-
-	// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
-	// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
-
-	c.V3.NumTensor = uint64(len(tensors))
-	c.V3.NumKV = uint64(len(m.KV))
-
-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	err = m.Encode(f)
-	if err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
-}
--- a/convert/sentencepiece/sentencepiece_model.pb.go
+++ b/convert/sentencepiece/sentencepiece_model.pb.go
--- a/convert/sentencepiece_model.proto
+++ b/convert/sentencepiece_model.proto
@@ -1,333 +0,0 @@
-// Copyright 2016 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.!
-
-syntax = "proto2";
-
-// TODO(taku): Needs to use LITE RUNTIME in OSS release.
-option optimize_for = LITE_RUNTIME;
-option go_package = "./sentencepiece";
-
-package sentencepiece;
-
-// TrainerSpec encodes a various parameters for SentencePiece training.
-// Next id: 55
-message TrainerSpec {
-  ///////////////////////////////////////////////////////////////////
-  // General parameters
-  //
-  // Input corpus files.
-  //  Trainer accepts the following two formats:
-  //  A) Monolingual: plain text, one sentence per line.
-  //  B) Bilingual:   TSV, source sentence <tab> target sentence
-  //  When bilingual data is passed, shared vocabulary model is built.
-  //  Note that the input file must be raw corpus, not a preprocessed corpus.
-  //  Trainer only loads the first `input_sentence_size` sentences specified
-  //  with this parameter.
-  repeated string input = 1;
-
-  // Input corpus format:
-  // "text": one-sentence-per-line text format (default)
-  // "tsv":  sentence <tab> freq
-  optional string input_format = 7;
-
-  // Output model file prefix.
-  // <model_prefix>.model and <model_prefix>.vocab are generated.
-  optional string model_prefix = 2;
-
-  // Model type. only have UNIGRAM now.
-  enum ModelType {
-    UNIGRAM = 1;  // Unigram language model with dynamic algorithm
-    BPE = 2;      // Byte Pair Encoding
-    WORD = 3;     // Delimitered by whitespace.
-    CHAR = 4;     // tokenizes into character sequence
-  }
-  optional ModelType model_type = 3 [default = UNIGRAM];
-
-  // Vocabulary size. 8k is the default size.
-  optional int32 vocab_size = 4 [default = 8000];
-
-  // List of the languages this model can accept.
-  // Since the model is language-agnostic, this field is used as a reference.
-  repeated string accept_language = 5;
-
-  // Size of self-test samples, which are encoded in the model file.
-  optional int32 self_test_sample_size = 6 [default = 0];
-
-  // Whether to use DP version of sentencepiece. Use it with TSV input format
-  // (requires precomputed word tab counts to work).
-  optional bool enable_differential_privacy = 50 [default = false];
-  // Set these parameters if you need DP version of sentencepiece.
-  // std of noise to add.
-  optional float differential_privacy_noise_level = 51 [default = 0.0];
-  // Clipping threshold to apply after adding noise. All the words with
-  // frequency less than this value are dropped.
-  optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
-
-  ///////////////////////////////////////////////////////////////////
-  // Training parameters.
-  //
-  // Uses characters which cover the corpus with the ratio of `chars_coverage`.
-  // This parameter determines the set of basic Alphabet of sentence piece.
-  // 1.0 - `chars_coverage` characters are treated as UNK.
-  // See also required_chars field.
-  optional float character_coverage = 10 [default = 0.9995];
-
-  // Maximum size of sentences the trainer loads from `input` parameter.
-  // Trainer simply loads the `input` files in sequence.
-  // It is better to shuffle the input corpus randomly.
-  optional uint64 input_sentence_size = 11 [default = 0];
-  optional bool shuffle_input_sentence = 19 [default = true];
-
-  // Maximum size of sentences to make seed sentence pieces.
-  // Extended suffix array is constructed to extract frequent
-  // sub-strings from the corpus. This uses 20N working space,
-  // where N is the size of corpus.
-  optional int32 mining_sentence_size = 12 [deprecated = true];
-
-  // Maximum size of sentences to train sentence pieces.
-  optional int32 training_sentence_size = 13 [deprecated = true];
-
-  // The size of seed sentencepieces.
-  // `seed_sentencepiece_size` must be larger than `vocab_size`.
-  optional int32 seed_sentencepiece_size = 14 [default = 1000000];
-
-  // In every EM sub-iterations, keeps top
-  // `shrinking_factor` * `current sentencepieces size` with respect to
-  // the loss of the sentence piece. This value should be smaller than 1.0.
-  optional float shrinking_factor = 15 [default = 0.75];
-
-  // The maximum sentence length in byte. The sentences with the length
-  // larger than `max_sentence_length` is simply ignored.
-  // Longer input tends to bring the following risks:
-  //  * Overflow during EM training (unigram language model only)
-  //  * Performance drop because of O(n log n) cost in BPE.
-  optional int32 max_sentence_length = 18 [default = 4192];
-
-  // Number of threads in the training.
-  optional int32 num_threads = 16 [default = 16];
-
-  // Number of EM sub iterations.
-  optional int32 num_sub_iterations = 17 [default = 2];
-
-  ///////////////////////////////////////////////////////////////////
-  // SentencePiece parameters which control the shapes of sentence piece.
-  //
-  // Maximum length of sentencepiece.
-  optional int32 max_sentencepiece_length = 20 [default = 16];
-
-  // Uses Unicode script to split sentence pieces.
-  // When `split_by_unicode_script` is true, we do not allow sentence piece to
-  // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
-  // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
-  // as one script type, since Japanese word can consist of multiple scripts.
-  // This exception is always applied regardless of the accept-language
-  // parameter.
-  optional bool split_by_unicode_script = 21 [default = true];
-
-  // When `split_by_number` is true, put a boundary between number and
-  // non-number transition. If we want to treat "F1" is one token, set this flag
-  // to be false.
-  optional bool split_by_number = 23 [default = true];
-
-  // Use a white space to split sentence pieces.
-  // When `split_by_whitespace` is false, we may have the piece containing
-  // a white space in the middle. e.g., "in_the".
-  optional bool split_by_whitespace = 22 [default = true];
-
-  // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
-  // hello_. When `treat_whitespace_as_suffix` is true,
-  // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
-  // of sentence.
-  optional bool treat_whitespace_as_suffix = 24 [default = false];
-
-  // Allows pieces that only contain whitespaces instead of appearing only as
-  // prefix or suffix of other pieces.
-  optional bool allow_whitespace_only_pieces = 26 [default = false];
-
-  // Split all digits (0-9) into separate pieces.
-  optional bool split_digits = 25 [default = false];
-
-  // Defines the pre-tokenization delimiter.
-  // When specified, no pieces crossing this delimiter is not included
-  // in the vocab. Then the delimiter string is virtually ignored
-  // during the training. This field can allows constraints on the vocabulary
-  // selection. Note that this field is available on unigram mode.
-  optional string pretokenization_delimiter = 53 [ default = ""];
-
-  ///////////////////////////////////////////////////////////////////
-  // Vocabulary management
-  //
-  // Defines control symbols used as an indicator to
-  // change the behavior of the decoder. <s> and </s> are pre-defined.
-  // We can use this field to encode various meta information,
-  // including language indicator in multilingual model.
-  // These symbols are not visible to users, but visible to
-  // the decoder. Note that when the input sentence contains control symbols,
-  // they are not treated as one token, but segmented into normal pieces.
-  // Control symbols must be inserted independently from the segmentation.
-  repeated string control_symbols = 30;
-
-  // Defines user defined symbols.
-  // These symbols are added with extremely high score
-  // so they are always treated as one unique symbol in any context.
-  // Typical usage of user_defined_symbols is placeholder for named entities.
-  repeated string user_defined_symbols = 31;
-
-  // Defines required characters. Each UTF8 character in this string is included
-  // in the character set regardless of character_coverage value. Unlike
-  // user_defined_symbols, these characters have scores based on the frequency
-  // on input sentences, and the model can form subwords using characters
-  // in this field.
-  optional string required_chars = 36;
-
-  // Decomposes unknown pieces into UTF-8 bytes.
-  optional bool byte_fallback = 35 [default = false];
-
-  // When creating the vocabulary file, defines whether or not to additionally
-  // output the score for each piece.
-  optional bool vocabulary_output_piece_score = 32 [default = true];
-
-  // `vocab_size` is treated as hard limit. Crash if
-  // the model can not produce the vocab of size `vocab_size`,
-  // When `hard_vocab_limit` is false, vocab_size is treated
-  // as soft limit. Note that when model_type=char,
-  // always assumes hard_vocab_limit = false.
-  optional bool hard_vocab_limit = 33 [default = true];
-
-  // use all symbols for vocab extraction. This flag is valid
-  // if model type is either CHAR or WORD
-  optional bool use_all_vocab = 34 [default = false];
-
-  ///////////////////////////////////////////////////////////////////
-  // Reserved special meta tokens.
-  // * -1 is not used.
-  // * unk_id must not be -1.
-  // Id must starts with 0 and be contigous.
-  optional int32 unk_id = 40 [default = 0];   // <unk>
-  optional int32 bos_id = 41 [default = 1];   // <s>
-  optional int32 eos_id = 42 [default = 2];   // </s>
-  optional int32 pad_id = 43 [default = -1];  // <pad> (padding)
-  optional string unk_piece = 45 [default = "<unk>"];
-  optional string bos_piece = 46 [default = "<s>"];
-  optional string eos_piece = 47 [default = "</s>"];
-  optional string pad_piece = 48 [default = "<pad>"];
-
-  // Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
-  // since this character can be useful both for user and
-  // developer. We can easily figure out that <unk> is emitted.
-  optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
-
-  // Increase bit depth to allow unigram model training on large
-  // (>10M sentences) corpora. A Side-effect of enabling this flag
-  // is increased memory usage.
-  optional bool train_extremely_large_corpus = 49 [default = false];
-
- // Path to a seed sentencepieces file, with one tab-separated
-  // seed sentencepiece <tab> frequency per line.
-  optional string seed_sentencepieces_file = 54 [default = ""];
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
-
-// NormalizerSpec encodes a various parameters for string normalizaiton
-message NormalizerSpec {
-  // name of normalization rule.
-  optional string name = 1;
-
-  // Pre-compiled normalization rule created by
-  // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
-  // Usually this field is set by Builder::GetNormalizerSpec() method.
-  optional bytes precompiled_charsmap = 2;
-
-  // Adds dummy whitespace at the beginning of text in order to
-  // treat "world" in "world" and "hello world" in the same way.
-  optional bool add_dummy_prefix = 3 [default = true];
-
-  // Removes leading, trailing, and duplicate internal whitespace.
-  optional bool remove_extra_whitespaces = 4 [default = true];
-
-  // Replaces whitespace with meta symbol.
-  // This field must be true to train sentence piece model.
-  optional bool escape_whitespaces = 5 [default = true];
-
-  // Custom normalization rule file in TSV format.
-  // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
-  // This field is only used in SentencePieceTrainer::Train() method, which
-  // compiles the rule into the binary rule stored in `precompiled_charsmap`.
-  optional string normalization_rule_tsv = 6;
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
-
-// Proto to store samples for self-testing.
-message SelfTestData {
-  message Sample {
-    optional string input = 1;
-    optional string expected = 2;
-  }
-  repeated Sample samples = 1;
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
-
-// ModelProto stores model parameters.
-// SentencePieceProcessor is supposed to be self-contained.
-// All settings/parameters which may change the behavior must be encoded
-// in ModelProto.
-message ModelProto {
-  message SentencePiece {
-    enum Type {
-      NORMAL = 1;        // normal symbol
-      UNKNOWN = 2;       // unknown symbol. only <unk> for now.
-      CONTROL = 3;       // control symbols. </s>, <s>, <2ja> etc.
-      USER_DEFINED = 4;  // user defined symbols.
-                         // Typical usage of USER_DEFINED symbol
-                         // is placeholder.
-      BYTE = 6;          // byte symbols. Used when `byte_fallback` is true.
-      UNUSED = 5;        // this piece is not used.
-    }
-    optional string piece = 1;  // piece must not be empty.
-    optional float score = 2;
-    optional Type type = 3 [default = NORMAL];
-
-    // Customized extensions: the range of field numbers
-    // are open to third-party extensions.
-    extensions 200 to max;
-  }
-
-  // Sentence pieces with scores.
-  repeated SentencePiece pieces = 1;
-
-  // Spec used to generate this model file.
-  optional TrainerSpec trainer_spec = 2;
-
-  // Spec for text normalization.
-  optional NormalizerSpec normalizer_spec = 3;
-
-  // Stores sample input and its expected segmentation to verify the model.
-  optional SelfTestData self_test_data = 4;
-
-  // Spec for text de-normalization.
-  optional NormalizerSpec denormalizer_spec = 5;
-
-  // Customized extensions: the range of field numbers
-  // are open to third-party extensions.
-  extensions 200 to max;
-}
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,21 +1,25 @@
 # Documentation

-### Getting Started
-* [Quickstart](../README.md#quickstart)
-* [Examples](../examples)
-* [Importing models](./import.md)
-* [Linux Documentation](./linux.md)
-* [Windows Documentation](./windows.md)
-* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
+To get started, see the project's **[quickstart](../README.md#quickstart)**.

-### Reference
+Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.

-* [API Reference](./api.md)
-* [Modelfile Reference](./modelfile.md)
-* [OpenAI Compatibility](./openai.md)
+Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.

-### Resources
+Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.

-* [Troubleshooting Guide](./troubleshooting.md)
-* [FAQ](./faq.md)
-* [Development guide](./development.md)
+Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
+
+Installing on Linux in most cases is easy using the script on [ollama.com/download](ollama.com/download). To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
+
+Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
+
+It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
+
+If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
+
+Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
+
+[Tutorials](./tutorials.md) apply the documentation to tasks.
+
+For working code examples of using Ollama, see [Examples](../examples).
--- a/docs/api.md
+++ b/docs/api.md
@@ -54,7 +54,7 @@ Advanced parameters (optional):

 #### JSON mode

-Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
+Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#generate-request-json-mode) below.

 > Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.

@@ -256,9 +256,9 @@ For reproducible outputs, set `temperature` to 0 and `seed` to a number:
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "mistral",
-  "prompt": "Why is the sky blue?",
+  "prompt": "[INST] why is the sky blue? [/INST]",
  "options": {
-    "seed": 123,
+    "seed": 101,
    "temperature": 0
  }
 }'
@@ -1024,7 +1024,7 @@ Advanced parameters:

 ```shell
 curl http://localhost:11434/api/embeddings -d '{
-  "model": "all-minilm",
+  "model": "llama2",
  "prompt": "Here is an article about llamas..."
 }'
 ```
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,7 +3,7 @@
 Install required tools:

 - cmake version 3.24 or higher
- go version 1.22 or higher
+- go version 1.21 or higher
 - gcc version 11.4.0 or higher

 ```bash
@@ -42,15 +42,15 @@ Now you can run `ollama`:

 #### Linux CUDA (NVIDIA)

-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*

 Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
+development and runtime packages. 

 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
+libraries, and `CUDACXX` to the location of the nvcc compiler.  You can customize
 set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")

 Then generate dependencies:
@@ -67,15 +67,15 @@ go build .

 #### Linux ROCm (AMD)

-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*

-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.

 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
+CLBlast install (typically `/usr/lib/cmake/CLBlast`).  You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)

 ```
@@ -88,17 +88,17 @@ Then build the binary:
 go build .
 ```

-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.

 #### Advanced CPU Settings

 By default, running `go generate ./...` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. If you would like to build a CPU-based build customized for your
+somewhat slowly.  At runtime, Ollama will auto-detect the optimal variation to
+load.  If you would like to build a CPU-based build customized for your
 processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use. For example, to compile an optimized binary for an Intel i9-9880H,
+like to use.  For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:

 ```
@@ -108,7 +108,8 @@ go build .

 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.  The resulting binary is placed in `./dist`
+

 ### Windows

@@ -117,29 +118,21 @@ Note: The windows build for Ollama is still under development.
 Install required tools:

 - MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
+- go version 1.21 or higher
 - MinGW (pick one variant) with GCC.
-  - [MinGW-w64](https://www.mingw-w64.org/)
-  - [MSYS2](https://www.msys2.org/)
+  - <https://www.mingw-w64.org/>
+  - <https://www.msys2.org/>

 ```powershell
 $env:CGO_ENABLED="1"
+
 go generate ./...
+
 go build .
 ```

 #### Windows CUDA (NVIDIA)

-In addition to the common Windows development tools described above, install CUDA after installing MSVC.
+In addition to the common Windows development tools described above, install:

 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
-
-
-#### Windows ROCm (AMD Radeon)
-
-In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
-
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
- [Strawberry Perl](https://strawberryperl.com/)
-
-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -14,10 +14,6 @@ curl -fsSL https://ollama.com/install.sh | sh

 Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.

-## Is my GPU compatible with Ollama?
-
-Please refer to the [GPU docs](./gpu.md).
-
 ## How can I specify the context window size?

 By default, Ollama uses a context window size of 2048 tokens.
@@ -99,37 +95,6 @@ Ollama binds 127.0.0.1 port 11434 by default. Change the bind address with the `

 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

-## How can I use Ollama with a proxy server?
-
-Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
-
-```
-server {
-    listen 80;
-    server_name example.com;  # Replace with your domain or IP
-    location / {
-        proxy_pass http://localhost:11434;
-        proxy_set_header Host localhost:11434;
-    }
-}
-```
-
-## How can I use Ollama with ngrok?
-
-Ollama can be accessed using a range of tools for tunneling tools. For example with Ngrok:
-
-```
-ngrok http 11434 --host-header="localhost:11434"
-```
-
-## How can I use Ollama with Cloudflare Tunnel?
-
-To use Ollama with Cloudflare Tunnel, use the `--url` and `--http-host-header` flags:
-
-```
-cloudflared tunnel --url http://localhost:11434 --http-host-header="localhost:11434"
-```
-
 ## How can I allow additional web origins to access Ollama?

 Ollama allows cross-origin requests from `127.0.0.1` and `0.0.0.0` by default. Additional origins can be configured with `OLLAMA_ORIGINS`.
@@ -154,7 +119,7 @@ No. Ollama runs locally, and conversation data does not leave your machine.

 ## How can I use Ollama in Visual Studio Code?

-There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/ollama/ollama#extensions--plugins) at the bottom of the main repository readme.
+There is already a large collection of plugins available for VSCode as well as other editors that leverage Ollama. See the list of [extensions & plugins](https://github.com/jmorganca/ollama#extensions--plugins) at the bottom of the main repository readme.

 ## How do I use Ollama behind a proxy?

--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,102 +0,0 @@
-# GPU
-## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
-
-Check your compute compatibility to see if your card is supported:
-[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
-
-| Compute Capability | Family              | Cards                                                                                                       |
-| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
-| 9.0                | NVIDIA              | `H100`                                                                                                      |
-| 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti`                                                           |
-|                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
-| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060`         |
-|                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
-| 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
-| 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
-|                    | NVIDIA Professional | `T4` `RTX 5000` `RTX 4000` `RTX 3000` `T2000` `T1200` `T1000` `T600` `T500`                                 |
-|                    | Quadro              | `RTX 8000` `RTX 6000` `RTX 5000` `RTX 4000`                                                                 |
-| 7.0                | NVIDIA              | `TITAN V` `V100` `Quadro GV100`                                                                             |
-| 6.1                | NVIDIA TITAN        | `TITAN Xp` `TITAN X`                                                                                        |
-|                    | GeForce GTX         | `GTX 1080 Ti` `GTX 1080` `GTX 1070 Ti` `GTX 1070` `GTX 1060` `GTX 1050`                                     |
-|                    | Quadro              | `P6000` `P5200` `P4200` `P3200` `P5000` `P4000` `P3000` `P2200` `P2000` `P1000` `P620` `P600` `P500` `P520` |
-|                    | Tesla               | `P40` `P4`                                                                                                  |
-| 6.0                | NVIDIA              | `Tesla P100` `Quadro GP100`                                                                                 |
-| 5.2                | GeForce GTX         | `GTX TITAN X` `GTX 980 Ti` `GTX 980` `GTX 970` `GTX 960` `GTX 950`                                          |
-|                    | Quadro              | `M6000 24GB` `M6000` `M5000` `M5500M` `M4000` `M2200` `M2000` `M620`                                        |
-|                    | Tesla               | `M60` `M40`                                                                                                 |
-| 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                            |
-|                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`  |
-
-
-### GPU Selection
-
-If you have multiple NVIDIA GPUs in your system and want to limit Ollama to use
-a subset, you can set `CUDA_VISIBLE_DEVICES` to a comma separated list of GPUs.
-Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
-You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
-ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
-
-### Laptop Suspend Resume
-
-On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
-your NVIDIA GPU, and fallback to running on the CPU.  You can workaround this
-driver bug by reloading the NVIDIA UVM driver with `sudo rmmod nvidia_uvm &&
-sudo modprobe nvidia_uvm`
-
-## AMD Radeon
-Ollama supports the following AMD GPUs:
-| Family         | Cards and accelerators                                                                                                               |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56`    |
-| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
-| AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50`                                                               |
-
-### Overrides
-Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
-some cases you can force the system to try to use a similar LLVM target that is
-close.  For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
-however, ROCm does not currently support this target. The closest support is
-`gfx1030`.  You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
-`x.y.z` syntax.  So for example, to force the system to run on the RX 5400, you
-would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
-server.  If you have an unsupported AMD GPU you can experiment using the list of
-supported types below.
-
-At this time, the known supported GPU types are the following LLVM Targets.
-This table shows some example GPUs that map to these LLVM targets:
-| **LLVM Target** | **An Example GPU** |
-|-----------------|---------------------|
-| gfx900 | Radeon RX Vega 56 |
-| gfx906 | Radeon Instinct MI50 |
-| gfx908 | Radeon Instinct MI100 |
-| gfx90a | Radeon Instinct MI210 |
-| gfx940 | Radeon Instinct MI300 |
-| gfx941 | |
-| gfx942 | |
-| gfx1030 | Radeon PRO V620 |
-| gfx1100 | Radeon PRO W7900 |
-| gfx1101 | Radeon PRO W7700 |
-| gfx1102 | Radeon RX 7600 |
-
-AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
-future release which should increase support for more GPUs.
-
-Reach out on [Discord](https://discord.gg/ollama) or file an
-[issue](https://github.com/ollama/ollama/issues) for additional help.
-
-### GPU Selection
-
-If you have multiple AMD GPUs in your system and want to limit Ollama to use a
-subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
-You can see the list of devices with `rocminfo`.  If you want to ignore the GPUs
-and force CPU usage, use an invalid GPU ID (e.g., "-1")
-
-### Container Permission
-
-In some Linux distributions, SELinux can prevent containers from
-accessing the AMD GPU devices.  On the host system you can run 
-`sudo setsebool container_use_devices=1` to allow containers to use devices.
-
-### Metal (Apple GPUs)
-Ollama supports GPU acceleration on Apple devices via the Metal API.
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,14 +10,6 @@ Install Ollama running this one-liner:
 curl -fsSL https://ollama.com/install.sh | sh
 ```

-## AMD Radeon GPU support
-
-While AMD has contributed the `amdgpu` driver upstream to the official linux
-kernel source, the version is older and may not support all ROCm features. We
-recommend you install the latest driver from
-https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-GPU.
-
 ## Manual install

 ### Download the `ollama` binary
@@ -72,11 +64,6 @@ Verify that the drivers are installed by running the following command, which sh
 nvidia-smi
 ```

-### Install ROCm (optional - for Radeon GPUs)
-[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
-
-Make sure to install ROCm v6
-
 ### Start Ollama

 Start Ollama using `systemd`:
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -113,7 +113,7 @@ FROM llama2
 ```

 A list of available base models:
-<https://github.com/ollama/ollama#model-library>
+<https://github.com/jmorganca/ollama#model-library>

 #### Build from a `bin` file

@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
 PARAMETER <parameter> <parametervalue>
 ```

-#### Valid Parameters and Values
+### Valid Parameters and Values

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -183,7 +183,7 @@ SYSTEM """<system message>"""

 ### ADAPTER

-The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
+The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.

 ```modelfile
 ADAPTER ./ollama-lora.bin
@@ -201,22 +201,7 @@ LICENSE """

 ### MESSAGE

-The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
-
-```modelfile
-MESSAGE <role> <message>
-```
-
-#### Valid roles
-
-| Role      | Description                                                  |
-| --------- | ------------------------------------------------------------ |
-| system    | Alternate way of providing the SYSTEM message for the model. |
-| user      | An example message of what the user could have asked.        |
-| assistant | An example message of how the model should respond.          |
-
-
-#### Example conversation
+The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:

 ```modelfile
 MESSAGE user Is Toronto in Canada?
@@ -227,7 +212,6 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```

-
 ## Notes

 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -1,6 +1,6 @@
 # OpenAI compatibility

-> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/ollama/ollama/blob/main/docs/api.md).
+> **Note:** OpenAI compatibility is experimental and is subject to major adjustments including breaking changes. For fully-featured access to the Ollama API, see the Ollama [Python library](https://github.com/ollama/ollama-python), [JavaScript library](https://github.com/ollama/ollama-js) and [REST API](https://github.com/jmorganca/ollama/blob/main/docs/api.md).

 Ollama provides experimental compatibility with parts of the [OpenAI API](https://platform.openai.com/docs/api-reference) to help connect existing applications to Ollama.

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -67,12 +67,6 @@ You can see what features your CPU has with the following.
 cat /proc/cpuinfo| grep flags  | head -1
 ```

-## Installing older or pre-release versions on Linux
+## Known issues

-If you run into problems on Linux and want to install an older version, or you'd
-like to try out a pre-release before it's officially released, you can tell the
-install script which version to install.
-
-```sh
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
-```
+* N/A
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -4,7 +4,7 @@ Welcome to the Ollama Windows preview.

 No more WSL required!

-Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
+Ollama now runs as a native Windows application, including NVIDIA GPU support.
 After installing Ollama Windows Preview, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
@@ -21,7 +21,6 @@ Logs will often be helpful in dianosing the problem (see

 * Windows 10 or newer, Home or Pro
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
-* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card

 ## API Access

--- a/examples/langchain-python-rag-privategpt/README.md
+++ b/examples/langchain-python-rag-privategpt/README.md
@@ -1,6 +1,6 @@
 # PrivateGPT with Llama 2 uncensored

-https://github.com/ollama/ollama/assets/3325447/20cf8ec6-ff25-42c6-bdd8-9be594e3ce1b
+https://github.com/jmorganca/ollama/assets/3325447/20cf8ec6-ff25-42c6-bdd8-9be594e3ce1b

 > Note: this example is a slightly modified version of PrivateGPT using models such as Llama 2 Uncensored. All credit for PrivateGPT goes to Iván Martínez who is the creator of it, and you can find his GitHub repo [here](https://github.com/imartinez/privateGPT).

--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -28,7 +28,7 @@ You are Mario from Super Mario Bros, acting as an assistant.
 What if you want to change its behaviour?

 - Try changing the prompt
- Try changing the parameters [Docs](https://github.com/ollama/ollama/blob/main/docs/modelfile.md)
+- Try changing the parameters [Docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md)
 - Try changing the model (e.g. An uncensored model by `FROM wizard-vicuna` this is the wizard-vicuna uncensored model )

 Once the changes are made,
--- a/examples/modelfile-tweetwriter/readme.md
+++ b/examples/modelfile-tweetwriter/readme.md
@@ -0,0 +1,23 @@
+# Example Modelfile - Tweetwriter
+
+This simple examples shows what you can do without any code, simply relying on a Modelfile. The file has two instructions:
+
+1. FROM - The From instructions defines the parent model to use for this one. If you choose a model from the library, you can enter just the model name. For all other models, you need to specify the namespace as well. You could also use a local file. Just include the relative path to the converted, quantized model weights file. To learn more about creating that file, see the `import.md` file in the docs folder of this repository.
+2. SYSTEM - This defines the system prompt for the model and overrides the system prompt from the parent model.
+
+## Running the Example
+
+1. Create the model:
+
+   ```bash
+   ollama create tweetwriter
+   ```
+
+2. Enter a topic to generate a tweet about.
+3. Show the Modelfile in the REPL.
+
+   ```bash
+   /show modelfile
+   ```
+
+   Notice that the FROM and SYSTEM match what was in the file. But there is also a TEMPLATE and PARAMETER. These are inherited from the parent model.
--- a/examples/python-chat-app/README.md
+++ b/examples/python-chat-app/README.md
@@ -0,0 +1,21 @@
+# Ollama Chat App
+
+Build a Llama2 chat app using Streamlit and Ollama.
+
+## Running the Example
+
+1. Ensure you have the `llama2` model installed:
+
+   ```bash
+   ollama pull llama2
+   ```
+2. Install the Python Requirements.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -1,6 +1,6 @@
 # JSON Output Example

-![llmjson 2023-11-10 15_31_31](https://github.com/ollama/ollama/assets/633681/e599d986-9b4a-4118-81a4-4cfe7e22da25)
+![llmjson 2023-11-10 15_31_31](https://github.com/jmorganca/ollama/assets/633681/e599d986-9b4a-4118-81a4-4cfe7e22da25)

 There are two python scripts in this example. `randomaddresses.py` generates random addresses from different countries. `predefinedschema.py` sets a template for the model to fill in.

--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@@ -1,6 +1,6 @@
 # Log Analysis example

-![loganalyzer 2023-11-10 08_53_29](https://github.com/ollama/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)
+![loganalyzer 2023-11-10 08_53_29](https://github.com/jmorganca/ollama/assets/633681/ad30f1fc-321f-4953-8914-e30e24db9921)

 This example shows one possible way to create a log file analyzer. It uses the model **mattw/loganalyzer** which is based on **codebooga**, a 34b parameter model.

--- a/examples/typescript-functioncalling/readme.md
+++ b/examples/typescript-functioncalling/readme.md
@@ -1,6 +1,6 @@
 # Function calling

-![function calling 2023-11-16 16_12_58](https://github.com/ollama/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)
+![function calling 2023-11-16 16_12_58](https://github.com/jmorganca/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)

 One of the features added to some models is 'function calling'. It's a bit of a confusing name. It's understandable if you think that means the model can call functions, but that's not what it means. Function calling simply means that the output of the model is formatted in JSON, using a preconfigured schema, and uses the expected types. Then your code can use the output of the model and call functions with it. Using the JSON format in Ollama, you can use any model for function calling. 

--- a/go.mod
+++ b/go.mod
@@ -1,43 +1,23 @@
-module github.com/ollama/ollama
+module github.com/jmorganca/ollama

-go 1.22
-
-toolchain go1.22.0
+go 1.21

 require (
 	github.com/containerd/console v1.0.3
-	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
-	github.com/golang/protobuf v1.5.0
 	github.com/google/uuid v1.0.0
-	github.com/mitchellh/mapstructure v1.5.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.8.4
-	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.3.0
 )

-require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
-
 require (
-	github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
-	github.com/chewxy/hm v1.0.0 // indirect
-	github.com/chewxy/math32 v1.0.8 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/gogo/protobuf v1.3.2 // indirect
-	github.com/google/flatbuffers v1.12.0 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
-	github.com/xtgo/set v1.0.0 // indirect
-	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
-	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.8.2 // indirect
-	gorgonia.org/vecf32 v0.9.0 // indirect
-	gorgonia.org/vecf64 v0.9.0 // indirect
 )

 require (
@@ -58,6 +38,7 @@ require (
 	github.com/mattn/go-isatty v0.0.19 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
 	github.com/pelletier/go-toml/v2 v2.0.8 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
@@ -69,6 +50,6 @@ require (
 	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
 	golang.org/x/text v0.13.0 // indirect
-	google.golang.org/protobuf v1.30.0
+	google.golang.org/protobuf v1.30.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,38 +1,18 @@
-cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
-github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc h1:zvQ6w7KwtQWgMQiewOF9tFtundRMVZFSAksNV6ogzuY=
-github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
 github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
 github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
 github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
-github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
-github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
-github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
-github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
-github.com/chewxy/math32 v1.0.8 h1:fU5E4Ec4Z+5RtRAi3TovSxUjQPkgRh+HbP7tKB2OFbM=
-github.com/chewxy/math32 v1.0.8/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
-github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
-github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY=
-github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
-github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
-github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
 github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
 github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
@@ -57,31 +37,7 @@ github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QX
 github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
-github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
-github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
-github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
-github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
-github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
-github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
-github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
-github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w=
-github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
@@ -92,9 +48,6 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
-github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
-github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
-github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
 github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
@@ -115,8 +68,6 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
-github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -124,17 +75,14 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
-github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
-github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9/go.mod h1:nR7l3gM6ubiOm+mCkmmUyIBUcBAyiUmW6dQrDZhugFE=
+github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
+github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
-github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
@@ -148,8 +96,6 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -166,61 +112,19 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6
 github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
 github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
 github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
-github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
-github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
-github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
-github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
-github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4=
-go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
 golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
 golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
-golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
-golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
-golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
-golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
-golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
 golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
-golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
-golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -233,56 +137,12 @@ golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
 golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
-golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
-golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
-golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
-gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
-gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
-google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
-google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
-google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM=
-google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
-google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
-google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
-google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
-google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0=
-google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
-google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
-google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
-google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
-google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
-google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
-google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
-google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
@@ -297,10 +157,4 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
-gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA=
-gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
-gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
-honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
--- a/gpu/amd.go
+++ b/gpu/amd.go
@@ -0,0 +1,101 @@
+package gpu
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// TODO - windows vs. non-windows vs darwin
+
+// Discovery logic for AMD/ROCm GPUs
+
+const (
+	DriverVersionFile     = "/sys/module/amdgpu/version"
+	GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
+	// TODO probably break these down per GPU to make the logic simpler
+	GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
+	GPUUsedMemoryFileGlob  = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
+)
+
+func AMDDetected() bool {
+	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
+	sysfsDir := filepath.Dir(DriverVersionFile)
+	_, err := os.Stat(sysfsDir)
+	if errors.Is(err, os.ErrNotExist) {
+		slog.Debug("amd driver not detected " + sysfsDir)
+		return false
+	} else if err != nil {
+		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
+		return false
+	}
+	return true
+}
+
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
+	if err != nil {
+		return "", fmt.Errorf("amdgpu file stat error: %s %w", DriverVersionFile, err)
+	}
+	fp, err := os.Open(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	defer fp.Close()
+	verString, err := io.ReadAll(fp)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSpace(string(verString)), nil
+}
+
+func AMDGFXVersions() []Version {
+	res := []Version{}
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	for _, match := range matches {
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
+			continue
+		}
+		defer fp.Close()
+
+		scanner := bufio.NewScanner(fp)
+		// optionally, resize scanner's capacity for lines over 64K, see next example
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+					slog.Debug("malformed " + line)
+					continue
+				}
+				l := len(ver[1])
+				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+
+				res = append(res, Version{
+					Major: uint(major),
+					Minor: uint(minor),
+					Patch: uint(patch),
+				})
+			}
+		}
+	}
+	return res
+}
+
+func (v Version) ToGFXString() string {
+	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
+}
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -1,56 +0,0 @@
-//go:build linux || windows
-
-package gpu
-
-import (
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-
-// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
-func rocmLibUsable(libDir string) bool {
-	slog.Debug("evaluating potential rocm lib dir " + libDir)
-	for _, g := range ROCmLibGlobs {
-		res, _ := filepath.Glob(filepath.Join(libDir, g))
-		if len(res) == 0 {
-			return false
-		}
-	}
-	return true
-}
-
-func GetSupportedGFX(libDir string) ([]string, error) {
-	var ret []string
-	files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
-	if err != nil {
-		return nil, err
-	}
-	for _, file := range files {
-		ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
-	}
-	return ret, nil
-}
-
-func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
-	// Set the visible devices if not already set
-	// TODO - does sort order matter?
-	devices := []string{}
-	for i := range ids {
-		if _, skipped := skip[i]; skipped {
-			continue
-		}
-		devices = append(devices, strconv.Itoa(i))
-	}
-
-	val := strings.Join(devices, ",")
-	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
-	if err != nil {
-		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
-	} else {
-		slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
-	}
-}
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -1,141 +0,0 @@
-package gpu
-
-import (
-	"fmt"
-	"log/slog"
-	"strconv"
-	"syscall"
-	"unsafe"
-
-	"golang.org/x/sys/windows"
-)
-
-const (
-	hipSuccess       = 0
-	hipErrorNoDevice = 100
-)
-
-type hipDevicePropMinimal struct {
-	Name        [256]byte
-	unused1     [140]byte
-	GcnArchName [256]byte // gfx####
-	iGPU        int       // Doesn't seem to actually report correctly
-	unused2     [128]byte
-}
-
-// Wrap the amdhip64.dll library for GPU discovery
-type HipLib struct {
-	dll                    windows.Handle
-	hipGetDeviceCount      uintptr
-	hipGetDeviceProperties uintptr
-	hipMemGetInfo          uintptr
-	hipSetDevice           uintptr
-	hipDriverGetVersion    uintptr
-}
-
-func NewHipLib() (*HipLib, error) {
-	h, err := windows.LoadLibrary("amdhip64.dll")
-	if err != nil {
-		return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
-	}
-	hl := &HipLib{}
-	hl.dll = h
-	hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
-	if err != nil {
-		return nil, err
-	}
-	hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
-	if err != nil {
-		return nil, err
-	}
-	return hl, nil
-}
-
-// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
-// so we have to unload/reset the library after we do our initial discovery
-// to make sure our updates to that variable are processed by llama.cpp
-func (hl *HipLib) Release() {
-	err := windows.FreeLibrary(hl.dll)
-	if err != nil {
-		slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
-	}
-	hl.dll = 0
-}
-
-func (hl *HipLib) AMDDriverVersion() (string, error) {
-	if hl.dll == 0 {
-		return "", fmt.Errorf("dll has been unloaded")
-	}
-	var version int
-	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
-	if status != hipSuccess {
-		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
-	}
-	return strconv.Itoa(version), nil
-}
-
-func (hl *HipLib) HipGetDeviceCount() int {
-	if hl.dll == 0 {
-		slog.Error("dll has been unloaded")
-		return 0
-	}
-	var count int
-	status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
-	if status == hipErrorNoDevice {
-		slog.Info("AMD ROCm reports no devices found")
-		return 0
-	}
-	if status != hipSuccess {
-		slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
-	}
-	return count
-}
-
-func (hl *HipLib) HipSetDevice(device int) error {
-	if hl.dll == 0 {
-		return fmt.Errorf("dll has been unloaded")
-	}
-	status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
-	if status != hipSuccess {
-		return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
-	}
-	return nil
-}
-
-func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
-	if hl.dll == 0 {
-		return nil, fmt.Errorf("dll has been unloaded")
-	}
-	var props hipDevicePropMinimal
-	status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
-	if status != hipSuccess {
-		return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
-	}
-	return &props, nil
-}
-
-// free, total, err
-func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
-	if hl.dll == 0 {
-		return 0, 0, fmt.Errorf("dll has been unloaded")
-	}
-	var totalMemory uint64
-	var freeMemory uint64
-	status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
-	if status != hipSuccess {
-		return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
-	}
-	return freeMemory, totalMemory, nil
-}
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -1,434 +0,0 @@
-package gpu
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"slices"
-	"strconv"
-	"strings"
-)
-
-// Discovery logic for AMD/ROCm GPUs
-
-const (
-	DriverVersionFile     = "/sys/module/amdgpu/version"
-	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
-	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
-
-	// Prefix with the node dir
-	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
-	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
-	RocmStandardLocation   = "/opt/rocm/lib"
-
-	// TODO find a better way to detect iGPU instead of minimum memory
-	IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
-)
-
-var (
-	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
-)
-
-// Gather GPU information from the amdgpu driver if any supported GPUs are detected
-// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
-// and the user hasn't already set this variable
-func AMDGetGPUInfo(resp *GpuInfo) {
-	// TODO - DRY this out with windows
-	if !AMDDetected() {
-		return
-	}
-	skip := map[int]interface{}{}
-
-	// Opportunistic logging of driver version to aid in troubleshooting
-	ver, err := AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
-		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
-		slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
-	}
-
-	// If the user has specified exactly which GPUs to use, look up their memory
-	visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
-	if visibleDevices != "" {
-		ids := []int{}
-		for _, idStr := range strings.Split(visibleDevices, ",") {
-			id, err := strconv.Atoi(idStr)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
-			} else {
-				ids = append(ids, id)
-			}
-		}
-		amdProcMemLookup(resp, nil, ids)
-		return
-	}
-
-	// Gather GFX version information from all detected cards
-	gfx := AMDGFXVersions()
-	verStrings := []string{}
-	for i, v := range gfx {
-		verStrings = append(verStrings, v.ToGFXString())
-		if v.Major == 0 {
-			// Silently skip CPUs
-			skip[i] = struct{}{}
-			continue
-		}
-		if v.Major < 9 {
-			// TODO consider this a build-time setting if we can support 8xx family GPUs
-			slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
-			skip[i] = struct{}{}
-		}
-	}
-	slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
-
-	// Abort if all GPUs are skipped
-	if len(skip) >= len(gfx) {
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
-		return
-	}
-
-	// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
-	libDir, err := AMDValidateLibDir()
-	if err != nil {
-		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
-		return
-	}
-
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
-	if gfxOverride == "" {
-		supported, err := GetSupportedGFX(libDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
-			return
-		}
-		slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
-
-		for i, v := range gfx {
-			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
-				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
-				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
-				skip[i] = struct{}{}
-			} else {
-				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
-			}
-		}
-	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
-	}
-
-	if len(skip) >= len(gfx) {
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
-		return
-	}
-
-	ids := make([]int, len(gfx))
-	i := 0
-	for k := range gfx {
-		ids[i] = k
-		i++
-	}
-	amdProcMemLookup(resp, skip, ids)
-	if resp.memInfo.DeviceCount == 0 {
-		return
-	}
-	if len(skip) > 0 {
-		amdSetVisibleDevices(ids, skip)
-	}
-}
-
-// Walk the sysfs nodes for the available GPUs and gather information from them
-// skipping over any devices in the skip map
-func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
-	resp.memInfo.DeviceCount = 0
-	resp.memInfo.TotalMemory = 0
-	resp.memInfo.FreeMemory = 0
-	slog.Debug("discovering VRAM for amdgpu devices")
-	if len(ids) == 0 {
-		entries, err := os.ReadDir(AMDNodesSysfsDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
-			return
-		}
-		for _, node := range entries {
-			if !node.IsDir() {
-				continue
-			}
-			id, err := strconv.Atoi(node.Name())
-			if err != nil {
-				slog.Warn("malformed amdgpu sysfs node id " + node.Name())
-				continue
-			}
-			ids = append(ids, id)
-		}
-	}
-	slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
-
-	for _, id := range ids {
-		if _, skipped := skip[id]; skipped {
-			continue
-		}
-		totalMemory := uint64(0)
-		usedMemory := uint64(0)
-		// Adjust for sysfs vs HIP ids
-		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
-		propFiles, err := filepath.Glob(propGlob)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
-		}
-		// 1 or more memory banks - sum the values of all of them
-		for _, propFile := range propFiles {
-			fp, err := os.Open(propFile)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
-				continue
-			}
-			defer fp.Close()
-			scanner := bufio.NewScanner(fp)
-			for scanner.Scan() {
-				line := strings.TrimSpace(scanner.Text())
-				if strings.HasPrefix(line, "size_in_bytes") {
-					ver := strings.Fields(line)
-					if len(ver) != 2 {
-						slog.Warn("malformed " + line)
-						continue
-					}
-					bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
-					if err != nil {
-						slog.Warn("malformed int " + line)
-						continue
-					}
-					totalMemory += bankSizeInBytes
-				}
-			}
-		}
-		if totalMemory == 0 {
-			slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
-			skip[id] = struct{}{}
-			continue
-		}
-		if totalMemory < IGPUMemLimit {
-			slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
-			skip[id] = struct{}{}
-			continue
-		}
-		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
-		usedFiles, err := filepath.Glob(usedGlob)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
-			continue
-		}
-		for _, usedFile := range usedFiles {
-			fp, err := os.Open(usedFile)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
-				continue
-			}
-			defer fp.Close()
-			data, err := io.ReadAll(fp)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
-				continue
-			}
-			used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
-			if err != nil {
-				slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
-				continue
-			}
-			usedMemory += used
-		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %dM", id, (totalMemory-usedMemory)/1024/1024))
-		resp.memInfo.DeviceCount++
-		resp.memInfo.TotalMemory += totalMemory
-		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
-	}
-	if resp.memInfo.DeviceCount > 0 {
-		resp.Library = "rocm"
-	}
-}
-
-// Quick check for AMD driver so we can skip amdgpu discovery if not present
-func AMDDetected() bool {
-	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
-	sysfsDir := filepath.Dir(DriverVersionFile)
-	_, err := os.Stat(sysfsDir)
-	if errors.Is(err, os.ErrNotExist) {
-		slog.Debug("amdgpu driver not detected " + sysfsDir)
-		return false
-	} else if err != nil {
-		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
-		return false
-	}
-	return true
-}
-
-func setupLink(source, target string) error {
-	if err := os.RemoveAll(target); err != nil {
-		return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
-	}
-	if err := os.Symlink(source, target); err != nil {
-		return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
-	}
-	slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
-	return nil
-}
-
-// Ensure the AMD rocm lib dir is wired up
-// Prefer to use host installed ROCm, as long as it meets our minimum requirements
-// failing that, tell the user how to download it on their own
-func AMDValidateLibDir() (string, error) {
-	// We rely on the rpath compiled into our library to find rocm
-	// so we establish a symlink to wherever we find it on the system
-	// to <payloads>/rocm
-	payloadsDir, err := PayloadsDir()
-	if err != nil {
-		return "", err
-	}
-
-	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
-	if rocmLibUsable(rocmTargetDir) {
-		return rocmTargetDir, nil
-	}
-
-	// next to the running binary
-	exe, err := os.Executable()
-	if err == nil {
-		peerDir := filepath.Dir(exe)
-		if rocmLibUsable(peerDir) {
-			slog.Debug("detected ROCM next to ollama executable " + peerDir)
-			return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
-		}
-		peerDir = filepath.Join(filepath.Dir(exe), "rocm")
-		if rocmLibUsable(peerDir) {
-			slog.Debug("detected ROCM next to ollama executable " + peerDir)
-			return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
-		}
-	}
-
-	// Well known ollama installer path
-	installedRocmDir := "/usr/share/ollama/lib/rocm"
-	if rocmLibUsable(installedRocmDir) {
-		return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
-	}
-
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "lib")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
-		}
-	}
-
-	// Scan the library path for potential matches
-	ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	for _, ldPath := range ldPaths {
-		d, err := filepath.Abs(ldPath)
-		if err != nil {
-			continue
-		}
-		if rocmLibUsable(d) {
-			return rocmTargetDir, setupLink(d, rocmTargetDir)
-		}
-	}
-
-	// Well known location(s)
-	if rocmLibUsable("/opt/rocm/lib") {
-		return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
-	}
-
-	// If we still haven't found a usable rocm, the user will have to install it on their own
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
-	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
-}
-
-func AMDDriverVersion() (string, error) {
-	_, err := os.Stat(DriverVersionFile)
-	if err != nil {
-		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
-	}
-	fp, err := os.Open(DriverVersionFile)
-	if err != nil {
-		return "", err
-	}
-	defer fp.Close()
-	verString, err := io.ReadAll(fp)
-	if err != nil {
-		return "", err
-	}
-	return strings.TrimSpace(string(verString)), nil
-}
-
-func AMDGFXVersions() map[int]Version {
-	// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
-	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
-	res := map[int]Version{}
-	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	for _, match := range matches {
-		fp, err := os.Open(match)
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
-			continue
-		}
-		defer fp.Close()
-		i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
-			continue
-		}
-
-		if i == 0 {
-			// Skipping the CPU
-			continue
-		}
-		// Align with HIP IDs (zero is first GPU, not CPU)
-		i -= 1
-
-		scanner := bufio.NewScanner(fp)
-		for scanner.Scan() {
-			line := strings.TrimSpace(scanner.Text())
-			if strings.HasPrefix(line, "gfx_target_version") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 || len(ver[1]) < 5 {
-					if ver[1] != "0" {
-						slog.Debug("malformed " + line)
-					}
-					res[i] = Version{
-						Major: 0,
-						Minor: 0,
-						Patch: 0,
-					}
-					continue
-				}
-				l := len(ver[1])
-				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
-				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
-				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
-				if err1 != nil || err2 != nil || err3 != nil {
-					slog.Debug("malformed int " + line)
-					continue
-				}
-
-				res[i] = Version{
-					Major: uint(major),
-					Minor: uint(minor),
-					Patch: uint(patch),
-				}
-			}
-		}
-	}
-	return res
-}
-
-func (v Version) ToGFXString() string {
-	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
-}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -1,180 +0,0 @@
-package gpu
-
-import (
-	"bytes"
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"slices"
-	"strings"
-)
-
-const (
-	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
-
-	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
-	iGPUName = "AMD Radeon(TM) Graphics"
-)
-
-var (
-	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
-)
-
-func AMDGetGPUInfo(resp *GpuInfo) {
-	hl, err := NewHipLib()
-	if err != nil {
-		slog.Debug(err.Error())
-		return
-	}
-	defer hl.Release()
-	skip := map[int]interface{}{}
-	ids := []int{}
-	resp.memInfo.DeviceCount = 0
-	resp.memInfo.TotalMemory = 0
-	resp.memInfo.FreeMemory = 0
-
-	ver, err := hl.AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
-		// For now this is benign, but we may eventually need to fail compatibility checks
-		slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
-	}
-
-	// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
-	count := hl.HipGetDeviceCount()
-	if count == 0 {
-		return
-	}
-	libDir, err := AMDValidateLibDir()
-	if err != nil {
-		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
-		return
-	}
-
-	var supported []string
-	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
-	if gfxOverride == "" {
-		supported, err = GetSupportedGFX(libDir)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
-			return
-		}
-	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
-	}
-
-	slog.Info(fmt.Sprintf("detected %d hip devices", count))
-	for i := 0; i < count; i++ {
-		ids = append(ids, i)
-		err = hl.HipSetDevice(i)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
-			skip[i] = struct{}{}
-			continue
-		}
-
-		props, err := hl.HipGetDeviceProperties(i)
-		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
-			skip[i] = struct{}{}
-			continue
-		}
-		n := bytes.IndexByte(props.Name[:], 0)
-		name := string(props.Name[:n])
-		slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
-		n = bytes.IndexByte(props.GcnArchName[:], 0)
-		gfx := string(props.GcnArchName[:n])
-		slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
-		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
-		// TODO  Why isn't props.iGPU accurate!?
-		if strings.EqualFold(name, iGPUName) {
-			slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
-			skip[i] = struct{}{}
-			continue
-		}
-		if gfxOverride == "" {
-			if !slices.Contains[[]string, string](supported, gfx) {
-				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
-				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
-				skip[i] = struct{}{}
-				continue
-			} else {
-				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
-			}
-		}
-
-		totalMemory, freeMemory, err := hl.HipMemGetInfo()
-		if err != nil {
-			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
-			continue
-		}
-
-		// TODO according to docs, freeMem may lie on windows!
-		slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
-		slog.Info(fmt.Sprintf("[%d] Free Mem:  %d", i, freeMemory))
-		resp.memInfo.DeviceCount++
-		resp.memInfo.TotalMemory += totalMemory
-		resp.memInfo.FreeMemory += freeMemory
-	}
-	if resp.memInfo.DeviceCount > 0 {
-		resp.Library = "rocm"
-	}
-	// Abort if all GPUs are skipped
-	if len(skip) >= count {
-		slog.Info("all detected amdgpus are skipped, falling back to CPU")
-		return
-	}
-	if len(skip) > 0 {
-		amdSetVisibleDevices(ids, skip)
-	}
-	UpdatePath(libDir)
-}
-
-func AMDValidateLibDir() (string, error) {
-	// On windows non-admins typically can't create links
-	// so instead of trying to rely on rpath and a link in
-	// $LibDir/rocm, we instead rely on setting PATH to point
-	// to the location of the ROCm library
-
-	// Installer payload location if we're running the installed binary
-	exe, err := os.Executable()
-	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
-		if rocmLibUsable(rocmTargetDir) {
-			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-			return rocmTargetDir, nil
-		}
-	}
-
-	// Installer payload (if we're running from some other location)
-	localAppData := os.Getenv("LOCALAPPDATA")
-	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "rocm")
-	if rocmLibUsable(rocmTargetDir) {
-		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
-		return rocmTargetDir, nil
-	}
-
-	// Prefer explicit HIP env var
-	hipPath := os.Getenv("HIP_PATH")
-	if hipPath != "" {
-		hipLibDir := filepath.Join(hipPath, "bin")
-		if rocmLibUsable(hipLibDir) {
-			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
-			return hipLibDir, nil
-		}
-	}
-
-	// Well known location(s)
-	if rocmLibUsable(RocmStandardLocation) {
-		return RocmStandardLocation, nil
-	}
-
-	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
-	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
-	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
-}
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -1,112 +0,0 @@
-package gpu
-
-import (
-	"errors"
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"syscall"
-)
-
-var (
-	lock        sync.Mutex
-	payloadsDir = ""
-)
-
-func PayloadsDir() (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	if payloadsDir == "" {
-		cleanupTmpDirs()
-		tmpDir, err := os.MkdirTemp("", "ollama")
-		if err != nil {
-			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-		}
-
-		// Track our pid so we can clean up orphaned tmpdirs
-		pidFilePath := filepath.Join(tmpDir, "ollama.pid")
-		pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
-		if err != nil {
-			return "", err
-		}
-		if _, err := pidFile.Write([]byte(fmt.Sprint(os.Getpid()))); err != nil {
-			return "", err
-		}
-
-		// We create a distinct subdirectory for payloads within the tmpdir
-		// This will typically look like /tmp/ollama3208993108/runners on linux
-		payloadsDir = filepath.Join(tmpDir, "runners")
-	}
-	return payloadsDir, nil
-}
-
-// Best effort to clean up prior tmpdirs
-func cleanupTmpDirs() {
-	dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
-	if err != nil {
-		return
-	}
-	for _, d := range dirs {
-		info, err := os.Stat(d)
-		if err != nil || !info.IsDir() {
-			continue
-		}
-		raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
-		if err == nil {
-			pid, err := strconv.Atoi(string(raw))
-			if err == nil {
-				if proc, err := os.FindProcess(int(pid)); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-					// Another running ollama, ignore this tmpdir
-					continue
-				}
-			}
-		} else {
-			slog.Debug("failed to open ollama.pid", "path", d, "error", err)
-		}
-		err = os.RemoveAll(d)
-		if err != nil {
-			slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
-		}
-	}
-}
-
-func Cleanup() {
-	lock.Lock()
-	defer lock.Unlock()
-	if payloadsDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
-		if err != nil {
-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-		}
-	}
-}
-
-func UpdatePath(dir string) {
-	if runtime.GOOS == "windows" {
-		tmpDir := filepath.Dir(dir)
-		pathComponents := strings.Split(os.Getenv("PATH"), ";")
-		i := 0
-		for _, comp := range pathComponents {
-			if strings.EqualFold(comp, dir) {
-				return
-			}
-			// Remove any other prior paths to our temp dir
-			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-				pathComponents[i] = comp
-				i++
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
-		os.Setenv("PATH", newPath)
-	}
-	// linux and darwin rely on rpath
-}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -23,8 +23,8 @@ import (
 )

 type handles struct {
-	nvml   *C.nvml_handle_t
-	cudart *C.cudart_handle_t
+	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
 }

 var gpuMutex sync.Mutex
@@ -34,7 +34,7 @@ var gpuHandles *handles = nil
 var CudaComputeMin = [2]C.int{5, 0}

 // Possible locations for the nvidia-ml library
-var NvmlLinuxGlobs = []string{
+var CudaLinuxGlobs = []string{
 	"/usr/local/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
@@ -42,98 +42,75 @@ var NvmlLinuxGlobs = []string{
 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
+	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
-	"/usr/local/lib*/libnvidia-ml.so*",

 	// TODO: are these stubs ever valid?
 	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 }

-var NvmlWindowsGlobs = []string{
+var CudaWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }

-var CudartLinuxGlobs = []string{
-	"/usr/local/cuda/lib64/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
-	"/usr/lib/wsl/lib/libcudart.so*",
-	"/usr/lib/wsl/drivers/*/libcudart.so*",
-	"/opt/cuda/lib64/libcudart.so*",
-	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
-	"/usr/local/cuda/lib*/libcudart.so*",
-	"/usr/lib*/libcudart.so*",
-	"/usr/local/lib*/libcudart.so*",
+var RocmLinuxGlobs = []string{
+	"/opt/rocm*/lib*/librocm_smi64.so*",
 }

-var CudartWindowsGlobs = []string{
-	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+var RocmWindowsGlobs = []string{
+	"c:\\Windows\\System32\\rocm_smi64.dll",
 }

-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 // Note: gpuMutex must already be held
 func initGPUHandles() {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

 	gpuHandles = &handles{nil, nil}
-	var nvmlMgmtName string
-	var nvmlMgmtPatterns []string
-	var cudartMgmtName string
-	var cudartMgmtPatterns []string
-
-	tmpDir, _ := PayloadsDir()
+	var cudaMgmtName string
+	var cudaMgmtPatterns []string
+	var rocmMgmtName string
+	var rocmMgmtPatterns []string
 	switch runtime.GOOS {
 	case "windows":
-		nvmlMgmtName = "nvml.dll"
-		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
-		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
-		cudartMgmtName = "cudart64_*.dll"
-		localAppData := os.Getenv("LOCALAPPDATA")
-		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
-		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
+		cudaMgmtName = "nvml.dll"
+		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
+		copy(cudaMgmtPatterns, CudaWindowsGlobs)
+		rocmMgmtName = "rocm_smi64.dll"
+		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
+		copy(rocmMgmtPatterns, RocmWindowsGlobs)
 	case "linux":
-		nvmlMgmtName = "libnvidia-ml.so"
-		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
-		copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
-		cudartMgmtName = "libcudart.so*"
-		if tmpDir != "" {
-			// TODO - add "payloads" for subprocess
-			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
-		}
-		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
+		cudaMgmtName = "libnvidia-ml.so"
+		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
+		copy(cudaMgmtPatterns, CudaLinuxGlobs)
+		rocmMgmtName = "librocm_smi64.so"
+		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
+		copy(rocmMgmtPatterns, RocmLinuxGlobs)
 	default:
 		return
 	}

 	slog.Info("Detecting GPU type")
-	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
-	if len(cudartLibPaths) > 0 {
-		cudart := LoadCUDARTMgmt(cudartLibPaths)
-		if cudart != nil {
-			slog.Info("Nvidia GPU detected via cudart")
-			gpuHandles.cudart = cudart
+	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
+	if len(cudaLibPaths) > 0 {
+		cuda := LoadCUDAMgmt(cudaLibPaths)
+		if cuda != nil {
+			slog.Info("Nvidia GPU detected")
+			gpuHandles.cuda = cuda
 			return
 		}
 	}

-	// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
-	nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
-	if len(nvmlLibPaths) > 0 {
-		nvml := LoadNVMLMgmt(nvmlLibPaths)
-		if nvml != nil {
-			slog.Info("Nvidia GPU detected via nvidia-ml")
-			gpuHandles.nvml = nvml
+	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
+	if len(rocmLibPaths) > 0 {
+		rocm := LoadROCMMgmt(rocmLibPaths)
+		if rocm != nil {
+			slog.Info("Radeon GPU detected")
+			gpuHandles.rocm = rocm
 			return
 		}
 	}
-
 }

 func GetGPUInfo() GpuInfo {
@@ -153,48 +130,85 @@ func GetGPUInfo() GpuInfo {

 	var memInfo C.mem_info_t
 	resp := GpuInfo{}
-	if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
+	if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
+			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else if memInfo.count > 0 {
 			// Verify minimum compute capability
-			var cc C.nvml_compute_capability_t
-			C.nvml_compute_capability(*gpuHandles.nvml, &cc)
+			var cc C.cuda_compute_capability_t
+			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
 			if cc.err != nil {
-				slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
+				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
-				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
+				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 			} else {
-				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
+				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
-		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
-			C.free(unsafe.Pointer(memInfo.err))
-		} else if memInfo.count > 0 {
-			// Verify minimum compute capability
-			var cc C.cudart_compute_capability_t
-			C.cudart_compute_capability(*gpuHandles.cudart, &cc)
-			if cc.err != nil {
-				slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
-				C.free(unsafe.Pointer(cc.err))
-			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
-				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
-				resp.Library = "cuda"
-			} else {
-				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
-			}
+	} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		ver, err := AMDDriverVersion()
+		if err == nil {
+			slog.Info("AMD Driver: " + ver)
+		} else {
+			// For now this is benign, but we may eventually need to fail compatibility checks
+			slog.Debug("error looking up amd driver version: %s", err)
 		}
-	} else {
-		AMDGetGPUInfo(&resp)
-		if resp.Library != "" {
-			return resp
+		gfx := AMDGFXVersions()
+		tooOld := false
+		for _, v := range gfx {
+			if v.Major < 9 {
+				slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
+				tooOld = true
+				break
+			}
+
+			// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
+			// e.g. gfx1034 works if we map it to gfx1030 at runtime
+
+		}
+		if !tooOld {
+			// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
+			C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+			if memInfo.err != nil {
+				slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
+				C.free(unsafe.Pointer(memInfo.err))
+			} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+				// Only one GPU detected and it appears to be an integrated GPU - skip it
+				slog.Info("ROCm unsupported integrated GPU detected")
+			} else if memInfo.count > 0 {
+				if memInfo.igpu_index >= 0 {
+					// We have multiple GPUs reported, and one of them is an integrated GPU
+					// so we have to set the env var to bypass it
+					// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
+					val := os.Getenv("ROCR_VISIBLE_DEVICES")
+					if val == "" {
+						devices := []string{}
+						for i := 0; i < int(memInfo.count); i++ {
+							if i == int(memInfo.igpu_index) {
+								continue
+							}
+							devices = append(devices, strconv.Itoa(i))
+						}
+						val = strings.Join(devices, ",")
+						os.Setenv("ROCR_VISIBLE_DEVICES", val)
+					}
+					slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
+				}
+				resp.Library = "rocm"
+				var version C.rocm_version_resp_t
+				C.rocm_get_version(*gpuHandles.rocm, &version)
+				verString := C.GoString(version.str)
+				if version.status == 0 {
+					resp.Variant = "v" + verString
+				} else {
+					slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
+				}
+				C.free(unsafe.Pointer(version.str))
+			}
 		}
 	}
 	if resp.Library == "" {
@@ -228,15 +242,6 @@ func getCPUMem() (memInfo, error) {
 }

 func CheckVRAM() (int64, error) {
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseInt(userLimit, 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
-		}
-		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return avail, nil
-	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
 		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
@@ -245,11 +250,6 @@ func CheckVRAM() (int64, error) {
 		if overhead < gpus*1024*1024*1024 {
 			overhead = gpus * 1024 * 1024 * 1024
 		}
-		// Assigning full reported free memory for Tegras due to OS controlled caching.
-		if CudaTegra != "" {
-			// Setting overhead for non-Tegra devices
-			overhead = 0
-		}
 		avail := int64(gpuInfo.FreeMemory - overhead)
 		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
 		return avail, nil
@@ -312,15 +312,15 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 	return gpuLibPaths
 }

-func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
-	var resp C.nvml_init_resp_t
+func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
+	var resp C.cuda_init_resp_t
 	resp.ch.verbose = getVerboseState()
-	for _, libPath := range nvmlLibPaths {
+	for _, libPath := range cudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
-		C.nvml_init(lib, &resp)
+		C.cuda_init(lib, &resp)
 		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
+			slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return &resp.ch
@@ -329,18 +329,18 @@ func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
 	return nil
 }

-func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
-	var resp C.cudart_init_resp_t
-	resp.ch.verbose = getVerboseState()
-	for _, libPath := range cudartLibPaths {
+func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
+	var resp C.rocm_init_resp_t
+	resp.rh.verbose = getVerboseState()
+	for _, libPath := range rocmLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
-		C.cudart_init(lib, &resp)
+		C.rocm_init(lib, &resp)
 		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
+			slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			return &resp.ch
+			return &resp.rh
 		}
 	}
 	return nil
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -1,7 +1,6 @@
 //go:build darwin

 package gpu
-
 /*
 #cgo CFLAGS: -x objective-c
 #cgo LDFLAGS: -framework Foundation -framework CoreGraphics -framework Metal
@@ -9,25 +8,11 @@ package gpu
 */
 import "C"
 import (
-	"fmt"
-	"log/slog"
-	"os"
 	"runtime"
-	"strconv"
 )

 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseInt(userLimit, 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
-		}
-		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return avail, nil
-	}
-
 	if runtime.GOARCH == "amd64" {
 		// gpu not supported, this may not be metal
 		return 0, nil
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -52,8 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
 }
 #endif

-#include "gpu_info_nvml.h"
-#include "gpu_info_cudart.h"
+#include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -1,10 +1,10 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

+#include "gpu_info_cuda.h"
+
 #include <string.h>

-#include "gpu_info_nvml.h"
-
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -30,20 +30,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
      {NULL, NULL},
  };

-  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
+  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
-             nvml_lib_path, msg);
+             cuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }

  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
  
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
@@ -82,7 +82,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  }
 }

-void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
@@ -92,7 +92,7 @@ void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
  int i;

  if (h.handle == NULL) {
-    resp->err = strdup("nvml handle isn't initialized");
+    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

@@ -124,31 +124,31 @@ void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
-      if (ret != NVML_SUCCESS) {
+      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
@@ -156,14 +156,14 @@ void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
    }

    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);

    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
 }

-void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
+void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -1,6 +1,6 @@
 #ifndef __APPLE__
-#ifndef __GPU_INFO_NVML_H__
-#define __GPU_INFO_NVML_H__
+#ifndef __GPU_INFO_CUDA_H__
+#define __GPU_INFO_CUDA_H__
 #include "gpu_info.h"

 // Just enough typedef's to dlopen/dlsym for memory information
@@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
    NVML_BRAND_UNKNOWN          = 0,
 } nvmlBrandType_t;

-typedef struct nvml_handle {
+typedef struct cuda_handle {
  void *handle;
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
@@ -35,22 +35,22 @@ typedef struct nvml_handle {
  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
-} nvml_handle_t;
+} cuda_handle_t;

-typedef struct nvml_init_resp {
+typedef struct cuda_init_resp {
  char *err;  // If err is non-null handle is invalid
-  nvml_handle_t ch;
-} nvml_init_resp_t;
+  cuda_handle_t ch;
+} cuda_init_resp_t;

-typedef struct nvml_compute_capability {
+typedef struct cuda_compute_capability {
  char *err;
  int major;
  int minor;
-} nvml_compute_capability_t;
+} cuda_compute_capability_t;

-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
-void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);

-#endif  // __GPU_INFO_NVML_H__
+#endif  // __GPU_INFO_CUDA_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -1,190 +0,0 @@
-#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
-
-#include <string.h>
-#include "gpu_info_cudart.h"
-
-void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
-  cudartReturn_t ret;
-  resp->err = NULL;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
-      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
-      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
-      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
-      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
-      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
-      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
-      {NULL, NULL},
-  };
-
-  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
-  if (!resp->ch.handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
-    snprintf(buf, buflen,
-            "Unable to load %s library to query for Nvidia GPUs: %s",
-            cudart_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
-  
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
-
-    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!l[i].p) {
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.handle);
-      resp->ch.handle = NULL;
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-              msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->ch.cudaSetDevice)(0);
-  if (ret != CUDART_SUCCESS) {
-    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "cudart init failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  int version = 0;
-  cudartDriverVersion_t driverVersion;
-  driverVersion.major = 0;
-  driverVersion.minor = 0;
-
-  // Report driver version if we're in verbose mode, ignore errors
-  ret = (*resp->ch.cudaDriverGetVersion)(&version);
-  if (ret != CUDART_SUCCESS) {
-    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
-  } else {
-    driverVersion.major = version / 1000;
-    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
-  }
-}
-
-
-void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
-  resp->err = NULL;
-  cudartMemory_t memInfo = {0,0,0};
-  cudartReturn_t ret;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  if (h.handle == NULL) {
-    resp->err = strdup("cudart handle isn't initialized");
-    return;
-  }
-
-  // cudaGetDeviceCount takes int type, resp-> count is uint
-  int deviceCount;
-  ret = (*h.cudaGetDeviceCount)(&deviceCount);
-  if (ret != CUDART_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  } else {
-    resp->count = (unsigned int)deviceCount;
-  }
-
-  resp->total = 0;
-  resp->free = 0;
-  for (i = 0; i < resp-> count; i++) {  
-    ret = (*h.cudaSetDevice)(i);
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "cudart device failed to initialize");
-      resp->err = strdup(buf);
-      return;
-    }
-    ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
-      resp->err = strdup(buf);
-      return;
-    }
-
-    LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
-
-    resp->total += memInfo.total;
-    resp->free += memInfo.free;
-  }
-}
-
-void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
-  resp->err = NULL;
-  resp->major = 0;
-  resp->minor = 0;
-  int major = 0;
-  int minor = 0;
-  cudartReturn_t ret;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  if (h.handle == NULL) {
-    resp->err = strdup("cudart handle not initialized");
-    return;
-  }
-
-  int devices;
-  ret = (*h.cudaGetDeviceCount)(&devices);
-  if (ret != CUDART_SUCCESS) {
-    snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  for (i = 0; i < devices; i++) {
-    ret = (*h.cudaSetDevice)(i);
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "cudart device failed to initialize");
-      resp->err = strdup(buf);
-      return;
-    }
-
-    ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
-      resp->err = strdup(buf);
-      return;
-    }
-    ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
-    if (ret != CUDART_SUCCESS) {
-      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
-      resp->err = strdup(buf);
-      return;
-    }
-      
-    // Report the lowest major.minor we detect as that limits our compatibility
-    if (resp->major == 0 || resp->major > major ) {
-      resp->major = major;
-      resp->minor = minor;
-    } else if ( resp->major == major && resp->minor > minor ) {
-      resp->minor = minor;
-    }
-  }
-}
-
-#endif  // __APPLE__
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -1,59 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_CUDART_H__
-#define __GPU_INFO_CUDART_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum cudartReturn_enum {
-  CUDART_SUCCESS = 0,
-  CUDART_UNSUPPORTED = 1,
-  // Other values omitted for now...
-} cudartReturn_t;
-
-typedef enum cudartDeviceAttr_enum {
-  cudartDevAttrComputeCapabilityMajor = 75,
-  cudartDevAttrComputeCapabilityMinor = 76,
-} cudartDeviceAttr_t;
-
-typedef void *cudartDevice_t;  // Opaque is sufficient
-typedef struct cudartMemory_st {
-  size_t total;
-  size_t free;
-  size_t used;
-} cudartMemory_t;
-
-typedef struct cudartDriverVersion {
-  int major;
-  int minor;
-} cudartDriverVersion_t;
-
-typedef struct cudart_handle {
-  void *handle;
-  uint16_t verbose;
-  cudartReturn_t (*cudaSetDevice)(int device);
-  cudartReturn_t (*cudaDeviceSynchronize)(void);
-  cudartReturn_t (*cudaDeviceReset)(void);
-  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
-  cudartReturn_t (*cudaGetDeviceCount)(int *);
-  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
-  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
-} cudart_handle_t;
-
-typedef struct cudart_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  cudart_handle_t ch;
-} cudart_init_resp_t;
-
-typedef struct cudart_compute_capability {
-  char *err;
-  int major;
-  int minor;
-} cudart_compute_capability_t;
-
-
-void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
-void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
-void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
-
-#endif  // __GPU_INFO_CUDART_H__
-#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -0,0 +1,198 @@
+#ifndef __APPLE__
+
+#include "gpu_info_rocm.h"
+
+#include <string.h>
+
+void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
+  rsmi_status_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
+      {NULL, NULL},
+  };
+
+  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
+  if (!resp->rh.handle) {
+    char *msg = LOAD_ERR();
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      resp->rh.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->rh.rsmi_init)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->rh.handle);
+    resp->rh.handle = NULL;
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  return;
+}
+
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  resp->igpu_index = -1;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("rocm handle not initialized");
+    return;
+  }
+
+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    if (h.verbose) {
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
+      }
+    }
+
+    // Get total memory - used memory for available memory
+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
+    if (totalMem < 1024 * 1024 * 1024) {
+      // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
+      LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
+      resp->igpu_index = i;
+    } else {
+      resp->total += totalMem;
+      resp->free += totalMem - usedMem;
+    }
+  }
+}
+
+void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
+  const int buflen = 256;
+  char buf[buflen + 1];
+  if (h.handle == NULL) {
+    resp->str = strdup("rocm handle not initialized");
+    resp->status = 1;
+    return;
+  }
+  rsmi_version_t ver;
+  rsmi_status_t ret;
+  ret = h.rsmi_version_get(&ver);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
+    resp->status = 1;
+  } else {
+    snprintf(buf, buflen, "%d", ver.major);
+    resp->status = 0;
+  }
+  resp->str = strdup(buf);
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -0,0 +1,59 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+
+ typedef struct {
+     uint32_t major;     
+     uint32_t minor;     
+     uint32_t patch;     
+     const char *build;  
+ } rsmi_version_t;
+
+typedef struct rocm_handle {
+  void *handle;
+  uint16_t verbose;
+  rsmi_status_t (*rsmi_init)(uint64_t);
+  rsmi_status_t (*rsmi_shut_down)(void);
+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
+} rocm_handle_t;
+
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+
+typedef struct rocm_version_resp {
+  rsmi_status_t status;
+  char *str; // Contains version or error string if status != 0 
+} rocm_version_resp_t;
+
+void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
+
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
--- a/integration/README.md
+++ b/integration/README.md
@@ -1,11 +0,0 @@
-# Integration Tests
-
-This directory contains integration tests to exercise Ollama end-to-end to verify behavior
-
-By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...`
-
-
-The integration tests have 2 modes of operating.
-
-1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
-2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -1,28 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"net/http"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestOrcaMiniBlueSky(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
-	defer cancel()
-	// Set up the test data
-	req := api.GenerateRequest{
-		Model:  "orca-mini",
-		Prompt: "why is the sky blue?",
-		Stream: &stream,
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-		},
-	}
-	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh"})
-}
--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@@ -1,73 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"net/http"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
-//        package to avoid circular dependencies
-
-// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
-//
-// TODO - Fix this ^^
-
-var (
-	stream = false
-	req    = [2]api.GenerateRequest{
-		{
-			Model:  "orca-mini",
-			Prompt: "why is the ocean blue?",
-			Stream: &stream,
-			Options: map[string]interface{}{
-				"seed":        42,
-				"temperature": 0.0,
-			},
-		}, {
-			Model:  "orca-mini",
-			Prompt: "what is the origin of the us thanksgiving holiday?",
-			Stream: &stream,
-			Options: map[string]interface{}{
-				"seed":        42,
-				"temperature": 0.0,
-			},
-		},
-	}
-	resp = [2][]string{
-		[]string{"sunlight"},
-		[]string{"england", "english", "massachusetts", "pilgrims"},
-	}
-)
-
-func TestIntegrationSimpleOrcaMini(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
-	defer cancel()
-	GenerateTestHelper(ctx, t, &http.Client{}, req[0], resp[0])
-}
-
-// TODO
-// The server always loads a new runner and closes the old one, which forces serial execution
-// At present this test case fails with concurrency problems.  Eventually we should try to
-// get true concurrency working with n_parallel support in the backend
-func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
-	var wg sync.WaitGroup
-	wg.Add(len(req))
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
-	defer cancel()
-	for i := 0; i < len(req); i++ {
-		go func(i int) {
-			defer wg.Done()
-			GenerateTestHelper(ctx, t, &http.Client{}, req[i], resp[i])
-		}(i)
-	}
-	wg.Wait()
-}
-
-// TODO - create a parallel test with 2 different models once we support concurrency
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -1,250 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"math/rand"
-	"net"
-	"net/http"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/app/lifecycle"
-	"github.com/stretchr/testify/assert"
-)
-
-func FindPort() string {
-	port := 0
-	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-		var l *net.TCPListener
-		if l, err = net.ListenTCP("tcp", a); err == nil {
-			port = l.Addr().(*net.TCPAddr).Port
-			l.Close()
-		}
-	}
-	if port == 0 {
-		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-	}
-	return strconv.Itoa(port)
-}
-
-func GetTestEndpoint() (string, string) {
-	defaultPort := "11434"
-	ollamaHost := os.Getenv("OLLAMA_HOST")
-
-	scheme, hostport, ok := strings.Cut(ollamaHost, "://")
-	if !ok {
-		scheme, hostport = "http", ollamaHost
-	}
-
-	// trim trailing slashes
-	hostport = strings.TrimRight(hostport, "/")
-
-	host, port, err := net.SplitHostPort(hostport)
-	if err != nil {
-		host, port = "127.0.0.1", defaultPort
-		if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
-			host = ip.String()
-		} else if hostport != "" {
-			host = hostport
-		}
-	}
-
-	if os.Getenv("OLLAMA_TEST_EXISTING") == "" && port == defaultPort {
-		port = FindPort()
-	}
-
-	url := fmt.Sprintf("%s:%s", host, port)
-	slog.Info("server connection", "url", url)
-	return scheme, url
-}
-
-// TODO make fanicier, grab logs, etc.
-var serverMutex sync.Mutex
-var serverReady bool
-
-func StartServer(ctx context.Context, ollamaHost string) error {
-	// Make sure the server has been built
-	CLIName, err := filepath.Abs("../ollama")
-	if err != nil {
-		return err
-	}
-
-	if runtime.GOOS == "windows" {
-		CLIName += ".exe"
-	}
-	_, err = os.Stat(CLIName)
-	if err != nil {
-		return fmt.Errorf("CLI missing, did you forget to build first?  %w", err)
-	}
-	serverMutex.Lock()
-	defer serverMutex.Unlock()
-	if serverReady {
-		return nil
-	}
-
-	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
-		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
-		os.Setenv("OLLAMA_HOST", ollamaHost)
-	}
-
-	slog.Info("starting server", "url", ollamaHost)
-	done, err := lifecycle.SpawnServer(ctx, "../ollama")
-	if err != nil {
-		return fmt.Errorf("failed to start server: %w", err)
-	}
-
-	go func() {
-		<-ctx.Done()
-		serverMutex.Lock()
-		defer serverMutex.Unlock()
-		exitCode := <-done
-		if exitCode > 0 {
-			slog.Warn("server failure", "exit", exitCode)
-		}
-		serverReady = false
-	}()
-
-	// TODO wait only long enough for the server to be responsive...
-	time.Sleep(500 * time.Millisecond)
-
-	serverReady = true
-	return nil
-}
-
-func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
-	slog.Debug("checking status of model", "model", modelName)
-	showReq := &api.ShowRequest{Name: modelName}
-	requestJSON, err := json.Marshal(showReq)
-	if err != nil {
-		return err
-	}
-
-	req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/show", bytes.NewReader(requestJSON))
-	if err != nil {
-		return err
-	}
-
-	// Make the request with the HTTP client
-	response, err := client.Do(req.WithContext(ctx))
-	if err != nil {
-		return err
-	}
-	defer response.Body.Close()
-	if response.StatusCode == 200 {
-		slog.Info("model already present", "model", modelName)
-		return nil
-	}
-	slog.Info("model missing", "status", response.StatusCode)
-
-	pullReq := &api.PullRequest{Name: modelName, Stream: &stream}
-	requestJSON, err = json.Marshal(pullReq)
-	if err != nil {
-		return err
-	}
-
-	req, err = http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/pull", bytes.NewReader(requestJSON))
-	if err != nil {
-		return err
-	}
-	slog.Info("pulling", "model", modelName)
-
-	response, err = client.Do(req.WithContext(ctx))
-	if err != nil {
-		return err
-	}
-	defer response.Body.Close()
-	if response.StatusCode != 200 {
-		return fmt.Errorf("failed to pull model") // TODO more details perhaps
-	}
-	slog.Info("model pulled", "model", modelName)
-	return nil
-}
-
-func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
-	requestJSON, err := json.Marshal(genReq)
-	if err != nil {
-		t.Fatalf("Error serializing request: %v", err)
-	}
-	defer func() {
-		if t.Failed() && os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-			// TODO
-			fp, err := os.Open(lifecycle.ServerLogFile)
-			if err != nil {
-				slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
-				return
-			}
-			data, err := io.ReadAll(fp)
-			if err != nil {
-				slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
-				return
-			}
-			slog.Warn("SERVER LOG FOLLOWS")
-			os.Stderr.Write(data)
-			slog.Warn("END OF SERVER")
-		}
-		err = os.Remove(lifecycle.ServerLogFile)
-		if err != nil && !os.IsNotExist(err) {
-			slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
-		}
-	}()
-	scheme, testEndpoint := GetTestEndpoint()
-
-	if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-		assert.NoError(t, StartServer(ctx, testEndpoint))
-	}
-
-	err = PullIfMissing(ctx, client, scheme, testEndpoint, genReq.Model)
-	if err != nil {
-		t.Fatalf("Error pulling model: %v", err)
-	}
-
-	// Make the request and get the response
-	req, err := http.NewRequest("POST", scheme+"://"+testEndpoint+"/api/generate", bytes.NewReader(requestJSON))
-	if err != nil {
-		t.Fatalf("Error creating request: %v", err)
-	}
-
-	// Set the content type for the request
-	req.Header.Set("Content-Type", "application/json")
-
-	// Make the request with the HTTP client
-	response, err := client.Do(req.WithContext(ctx))
-	if err != nil {
-		t.Fatalf("Error making request: %v", err)
-	}
-	defer response.Body.Close()
-	body, err := io.ReadAll(response.Body)
-	assert.NoError(t, err)
-	assert.Equal(t, response.StatusCode, 200, string(body))
-
-	// Verify the response is valid JSON
-	var payload api.GenerateResponse
-	err = json.Unmarshal(body, &payload)
-	if err != nil {
-		assert.NoError(t, err, body)
-	}
-
-	// Verify the response contains the expected data
-	atLeastOne := false
-	for _, resp := range anyResp {
-		if strings.Contains(strings.ToLower(payload.Response), resp) {
-			atLeastOne = true
-			break
-		}
-	}
-	assert.True(t, atLeastOne, "none of %v found in %s", anyResp, payload.Response)
-}
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -1,142 +0,0 @@
-#include "dyn_ext_server.h"
-
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __linux__
-#include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
-#define LOAD_ERR() strdup(dlerror())
-#define UNLOAD_LIBRARY(handle) dlclose(handle)
-#elif _WIN32
-#include <windows.h>
-#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
-#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
-#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
-#define LOAD_ERR() ({\
-  LPSTR messageBuffer = NULL; \
-  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
-                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
-  char *resp = strdup(messageBuffer); \
-  LocalFree(messageBuffer); \
-  resp; \
-})
-#else
-#include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
-#define LOAD_ERR() strdup(dlerror())
-#define UNLOAD_LIBRARY(handle) dlclose(handle)
-#endif
-
-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
-                       ext_server_resp_t *err) {
-  int i = 0;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"llama_server_init", (void *)&s->llama_server_init},
-      {"llama_server_start", (void *)&s->llama_server_start},
-      {"llama_server_stop", (void *)&s->llama_server_stop},
-      {"llama_server_completion", (void *)&s->llama_server_completion},
-      {"llama_server_completion_next_result",
-       (void *)&s->llama_server_completion_next_result},
-      {"llama_server_completion_cancel",
-       (void *)&s->llama_server_completion_cancel},
-      {"llama_server_release_task_result",
-       (void *)&s->llama_server_release_task_result},
-      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
-      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
-      {"llama_server_embedding", (void *)&s->llama_server_embedding},
-      {"llama_server_release_json_resp",
-       (void *)&s->llama_server_release_json_resp},
-      {"", NULL},
-  };
-
-  printf("loading library %s\n", libPath);
-  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
-  if (!s->handle) {
-    err->id = -1;
-    char *msg = LOAD_ERR();
-    snprintf(err->msg, err->msg_len,
-             "Unable to load dynamic server library: %s", msg);
-    free(msg);
-    return;
-  }
-
-  for (i = 0; l[i].p != NULL; i++) {
-    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
-    if (!l[i].p) {
-      UNLOAD_LIBRARY(s->handle);
-      err->id = -1;
-      char *msg = LOAD_ERR();
-      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
-               l[i].s, msg);
-      free(msg);
-      return;
-    }
-  }
-}
-
-inline void dyn_llama_server_init(struct dynamic_llama_server s,
-                                           ext_server_params_t *sparams,
-                                           ext_server_resp_t *err) {
-  s.llama_server_init(sparams, err);
-}
-
-inline void dyn_llama_server_start(struct dynamic_llama_server s) {
-  s.llama_server_start();
-}
-
-inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
-  s.llama_server_stop();
-}
-
-inline void dyn_llama_server_completion(struct dynamic_llama_server s,
-                                                 const char *json_req,
-                                                 ext_server_resp_t *resp) {
-  s.llama_server_completion(json_req, resp);
-}
-
-inline void dyn_llama_server_completion_next_result(
-    struct dynamic_llama_server s, const int task_id,
-    ext_server_task_result_t *result) {
-  s.llama_server_completion_next_result(task_id, result);
-}
-
-inline void dyn_llama_server_completion_cancel(
-    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
-  s.llama_server_completion_cancel(task_id, err);
-}
-inline void dyn_llama_server_release_task_result(
-    struct dynamic_llama_server s, ext_server_task_result_t *result) {
-  s.llama_server_release_task_result(result);
-}
-
-inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
-                                               const char *json_req,
-                                               char **json_resp,
-                                               ext_server_resp_t *err) {
-  s.llama_server_tokenize(json_req, json_resp, err);
-}
-
-inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
-                                                 const char *json_req,
-                                                 char **json_resp,
-                                                 ext_server_resp_t *err) {
-  s.llama_server_detokenize(json_req, json_resp, err);
-}
-
-inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
-                                                const char *json_req,
-                                                char **json_resp,
-                                                ext_server_resp_t *err) {
-  s.llama_server_embedding(json_req, json_resp, err);
-}
-
-inline void dyn_llama_server_release_json_resp(
-    struct dynamic_llama_server s, char **json_resp) {
-  s.llama_server_release_json_resp(json_resp);
-}
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -28,13 +28,13 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"

-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/jmorganca/ollama/api"
 )

 type dynExtServer struct {
@@ -64,12 +64,15 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }

+// Note: current implementation does not support concurrent instantiations
+var llm *dynExtServer
+
 func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if !mutex.TryLock() {
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
-	gpu.UpdatePath(filepath.Dir(library))
+	updatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(512)
@@ -80,7 +83,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 		mutex.Unlock()
 		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
-	llm := dynExtServer{
+	llm = &dynExtServer{
 		s:       srv,
 		options: opts,
 	}
@@ -145,8 +148,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	}

 	slog.Info("Initializing llama server")
-	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
-	initResp := newExtServerResp(512)
+	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
@@ -158,7 +160,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts

 	slog.Info("Starting llama main loop")
 	C.dyn_llama_server_start(llm.s)
-	return &llm, nil
+	return llm, nil
 }

 func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
@@ -195,9 +197,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu

 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
-		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
-			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
-		}
 	}

 	retryDelay := 100 * time.Microsecond
@@ -225,14 +224,17 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		}

 		retryNeeded := false
-		// keep track of the last token generated, this is used to abort if the model starts looping
-		var lastToken string
-		var tokenRepeat int
 	out:
 		for {
 			select {
 			case <-ctx.Done():
-				return cancelCompletion(llm, resp)
+				// This handles the request cancellation
+				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+				if resp.id < 0 {
+					return extServerResponseToErr(resp)
+				} else {
+					return nil
+				}
 			default:
 				var result C.ext_server_task_result_t
 				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
@@ -255,20 +257,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 					break out
 				}

-				switch {
-				case strings.TrimSpace(p.Content) == lastToken:
-					tokenRepeat++
-				default:
-					lastToken = strings.TrimSpace(p.Content)
-					tokenRepeat = 0
-				}
-
-				// 30 picked as an arbitrary max token repeat limit, modify as needed
-				if tokenRepeat > 30 {
-					slog.Debug("prediction aborted, token repeat limit reached")
-					return cancelCompletion(llm, resp)
-				}
-
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
@@ -296,15 +284,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 	return fmt.Errorf("max retries exceeded")
 }

-func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
-	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
-	if resp.id < 0 {
-		return extServerResponseToErr(resp)
-	} else {
-		return nil
-	}
-}
-
 func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
@@ -386,3 +365,25 @@ func (llm *dynExtServer) Close() {
 	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
+
+func updatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
+		os.Setenv("PATH", newPath)
+	}
+	// linux and darwin rely on rpath
+}
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -1,74 +0,0 @@
-#include <stdlib.h>
-
-#include "ext_server.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-struct dynamic_llama_server {
-  void *handle;
-  void (*llama_server_init)(ext_server_params_t *sparams,
-                            ext_server_resp_t *err);
-  void (*llama_server_start)();
-  void (*llama_server_stop)();
-  void (*llama_server_completion)(const char *json_req,
-                                  ext_server_resp_t *resp);
-  void (*llama_server_completion_next_result)(const int task_id,
-                                              ext_server_task_result_t *result);
-  void (*llama_server_completion_cancel)(const int task_id,
-                                         ext_server_resp_t *err);
-  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
-  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
-                                ext_server_resp_t *err);
-  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
-                                  ext_server_resp_t *err);
-  void (*llama_server_embedding)(const char *json_req, char **json_resp,
-                                 ext_server_resp_t *err);
-  void (*llama_server_release_json_resp)(char **json_resp);
-};
-
-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
-                       ext_server_resp_t *err);
-
-// No good way to call C function pointers from Go so inline the indirection
-void dyn_llama_server_init(struct dynamic_llama_server s,
-                                    ext_server_params_t *sparams,
-                                    ext_server_resp_t *err);
-
-void dyn_llama_server_start(struct dynamic_llama_server s);
-
-void dyn_llama_server_stop(struct dynamic_llama_server s);
-
-void dyn_llama_server_completion(struct dynamic_llama_server s,
-                                          const char *json_req,
-                                          ext_server_resp_t *resp);
-
-void dyn_llama_server_completion_next_result(
-    struct dynamic_llama_server s, const int task_id,
-    ext_server_task_result_t *result);
-
-void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
-                                                 const int task_id,
-                                                 ext_server_resp_t *err);
-
-void dyn_llama_server_release_task_result(
-    struct dynamic_llama_server s, ext_server_task_result_t *result);
-
-void dyn_llama_server_tokenize(struct dynamic_llama_server s,
-                                        const char *json_req, char **json_resp,
-                                        ext_server_resp_t *err);
-
-void dyn_llama_server_detokenize(struct dynamic_llama_server s,
-                                          const char *json_req,
-                                          char **json_resp,
-                                          ext_server_resp_t *err);
-
-void dyn_llama_server_embedding(struct dynamic_llama_server s,
-                                         const char *json_req, char **json_resp,
-                                         ext_server_resp_t *err);
-void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
-                                                 char **json_resp);
-
-#ifdef __cplusplus
-}
-#endif
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,21 +0,0 @@
-
-set(TARGET ext_server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-if (WIN32)
-    add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp)
-else()
-    add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
-endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
-target_link_libraries(${TARGET} PRIVATE ggml llava common )
-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
-install(TARGETS ext_server LIBRARY)
-
-if (CUDAToolkit_FOUND)
-    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    if (WIN32)
-        target_link_libraries(${TARGET} PRIVATE nvml)
-    endif()
-endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -1,18 +0,0 @@
-# Extern C Server
-
-This directory contains a thin facade we layer on top of the Llama.cpp server to
-expose `extern C` interfaces to access the functionality through direct API
-calls in-process.  The llama.cpp code uses compile time macros to configure GPU
-type along with other settings.  During the `go generate ./...` execution, the
-build will generate one or more copies of the llama.cpp `extern C` server based
-on what GPU libraries are detected to support multiple GPU types as well as CPU
-only support. The Ollama go build then embeds these different servers to support
-different GPUs and settings at runtime.
-
-If you are making changes to the code in this directory, make sure to disable
-caching during your go build to ensure you pick up your changes.  A typical
-iteration cycle from the top of the source tree looks like:
-
-```
-go generate ./... && go build -a .
-```
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -1,377 +0,0 @@
-#include "ext_server.h"
-#include <atomic>
-
-// Necessary evil since the server types are not defined in a header
-#include "server.cpp"
-
-// Low level API access to verify GPU access
-#if defined(GGML_USE_CUBLAS)
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define cudaGetDevice hipGetDevice
-#define cudaError_t hipError_t
-#define cudaSuccess hipSuccess
-#define cudaGetErrorString hipGetErrorString
-#else
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#endif // defined(GGML_USE_HIPBLAS)
-#endif // GGML_USE_CUBLAS
-
-// Expose the llama server as a callable extern "C" API
-llama_server_context *llama = NULL;
-std::thread ext_server_thread;
-bool shutting_down = false;
-std::atomic_int recv_counter;
-
-// RAII wrapper for tracking in-flight recv calls
-class atomicRecv {
-  public:
-    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
-      ++this->atomic;
-    }
-    ~atomicRecv() {
-      --this->atomic;
-    }
-  private:
-    std::atomic<int> &atomic;
-};
- 
-void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
-  recv_counter = 0;
-  assert(err != NULL && sparams != NULL);
-  log_set_target(stderr);
-  if (!sparams->verbose_logging) {
-    server_verbose = true;
-    log_disable();
-  }
-
-  LOG_TEE("system info: %s\n", llama_print_system_info());
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    llama = new llama_server_context;
-    gpt_params params;
-    params.n_ctx = sparams->n_ctx;
-    params.n_batch = sparams->n_batch;
-    if (sparams->n_threads > 0) {
-      params.n_threads = sparams->n_threads;
-    }
-    params.n_parallel = sparams->n_parallel;
-    params.rope_freq_base = sparams->rope_freq_base;
-    params.rope_freq_scale = sparams->rope_freq_scale;
-
-    if (sparams->memory_f16) {
-      params.cache_type_k = "f16";
-      params.cache_type_v = "f16";
-    } else {
-      params.cache_type_k = "f32";
-      params.cache_type_v = "f32";
-    }
-
-    params.n_gpu_layers = sparams->n_gpu_layers;
-    params.main_gpu = sparams->main_gpu;
-    params.use_mlock = sparams->use_mlock;
-    params.use_mmap = sparams->use_mmap;
-    params.numa = (ggml_numa_strategy)sparams->numa;
-    params.embedding = sparams->embedding;
-    if (sparams->model != NULL) {
-      params.model = sparams->model;
-    }
-
-    if (sparams->lora_adapters != NULL) {
-      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
-          la = la->next) {
-        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
-      }
-
-      params.use_mmap = false;
-    }
-
-    if (sparams->mmproj != NULL) {
-      params.mmproj = std::string(sparams->mmproj);
-    }
-
-#if defined(GGML_USE_CUBLAS)
-    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
-    LOG_TEE("Performing pre-initialization of GPU\n");
-    int id;
-    cudaError_t cudaErr = cudaGetDevice(&id);
-    if (cudaErr != cudaSuccess) {
-      err->id = -1;
-      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
-      return;
-    }
-#endif
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-  if (!llama->load_model(params)) { 
-    // an error occurred that was not thrown
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
-    return;
-  }
-
-    llama->initialize();
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len,
-             "Unknown exception initializing llama server");
-  }
-}
-
-void llama_server_start() {
-  assert(llama != NULL);
-  // TODO mutex to protect thread creation
-  ext_server_thread = std::thread([&]() {
-    try {
-      LOG_TEE("llama server main loop starting\n");
-      ggml_time_init();
-      llama->queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, llama, std::placeholders::_1));
-      llama->queue_tasks.on_finish_multitask(std::bind(
-        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
-      llama->queue_tasks.on_run_slots(std::bind(
-        &llama_server_context::update_slots, llama));
-      llama->queue_results.on_multitask_update(std::bind(
-          &llama_server_queue::update_multitask,
-          &llama->queue_tasks,
-          std::placeholders::_1,
-          std::placeholders::_2,
-          std::placeholders::_3
-        ));
-      llama->queue_tasks.start_loop();
-    } catch (std::exception &e) {
-      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
-    } catch (...) {
-      LOG_TEE("caught unknown exception in llama server main loop\n");
-    }
-    LOG_TEE("\nllama server shutting down\n");
-    llama_backend_free();
-  });
-}
-
-void llama_server_stop() {
-  assert(llama != NULL);
-  // Shutdown any in-flight requests and block incoming requests.
-  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
-  shutting_down = true;
-
-  while (recv_counter.load() > 0) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(50));
-  }
-
-  // This may take a while for any pending tasks to drain
-  // TODO - consider a timeout to cancel tasks if it's taking too long
-  llama->queue_tasks.terminate();
-  ext_server_thread.join();
-  delete llama;
-  llama = NULL;
-  LOG_TEE("llama server shutdown complete\n");
-  shutting_down = false;
-}
-
-void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
-  assert(llama != NULL && json_req != NULL && resp != NULL);
-  resp->id = -1;
-  resp->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    json data = json::parse(json_req);
-    resp->id = llama->queue_tasks.get_new_id();
-    llama->queue_results.add_waiting_task_id(resp->id);
-    llama->request_completion(resp->id, data, false, false, -1);
-  } catch (std::exception &e) {
-    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
-  } catch (...) {
-    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
-  }
-}
-
-void llama_server_completion_next_result(const int task_id,
-                                         ext_server_task_result_t *resp) {
-  assert(llama != NULL && resp != NULL);
-  resp->id = -1;
-  resp->stop = false;
-  resp->error = false;
-  resp->json_resp = NULL;
-  std::string result_json;
-  try {
-    atomicRecv ar(recv_counter);
-    task_result result = llama->queue_results.recv(task_id);
-    result_json =
-        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
-    resp->id = result.id;
-    resp->stop = result.stop;
-    resp->error = result.error;
-    if (result.error) {
-      LOG_TEE("next result cancel on error\n");
-      llama->request_cancel(task_id);
-      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
-    } else if (result.stop) {
-      LOG_TEE("next result cancel on stop\n");
-      llama->request_cancel(task_id);
-      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
-    } else if (shutting_down) {
-      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
-      llama->request_cancel(task_id);
-      llama->queue_results.remove_waiting_task_id(task_id);
-      resp->stop = true;
-    }
-  } catch (std::exception &e) {
-    resp->error = true;
-    resp->id = -1;
-    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
-    LOG_TEE("llama server completion exception %s\n", e.what());
-  } catch (...) {
-    resp->error = true;
-    resp->id = -1;
-    result_json = "{\"error\":\"Unknown exception during completion\"}";
-    LOG_TEE("llama server completion unknown exception\n");
-  }
-  const std::string::size_type size = result_json.size() + 1;
-  resp->json_resp = new char[size];
-  snprintf(resp->json_resp, size, "%s", result_json.c_str());
-}
-
-void llama_server_release_task_result(ext_server_task_result_t *result) {
-  if (result == NULL || result->json_resp == NULL) {
-    return;
-  }
-  delete[] result->json_resp;
-}
-
-void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
-  assert(llama != NULL && err != NULL);
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    llama->request_cancel(task_id);
-    llama->queue_results.remove_waiting_task_id(task_id);
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len,
-             "Unknown exception completion cancel in llama server");
-  }
-}
-
-void llama_server_tokenize(const char *json_req, char **json_resp,
-                           ext_server_resp_t *err) {
-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
-  *json_resp = NULL;
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    const json body = json::parse(json_req);
-    std::vector<llama_token> tokens;
-    if (body.count("content") != 0) {
-      tokens = llama->tokenize(body["content"], false);
-    }
-    const json data = format_tokenizer_response(tokens);
-    std::string result_json = data.dump();
-    const std::string::size_type size = result_json.size() + 1;
-    *json_resp = new char[size];
-    snprintf(*json_resp, size, "%s", result_json.c_str());
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
-  }
-}
-
-void llama_server_release_json_resp(char **json_resp) {
-  if (json_resp == NULL || *json_resp == NULL) {
-    return;
-  }
-  delete[] *json_resp;
-}
-
-void llama_server_detokenize(const char *json_req, char **json_resp,
-                             ext_server_resp_t *err) {
-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
-  *json_resp = NULL;
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    const json body = json::parse(json_req);
-    std::string content;
-    if (body.count("tokens") != 0) {
-      const std::vector<llama_token> tokens = body["tokens"];
-      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
-    }
-    const json data = format_detokenized_response(content);
-    std::string result_json = data.dump();
-    const std::string::size_type size = result_json.size() + 1;
-    *json_resp = new char[size];
-    snprintf(*json_resp, size, "%s", result_json.c_str());
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
-  }
-}
-
-void llama_server_embedding(const char *json_req, char **json_resp,
-                            ext_server_resp_t *err) {
-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
-  *json_resp = NULL;
-  err->id = 0;
-  err->msg[0] = '\0';
-  try {
-    if (shutting_down) {
-      throw std::runtime_error("server shutting down");
-    }
-    const json body = json::parse(json_req);
-    json prompt;
-    if (body.count("content") != 0) {
-      prompt = body["content"];
-    } else {
-      prompt = "";
-    }
-    const int task_id = llama->queue_tasks.get_new_id();
-    llama->queue_results.add_waiting_task_id(task_id);
-    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
-    atomicRecv ar(recv_counter);
-    task_result result = llama->queue_results.recv(task_id);
-    std::string result_json = result.result_json.dump();
-    const std::string::size_type size = result_json.size() + 1;
-    *json_resp = new char[size];
-    snprintf(*json_resp, size, "%s", result_json.c_str());
-    llama->queue_results.remove_waiting_task_id(task_id);
-  } catch (std::exception &e) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
-  } catch (...) {
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
-  }
-}
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -1,95 +0,0 @@
-#if defined(LLAMA_SERVER_LIBRARY)
-#ifndef LLAMA_SERVER_H
-#define LLAMA_SERVER_H
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-int __main(int argc, char **argv);
-
-// This exposes extern C entrypoints into the llama_server
-// To enable the server compile with LLAMA_SERVER_LIBRARY
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-typedef struct ext_server_resp {
-  int id;          // < 0 on error
-  size_t msg_len;  // caller must allocate msg and set msg_len
-  char *msg;
-} ext_server_resp_t;
-
-// Allocated and freed by caller
-typedef struct ext_server_lora_adapter {
-  char *adapter;
-  float scale;
-  struct ext_server_lora_adapter *next;
-} ext_server_lora_adapter_t;
-
-// Allocated and freed by caller
-typedef struct ext_server_params {
-  char *model;
-  uint32_t n_ctx;         // token context window, 0 = from model
-  uint32_t n_batch;       // prompt processing maximum batch size
-  uint32_t n_threads;     // number of threads to use for generation
-  int32_t n_parallel;     // number of parallel sequences to decodewra
-  float rope_freq_base;   // RoPE base frequency, 0 = from model
-  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-  bool memory_f16;        // use f16 instead of f32 for memory kv
-  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
-  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
-  bool use_mlock;        // force system to keep model in RAM
-  bool use_mmap;         // use mmap if possible
-  int numa;              // attempt optimizations that help on some NUMA systems
-  bool embedding;        // get only sentence embedding
-  ext_server_lora_adapter_t *lora_adapters;
-  char *mmproj;
-  bool verbose_logging;  // Enable verbose logging of the server
-} ext_server_params_t;
-
-typedef struct ext_server_task_result {
-  int id;
-  bool stop;
-  bool error;
-  char *json_resp;  // null terminated, memory managed by ext_server
-} ext_server_task_result_t;
-
-// Initialize the server once per process
-// err->id = 0 for success and err->msg[0] = NULL
-// err->id != 0 for failure, and err->msg contains error message
-void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
-
-// Run the main loop, called once per init
-void llama_server_start();
-// Stop the main loop and free up resources allocated in init and start.  Init
-// must be called again to reuse
-void llama_server_stop();
-
-// json_req null terminated string, memory managed by caller
-// resp->id >= 0 on success (task ID)
-// resp->id < 0 on error, and resp->msg contains error message
-void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
-
-// Caller must call llama_server_release_task_result to free resp->json_resp
-void llama_server_completion_next_result(const int task_id,
-                                         ext_server_task_result_t *result);
-void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
-void llama_server_release_task_result(ext_server_task_result_t *result);
-
-// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
-// 0
-void llama_server_tokenize(const char *json_req, char **json_resp,
-                           ext_server_resp_t *err);
-void llama_server_detokenize(const char *json_req, char **json_resp,
-                             ext_server_resp_t *err);
-void llama_server_embedding(const char *json_req, char **json_resp,
-                            ext_server_resp_t *err);
-void llama_server_release_json_resp(char **json_resp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-#endif  // LLAMA_SERVER_LIBRARY
--- a/llm/ext_server/httplib.h
+++ b/llm/ext_server/httplib.h
--- a/llm/ext_server/json.hpp
+++ b/llm/ext_server/json.hpp
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@@ -1,655 +0,0 @@
-// MIT License
-
-// Copyright (c) 2023 Georgi Gerganov
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-
-#include "json.hpp"
-
-#include "../llava/clip.h"
-
-using json = nlohmann::json;
-
-extern bool server_verbose;
-extern bool server_log_json;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE,
-    TASK_TYPE_METRICS
-};
-
-struct task_server {
-    int id = -1; // to be filled by llama_server_queue
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// completion token output with probabilities
-struct completion_token_output {
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
-
-struct token_translator {
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = nlohmann::ordered_json{
-        {"tid", ss_tid.str()},
-        {"timestamp", time(nullptr)},
-    };
-
-    if (server_log_json) {
-        log.merge_patch(
-                {
-                        {"level",     level},
-                        {"function",  function},
-                        {"line",      line},
-                        {"msg",       message},
-                });
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
-    } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-        std::stringstream ss;
-        ss << buf << " |";
-        for (const auto& el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            ss << " " << el.key() << "=" << value;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
-    }
-}
-
-//
-// server utils
-//
-
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value) {
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    std::vector<char> buf(1);
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
-    return res >= 0;
-}
-
-// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
-
-    for (size_t i = 0; i < messages.size(); ++i) {
-        auto &curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
-    }
-
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
-    return formatted_chat;
-}
-
-//
-// work queue utils
-//
-
-struct llama_server_queue {
-    int id = 0;
-    std::mutex mutex_tasks;
-    bool running;
-    // queues
-    std::vector<task_server> queue_tasks;
-    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
-    std::condition_variable condition_tasks;
-    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_run_slots;
-
-    // Add a new task to the end of the queue
-    int post(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (task.id == -1) {
-            task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
-        }
-        queue_tasks.push_back(std::move(task));
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        queue_tasks_deferred.push_back(std::move(task));
-    }
-
-    // Get the next id for creating anew task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
-        return new_id;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
-    }
-
-    // Register function to process a multitask when it is finished
-    void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
-    }
-
-    // Register the function to be called when all slots data is ready to be processed
-    void on_run_slots(std::function<void(void)> callback) {
-        callback_run_slots = callback;
-    }
-
-    // Call when the state of one slot is changed
-    void notify_slot_changed() {
-        // move deferred tasks back to main loop
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : queue_tasks_deferred) {
-            queue_tasks.push_back(std::move(task));
-        }
-        queue_tasks_deferred.clear();
-    }
-
-    // end the start_loop routine
-    void terminate() {
-        {
-            std::unique_lock<std::mutex> lock(mutex_tasks);
-            running = false;
-        }
-        condition_tasks.notify_all();
-    }
-
-    /**
-     * Main loop consists of these steps:
-     * - Wait until a new task arrives
-     * - Process the task (i.e. maybe copy data into slot)
-     * - Check if multitask is finished
-     * - Run all slots
-     */
-    void start_loop() {
-        running = true;
-        while (true) {
-            LOG_VERBOSE("new task may arrive", {});
-            {
-                while (true)
-                {
-                    std::unique_lock<std::mutex> lock(mutex_tasks);
-                    if (queue_tasks.empty()) {
-                        lock.unlock();
-                        break;
-                    }
-                    task_server task = queue_tasks.front();
-                    queue_tasks.erase(queue_tasks.begin());
-                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
-                    callback_new_task(task);
-                }
-                LOG_VERBOSE("update_multitasks", {});
-                // check if we have any finished multitasks
-                auto queue_iterator = queue_multitasks.begin();
-                while (queue_iterator != queue_multitasks.end())
-                {
-                    if (queue_iterator->subtasks_remaining.empty())
-                    {
-                        // all subtasks done == multitask is done
-                        task_multi current_multitask = *queue_iterator;
-                        callback_finish_multitask(current_multitask);
-                        // remove this multitask
-                        queue_iterator = queue_multitasks.erase(queue_iterator);
-                    }
-                    else
-                    {
-                        ++queue_iterator;
-                    }
-                }
-                // all tasks in the current loop is processed, slots data is now ready
-                LOG_VERBOSE("callback_run_slots", {});
-                callback_run_slots();
-            }
-            LOG_VERBOSE("wait for new task", {});
-            // wait for new task
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (queue_tasks.empty()) {
-                    if (!running) {
-                        LOG_VERBOSE("ending start_loop", {});
-                        return;
-                    }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
-                }
-            }
-        }
-    }
-
-    //
-    // functions to manage multitasks
-    //
-
-    // add a multitask by specifying the id of all subtask (subtask is a task_server)
-    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = multitask_id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-    }
-
-    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-            }
-        }
-    }
-};
-
-struct llama_server_response {
-    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
-    callback_multitask_t callback_update_multitask;
-    // for keeping track of all tasks waiting for the result
-    std::set<int> waiting_task_ids;
-    // the main result queue
-    std::vector<task_result> queue_results;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    // add the task_id to the list of tasks waiting for response
-    void add_waiting_task_id(int task_id) {
-        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(task_id);
-    }
-
-    // when the request is finished, we can remove task associated with it
-    void remove_waiting_task_id(int task_id) {
-        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(task_id);
-    }
-
-    // This function blocks the thread until there is a response for this task_id
-    task_result recv(int task_id) {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // Register the function to update multitask
-    void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
-    }
-
-    // Send a new result to a waiting task_id
-    void send(task_result result) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {{"task_id", result.id}});
-        for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
-            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-            if (result.multitask_id == task_id)
-            {
-                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
-                callback_update_multitask(task_id, result.id, result);
-                continue;
-            }
-
-            if (result.id == task_id)
-            {
-                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
-                queue_results.push_back(result);
-                condition_results.notify_all();
-                return;
-            }
-        }
-    }
-};
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
-
-static inline bool is_base64(uint8_t c)
-{
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
-
-    int in_len = encoded_string.size();
-
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
-
-    std::vector<uint8_t> ret;
-
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
-
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
-    }
-
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
-
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
-
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
-    }
-
-    return ret;
-}
-
-//
-// random string / id
-//
-
-static std::string random_string()
-{
-    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-    std::random_device rd;
-    std::mt19937 generator(rd());
-
-    std::string result(32, ' ');
-
-    for (int i = 0; i < 32; ++i) {
-        result[i] = str[generator() % str.size()];
-    }
-
-    return result;
-}
-
-static std::string gen_chatcmplid()
-{
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-    return chatcmplid.str();
-}
-
-//
-// other common utils
-//
-
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
-}
-
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
-
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
-    std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += llama_token_to_piece(ctx, *begin);
-    }
-    return ret;
-}
-
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
-    return out;
-}
-
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
-{
-    json out = json::array();
-    for (const auto &prob : probs)
-    {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-    return out;
-}
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -1,122 +0,0 @@
-# common logic across linux and darwin
-
-init_vars() {
-    case "${GOARCH}" in
-    "amd64")
-        ARCH="x86_64"
-        ;;
-    "arm64")
-        ARCH="arm64"
-        ;;
-    *)
-        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
-    esac
-
-    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ext_server"
-    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
-    else
-        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
-    fi
-    case $(uname -s) in 
-    "Darwin")
-        LIB_EXT="dylib"
-        WHOLE_ARCHIVE="-Wl,-force_load"
-        NO_WHOLE_ARCHIVE=""
-        GCC_ARCH="-arch ${ARCH}"
-        ;;
-    "Linux")
-        LIB_EXT="so"
-        WHOLE_ARCHIVE="-Wl,--whole-archive"
-        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
-
-        # Cross compiling not supported on linux - Use docker
-        GCC_ARCH=""
-        ;;
-    *)
-        ;;
-    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
-        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    fi
-}
-
-git_module_setup() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
-        echo "Skipping submodule initialization"
-        return
-    fi
-    # Make sure the tree is clean after the directory moves
-    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
-        echo "Cleaning up old submodule"
-        rm -rf ${LLAMACPP_DIR}
-    fi
-    git submodule init
-    git submodule update --force ${LLAMACPP_DIR}
-
-}
-
-apply_patches() {
-    # Wire up our CMakefile
-    if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
-        echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
-    fi
-
-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        # apply temporary patches until fix is upstream
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
-        done
-        for patch in ../patches/*.diff; do
-            (cd ${LLAMACPP_DIR} && git apply ${patch})
-        done
-    fi
-}
-
-build() {
-    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    mkdir -p ${BUILD_DIR}/lib/
-    ls ${BUILD_DIR}
-    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
-        ${GCC_ARCH} \
-        ${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
-        ${BUILD_DIR}/common/libcommon.a \
-        ${BUILD_DIR}/libllama.a \
-        -Wl,-rpath,\$ORIGIN \
-        -lpthread -ldl -lm \
-        ${EXTRA_LIBS}
-}
-
-compress_libs() {
-    echo "Compressing payloads to reduce overall binary size..."
-    pids=""
-    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
-    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
-        gzip -n --best -f ${lib} &
-        pids+=" $!"
-    done
-    echo 
-    for pid in ${pids}; do
-        wait $pid
-    done
-    echo "Finished compression"
-}
-
-# Keep the local tree clean after we're done with the build
-cleanup() {
-    (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
-
-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
-        done
-    fi
-}
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -1,77 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
-
-# TODO - add hardening to detect missing tools (cmake, etc.)
-
-set -ex
-set -o pipefail
-echo "Starting darwin generate script"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-sign() {
-    if [ -n "$APPLE_IDENTITY" ]; then
-        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
-    fi
-}
-
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
-
-case "${GOARCH}" in
-"amd64")
-    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
-
-    #
-    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-    #
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
-    echo "Building LCD CPU"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
-    compress_libs
-
-    #
-    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-    # Approximately 400% faster than LCD on same CPU
-    #
-    init_vars
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
-    echo "Building AVX CPU"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
-    compress_libs
-
-    #
-    # ~2013 CPU Dynamic library
-    # Approximately 10% faster than AVX on same CPU
-    #
-    init_vars
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
-    echo "Building AVX2 CPU"
-    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
-    compress_libs
-    ;;
-"arm64")
-    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
-    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
-    compress_libs
-    ;;
-*)
-    echo "GOARCH must be set"
-    echo "this script is meant to be run from within go generate"
-    exit 1
-    ;;
-esac
-
-cleanup
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -1,223 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be llm/generate/
-
-# First we build one or more CPU based LLM libraries
-#
-# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
-# library dependencies
-#
-# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
-# libraries are quite large, and also dynamically load data files at runtime
-# which in turn are large, so we don't attempt to cary them as payload
-
-set -ex
-set -o pipefail
-
-# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
-amdGPUs() {
-    if [ -n "${AMDGPU_TARGETS}" ]; then
-        echo "${AMDGPU_TARGETS}"
-        return
-    fi
-    GPU_LIST=(
-        "gfx900"
-        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx940"
-        "gfx941"
-        "gfx942"
-        "gfx1010"
-        "gfx1012"
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    (
-        IFS=$';'
-        echo "'${GPU_LIST[*]}'"
-    )
-}
-
-echo "Starting linux generate script"
-if [ -z "${CUDACXX}" ]; then
-    if [ -x /usr/local/cuda/bin/nvcc ]; then
-        export CUDACXX=/usr/local/cuda/bin/nvcc
-    else
-        # Try the default location in case it exists
-        export CUDACXX=$(command -v nvcc)
-    fi
-fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-    # Users building from source can tune the exact flags we pass to cmake for configuring
-    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
-    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
-        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
-        echo "Building custom CPU"
-        build
-        compress_libs
-    else
-        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
-        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-        # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
-        # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-        # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-        # Note: the following seem to yield slower results than AVX2 - ymmv
-        # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
-        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
-        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
-
-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
-            #
-            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-            #
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
-            echo "Building LCD CPU"
-            build
-            compress_libs
-        fi
-
-        if [ "${ARCH}" == "x86_64" ]; then
-            #
-            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
-            #
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
-                #
-                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-                # Approximately 400% faster than LCD on same CPU
-                #
-                init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
-                echo "Building AVX CPU"
-                build
-                compress_libs
-            fi
-
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-                #
-                # ~2013 CPU Dynamic library
-                # Approximately 10% faster than AVX on same CPU
-                #
-                init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
-                echo "Building AVX2 CPU"
-                build
-                compress_libs
-            fi
-        fi
-    fi
-else
-    echo "Skipping CPU generation step as requested"
-fi
-
-# If needed, look for the default CUDA toolkit location
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
-    CUDA_LIB_DIR=/usr/local/cuda/lib64
-fi
-
-# If needed, look for CUDA on Arch Linux
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
-    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
-fi
-
-# Allow override in case libcudart is in the wrong place
-if [ -z "${CUDART_LIB_DIR}" ]; then
-    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
-fi
-
-if [ -d "${CUDA_LIB_DIR}" ]; then
-    echo "CUDA libraries detected - building dynamic CUDA library"
-    init_vars
-    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" ]; then
-        CUDA_VARIANT=_v${CUDA_MAJOR}
-    fi
-    if [ "${ARCH}" == "arm64" ]; then
-        echo "ARM CPU detected - disabling unsupported AVX instructions"
-        
-        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
-        #
-        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
-        # Disabling has minimal performance effect while maintaining compatibility. 
-        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
-    fi
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    build
-
-    # Carry the CUDA libs as payloads to help reduce dependency burden on users
-    #
-    # TODO - in the future we may shift to packaging these separately and conditionally
-    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
-    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
-        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
-        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
-        fi
-    done
-    compress_libs
-
-fi
-
-if [ -z "${ROCM_PATH}" ]; then
-    # Try the default location in case it exists
-    ROCM_PATH=/opt/rocm
-fi
-
-if [ -z "${CLBlast_DIR}" ]; then
-    # Try the default location in case it exists
-    if [ -d /usr/lib/cmake/CLBlast ]; then
-        export CLBlast_DIR=/usr/lib/cmake/CLBlast
-    fi
-fi
-
-if [ -d "${ROCM_PATH}" ]; then
-    echo "ROCm libraries detected - building dynamic ROCm library"
-    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
-        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
-    fi
-    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
-    build
-
-    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/lib/deps.txt"
-    touch "${BUILD_DIR}/lib/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
-    done
-    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/lib/deps.txt"
-        echo "ERROR: deps file short"
-        exit 1
-    fi
-    compress_libs
-fi
-
-cleanup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -1,276 +0,0 @@
-#!powershell
-
-$ErrorActionPreference = "Stop"
-
-function amdGPUs {
-    if ($env:AMDGPU_TARGETS) {
-        return $env:AMDGPU_TARGETS
-    }
-    # TODO - load from some common data file for linux + windows build consistency
-    $GPU_LIST = @(
-        "gfx900"
-        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx940"
-        "gfx941"
-        "gfx942"
-        "gfx1010"
-        "gfx1012"
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    $GPU_LIST -join ';'
-}
-
-function init_vars {
-    $script:SRC_DIR = $(resolve-path "..\..\")
-    $script:llamacppDir = "../llama.cpp"
-    $script:cmakeDefs = @(
-        "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off"
-        )
-    $script:cmakeTargets = @("ext_server")
-    $script:ARCH = "amd64" # arm not yet supported.
-    if ($env:CGO_CFLAGS -contains "-g") {
-        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
-        $script:config = "RelWithDebInfo"
-    } else {
-        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
-        $script:config = "Release"
-    }
-    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
-        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
-    }
-    # Try to find the CUDA dir
-    if ($env:CUDA_LIB_DIR -eq $null) {
-        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:CUDA_LIB_DIR=($d| split-path -parent)
-            $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
-        }
-    } else {
-        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
-    }
-    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
-    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
-    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
-        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    } else {
-        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
-    }
-    # Note: Windows Kits 10 signtool crashes with GCP's plugin
-    if ($null -eq $env:SIGN_TOOL) {
-        ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
-    } else {
-        ${script:SignTool}=${env:SIGN_TOOL}
-    }
-    if ("${env:KEY_CONTAINER}") {
-        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
-    }
-}
-
-function git_module_setup {
-    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
-    & git submodule init
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
-
-function apply_patches {
-    # Wire up our CMakefile
-    if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
-        Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
-    }
-
-    # Apply temporary patches until fix is upstream
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        Set-Location -Path ${script:llamacppDir}
-        foreach ($file in $filePaths) {
-            git checkout $file
-        }
-    }
-
-    # Apply each patch
-    foreach ($patch in $patches) {
-        Set-Location -Path ${script:llamacppDir}
-        git apply $patch.FullName
-    }
-}
-
-function build {
-    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
-    & cmake --version
-    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
-
-function install {
-    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
-    md "${script:buildDir}/lib" -ea 0 > $null
-    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
-    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
-    # Display the dll dependencies in the build log
-    if ($script:DUMPBIN -ne $null) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
-    }
-}
-
-function sign {
-    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/lib/*.dll"
-        foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
-            & "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-    }
-}
-
-function compress_libs {
-    if ($script:GZIP -eq $null) {
-        write-host "gzip not installed, not compressing files"
-        return
-    }
-    write-host "Compressing dlls..."
-    $libs = dir "${script:buildDir}/lib/*.dll"
-    foreach ($file in $libs) {
-        & "$script:GZIP" --best -f $file
-    }
-}
-
-function cleanup {
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        Set-Location -Path ${script:llamacppDir}
-        foreach ($file in $filePaths) {            
-            git checkout $file
-        }
-    }
-    Set-Location "${script:llamacppDir}/"
-    git checkout CMakeLists.txt
-
-}
-
-init_vars
-git_module_setup
-apply_patches
-
-# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
-# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-
-$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-
-if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
-
-    init_vars
-    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
-    write-host "Building LCD CPU"
-    build
-    install
-    sign
-    compress_libs
-
-    init_vars
-    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
-    write-host "Building AVX CPU"
-    build
-    install
-    sign
-    compress_libs
-
-    init_vars
-    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
-    write-host "Building AVX2 CPU"
-    build
-    install
-    sign
-    compress_libs
-} else {
-    write-host "Skipping CPU generation step as requested"
-}
-
-if ($null -ne $script:CUDA_LIB_DIR) {
-    # Then build cuda as a dynamically loaded library
-    $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-    $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
-    if ($null -ne $script:CUDA_VERSION) {
-        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
-    }
-    init_vars
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
-    write-host "Building CUDA"
-    build
-    install
-    sign
-    compress_libs
-}
-
-if ($null -ne $env:HIP_PATH) {
-    $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
-    if ($null -ne $script:ROCM_VERSION) {
-        $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
-    }
-
-    init_vars
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
-    $script:cmakeDefs += @(
-        "-G", "Ninja", 
-        "-DCMAKE_C_COMPILER=clang.exe",
-        "-DCMAKE_CXX_COMPILER=clang++.exe",
-        "-DLLAMA_HIPBLAS=on",
-        "-DLLAMA_AVX=on",
-        "-DLLAMA_AVX2=off",
-        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
-        "-DAMDGPU_TARGETS=$(amdGPUs)",
-        "-DGPU_TARGETS=$(amdGPUs)"
-        )
-
-    # Make sure the ROCm binary dir is first in the path
-    $env:PATH="$env:HIP_PATH\bin;$env:PATH"
-
-    # We have to clobber the LIB var from the developer shell for clang to work properly
-    $env:LIB=""
-
-    write-host "Building ROCm"
-    build
-    # Ninja doesn't prefix with config name
-    ${script:config}=""
-    install
-    if ($null -ne $script:DUMPBIN) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
-    }
-    sign
-    compress_libs
-}
-
-cleanup
-write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})"
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate sh ./gen_darwin.sh
--- a/llm/generate/generate_linux.go
+++ b/llm/generate/generate_linux.go
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_linux.sh
--- a/llm/generate/generate_windows.go
+++ b/llm/generate/generate_windows.go
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -1,162 +0,0 @@
-package llm
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-	"slices"
-)
-
-type ContainerGGLA struct {
-	version uint32
-}
-
-func (c *ContainerGGLA) Name() string {
-	return "ggla"
-}
-
-func (c *ContainerGGLA) Decode(rs io.ReadSeeker) (model, error) {
-	binary.Read(rs, binary.LittleEndian, &c.version)
-
-	switch c.version {
-	case 1:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	model := newModelGGLA(c)
-	err := model.decode(rs)
-	return model, err
-}
-
-type ModelGGLA struct {
-	*ContainerGGLA
-
-	kv      KV
-	tensors []Tensor
-}
-
-func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
-	return &ModelGGLA{
-		ContainerGGLA: container,
-		kv:            make(KV),
-	}
-}
-
-func (m *ModelGGLA) decode(rs io.ReadSeeker) error {
-	var r uint32
-	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
-		return err
-	}
-	m.kv["r"] = r
-
-	var alpha uint32
-	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
-		return err
-	}
-	m.kv["alpha"] = alpha
-
-	for {
-		var dims uint32
-		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
-			return err
-		}
-
-		var namesize uint32
-		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
-			return err
-		}
-
-		var t Tensor
-		if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
-			return err
-		}
-
-		t.Shape = make([]uint64, dims)
-		for i := 0; uint32(i) < dims; i++ {
-			var shape32 uint32
-			if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
-				return err
-			}
-
-			t.Shape[i] = uint64(shape32)
-		}
-
-		// ggla tensor shape is reversed
-		// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
-		slices.Reverse(t.Shape)
-
-		name := make([]byte, namesize)
-		if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
-			return err
-		}
-
-		t.Name = string(name)
-
-		offset, err := rs.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return err
-		}
-
-		if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
-			return err
-		}
-
-		offset, err = rs.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return err
-		}
-
-		t.Offset = uint64(offset)
-
-		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
-			return err
-		}
-
-		m.tensors = append(m.tensors, t)
-	}
-}
-
-func (m *ModelGGLA) KV() KV {
-	return m.kv
-}
-
-func (m *ModelGGLA) Tensor() []Tensor {
-	return m.tensors
-}
-
-func (*ModelGGLA) ModelFamily() string {
-	return "ggla"
-}
-
-func (*ModelGGLA) ModelType() string {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) FileType() string {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumLayers() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumGQA() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumEmbed() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumHead() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumHeadKv() uint32 {
-	panic("not implemented")
-}
-
-func (*ModelGGLA) NumCtx() uint32 {
-	panic("not implemented")
-}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -103,7 +103,33 @@ type model interface {

 type container interface {
 	Name() string
-	Decode(io.ReadSeeker) (model, error)
+	Decode(*readSeekOffset) (model, error)
+}
+
+type containerLORA struct {
+	version uint32
+}
+
+func (c *containerLORA) Name() string {
+	return "ggla"
+}
+
+func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
+	var version uint32
+	binary.Read(rso, binary.LittleEndian, &version)
+
+	switch version {
+	case 1:
+	default:
+		return nil, errors.New("invalid version")
+	}
+
+	c.version = version
+
+	// remaining file contents aren't decoded
+	rso.Seek(0, io.SeekEnd)
+
+	return nil, nil
 }

 const (
@@ -122,9 +148,11 @@ const (

 var ErrUnsupportedFormat = errors.New("unsupported model format")

-func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
+func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
+	ro := readSeekOffset{ReadSeeker: r}
+
 	var magic uint32
-	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
+	if err := binary.Read(&ro, binary.LittleEndian, &magic); err != nil {
 		return nil, err
 	}

@@ -133,23 +161,16 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
 	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
 		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
-		c = &ContainerGGLA{}
+		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
+		c = &containerGGUF{bo: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		c = &ContainerGGUF{ByteOrder: binary.BigEndian}
+		c = &containerGGUF{bo: binary.BigEndian}
 	default:
 		return nil, errors.New("invalid file magic")
 	}

-	model, err := c.Decode(rs)
-	if errors.Is(err, io.EOF) {
-		// noop
-	} else if err != nil {
-		return nil, err
-	}
-
-	offset, err := rs.Seek(0, io.SeekCurrent)
+	model, err := c.Decode(&ro)
 	if err != nil {
 		return nil, err
 	}
@@ -158,6 +179,27 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
 	return &GGML{
 		container: c,
 		model:     model,
-		Size:      offset,
+		Size:      ro.offset,
 	}, nil
 }
+
+type readSeekOffset struct {
+	io.ReadSeeker
+	offset int64
+}
+
+func (rso *readSeekOffset) Seek(offset int64, whence int) (int64, error) {
+	offset, err := rso.ReadSeeker.Seek(offset, whence)
+	if err != nil {
+		return 0, err
+	}
+
+	rso.offset = offset
+	return offset, nil
+}
+
+func (rso *readSeekOffset) Read(p []byte) (int, error) {
+	n, err := rso.ReadSeeker.Read(p)
+	rso.offset += int64(n)
+	return n, err
+}
--- a/llm/gguf.go
+++ b/llm/gguf.go
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -5,7 +5,7 @@ import (
 	"fmt"
 	"time"

-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )

 const jsonGrammar = `
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -6,10 +6,10 @@ import (
 	"log/slog"
 	"os"
 	"runtime"
-	"slices"
+	"time"

-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/gpu"
 )

 type LLM interface {
@@ -20,11 +20,7 @@ type LLM interface {
 	Close()
 }

-var cpuOnlyFamilies = []string{
-	"mamba",
-}
-
-func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -53,18 +49,13 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 	size := ggml.Size

 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())

 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.NumGQA()) * kv / 6

-	// certain model architectures don't support gpu inference yet
-	if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
-		opts.NumGPU = 0
-	}
-
 	info := gpu.GetGPUInfo()
 	switch runtime.GOOS {
 	case "darwin":
@@ -73,7 +64,9 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 		}

 		if size+kv+graph > vram {
-			slog.Info("not enough vram available, setting num_gpu=0")
+			slog.Info("not enough vram available, falling back to CPU only")
+			info.Library = "cpu"
+			info.Variant = gpu.GetCPUVariant()
 			opts.NumGPU = 0
 			break
 		}
@@ -128,15 +121,15 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er

 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, model, adapters, projectors, opts)
+	return newLlmServer(info, workDir, model, adapters, projectors, opts)
 }

 // Give any native cgo implementations an opportunity to initialize
-func Init() error {
-	return nativeInit()
+func Init(workdir string) error {
+	return nativeInit(workdir)
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting
@@ -155,7 +148,7 @@ func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []stri
 	_, err := os.Stat(dynLibs[0])
 	if err != nil {
 		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit()
+		err = nativeInit(workDir)
 		if err != nil {
 			return nil, err
 		}
@@ -173,3 +166,12 @@ func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []stri

 	return nil, err2
 }
+
+func parseDurationMs(ms float64) time.Duration {
+	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
+	if err != nil {
+		panic(err)
+	}
+
+	return dur
+}
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -0,0 +1,14 @@
+//go:generate cmake -S server -B server/build/cpu -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off
+//go:generate cmake -S server -B server/build/cpu_avx -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on
+//go:generate cmake -S server -B server/build/cpu_avx2 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=on
+//go:generate cmake --build server/build/cpu --target server -- -j4
+//go:generate cmake --build server/build/cpu_avx --target server -- -j4
+//go:generate cmake --build server/build/cpu_avx2 --target server -- -j4
+package llm
+
+import "embed"
+
+//go:embed server/build/cpu/server
+//go:embed server/build/cpu_avx/server
+//go:embed server/build/cpu_avx2/server
+var libEmbed embed.FS
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -0,0 +1,8 @@
+//go:generate cmake -S server -B server/build/metal -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64
+//go:generate cmake --build server/build/metal --target server -- -j4
+package llm
+
+import "embed"
+
+//go:embed server/build/metal/ggml-metal.metal server/build/metal/server
+var libEmbed embed.FS
--- a/llm/patches/03-load_exception.diff
+++ b/llm/patches/03-load_exception.diff
@@ -1,44 +0,0 @@
-diff --git a/llama.cpp b/llama.cpp
-index 4225f955..7b762f86 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
-         }
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-+        throw;
-     }
- 
-     return 0;
-@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
-         };
-     }
- 
-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+    try {
-+        int status = llama_model_load(path_model, *model, params);
-+        GGML_ASSERT(status <= 0);
-+        if (status < 0) {
-+            if (status == -1) {
-+                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-+            } else if (status == -2) {
-+                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+            }
-+            delete model;
-+            return nullptr;
-         }
-+    } catch (...) {
-+        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
-         delete model;
-        return nullptr;
-+        throw;
-     }
- 
-     return model;
--- a/llm/patches/04-locale.diff
+++ b/llm/patches/04-locale.diff
@@ -1,13 +0,0 @@
-diff --git a/llama.cpp b/llama.cpp
-index b27aa272..99372f9c 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
-     }
- 
-     uint32_t to_lower(uint32_t code) {
-        static const std::locale locale("en_US.UTF-8");
-+        static const std::locale locale("");
- #if defined(_WIN32)
-         if (code > 0xFFFF) {
-             return code;
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -11,12 +11,11 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-	"sync"

 	"golang.org/x/exp/slices"
 	"golang.org/x/sync/errgroup"

-	"github.com/ollama/ollama/gpu"
+	"github.com/jmorganca/ollama/gpu"
 )

 // Libraries names may contain an optional variant separated by '_'
@@ -104,17 +103,24 @@ func rocmDynLibPresent() bool {
 	return false
 }

-func nativeInit() error {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		return err
+func nativeInit(workdir string) error {
+	slog.Info("Extracting dynamic libraries...")
+	if runtime.GOOS == "darwin" {
+		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
+		if err != nil {
+			if err == payloadMissing {
+				// TODO perhaps consider this a hard failure on arm macs?
+				slog.Info("ggml-meta.metal payload missing")
+				return nil
+			}
+			return err
+		}
+		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
 	}

-	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
-
-	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
-		if errors.Is(err, payloadMissing) {
+		if err == payloadMissing {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
 			return nil
 		}
@@ -143,15 +149,18 @@ func nativeInit() error {
 	return nil
 }

-func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
+func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
+	libs := []string{}

-	var mu sync.Mutex
-	var libs []string
-	var g errgroup.Group
+	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
+	// and tracking by version so we don't reexpand the files every time
+	// Also maybe consider lazy loading only what is needed
+
+	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
 		if len(pathComps) != pathComponentCount {
@@ -163,14 +172,14 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
+			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
+				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
@@ -184,29 +193,71 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {

 			destFile := filepath.Join(targetDir, filepath.Base(filename))
 			if strings.Contains(destFile, "server") {
-				mu.Lock()
 				libs = append(libs, destFile)
-				mu.Unlock()
 			}

-			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-			if err != nil {
-				return fmt.Errorf("write payload %s: %v", file, err)
-			}
-			defer destFp.Close()
-			if _, err := io.Copy(destFp, src); err != nil {
-				return fmt.Errorf("copy payload %s: %v", file, err)
+			_, err = os.Stat(destFile)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", file, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", file, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", file, err)
 			}
 			return nil
 		})
 	}
-	err = g.Wait()
-	if err != nil {
-		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
-		gpu.Cleanup()
-		return nil, err
+	return libs, g.Wait()
+}
+
+func extractPayloadFiles(workDir, glob string) error {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return payloadMissing
 	}
-	return libs, nil
+
+	for _, file := range files {
+		srcFile, err := libEmbed.Open(file)
+		if err != nil {
+			return fmt.Errorf("read payload %s: %v", file, err)
+		}
+		defer srcFile.Close()
+		if err := os.MkdirAll(workDir, 0o755); err != nil {
+			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		}
+		src := io.Reader(srcFile)
+		filename := file
+		if strings.HasSuffix(file, ".gz") {
+			src, err = gzip.NewReader(src)
+			if err != nil {
+				return fmt.Errorf("decompress payload %s: %v", file, err)
+			}
+			filename = strings.TrimSuffix(filename, ".gz")
+		}
+
+		destFile := filepath.Join(workDir, filepath.Base(filename))
+		_, err = os.Stat(destFile)
+		switch {
+		case errors.Is(err, os.ErrNotExist):
+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFile.Close()
+			if _, err := io.Copy(destFile, src); err != nil {
+				return fmt.Errorf("copy payload %s: %v", file, err)
+			}
+		case err != nil:
+			return fmt.Errorf("stat payload %s: %v", file, err)
+		}
+	}
+	return nil
 }

 func verifyDriverAccess() error {
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
-var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
-var libEmbed embed.FS
--- a/Show More
+++ b/Show More