Backport MacOS SDK fix from main

Apply 01-cache.diff
2026-01-06 14:40:27 -05:00 · 2024-04-04 11:17:48 -07:00 · 2024-04-04 10:11:23 -07:00
102 changed files with 3263 additions and 5869 deletions
--- a/.github/ISSUE_TEMPLATE/90_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/90_bug_report.yml
@@ -19,7 +19,7 @@ body:
      label: What did you expect to see?
      description: What did you expect to see/happen instead?
    validations:
-      required: false
+      required: true
  - type: textarea
    id: steps
    attributes:
--- a/.github/workflows/latest.yaml
+++ b/.github/workflows/latest.yaml
@@ -1,24 +0,0 @@
-name: latest
-
-on:
-  release:
-    types: [released]
-
-jobs:
-  update-latest:
-    environment: release
-    runs-on: linux
-    steps:
-      - uses: actions/checkout@v4
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Tag images as latest
-        env:
-          PUSH: "1"
-        shell: bash
-        run: |
-          export "VERSION=${GITHUB_REF_NAME#v}"
-          ./scripts/tag_latest.sh
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -30,7 +30,7 @@ jobs:
          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - name: Build Darwin
        env:
@@ -42,7 +42,7 @@ jobs:
          DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
        run: |
          ./scripts/build_darwin.sh
-
+        
      - uses: actions/upload-artifact@v4
        with:
          name: dist-darwin
@@ -50,6 +50,7 @@ jobs:
            dist/*arwin*
            !dist/*-cov

+
  # Windows builds take a long time to both install the dependencies and build, so parallelize
  # CPU generation step
  generate-windows-cpu:
@@ -86,7 +87,7 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -100,9 +101,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
-          path: |
-            llm/build/**/bin/*
-            llm/build/**/*.a
+          path: llm/llama.cpp/build/**/lib/*

  # ROCm generation step
  generate-windows-rocm:
@@ -139,9 +138,9 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
-      - name: 'Install ROCm'
+      - name: "Install ROCm"
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading AMD HIP Installer"
@@ -149,7 +148,7 @@ jobs:
          write-host "Installing AMD HIP"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
-      - name: 'Verify ROCm'
+      - name: "Verify ROCm"
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - run: go get ./...
@@ -163,7 +162,7 @@ jobs:
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          go generate -x ./...
        name: go generate
-      - name: 'gather rocm dependencies'
+      - name: "gather rocm dependencies"
        run: |
          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          md "dist\deps\bin\rocblas\library"
@@ -173,7 +172,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
-          path: llm/build/**/bin/*
+          path: llm/llama.cpp/build/**/lib/*
      - uses: actions/upload-artifact@v4
        with:
          name: windows-rocm-deps
@@ -214,36 +213,29 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
-      - name: 'Install CUDA'
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
-          write-host "Installing CUDA"
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
-          write-host "Completed CUDA"
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
-      - name: 'Verify CUDA'
+      # TODO - consider replacing this action with a ps1 snippet to install
+      # This actions seems to fail sometimes with "no tools in cache" but a re-run of the failed job clears it
+      # https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+      - name: "Install CUDA"
+        uses: Jimver/cuda-toolkit@v0.2.14
+        id: cuda-toolkit
+        with:
+          cuda: '11.3.1'      
+      - name: "Verify CUDA"
        run: nvcc -V
      - run: go get ./...
      - name: go generate
        run: |
          $gopath=(get-command go).source | split-path -parent
-          $cudabin=(get-command nvcc).source | split-path
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$cudabin;$env:PATH"
+          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          go generate -x ./...
-      - name: 'gather cuda dependencies'
+      - name: "gather cuda dependencies"
        run: |
          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
          md "dist\deps"
@@ -253,7 +245,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda
-          path: llm/build/**/bin/*
+          path: llm/llama.cpp/build/**/lib/*
      - uses: actions/upload-artifact@v4
        with:
          name: windows-cuda-deps
@@ -300,17 +292,17 @@ jobs:
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cpu
-          path: llm/build
+          path: llm/llama.cpp/build
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cuda
-          path: llm/build
+          path: llm/llama.cpp/build
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps
@@ -322,8 +314,8 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-          path: llm/build
-      - run: dir llm/build
+          path: llm/llama.cpp/build
+      - run: dir llm/llama.cpp/build
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@@ -339,14 +331,14 @@ jobs:
          name: dist-windows
          path: dist/*.exe

-  # Linux x86 assets built using the container based build
+  # Linux x86 assets built using the container based build 
  build-linux-amd64:
    environment: release
    runs-on: linux
    env:
-      OLLAMA_SKIP_MANIFEST_CREATE: '1'
+      OLLAMA_SKIP_MANIFEST_CREATE: "1"
      BUILD_ARCH: amd64
-      PUSH: '1'
+      PUSH: "1"
    steps:
      - uses: actions/checkout@v4
        with:
@@ -376,9 +368,9 @@ jobs:
    environment: release
    runs-on: linux-arm64
    env:
-      OLLAMA_SKIP_MANIFEST_CREATE: '1'
+      OLLAMA_SKIP_MANIFEST_CREATE: "1"
      BUILD_ARCH: arm64
-      PUSH: '1'
+      PUSH: "1"
    steps:
      - uses: actions/checkout@v4
        with:
@@ -386,7 +378,7 @@ jobs:
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: 'Install Docker'
+      - name: "Install Docker"
        run: |
          # Add Docker's official GPG key:
          env
@@ -423,7 +415,7 @@ jobs:
            !dist/*-cov

  # Aggregate all the assets and ship a release
-  release:
+  release: 
    needs:
      - build-darwin
      - build-windows
@@ -434,8 +426,8 @@ jobs:
    permissions:
      contents: write
    env:
-      OLLAMA_SKIP_IMAGE_BUILD: '1'
-      PUSH: '1'
+      OLLAMA_SKIP_IMAGE_BUILD: "1"
+      PUSH: "1"
    steps:
      - uses: actions/checkout@v4
      - name: Set Version
@@ -463,11 +455,11 @@ jobs:
        with:
          name: ${{ env.RELEASE_VERSION }}
          allowUpdates: true
-          artifacts: 'dist/*'
+          artifacts: "dist/*"
          draft: true
          prerelease: true
          omitBodyDuringUpdate: true
          generateReleaseNotes: true
          omitDraftDuringUpdate: true
          omitPrereleaseDuringUpdate: true
-          replacesArtifacts: true
+          replacesArtifacts: true
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -5,13 +5,13 @@ on:
    paths:
      - '**/*'
      - '!docs/**'
+      - '!examples/**'
      - '!README.md'

 jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
-      GENERATE: ${{ steps.changes.outputs.GENERATE }}
      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
    steps:
@@ -50,29 +50,25 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
+          $env:PATH="$gopath;$env:PATH"
          go generate -x ./...
        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: 'Windows Go Generate'
+        name: "Windows Go Generate"
      - run: go generate -x ./...
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: 'Unix Go Generate'
+        name: "Unix Go Generate"
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-          path: |
-            llm/build/**/bin/*
-            llm/build/**/*.a
+          path: llm/llama.cpp/build/**/lib/*
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@@ -92,7 +88,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -103,14 +99,14 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: cuda-${{ matrix.cuda-version }}-libraries
-          path: llm/build/**/bin/*
+          path: llm/llama.cpp/build/**/lib/*
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
    strategy:
      matrix:
        rocm-version:
-          - '6.0.2'
+          - '6.0'
    runs-on: linux
    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
    steps:
@@ -123,7 +119,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v4
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get ./...
      - run: |
@@ -134,87 +130,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: rocm-${{ matrix.rocm-version }}-libraries
-          path: llm/build/**/bin/*
-
-  # ROCm generation step
-  generate-windows-rocm:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
-    runs-on: windows
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: 'Install ROCm'
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading AMD HIP Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP"
-      - name: 'Verify ROCm'
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
-        name: go generate
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
-      # TODO - do we need any artifacts?
-
-  # CUDA generation step
-  generate-windows-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
-    runs-on: windows
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: 'Install CUDA'
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
-          write-host "Installing CUDA"
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
-          write-host "Completed CUDA"
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
-      - name: 'Verify CUDA'
-        run: nvcc -V
-      - run: go get ./...
-      - name: go generate
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$cudabin;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
-      # TODO - do we need any artifacts?
-
+          path: llm/llama.cpp/build/**/lib/*
  lint:
    strategy:
      matrix:
@@ -237,7 +153,7 @@ jobs:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: false
      - run: |
          case ${{ matrix.arch }} in
@@ -246,21 +162,19 @@ jobs:
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
-          mkdir -p llm/build/linux/$ARCH/stub/bin
-          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
+          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
+          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
+          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
+          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
+          touch llm/llama.cpp/ggml-metal.metal
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
-          mkdir -p llm/build/windows/$ARCH/stub/bin
-          touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
+          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
+          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
        if: ${{ startsWith(matrix.os, 'windows-') }}
-        shell: bash
-      - uses: golangci/golangci-lint-action@v4
-        with:
-          args: --timeout 8m0s
+      - uses: golangci/golangci-lint-action@v3
  test:
    strategy:
      matrix:
@@ -275,14 +189,13 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
-      OLLAMA_CPU_TARGET: 'static'
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-go@v5
        with:
-          go-version-file: go.mod
+          go-version: '1.22'
          cache: true
      - run: go get
      - run: |
@@ -292,19 +205,18 @@ jobs:
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
-          mkdir -p llm/build/linux/$ARCH/stub/bin
-          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
+          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
+          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
+          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
+          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
+          touch llm/llama.cpp/ggml-metal.metal
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
-          mkdir -p llm/build/windows/$ARCH/stub/bin
-          touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
+          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
+          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
        if: ${{ startsWith(matrix.os, 'windows-') }}
-        shell: bash
-      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
      - uses: actions/upload-artifact@v4
--- a/.gitignore
+++ b/.gitignore
@@ -10,5 +10,4 @@ ggml-metal.metal
 *.exe
 .idea
 test_data
-*.crt
-llm/build
+*.crt
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -15,3 +15,13 @@ linters:
    - misspell
    - nilerr
    - unused
+linters-settings:
+  errcheck:
+    # exclude the following functions since we don't generally
+    # need to be concerned with the returned errors
+    exclude-functions:
+      - encoding/binary.Read
+      - (*os.File).Seek
+      - (*bufio.Writer).WriteString
+      - (*github.com/spf13/pflag.FlagSet).Set
+      - (*github.com/ollama/ollama/llm.readSeekOffset).Seek
--- a/29
+++ b/29
@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=6.0.2
+ARG ROCM_VERSION=6.0

 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -42,7 +42,7 @@ ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
-    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
+    for dep in $(cat /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
        cp ${dep} /tmp/scratch/ || exit 1 ; \
    done && \
    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
@@ -61,8 +61,6 @@ ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

-FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
@@ -70,33 +68,28 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh

-FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
+FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-
-FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
-FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh

-
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
@@ -108,8 +101,8 @@ ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build -trimpath .
--- a/README.md
+++ b/README.md
@@ -64,7 +64,6 @@ Here are some example models that can be downloaded:
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
 | Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
 | Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`             |

 > Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

@@ -260,7 +259,6 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Web & Desktop

- [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
@@ -291,9 +289,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)

 ### Terminal

@@ -317,8 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Database

- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
- [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
+- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)

 ### Package managers

@@ -377,4 +371,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
--- a/api/client.go
+++ b/api/client.go
@@ -1,9 +1,3 @@
-// Package api implements the client-side API for code wishing to interact
-// with the ollama service. The methods of the [Client] type correspond to
-// the ollama REST API as described in https://github.com/ollama/ollama/blob/main/docs/api.md
-//
-// The ollama command-line client itself uses this package to interact with
-// the backend service.
 package api

 import (
@@ -11,6 +5,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net"
@@ -24,8 +19,6 @@ import (
 	"github.com/ollama/ollama/version"
 )

-// Client encapsulates client state for interacting with the ollama
-// service. Use [ClientFromEnvironment] to create new Clients.
 type Client struct {
 	base *url.URL
 	http *http.Client
@@ -47,15 +40,6 @@ func checkError(resp *http.Response, body []byte) error {
 	return apiError
 }

-// ClientFromEnvironment creates a new [Client] using configuration from the
-// environment variable OLLAMA_HOST, which points to the network host and
-// port on which the ollama service is listenting. The format of this variable
-// is:
-//
-//	<scheme>://<host>:<port>
-//
-// If the variable is not specified, a default ollama host and port will be
-// used.
 func ClientFromEnvironment() (*Client, error) {
 	defaultPort := "11434"

@@ -207,14 +191,8 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	return nil
 }

-// GenerateResponseFunc is a function that [Client.Generate] invokes every time
-// a response is received from the service. If this function returns an error,
-// [Client.Generate] will stop generating and return this error.
 type GenerateResponseFunc func(GenerateResponse) error

-// Generate generates a response for a given prompt. The req parameter should
-// be populated with prompt details. fn is called for each response (there may
-// be multiple responses, e.g. in case streaming is enabled).
 func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn GenerateResponseFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/generate", req, func(bts []byte) error {
 		var resp GenerateResponse
@@ -226,15 +204,8 @@ func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn Generate
 	})
 }

-// ChatResponseFunc is a function that [Client.Chat] invokes every time
-// a response is received from the service. If this function returns an error,
-// [Client.Chat] will stop generating and return this error.
 type ChatResponseFunc func(ChatResponse) error

-// Chat generates the next message in a chat. [ChatRequest] may contain a
-// sequence of messages which can be used to maintain chat history with a model.
-// fn is called for each response (there may be multiple responses, e.g. if case
-// streaming is enabled).
 func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
 		var resp ChatResponse
@@ -246,14 +217,8 @@ func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc
 	})
 }

-// PullProgressFunc is a function that [Client.Pull] invokes every time there
-// is progress with a "pull" request sent to the service. If this function
-// returns an error, [Client.Pull] will stop the process and return this error.
 type PullProgressFunc func(ProgressResponse) error

-// Pull downloads a model from the ollama library. fn is called each time
-// progress is made on the request and can be used to display a progress bar,
-// etc.
 func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/pull", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -336,7 +301,18 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
 }

 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
-	return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
+	if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
+		var statusError StatusError
+		if !errors.As(err, &statusError) || statusError.StatusCode != http.StatusNotFound {
+			return err
+		}
+
+		if err := c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil); err != nil {
+			return err
+		}
+	}
+
+	return nil
 }

 func (c *Client) Version(ctx context.Context) (string, error) {
--- a/api/types.go
+++ b/api/types.go
@@ -33,46 +33,18 @@ func (e StatusError) Error() string {

 type ImageData []byte

-// GenerateRequest describes a request sent by [Client.Generate]. While you
-// have to specify the Model and Prompt fields, all the other fields have
-// reasonable defaults for basic uses.
 type GenerateRequest struct {
-	// Model is the model name; it should be a name familiar to Ollama from
-	// the library at https://ollama.com/library
-	Model string `json:"model"`
+	Model     string      `json:"model"`
+	Prompt    string      `json:"prompt"`
+	System    string      `json:"system"`
+	Template  string      `json:"template"`
+	Context   []int       `json:"context,omitempty"`
+	Stream    *bool       `json:"stream,omitempty"`
+	Raw       bool        `json:"raw,omitempty"`
+	Format    string      `json:"format"`
+	KeepAlive *Duration   `json:"keep_alive,omitempty"`
+	Images    []ImageData `json:"images,omitempty"`

-	// Prompt is the textual prompt to send to the model.
-	Prompt string `json:"prompt"`
-
-	// System overrides the model's default system message/prompt.
-	System string `json:"system"`
-
-	// Template overrides the model's default prompt template.
-	Template string `json:"template"`
-
-	// Context is the context parameter returned from a previous call to
-	// Generate call. It can be used to keep a short conversational memory.
-	Context []int `json:"context,omitempty"`
-
-	// Stream specifies whether the response is streaming; it is true by default.
-	Stream *bool `json:"stream,omitempty"`
-
-	// Raw set to true means that no formatting will be applied to the prompt.
-	Raw bool `json:"raw,omitempty"`
-
-	// Format specifies the format to return a response in.
-	Format string `json:"format"`
-
-	// KeepAlive controls how long the model will stay loaded in memory following
-	// this request.
-	KeepAlive *Duration `json:"keep_alive,omitempty"`
-
-	// Images is an optional list of base64-encoded images accompanying this
-	// request, for multimodal models.
-	Images []ImageData `json:"images,omitempty"`
-
-	// Options lists model-specific options. For example, temperature can be
-	// set through this field, if the model supports it.
 	Options map[string]interface{} `json:"options"`
 }

@@ -137,24 +109,21 @@ type Options struct {

 // Runner options which must be set when the model is loaded into memory
 type Runner struct {
-	UseNUMA   bool `json:"numa,omitempty"`
-	NumCtx    int  `json:"num_ctx,omitempty"`
-	NumBatch  int  `json:"num_batch,omitempty"`
-	NumGQA    int  `json:"num_gqa,omitempty"`
-	NumGPU    int  `json:"num_gpu,omitempty"`
-	MainGPU   int  `json:"main_gpu,omitempty"`
-	LowVRAM   bool `json:"low_vram,omitempty"`
-	F16KV     bool `json:"f16_kv,omitempty"`
-	LogitsAll bool `json:"logits_all,omitempty"`
-	VocabOnly bool `json:"vocab_only,omitempty"`
-	UseMMap   bool `json:"use_mmap,omitempty"`
-	UseMLock  bool `json:"use_mlock,omitempty"`
-	NumThread int  `json:"num_thread,omitempty"`
-
-	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
-	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
-	// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
+	UseNUMA            bool    `json:"numa,omitempty"`
+	NumCtx             int     `json:"num_ctx,omitempty"`
+	NumBatch           int     `json:"num_batch,omitempty"`
+	NumGQA             int     `json:"num_gqa,omitempty"`
+	NumGPU             int     `json:"num_gpu,omitempty"`
+	MainGPU            int     `json:"main_gpu,omitempty"`
+	LowVRAM            bool    `json:"low_vram,omitempty"`
+	F16KV              bool    `json:"f16_kv,omitempty"`
+	LogitsAll          bool    `json:"logits_all,omitempty"`
+	VocabOnly          bool    `json:"vocab_only,omitempty"`
+	UseMMap            bool    `json:"use_mmap,omitempty"`
+	UseMLock           bool    `json:"use_mlock,omitempty"`
+	RopeFrequencyBase  float32 `json:"rope_frequency_base,omitempty"`
 	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
+	NumThread          int     `json:"num_thread,omitempty"`
 }

 type EmbeddingRequest struct {
@@ -170,11 +139,10 @@ type EmbeddingResponse struct {
 }

 type CreateRequest struct {
-	Model        string `json:"model"`
-	Path         string `json:"path"`
-	Modelfile    string `json:"modelfile"`
-	Stream       *bool  `json:"stream,omitempty"`
-	Quantization string `json:"quantization,omitempty"`
+	Model     string `json:"model"`
+	Path      string `json:"path"`
+	Modelfile string `json:"modelfile"`
+	Stream    *bool  `json:"stream,omitempty"`

 	// Name is deprecated, see Model
 	Name string `json:"name"`
@@ -414,16 +382,18 @@ func DefaultOptions() Options {

 		Runner: Runner{
 			// options set when the model is loaded
-			NumCtx:    2048,
-			NumBatch:  512,
-			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
-			NumGQA:    1,
-			NumThread: 0, // let the runtime decide
-			LowVRAM:   false,
-			F16KV:     true,
-			UseMLock:  false,
-			UseMMap:   true,
-			UseNUMA:   false,
+			NumCtx:             2048,
+			RopeFrequencyBase:  10000.0,
+			RopeFrequencyScale: 1.0,
+			NumBatch:           512,
+			NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
+			NumGQA:             1,
+			NumThread:          0, // let the runtime decide
+			LowVRAM:            false,
+			F16KV:              true,
+			UseMLock:           false,
+			UseMMap:            true,
+			UseNUMA:            false,
 		},
 	}
 }
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -83,38 +83,6 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 		io.Copy(logFile, stderr) //nolint:errcheck
 	}()

-	// Re-wire context done behavior to attempt a graceful shutdown of the server
-	cmd.Cancel = func() error {
-		if cmd.Process != nil {
-			err := terminate(cmd)
-			if err != nil {
-				slog.Warn("error trying to gracefully terminate server", "err", err)
-				return cmd.Process.Kill()
-			}
-
-			tick := time.NewTicker(10 * time.Millisecond)
-			defer tick.Stop()
-
-			for {
-				select {
-				case <-tick.C:
-					exited, err := isProcessExited(cmd.Process.Pid)
-					if err != nil {
-						return err
-					}
-
-					if exited {
-						return nil
-					}
-				case <-time.After(5 * time.Second):
-					slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
-					return cmd.Process.Kill()
-				}
-			}
-		}
-		return nil
-	}
-
 	// run the command and wait for it to finish
 	if err := cmd.Start(); err != nil {
 		return done, fmt.Errorf("failed to start server %w", err)
@@ -137,7 +105,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {

 			select {
 			case <-ctx.Done():
-				slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
+				slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code))
 				done <- code
 				return
 			default:
--- a/app/lifecycle/server_unix.go
+++ b/app/lifecycle/server_unix.go
@@ -4,35 +4,9 @@ package lifecycle

 import (
 	"context"
-	"errors"
-	"fmt"
-	"os"
 	"os/exec"
-	"syscall"
 )

 func getCmd(ctx context.Context, cmd string) *exec.Cmd {
 	return exec.CommandContext(ctx, cmd, "serve")
 }
-
-func terminate(cmd *exec.Cmd) error {
-	return cmd.Process.Signal(os.Interrupt)
-}
-
-func isProcessExited(pid int) (bool, error) {
-	proc, err := os.FindProcess(pid)
-	if err != nil {
-		return false, fmt.Errorf("failed to find process: %v", err)
-	}
-
-	err = proc.Signal(syscall.Signal(0))
-	if err != nil {
-		if errors.Is(err, os.ErrProcessDone) || errors.Is(err, syscall.ESRCH) {
-			return true, nil
-		}
-
-		return false, fmt.Errorf("error signaling process: %v", err)
-	}
-
-	return false, nil
-}
--- a/app/lifecycle/server_windows.go
+++ b/app/lifecycle/server_windows.go
@@ -2,88 +2,12 @@ package lifecycle

 import (
 	"context"
-	"fmt"
 	"os/exec"
 	"syscall"
-
-	"golang.org/x/sys/windows"
 )

 func getCmd(ctx context.Context, exePath string) *exec.Cmd {
 	cmd := exec.CommandContext(ctx, exePath, "serve")
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		HideWindow:    true,
-		CreationFlags: windows.CREATE_NEW_PROCESS_GROUP,
-	}
-
+	cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true, CreationFlags: 0x08000000}
 	return cmd
 }
-
-func terminate(cmd *exec.Cmd) error {
-	dll, err := windows.LoadDLL("kernel32.dll")
-	if err != nil {
-		return err
-	}
-	defer dll.Release() // nolint: errcheck
-
-	pid := cmd.Process.Pid
-
-	f, err := dll.FindProc("AttachConsole")
-	if err != nil {
-		return err
-	}
-
-	r1, _, err := f.Call(uintptr(pid))
-	if r1 == 0 && err != syscall.ERROR_ACCESS_DENIED {
-		return err
-	}
-
-	f, err = dll.FindProc("SetConsoleCtrlHandler")
-	if err != nil {
-		return err
-	}
-
-	r1, _, err = f.Call(0, 1)
-	if r1 == 0 {
-		return err
-	}
-
-	f, err = dll.FindProc("GenerateConsoleCtrlEvent")
-	if err != nil {
-		return err
-	}
-
-	r1, _, err = f.Call(windows.CTRL_BREAK_EVENT, uintptr(pid))
-	if r1 == 0 {
-		return err
-	}
-
-	r1, _, err = f.Call(windows.CTRL_C_EVENT, uintptr(pid))
-	if r1 == 0 {
-		return err
-	}
-
-	return nil
-}
-
-const STILL_ACTIVE = 259
-
-func isProcessExited(pid int) (bool, error) {
-	hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION, false, uint32(pid))
-	if err != nil {
-		return false, fmt.Errorf("failed to open process: %v", err)
-	}
-	defer windows.CloseHandle(hProcess) // nolint: errcheck
-
-	var exitCode uint32
-	err = windows.GetExitCodeProcess(hProcess, &exitCode)
-	if err != nil {
-		return false, fmt.Errorf("failed to get exit code: %v", err)
-	}
-
-	if exitCode == STILL_ACTIVE {
-		return false, nil
-	}
-
-	return true, nil
-}
--- a/app/tray/tray.go
+++ b/app/tray/tray.go
@@ -24,5 +24,10 @@ func NewTray() (commontray.OllamaTray, error) {
 		return nil, fmt.Errorf("failed to load icon %s: %w", iconName, err)
 	}

-	return InitPlatformTray(icon, updateIcon)
+	tray, err := InitPlatformTray(icon, updateIcon)
+	if err != nil {
+		return nil, err
+	}
+
+	return tray, nil
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -105,48 +105,24 @@ func CreateHandler(cmd *cobra.Command, args []string) error {

 				zf := zip.NewWriter(tf)

-				files := []string{}
-
-				tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
+				files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
 				if err != nil {
 					return err
-				} else if len(tfiles) == 0 {
-					tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
-					if err != nil {
-						return err
-					}
 				}

-				files = append(files, tfiles...)
-
 				if len(files) == 0 {
-					return fmt.Errorf("no models were found in '%s'", path)
+					return fmt.Errorf("no safetensors files were found in '%s'", path)
 				}

-				// add the safetensor/torch config file + tokenizer
+				// add the safetensor config file + tokenizer
 				files = append(files, filepath.Join(path, "config.json"))
-				files = append(files, filepath.Join(path, "params.json"))
 				files = append(files, filepath.Join(path, "added_tokens.json"))
 				files = append(files, filepath.Join(path, "tokenizer.model"))

 				for _, fn := range files {
 					f, err := os.Open(fn)
-
-					// just skip whatever files aren't there
-					if os.IsNotExist(err) {
-						if strings.HasSuffix(fn, "tokenizer.model") {
-							// try the parent dir before giving up
-							parentDir := filepath.Dir(path)
-							newFn := filepath.Join(parentDir, "tokenizer.model")
-							f, err = os.Open(newFn)
-							if os.IsNotExist(err) {
-								continue
-							} else if err != nil {
-								return err
-							}
-						} else {
-							continue
-						}
+					if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
+						continue
 					} else if err != nil {
 						return err
 					}
@@ -218,9 +194,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	quantization, _ := cmd.Flags().GetString("quantization")
-
-	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
+	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile)}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -239,10 +213,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
 	if _, err := io.Copy(hash, bin); err != nil {
 		return "", err
 	}
-
-	if _, err := bin.Seek(0, io.SeekStart); err != nil {
-		return "", err
-	}
+	bin.Seek(0, io.SeekStart)

 	digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
 	if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
@@ -961,7 +932,6 @@ func NewCLI() *cobra.Command {
 	}

 	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
-	createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -1003,7 +973,6 @@ Environment Variables:
    OLLAMA_ORIGINS      A comma separated list of allowed origins.
    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
-    OLLAMA_DEBUG        Set to 1 to enable additional debug logging
 `)

 	pullCmd := &cobra.Command{
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -295,14 +295,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					opts.WordWrap = false
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
-					if err := cmd.Flags().Set("verbose", "true"); err != nil {
-						return err
-					}
+					cmd.Flags().Set("verbose", "true")
 					fmt.Println("Set 'verbose' mode.")
 				case "quiet":
-					if err := cmd.Flags().Set("verbose", "false"); err != nil {
-						return err
-					}
+					cmd.Flags().Set("verbose", "false")
 					fmt.Println("Set 'quiet' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,16 +1,19 @@
 package convert

 import (
+	"bytes"
 	"cmp"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
+	"regexp"
 	"slices"
-	"strings"

+	"github.com/mitchellh/mapstructure"
 	"google.golang.org/protobuf/proto"

 	"github.com/ollama/ollama/convert/sentencepiece"
@@ -27,58 +30,137 @@ type Params struct {
 	AttentionHeads   int      `json:"num_attention_heads"` // n_head
 	KeyValHeads      int      `json:"num_key_value_heads"`
 	NormEPS          float64  `json:"rms_norm_eps"`
+	RopeFreqBase     float64  `json:"rope_theta"`
 	BoSTokenID       int      `json:"bos_token_id"`
 	EoSTokenID       int      `json:"eos_token_id"`
-	HeadDimension    int      `json:"head_dim"`
-	PaddingTokenID   int      `json:"pad_token_id"`
-
-	ByteOrder
 }

-type ByteOrder interface {
-	binary.ByteOrder
-	binary.AppendByteOrder
+type MetaData struct {
+	Type    string `mapstructure:"dtype"`
+	Shape   []int  `mapstructure:"shape"`
+	Offsets []int  `mapstructure:"data_offsets"`
 }

-type ModelArch interface {
-	GetTensors() error
-	LoadVocab() error
-	WriteGGUF() (string, error)
+func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
+	f, err := os.Open(fn)
+	if err != nil {
+		return []llm.Tensor{}, 0, err
+	}
+	defer f.Close()
+
+	var jsonSize uint64
+	binary.Read(f, binary.LittleEndian, &jsonSize)
+
+	buf := make([]byte, jsonSize)
+	_, err = io.ReadFull(f, buf)
+	if err != nil {
+		return []llm.Tensor{}, 0, err
+	}
+
+	d := json.NewDecoder(bytes.NewBuffer(buf))
+	d.UseNumber()
+	var parsed map[string]interface{}
+	if err = d.Decode(&parsed); err != nil {
+		return []llm.Tensor{}, 0, err
+	}
+
+	var keys []string
+	for k := range parsed {
+		keys = append(keys, k)
+	}
+
+	slices.Sort(keys)
+
+	slog.Info("converting layers")
+
+	var tensors []llm.Tensor
+	for _, k := range keys {
+		vals := parsed[k].(map[string]interface{})
+		var data MetaData
+		if err = mapstructure.Decode(vals, &data); err != nil {
+			return []llm.Tensor{}, 0, err
+		}
+
+		var size uint64
+		var kind uint32
+		switch len(data.Shape) {
+		case 0:
+			// metadata
+			continue
+		case 1:
+			// convert to float32
+			kind = 0
+			size = uint64(data.Shape[0] * 4)
+		case 2:
+			// convert to float16
+			kind = 1
+			size = uint64(data.Shape[0] * data.Shape[1] * 2)
+		}
+
+		ggufName, err := GetTensorName(k)
+		if err != nil {
+			slog.Error("%v", err)
+			return []llm.Tensor{}, 0, err
+		}
+
+		shape := []uint64{0, 0, 0, 0}
+		for i := range data.Shape {
+			shape[i] = uint64(data.Shape[i])
+		}
+
+		t := llm.Tensor{
+			Name:          ggufName,
+			Kind:          kind,
+			Offset:        offset,
+			Shape:         shape[:],
+			FileName:      fn,
+			OffsetPadding: 8 + jsonSize,
+			FileOffsets:   []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
+		}
+		slog.Debug(fmt.Sprintf("%v", t))
+		tensors = append(tensors, t)
+		offset += size
+	}
+	return tensors, offset, nil
 }

-type ModelFormat interface {
-	GetLayerName(string) (string, error)
-	GetTensors(string, *Params) ([]llm.Tensor, error)
-	GetParams(string) (*Params, error)
-	GetModelArch(string, string, *Params) (ModelArch, error)
+func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
+	var tensors []llm.Tensor
+	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
+	if err != nil {
+		return []llm.Tensor{}, err
+	}
+
+	var offset uint64
+	for _, f := range files {
+		var t []llm.Tensor
+		var err error
+		t, offset, err = ReadSafeTensors(f, offset)
+		if err != nil {
+			slog.Error("%v", err)
+			return []llm.Tensor{}, err
+		}
+		tensors = append(tensors, t...)
+	}
+	return tensors, nil
 }

-type ModelData struct {
-	Path    string
-	Name    string
-	Params  *Params
-	Vocab   *Vocab
-	Tensors []llm.Tensor
-	Format  ModelFormat
-}
+func GetParams(dirpath string) (*Params, error) {
+	f, err := os.Open(filepath.Join(dirpath, "config.json"))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()

-func GetModelFormat(dirname string) (ModelFormat, error) {
-	files, err := filepath.Glob(filepath.Join(dirname, "*"))
+	var params Params
+
+	d := json.NewDecoder(f)
+	err = d.Decode(&params)
 	if err != nil {
 		return nil, err
 	}

-	for _, fn := range files {
-		slog.Debug(fmt.Sprintf("file = %s", fn))
-		if strings.HasSuffix(fn, ".safetensors") {
-			return &SafetensorFormat{}, nil
-		} else if strings.HasSuffix(fn, ".bin") {
-			slog.Debug("model is torch")
-			return &TorchFormat{}, nil
-		}
-	}
-
-	return nil, fmt.Errorf("couldn't determine model format")
+	return &params, nil
 }

 // Details on gguf's tokenizer can be found at:
@@ -89,7 +171,7 @@ type Vocab struct {
 	Types  []int32
 }

-func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
+func LoadTokens(dirpath string) (*Vocab, error) {
 	slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
 	in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
 	if err != nil {
@@ -114,14 +196,6 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
 		v.Tokens = append(v.Tokens, p.GetPiece())
 		v.Scores = append(v.Scores, p.GetScore())
 		t := p.GetType()
-		switch t {
-		case sentencepiece.ModelProto_SentencePiece_UNKNOWN:
-		case sentencepiece.ModelProto_SentencePiece_CONTROL:
-		case sentencepiece.ModelProto_SentencePiece_UNUSED:
-		case sentencepiece.ModelProto_SentencePiece_BYTE:
-		default:
-			t = sentencepiece.ModelProto_SentencePiece_NORMAL
-		}
 		v.Types = append(v.Types, int32(t))
 	}

@@ -169,15 +243,89 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
 	}
 	slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))

-	if params.VocabSize > len(v.Tokens) {
-		missingTokens := params.VocabSize - len(v.Tokens)
-		slog.Warn(fmt.Sprintf("vocab is missing %d tokens", missingTokens))
-		for cnt := 0; cnt < missingTokens; cnt++ {
-			v.Tokens = append(v.Tokens, fmt.Sprintf("<dummy%05d>", cnt+1))
-			v.Scores = append(v.Scores, -1)
-			v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
+	return v, nil
+}
+
+func GetTensorName(n string) (string, error) {
+	tMap := map[string]string{
+		"model.embed_tokens.weight":                           "token_embd.weight",
+		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
+		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
+		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
+		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
+		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
+		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
+		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
+		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
+		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
+		"lm_head.weight":    "output.weight",
+		"model.norm.weight": "output_norm.weight",
+	}
+
+	v, ok := tMap[n]
+	if ok {
+		return v, nil
+	}
+
+	// quick hack to rename the layers to gguf format
+	for k, v := range tMap {
+		re := regexp.MustCompile(k)
+		newName := re.ReplaceAllString(n, v)
+		if newName != n {
+			return newName, nil
 		}
 	}

-	return v, nil
+	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
+}
+
+func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
+	c := llm.ContainerGGUF{
+		ByteOrder: binary.LittleEndian,
+	}
+
+	m := llm.NewGGUFModel(&c)
+	m.Tensors = tensors
+	m.KV["general.architecture"] = "llama"
+	m.KV["general.name"] = name
+	m.KV["llama.context_length"] = uint32(params.ContextSize)
+	m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
+	m.KV["llama.block_count"] = uint32(params.HiddenLayers)
+	m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
+	m.KV["llama.rope.dimension_count"] = uint32(128)
+	m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
+	m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
+	m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
+	m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
+	m.KV["general.file_type"] = uint32(1)
+	m.KV["tokenizer.ggml.model"] = "llama"
+
+	m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
+	m.KV["tokenizer.ggml.scores"] = vocab.Scores
+	m.KV["tokenizer.ggml.token_type"] = vocab.Types
+
+	m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
+	m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
+	m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
+	m.KV["tokenizer.ggml.add_bos_token"] = true
+	m.KV["tokenizer.ggml.add_eos_token"] = false
+
+	// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
+	// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
+
+	c.V3.NumTensor = uint64(len(tensors))
+	c.V3.NumKV = uint64(len(m.KV))
+
+	f, err := os.CreateTemp("", "ollama-gguf")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	err = m.Encode(f)
+	if err != nil {
+		return "", err
+	}
+
+	return f.Name(), nil
 }
--- a/convert/gemma.go
+++ b/convert/gemma.go
@@ -1,137 +0,0 @@
-package convert
-
-import (
-	"encoding/binary"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"strings"
-
-	"github.com/d4l3k/go-bfloat16"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type GemmaModel struct {
-	ModelData
-}
-
-func gemmaLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
-	slog.Debug(fmt.Sprintf("converting '%s'", r.t.Name))
-
-	data := make([]byte, r.end-r.start)
-	if err := binary.Read(f, r.bo, data); err != nil {
-		return err
-	}
-
-	tDataF32 := bfloat16.DecodeFloat32(data)
-
-	var err error
-	tDataF32, err = addOnes(tDataF32, int(r.t.Shape[0]))
-	if err != nil {
-		return err
-	}
-
-	if err := binary.Write(w, r.bo, tDataF32); err != nil {
-		return err
-	}
-	return nil
-}
-
-func addOnes(data []float32, vectorSize int) ([]float32, error) {
-	n := tensor.New(tensor.WithShape(vectorSize), tensor.WithBacking(data))
-	ones := tensor.Ones(tensor.Float32, vectorSize)
-
-	var err error
-	n, err = n.Add(ones)
-	if err != nil {
-		return []float32{}, err
-	}
-
-	newN, err := native.SelectF32(n, 0)
-	if err != nil {
-		return []float32{}, err
-	}
-
-	var fullTensor []float32
-	for _, v := range newN {
-		fullTensor = append(fullTensor, v...)
-	}
-
-	return fullTensor, nil
-}
-
-func (m *GemmaModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	slog.Debug(fmt.Sprintf("Total tensors: %d", len(t)))
-
-	m.Tensors = []llm.Tensor{}
-	for _, l := range t {
-		if strings.HasSuffix(l.Name, "norm.weight") {
-			wt := l.WriterTo.(safetensorWriterTo)
-			wt.handler = gemmaLayerHandler
-			l.WriterTo = wt
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *GemmaModel) LoadVocab() error {
-	v, err := LoadSentencePieceTokens(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-	m.Vocab = v
-	return nil
-}
-
-func (m *GemmaModel) WriteGGUF() (string, error) {
-	kv := llm.KV{
-		"general.architecture":                   "gemma",
-		"general.name":                           m.Name,
-		"gemma.context_length":                   uint32(m.Params.ContextSize),
-		"gemma.embedding_length":                 uint32(m.Params.HiddenSize),
-		"gemma.block_count":                      uint32(m.Params.HiddenLayers),
-		"gemma.feed_forward_length":              uint32(m.Params.IntermediateSize),
-		"gemma.attention.head_count":             uint32(m.Params.AttentionHeads),
-		"gemma.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
-		"gemma.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-		"gemma.attention.key_length":             uint32(m.Params.HeadDimension),
-		"gemma.attention.value_length":           uint32(m.Params.HeadDimension),
-		"general.file_type":                      uint32(1),
-		"tokenizer.ggml.model":                   "llama",
-
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.scores":     m.Vocab.Scores,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.padding_token_id": uint32(m.Params.PaddingTokenID),
-		"tokenizer.ggml.unknown_token_id": uint32(3),
-		"tokenizer.ggml.add_bos_token":    true,
-		"tokenizer.ggml.add_eos_token":    false,
-	}
-
-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
-}
--- a/convert/llama.go
+++ b/convert/llama.go
@@ -1,176 +0,0 @@
-package convert
-
-import (
-	"encoding/binary"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"regexp"
-	"strings"
-
-	"github.com/nlpodyssey/gopickle/pytorch"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type LlamaModel struct {
-	ModelData
-}
-
-func llamaLayerHandler(w io.Writer, r torchWriterTo) error {
-	slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name))
-
-	data := r.storage.(*pytorch.HalfStorage).Data
-	tData := make([]uint16, len(data))
-	for cnt, v := range data {
-		tData[cnt] = uint16(float16.Fromfloat32(v))
-	}
-
-	var err error
-	var heads uint32
-	if strings.Contains(r.t.Name, "attn_q") {
-		heads = uint32(r.params.AttentionHeads)
-	} else if strings.Contains(r.t.Name, "attn_k") {
-		heads = uint32(r.params.KeyValHeads)
-		if heads == 0 {
-			heads = uint32(r.params.AttentionHeads)
-		}
-	} else {
-		return fmt.Errorf("unknown layer type")
-	}
-
-	slog.Debug(fmt.Sprintf("heads = %d", heads))
-
-	tData, err = llamaRepack(tData, int(heads), r.t.Shape)
-	if err != nil {
-		return err
-	}
-
-	if err = binary.Write(w, r.bo, tData); err != nil {
-		return err
-	}
-	return nil
-}
-
-func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
-	n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
-	origShape := n.Shape().Clone()
-
-	// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
-	if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
-		return nil, err
-	}
-
-	if err := n.T(0, 2, 1, 3); err != nil {
-		return nil, err
-	}
-
-	if err := n.Reshape(origShape...); err != nil {
-		return nil, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return nil, err
-	}
-	newN, err := native.SelectU16(n, 1)
-	if err != nil {
-		return nil, err
-	}
-
-	var fullTensor []uint16
-	for _, v := range newN {
-		fullTensor = append(fullTensor, v...)
-	}
-	return fullTensor, nil
-}
-
-func (m *LlamaModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	m.Tensors = []llm.Tensor{}
-
-	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
-	re, err := regexp.Compile(pattern)
-	if err != nil {
-		return err
-	}
-
-	for _, l := range t {
-		matches := re.FindAllStringSubmatch(l.Name, -1)
-		if len(matches) > 0 {
-			slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name))
-			wt := l.WriterTo.(torchWriterTo)
-			wt.handler = llamaLayerHandler
-			l.WriterTo = wt
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *LlamaModel) LoadVocab() error {
-	var v *Vocab
-	var err error
-
-	slog.Debug("loading vocab")
-	v, err = LoadSentencePieceTokens(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	slog.Debug("vocab loaded")
-
-	m.Vocab = v
-	return nil
-}
-
-func (m *LlamaModel) WriteGGUF() (string, error) {
-	kv := llm.KV{
-		"general.architecture":                   "llama",
-		"general.name":                           m.Name,
-		"llama.vocab_size":                       uint32(len(m.Vocab.Tokens)),
-		"llama.context_length":                   uint32(m.Params.ContextSize),
-		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
-		"llama.block_count":                      uint32(m.Params.HiddenLayers),
-		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
-		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
-		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
-		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
-		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-		"general.file_type":                      uint32(1),
-		"tokenizer.ggml.model":                   "llama",
-
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.scores":     m.Vocab.Scores,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.unknown_token_id": uint32(0),
-		"tokenizer.ggml.add_bos_token":    true,
-		"tokenizer.ggml.add_eos_token":    false,
-	}
-
-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
-
-	return f.Name(), nil
-}
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -1,173 +0,0 @@
-package convert
-
-import (
-	"encoding/binary"
-	"fmt"
-	"io"
-	"os"
-	"regexp"
-	"strings"
-
-	"github.com/d4l3k/go-bfloat16"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type MistralModel struct {
-	ModelData
-}
-
-func mistralLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
-	layerSize := r.end - r.start
-
-	var err error
-	tData := make([]uint16, layerSize/2)
-	if err = binary.Read(f, r.bo, tData); err != nil {
-		return err
-	}
-
-	var heads uint32
-	if strings.Contains(r.t.Name, "attn_q") {
-		heads = uint32(r.params.AttentionHeads)
-	} else if strings.Contains(r.t.Name, "attn_k") {
-		heads = uint32(r.params.KeyValHeads)
-		if heads == 0 {
-			heads = uint32(r.params.AttentionHeads)
-		}
-	} else {
-		return fmt.Errorf("unknown layer type")
-	}
-
-	tData, err = repack(tData, int(heads), r.t.Shape)
-	if err != nil {
-		return err
-	}
-
-	var buf []byte
-	for _, n := range tData {
-		buf = r.bo.AppendUint16(buf, n)
-	}
-
-	tempBuf := make([]uint16, len(tData))
-	tDataF32 := bfloat16.DecodeFloat32(buf)
-	for cnt, v := range tDataF32 {
-		tDataF16 := float16.Fromfloat32(v)
-		tempBuf[cnt] = uint16(tDataF16)
-	}
-
-	if err = binary.Write(w, r.bo, tempBuf); err != nil {
-		return err
-	}
-	return nil
-}
-
-func repack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
-	n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
-	origShape := n.Shape().Clone()
-
-	// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
-	if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
-		return nil, err
-	}
-
-	if err := n.T(0, 2, 1, 3); err != nil {
-		return nil, err
-	}
-
-	if err := n.Reshape(origShape...); err != nil {
-		return nil, err
-	}
-
-	if err := n.Transpose(); err != nil {
-		return nil, err
-	}
-	newN, err := native.SelectU16(n, 1)
-	if err != nil {
-		return nil, err
-	}
-
-	var fullTensor []uint16
-	for _, v := range newN {
-		fullTensor = append(fullTensor, v...)
-	}
-	return fullTensor, nil
-}
-
-func (m *MistralModel) GetTensors() error {
-	t, err := m.Format.GetTensors(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-
-	m.Tensors = []llm.Tensor{}
-
-	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
-	re, err := regexp.Compile(pattern)
-	if err != nil {
-		return err
-	}
-
-	for _, l := range t {
-		matches := re.FindAllStringSubmatch(l.Name, -1)
-		if len(matches) > 0 {
-			wt := l.WriterTo.(safetensorWriterTo)
-			wt.handler = mistralLayerHandler
-			l.WriterTo = wt
-		}
-		m.Tensors = append(m.Tensors, l)
-	}
-
-	return nil
-}
-
-func (m *MistralModel) LoadVocab() error {
-	v, err := LoadSentencePieceTokens(m.Path, m.Params)
-	if err != nil {
-		return err
-	}
-	m.Vocab = v
-	return nil
-}
-
-func (m *MistralModel) WriteGGUF() (string, error) {
-	kv := llm.KV{
-		"general.architecture":                   "llama",
-		"general.name":                           m.Name,
-		"llama.context_length":                   uint32(m.Params.ContextSize),
-		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
-		"llama.block_count":                      uint32(m.Params.HiddenLayers),
-		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
-		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
-		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
-		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
-		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
-		"general.file_type":                      uint32(1),
-		"tokenizer.ggml.model":                   "llama",
-
-		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
-		"tokenizer.ggml.scores":     m.Vocab.Scores,
-		"tokenizer.ggml.token_type": m.Vocab.Types,
-
-		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
-		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
-		"tokenizer.ggml.add_bos_token":    true,
-		"tokenizer.ggml.add_eos_token":    false,
-		"tokenizer.ggml.unknown_token_id": uint32(0),
-	}
-
-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
-}
--- a/convert/safetensors.go
+++ b/convert/safetensors.go
@@ -1,304 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"regexp"
-	"slices"
-
-	"github.com/d4l3k/go-bfloat16"
-	"github.com/mitchellh/mapstructure"
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type safetensorWriterTo struct {
-	t *llm.Tensor
-
-	params *Params
-	bo     ByteOrder
-
-	filename string
-
-	start, end, padding uint64
-	handler             func(w io.Writer, r safetensorWriterTo, f *os.File) error
-}
-
-type tensorMetaData struct {
-	Type    string `mapstructure:"dtype"`
-	Shape   []int  `mapstructure:"shape"`
-	Offsets []int  `mapstructure:"data_offsets"`
-}
-
-type SafetensorFormat struct{}
-
-func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
-	slog.Debug("getting tensor data")
-	var tensors []llm.Tensor
-	files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
-	if err != nil {
-		return nil, err
-	}
-
-	var offset uint64
-	for _, f := range files {
-		var t []llm.Tensor
-		var err error
-		t, offset, err = m.readTensors(f, offset, params)
-		if err != nil {
-			slog.Error("%v", err)
-			return nil, err
-		}
-		tensors = append(tensors, t...)
-	}
-	slog.Debug(fmt.Sprintf("all tensors = %d", len(tensors)))
-	return tensors, nil
-}
-
-func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ([]llm.Tensor, uint64, error) {
-	f, err := os.Open(fn)
-	if err != nil {
-		return nil, 0, err
-	}
-	defer f.Close()
-
-	var jsonSize uint64
-	if err := binary.Read(f, binary.LittleEndian, &jsonSize); err != nil {
-		return nil, 0, err
-	}
-
-	buf := make([]byte, jsonSize)
-	_, err = io.ReadFull(f, buf)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	d := json.NewDecoder(bytes.NewBuffer(buf))
-	d.UseNumber()
-	var parsed map[string]interface{}
-	if err = d.Decode(&parsed); err != nil {
-		return nil, 0, err
-	}
-
-	var keys []string
-	for k := range parsed {
-		keys = append(keys, k)
-	}
-
-	slices.Sort(keys)
-
-	slog.Info("converting layers")
-
-	var tensors []llm.Tensor
-	for _, k := range keys {
-		vals := parsed[k].(map[string]interface{})
-		var data tensorMetaData
-		if err = mapstructure.Decode(vals, &data); err != nil {
-			slog.Error("couldn't decode properly")
-			return nil, 0, err
-		}
-
-		slog.Debug(fmt.Sprintf("metadata = %#v", data))
-		var size uint64
-		var kind uint32
-		switch len(data.Shape) {
-		case 0:
-			// metadata
-			continue
-		case 1:
-			// convert to float32
-			kind = 0
-			size = uint64(data.Shape[0] * 4)
-		case 2:
-			// convert to float16
-			kind = 1
-			size = uint64(data.Shape[0] * data.Shape[1] * 2)
-		}
-
-		ggufName, err := m.GetLayerName(k)
-		if err != nil {
-			slog.Error("%v", err)
-			return nil, 0, err
-		}
-
-		shape := []uint64{0, 0, 0, 0}
-		for i := range data.Shape {
-			shape[i] = uint64(data.Shape[i])
-		}
-
-		t := llm.Tensor{
-			Name:   ggufName,
-			Kind:   kind,
-			Offset: offset,
-			Shape:  shape[:],
-		}
-
-		t.WriterTo = safetensorWriterTo{
-			t:        &t,
-			params:   params,
-			bo:       params.ByteOrder,
-			filename: fn,
-			start:    uint64(data.Offsets[0]),
-			end:      uint64(data.Offsets[1]),
-			padding:  8 + jsonSize,
-		}
-
-		tensors = append(tensors, t)
-		offset += size
-	}
-	slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
-	slog.Debug(fmt.Sprintf("offset = %d", offset))
-	return tensors, offset, nil
-}
-
-func (m *SafetensorFormat) GetParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	var params Params
-
-	d := json.NewDecoder(f)
-	err = d.Decode(&params)
-	if err != nil {
-		return nil, err
-	}
-
-	params.ByteOrder = binary.LittleEndian
-	return &params, nil
-}
-
-func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
-	directMap := map[string]string{
-		"model.embed_tokens.weight": "token_embd.weight",
-		"lm_head.weight":            "output.weight",
-		"model.norm.weight":         "output_norm.weight",
-	}
-
-	tMap := map[string]string{
-		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
-		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
-		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
-		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
-		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
-		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
-		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
-		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
-		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
-	}
-
-	v, ok := directMap[n]
-	if ok {
-		return v, nil
-	}
-
-	// quick hack to rename the layers to gguf format
-	for k, v := range tMap {
-		re := regexp.MustCompile(k)
-		newName := re.ReplaceAllString(n, v)
-		if newName != n {
-			return newName, nil
-		}
-	}
-
-	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
-}
-
-func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) {
-	f, err := os.Open(r.filename)
-	if err != nil {
-		return 0, err
-	}
-	defer f.Close()
-
-	if _, err = f.Seek(int64(r.padding+r.start), 0); err != nil {
-		return 0, err
-	}
-
-	// use the handler if one is present
-	if r.handler != nil {
-		return 0, r.handler(w, r, f)
-	}
-
-	remaining := r.end - r.start
-
-	bufSize := uint64(10240)
-	var finished bool
-	for {
-		data := make([]byte, min(bufSize, remaining))
-
-		b, err := io.ReadFull(f, data)
-		remaining -= uint64(b)
-
-		if err == io.EOF || remaining <= 0 {
-			finished = true
-		} else if err != nil {
-			return 0, err
-		}
-
-		// convert bfloat16 -> ieee float32
-		tDataF32 := bfloat16.DecodeFloat32(data)
-
-		switch r.t.Kind {
-		case 0:
-			if err := binary.Write(w, r.bo, tDataF32); err != nil {
-				return 0, err
-			}
-		case 1:
-			// convert float32 -> float16
-			tempBuf := make([]uint16, len(data)/2)
-			for cnt, v := range tDataF32 {
-				tDataF16 := float16.Fromfloat32(v)
-				tempBuf[cnt] = uint16(tDataF16)
-			}
-			if err := binary.Write(w, r.bo, tempBuf); err != nil {
-				return 0, err
-			}
-		}
-		if finished {
-			break
-		}
-	}
-	return 0, nil
-}
-
-func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
-	switch len(params.Architectures) {
-	case 0:
-		return nil, fmt.Errorf("No architecture specified to convert")
-	case 1:
-		switch params.Architectures[0] {
-		case "MistralForCausalLM":
-			return &MistralModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		case "GemmaForCausalLM":
-			return &GemmaModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		default:
-			return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
-		}
-	}
-
-	return nil, fmt.Errorf("Unknown error")
-}
--- a/convert/torch.go
+++ b/convert/torch.go
@@ -1,286 +0,0 @@
-package convert
-
-import (
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"regexp"
-	"strings"
-
-	"github.com/nlpodyssey/gopickle/pytorch"
-	"github.com/nlpodyssey/gopickle/types"
-	"github.com/x448/float16"
-
-	"github.com/ollama/ollama/llm"
-)
-
-type torchWriterTo struct {
-	t *llm.Tensor
-
-	params *Params
-	bo     ByteOrder
-
-	storage pytorch.StorageInterface
-	handler func(w io.Writer, r torchWriterTo) error
-}
-
-type TorchFormat struct{}
-
-func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
-	slog.Debug("getting torch tensors")
-
-	files, err := filepath.Glob(filepath.Join(dirpath, "pytorch_model-*.bin"))
-	if err != nil {
-		slog.Error("didn't find any torch files")
-		return nil, err
-	}
-
-	var offset uint64
-
-	var tensors []llm.Tensor
-	for _, fn := range files {
-		m, err := pytorch.Load(fn)
-		if err != nil {
-			slog.Error(fmt.Sprintf("error unpickling: %q", err))
-			return []llm.Tensor{}, err
-		}
-
-		for _, k := range m.(*types.Dict).Keys() {
-			if strings.HasSuffix(k.(string), "self_attn.rotary_emb.inv_freq") {
-				continue
-			}
-
-			t, _ := m.(*types.Dict).Get(k)
-			tshape := t.(*pytorch.Tensor).Size
-
-			var size uint64
-			var kind uint32
-			switch len(tshape) {
-			case 0:
-				continue
-			case 1:
-				// convert to float32
-				kind = 0
-				size = uint64(tshape[0] * 4)
-			case 2:
-				// convert to float16
-				kind = 1
-				size = uint64(tshape[0] * tshape[1] * 2)
-			}
-
-			ggufName, err := tf.GetLayerName(k.(string))
-			if err != nil {
-				slog.Error("%v", err)
-				return nil, err
-			}
-			slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
-
-			shape := []uint64{0, 0, 0, 0}
-			for i := range tshape {
-				shape[i] = uint64(tshape[i])
-			}
-
-			tensor := llm.Tensor{
-				Name:   ggufName,
-				Kind:   kind,
-				Offset: offset, // calculate the offset
-				Shape:  shape[:],
-			}
-
-			tensor.WriterTo = torchWriterTo{
-				t:       &tensor,
-				params:  params,
-				bo:      params.ByteOrder,
-				storage: t.(*pytorch.Tensor).Source,
-			}
-
-			tensors = append(tensors, tensor)
-			offset += size
-		}
-	}
-
-	return tensors, nil
-
-}
-
-func getAltParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "params.json"))
-	if err != nil {
-		slog.Error("no params.json")
-		return nil, err
-	}
-	defer f.Close()
-
-	type TorchParams struct {
-		HiddenSize     int     `json:"dim"`
-		AttentionHeads int     `json:"n_heads"`
-		KeyValHeads    int     `json:"n_kv_heads"`
-		HiddenLayers   int     `json:"n_layers"`
-		RopeTheta      int     `json:"rope_theta"`
-		NormEPS        float64 `json:"norm_eps"`
-	}
-
-	var tparams TorchParams
-
-	d := json.NewDecoder(f)
-	err = d.Decode(&tparams)
-	if err != nil {
-		return nil, err
-	}
-
-	params := &Params{
-		HiddenSize:     tparams.HiddenSize,
-		AttentionHeads: tparams.AttentionHeads,
-		KeyValHeads:    tparams.KeyValHeads,
-		HiddenLayers:   tparams.HiddenLayers,
-		NormEPS:        tparams.NormEPS,
-	}
-
-	switch {
-	case tparams.RopeTheta == 1000000:
-		// Codellama
-		params.ContextSize = 16384
-	case tparams.NormEPS == 1e-06:
-		// llama2
-		slog.Debug("Found llama2 - setting context size to 4096")
-		params.ContextSize = 4096
-	default:
-		params.ContextSize = 2048
-	}
-
-	params.ByteOrder = binary.LittleEndian
-	return params, nil
-}
-
-func (m *TorchFormat) GetParams(dirpath string) (*Params, error) {
-	f, err := os.Open(filepath.Join(dirpath, "config.json"))
-	if err != nil {
-		if os.IsNotExist(err) {
-			// try params.json instead
-			return getAltParams(dirpath)
-		} else {
-			return nil, err
-		}
-	}
-
-	var params Params
-	d := json.NewDecoder(f)
-	err = d.Decode(&params)
-	if err != nil {
-		return nil, err
-	}
-
-	params.ByteOrder = binary.LittleEndian
-	return &params, nil
-}
-
-func (m *TorchFormat) GetLayerName(n string) (string, error) {
-	directMap := map[string]string{
-		"tok_embeddings.weight":     "token_embd.weight",
-		"output.weight":             "output.weight",
-		"norm.weight":               "output_norm.weight",
-		"rope.freqs":                "rope_freqs.weight",
-		"model.embed_tokens.weight": "token_embd.weight",
-		"lm_head.weight":            "output.weight",
-		"model.norm.weight":         "output_norm.weight",
-	}
-
-	lMap := map[string]string{
-		"layers.(\\d+).attention_norm.weight":                 "blk.$1.attn_norm.weight",
-		"layers.(\\d+).attention_output_norm.weight":          "blk.$1.attn_norm.weight",
-		"layers.(\\d+).feed_forward.w2.weight":                "blk.$1.ffn_down.weight",
-		"layers.(\\d+).feed_forward.w1.weight":                "blk.$1.ffn_gate.weight",
-		"layers.(\\d+).feed_forward.w3.weight":                "blk.$1.ffn_up.weight",
-		"layers.(\\d+).ffn_norm.weight":                       "blk.$1.ffn_norm.weight",
-		"layers.(\\d+).attention.wk.weight":                   "blk.$1.attn_k.weight",
-		"layers.(\\d+).attention.wo.weight":                   "blk.$1.attn_output.weight",
-		"layers.(\\d+).attention.wq.weight":                   "blk.$1.attn_q.weight",
-		"layers.(\\d+).attention.wv.weight":                   "blk.$1.attn_v.weight",
-		"model.layers.(\\d+).input_layernorm.weight":          "blk.$1.attn_norm.weight",
-		"model.layers.(\\d+).mlp.down_proj.weight":            "blk.$1.ffn_down.weight",
-		"model.layers.(\\d+).mlp.gate_proj.weight":            "blk.$1.ffn_gate.weight",
-		"model.layers.(\\d+).mlp.up_proj.weight":              "blk.$1.ffn_up.weight",
-		"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
-		"model.layers.(\\d+).self_attn.k_proj.weight":         "blk.$1.attn_k.weight",
-		"model.layers.(\\d+).self_attn.o_proj.weight":         "blk.$1.attn_output.weight",
-		"model.layers.(\\d+).self_attn.q_proj.weight":         "blk.$1.attn_q.weight",
-		"model.layers.(\\d+).self_attn.v_proj.weight":         "blk.$1.attn_v.weight",
-	}
-
-	v, ok := directMap[n]
-	if ok {
-		return v, nil
-	}
-
-	// quick hack to rename the layers to gguf format
-	for k, v := range lMap {
-		re := regexp.MustCompile(k)
-		newName := re.ReplaceAllString(n, v)
-		if newName != n {
-			return newName, nil
-		}
-	}
-
-	return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
-}
-
-func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) {
-	// use the handler if one is present
-	if r.handler != nil {
-		return 0, r.handler(w, r)
-	}
-
-	switch r.storage.(type) {
-	case *pytorch.FloatStorage:
-		slog.Warn(fmt.Sprintf("unexpected storage found for layer '%s'; skipping", r.t.Name))
-		return 0, nil
-	case *pytorch.HalfStorage:
-		switch r.t.Kind {
-		case 0:
-			data := r.storage.(*pytorch.HalfStorage).Data
-			slog.Debug(fmt.Sprintf("%35s F32 (%d)", r.t.Name, len(data)))
-			if err := binary.Write(w, r.bo, data); err != nil {
-				return 0, err
-			}
-		case 1:
-			data := r.storage.(*pytorch.HalfStorage).Data
-			tData := make([]uint16, len(data))
-			for cnt, v := range data {
-				tData[cnt] = uint16(float16.Fromfloat32(v))
-			}
-			slog.Debug(fmt.Sprintf("%35s F16 (%d)", r.t.Name, len(tData)))
-			if err := binary.Write(w, r.bo, tData); err != nil {
-				return 0, err
-			}
-		}
-	}
-
-	return 0, nil
-}
-
-func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
-	switch len(params.Architectures) {
-	case 0:
-		return nil, fmt.Errorf("No architecture specified to convert")
-	case 1:
-		switch params.Architectures[0] {
-		case "LlamaForCausalLM":
-			return &LlamaModel{
-				ModelData{
-					Name:   name,
-					Path:   dirPath,
-					Params: params,
-					Format: m,
-				},
-			}, nil
-		default:
-			return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
-		}
-	}
-
-	return nil, fmt.Errorf("Unknown error")
-}
--- a/docs/api.md
+++ b/docs/api.md
@@ -394,6 +394,7 @@ Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
+- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -139,6 +139,9 @@ PARAMETER <parameter> <parametervalue>
 | mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                        | float      | mirostat_eta 0.1     |
 | mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                         | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
+| num_gqa        | The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b                                                                                                                                         | int        | num_gqa 1            |
+| num_gpu        | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable.                                                                                                                                            | int        | num_gpu 50           |
+| num_thread     | Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). | int        | num_thread 8         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
 | temperature    | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)                                                                                                                                     | float      | temperature 0.7      |
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -76,10 +76,3 @@ install script which version to install.
 ```sh
 curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
 ```
-
-## Linux tmp noexec 
-
-If your system is configured with the "noexec" flag where Ollama stores its
-temporary executable files, you can specify an alternate location by setting
-OLLAMA_TMPDIR to a location writable by the user ollama runs as.  For example
-OLLAMA_TMPDIR=/usr/share/ollama/
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -18,7 +18,7 @@ const ollama = new Ollama({
  model: "llama2",
 });

-const answer = await ollama.invoke(`why is the sky blue?`);
+const answer = await ollama.call(`why is the sky blue?`);

 console.log(answer);
 ```
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -1,51 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-
-	"github.com/ollama/ollama/api"
-)
-
-func main() {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	messages := []api.Message{
-		api.Message{
-			Role:    "system",
-			Content: "Provide very brief, concise responses",
-		},
-		api.Message{
-			Role:    "user",
-			Content: "Name some unusual animals",
-		},
-		api.Message{
-			Role:    "assistant",
-			Content: "Monotreme, platypus, echidna",
-		},
-		api.Message{
-			Role:    "user",
-			Content: "which of these is the most dangerous?",
-		},
-	}
-
-	ctx := context.Background()
-	req := &api.ChatRequest{
-		Model:    "llama2",
-		Messages: messages,
-	}
-
-	respFunc := func(resp api.ChatResponse) error {
-		fmt.Print(resp.Message.Content)
-		return nil
-	}
-
-	err = client.Chat(ctx, req, respFunc)
-	if err != nil {
-		log.Fatal(err)
-	}
-}
--- a/examples/go-generate-streaming/main.go
+++ b/examples/go-generate-streaming/main.go
@@ -1,40 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-
-	"github.com/ollama/ollama/api"
-)
-
-func main() {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	// By default, GenerateRequest is streaming.
-	req := &api.GenerateRequest{
-		Model:  "gemma",
-		Prompt: "how many planets are there?",
-	}
-
-	ctx := context.Background()
-	respFunc := func(resp api.GenerateResponse) error {
-		// Only print the response here; GenerateResponse has a number of other
-		// interesting fields you want to examine.
-
-		// In streaming mode, responses are partial so we call fmt.Print (and not
-		// Println) in order to avoid spurious newlines being introduced. The
-		// model will insert its own newlines if it wants.
-		fmt.Print(resp.Response)
-		return nil
-	}
-
-	err = client.Generate(ctx, req, respFunc)
-	if err != nil {
-		log.Fatal(err)
-	}
-	fmt.Println()
-}
--- a/examples/go-generate/main.go
+++ b/examples/go-generate/main.go
@@ -1,37 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-
-	"github.com/ollama/ollama/api"
-)
-
-func main() {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	req := &api.GenerateRequest{
-		Model:  "gemma",
-		Prompt: "how many planets are there?",
-
-		// set streaming to false
-		Stream: new(bool),
-	}
-
-	ctx := context.Background()
-	respFunc := func(resp api.GenerateResponse) error {
-		// Only print the response here; GenerateResponse has a number of other
-		// interesting fields you want to examine.
-		fmt.Println(resp.Response)
-		return nil
-	}
-
-	err = client.Generate(ctx, req, respFunc)
-	if err != nil {
-		log.Fatal(err)
-	}
-}
--- a/examples/go-multimodal/main.go
+++ b/examples/go-multimodal/main.go
@@ -1,47 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"os"
-
-	"github.com/ollama/ollama/api"
-)
-
-func main() {
-	if len(os.Args) <= 1 {
-		log.Fatal("usage: <image name>")
-	}
-
-	imgData, err := os.ReadFile(os.Args[1])
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	req := &api.GenerateRequest{
-		Model:  "llava",
-		Prompt: "describe this image",
-		Images: []api.ImageData{imgData},
-	}
-
-	ctx := context.Background()
-	respFunc := func(resp api.GenerateResponse) error {
-		// In streaming mode, responses are partial so we call fmt.Print (and not
-		// Println) in order to avoid spurious newlines being introduced. The
-		// model will insert its own newlines if it wants.
-		fmt.Print(resp.Response)
-		return nil
-	}
-
-	err = client.Generate(ctx, req, respFunc)
-	if err != nil {
-		log.Fatal(err)
-	}
-	fmt.Println()
-}
--- a/examples/go-pull-progress/main.go
+++ b/examples/go-pull-progress/main.go
@@ -1,31 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-
-	"github.com/ollama/ollama/api"
-)
-
-func main() {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	ctx := context.Background()
-
-	req := &api.PullRequest{
-		Model: "mistral",
-	}
-	progressFunc := func(resp api.ProgressResponse) error {
-		fmt.Printf("Progress: status=%v, total=%v, completed=%v\n", resp.Status, resp.Total, resp.Completed)
-		return nil
-	}
-
-	err = client.Pull(ctx, req, progressFunc)
-	if err != nil {
-		log.Fatal(err)
-	}
-}
--- a/examples/golang-simplegenerate/README.md
+++ b/examples/golang-simplegenerate/README.md
--- a/examples/golang-simplegenerate/main.go
+++ b/examples/golang-simplegenerate/main.go
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -6,15 +6,11 @@ import (
 )

 const (
-	Byte = 1
-
+	Byte     = 1
 	KiloByte = Byte * 1000
 	MegaByte = KiloByte * 1000
 	GigaByte = MegaByte * 1000
 	TeraByte = GigaByte * 1000
-
-	KibiByte = Byte * 1024
-	MebiByte = KibiByte * 1024
 )

 func HumanBytes(b int64) string {
@@ -49,14 +45,3 @@ func HumanBytes(b int64) string {
 		return fmt.Sprintf("%d %s", int(value), unit)
 	}
 }
-
-func HumanBytes2(b uint64) string {
-	switch {
-	case b >= MebiByte:
-		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
-	case b >= KibiByte:
-		return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte)
-	default:
-		return fmt.Sprintf("%d B", b)
-	}
-}
--- a/go.mod
+++ b/go.mod
@@ -9,7 +9,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
-	github.com/golang/protobuf v1.5.0 // indirect
+	github.com/golang/protobuf v1.5.0
 	github.com/google/uuid v1.0.0
 	github.com/mitchellh/mapstructure v1.5.0
 	github.com/olekukonko/tablewriter v0.0.5
@@ -19,10 +19,7 @@ require (
 	golang.org/x/sync v0.3.0
 )

-require (
-	github.com/nlpodyssey/gopickle v0.3.0
-	github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
-)
+require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9

 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
@@ -71,7 +68,7 @@ require (
 	golang.org/x/net v0.17.0 // indirect
 	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
-	golang.org/x/text v0.14.0 // indirect
+	golang.org/x/text v0.13.0 // indirect
 	google.golang.org/protobuf v1.30.0
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -122,8 +122,6 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
-github.com/nlpodyssey/gopickle v0.3.0 h1:BLUE5gxFLyyNOPzlXxt6GoHEMMxD0qhsE4p0CIQyoLw=
-github.com/nlpodyssey/gopickle v0.3.0/go.mod h1:f070HJ/yR+eLi5WmM1OXJEGaTpuJEUiib19olXgYha0=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
@@ -238,8 +236,8 @@ golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
-golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -100,8 +100,6 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 		return
 	}

-	updateLibPath(libDir)
-
 	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
 	if gfxOverride == "" {
 		supported, err := GetSupportedGFX(libDir)
@@ -115,7 +113,7 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
 				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				skip[i] = struct{}{}
 			} else {
 				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
@@ -145,21 +143,6 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 	}
 }

-func updateLibPath(libDir string) {
-	ldPaths := []string{}
-	if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
-		ldPaths = strings.Split(val, ":")
-	}
-	for _, d := range ldPaths {
-		if d == libDir {
-			return
-		}
-	}
-	val := strings.Join(append(ldPaths, libDir), ":")
-	slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
-	os.Setenv("LD_LIBRARY_PATH", val)
-}
-
 // Walk the sysfs nodes for the available GPUs and gather information from them
 // skipping over any devices in the skip map
 func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -11,7 +11,6 @@ import (
 	"strings"
 	"sync"
 	"syscall"
-	"time"
 )

 var (
@@ -22,20 +21,11 @@ var (
 func PayloadsDir() (string, error) {
 	lock.Lock()
 	defer lock.Unlock()
-	var err error
 	if payloadsDir == "" {
 		cleanupTmpDirs()
-		tmpDir := os.Getenv("OLLAMA_TMPDIR")
-		if tmpDir == "" {
-			tmpDir, err = os.MkdirTemp("", "ollama")
-			if err != nil {
-				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-			}
-		} else {
-			err = os.MkdirAll(tmpDir, 0755)
-			if err != nil {
-				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
-			}
+		tmpDir, err := os.MkdirTemp("", "ollama")
+		if err != nil {
+			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 		}

 		// Track our pid so we can clean up orphaned tmpdirs
@@ -94,12 +84,7 @@ func Cleanup() {
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
-			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
-			time.Sleep(1000 * time.Millisecond)
-			err = os.RemoveAll(tmpDir)
-			if err != nil {
-				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-			}
+			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 		}
 	}
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -20,8 +20,6 @@ import (
 	"strings"
 	"sync"
 	"unsafe"
-
-	"github.com/ollama/ollama/format"
 )

 type handles struct {
@@ -29,12 +27,8 @@ type handles struct {
 	cudart *C.cudart_handle_t
 }

-const (
-	cudaMinimumMemory = 457 * format.MebiByte
-	rocmMinimumMemory = 457 * format.MebiByte
-)
-
 var gpuMutex sync.Mutex
+var gpuHandles *handles = nil

 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
@@ -84,11 +78,11 @@ var CudartWindowsGlobs = []string{
 var CudaTegra string = os.Getenv("JETSON_JETPACK")

 // Note: gpuMutex must already be held
-func initGPUHandles() *handles {
+func initGPUHandles() {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

-	gpuHandles := &handles{nil, nil}
+	gpuHandles = &handles{nil, nil}
 	var nvmlMgmtName string
 	var nvmlMgmtPatterns []string
 	var cudartMgmtName string
@@ -115,7 +109,7 @@ func initGPUHandles() *handles {
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
 	default:
-		return gpuHandles
+		return
 	}

 	slog.Info("Detecting GPU type")
@@ -125,7 +119,7 @@ func initGPUHandles() *handles {
 		if cudart != nil {
 			slog.Info("Nvidia GPU detected via cudart")
 			gpuHandles.cudart = cudart
-			return gpuHandles
+			return
 		}
 	}

@@ -136,10 +130,10 @@ func initGPUHandles() *handles {
 		if nvml != nil {
 			slog.Info("Nvidia GPU detected via nvidia-ml")
 			gpuHandles.nvml = nvml
-			return gpuHandles
+			return
 		}
 	}
-	return gpuHandles
+
 }

 func GetGPUInfo() GpuInfo {
@@ -147,16 +141,9 @@ func GetGPUInfo() GpuInfo {
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
-
-	gpuHandles := initGPUHandles()
-	defer func() {
-		if gpuHandles.nvml != nil {
-			C.nvml_release(*gpuHandles.nvml)
-		}
-		if gpuHandles.cudart != nil {
-			C.cudart_release(*gpuHandles.cudart)
-		}
-	}()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}

 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
 	cpuVariant := GetCPUVariant()
@@ -181,7 +168,6 @@ func GetGPUInfo() GpuInfo {
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
-				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
@@ -201,7 +187,6 @@ func GetGPUInfo() GpuInfo {
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
-				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
@@ -209,7 +194,6 @@ func GetGPUInfo() GpuInfo {
 	} else {
 		AMDGetGPUInfo(&resp)
 		if resp.Library != "" {
-			resp.MinimumMemory = rocmMinimumMemory
 			return resp
 		}
 	}
@@ -243,7 +227,7 @@ func getCPUMem() (memInfo, error) {
 	return ret, nil
 }

-func CheckVRAM() (uint64, error) {
+func CheckVRAM() (int64, error) {
 	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseInt(userLimit, 10, 64)
@@ -251,11 +235,24 @@ func CheckVRAM() (uint64, error) {
 			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
 		}
 		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return uint64(avail), nil
+		return avail, nil
 	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		return gpuInfo.FreeMemory, nil
+		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
+		overhead := gpuInfo.FreeMemory / 10
+		gpus := uint64(gpuInfo.DeviceCount)
+		if overhead < gpus*1024*1024*1024 {
+			overhead = gpus * 1024 * 1024 * 1024
+		}
+		// Assigning full reported free memory for Tegras due to OS controlled caching.
+		if CudaTegra != "" {
+			// Setting overhead for non-Tegra devices
+			overhead = 0
+		}
+		avail := int64(gpuInfo.FreeMemory - overhead)
+		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
+		return avail, nil
 	}

 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -17,7 +17,7 @@ import (
 )

 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
-func CheckVRAM() (uint64, error) {
+func CheckVRAM() (int64, error) {
 	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseInt(userLimit, 10, 64)
@@ -25,15 +25,15 @@ func CheckVRAM() (uint64, error) {
 			return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
 		}
 		slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
-		return uint64(avail), nil
+		return avail, nil
 	}

 	if runtime.GOARCH == "amd64" {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
-
-	return uint64(C.getRecommendedMaxVRAM()), nil
+	recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM())
+	return recommendedMaxVRAM, nil
 }

 func GetGPUInfo() GpuInfo {
@@ -53,7 +53,7 @@ func GetGPUInfo() GpuInfo {

 func getCPUMem() (memInfo, error) {
 	return memInfo{
-		TotalMemory: uint64(C.getPhysicalMemory()),
+		TotalMemory: 0,
 		FreeMemory:  0,
 		DeviceCount: 0,
 	}, nil
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -62,10 +62,6 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
-    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
-      resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
-      return;
-    }
    snprintf(buf, buflen, "cudart init failure: %d", ret);
    resp->err = strdup(buf);
    return;
@@ -191,10 +187,4 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r
  }
 }

-void cudart_release(cudart_handle_t h) {
-  LOG(h.verbose, "releasing cudart library\n");
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-
 #endif  // __APPLE__
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -7,7 +7,6 @@
 typedef enum cudartReturn_enum {
  CUDART_SUCCESS = 0,
  CUDART_UNSUPPORTED = 1,
-  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
  // Other values omitted for now...
 } cudartReturn_t;

@@ -55,7 +54,6 @@ typedef struct cudart_compute_capability {
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
 void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
 void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
-void cudart_release(cudart_handle_t ch);

 #endif  // __GPU_INFO_CUDART_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@@ -1,4 +1,3 @@
 #import <Metal/Metal.h>
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
-uint64_t getPhysicalMemory();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@@ -1,13 +1,11 @@
-// go:build darwin
+//go:build darwin
 #include "gpu_info_darwin.h"

-uint64_t getRecommendedMaxVRAM() {
-  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-  uint64_t result = device.recommendedMaxWorkingSetSize;
-  CFRelease(device);
-  return result;
+uint64_t getRecommendedMaxVRAM()
+{
+	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+	uint64_t result = device.recommendedMaxWorkingSetSize;
+	CFRelease(device);
+	return result;
 }

-uint64_t getPhysicalMemory() {
-  return [[NSProcessInfo processInfo] physicalMemory];
-}
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -211,11 +211,4 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
    }
  }
 }
-
-void nvml_release(nvml_handle_t h) {
-  LOG(h.verbose, "releasing nvml library\n");
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-
 #endif  // __APPLE__
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
@@ -51,7 +51,6 @@ typedef struct nvml_compute_capability {
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
 void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
 void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
-void nvml_release(nvml_handle_t ch);

 #endif  // __GPU_INFO_NVML_H__
 #endif  // __APPLE__
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -14,9 +14,6 @@ type GpuInfo struct {
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant,omitempty"`

-	// MinimumMemory represents the minimum memory required to use the GPU
-	MinimumMemory uint64 `json:"-"`
-
 	// TODO add other useful attributes about the card here for discovery information
 }

--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -24,5 +24,5 @@ func TestOrcaMiniBlueSky(t *testing.T) {
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"})
+	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh"})
 }
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -1,29 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"net/http"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestContextExhaustion(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
-	defer cancel()
-	// Set up the test data
-	req := api.GenerateRequest{
-		Model:  "llama2",
-		Prompt: "Write me a story with a ton of emojis?",
-		Stream: &stream,
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-			"num_ctx":     128,
-		},
-	}
-	GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"once", "upon", "lived"})
-}
--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@@ -15,6 +15,10 @@ import (
 // TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
 //        package to avoid circular dependencies

+// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
+//
+// TODO - Fix this ^^
+
 var (
 	stream = false
 	req    = [2]api.GenerateRequest{
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -126,7 +126,7 @@ func StartServer(ctx context.Context, ollamaHost string) error {
 }

 func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
-	slog.Info("checking status of model", "model", modelName)
+	slog.Debug("checking status of model", "model", modelName)
 	showReq := &api.ShowRequest{Name: modelName}
 	requestJSON, err := json.Marshal(showReq)
 	if err != nil {
@@ -174,51 +174,36 @@ func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoin
 	return nil
 }

-var serverProcMutex sync.Mutex
-
 func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
-
-	// TODO maybe stuff in an init routine?
-	lifecycle.InitLogging()
-
 	requestJSON, err := json.Marshal(genReq)
 	if err != nil {
 		t.Fatalf("Error serializing request: %v", err)
 	}
 	defer func() {
-		if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-			defer serverProcMutex.Unlock()
-			if t.Failed() {
-				fp, err := os.Open(lifecycle.ServerLogFile)
-				if err != nil {
-					slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
-					return
-				}
-				data, err := io.ReadAll(fp)
-				if err != nil {
-					slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
-					return
-				}
-				slog.Warn("SERVER LOG FOLLOWS")
-				os.Stderr.Write(data)
-				slog.Warn("END OF SERVER")
+		if t.Failed() && os.Getenv("OLLAMA_TEST_EXISTING") == "" {
+			// TODO
+			fp, err := os.Open(lifecycle.ServerLogFile)
+			if err != nil {
+				slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
+				return
 			}
-			err = os.Remove(lifecycle.ServerLogFile)
-			if err != nil && !os.IsNotExist(err) {
-				slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
+			data, err := io.ReadAll(fp)
+			if err != nil {
+				slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
+				return
 			}
+			slog.Warn("SERVER LOG FOLLOWS")
+			os.Stderr.Write(data)
+			slog.Warn("END OF SERVER")
+		}
+		err = os.Remove(lifecycle.ServerLogFile)
+		if err != nil && !os.IsNotExist(err) {
+			slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
 		}
 	}()
 	scheme, testEndpoint := GetTestEndpoint()

 	if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-		serverProcMutex.Lock()
-		fp, err := os.CreateTemp("", "ollama-server-*.log")
-		if err != nil {
-			t.Fatalf("failed to generate log file: %s", err)
-		}
-		lifecycle.ServerLogFile = fp.Name()
-		fp.Close()
 		assert.NoError(t, StartServer(ctx, testEndpoint))
 	}

--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -0,0 +1,142 @@
+#include "dyn_ext_server.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __linux__
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() strdup(dlerror())
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#elif _WIN32
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+#define LOAD_ERR() ({\
+  LPSTR messageBuffer = NULL; \
+  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
+                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
+  char *resp = strdup(messageBuffer); \
+  LocalFree(messageBuffer); \
+  resp; \
+})
+#else
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() strdup(dlerror())
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#endif
+
+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err) {
+  int i = 0;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"llama_server_init", (void *)&s->llama_server_init},
+      {"llama_server_start", (void *)&s->llama_server_start},
+      {"llama_server_stop", (void *)&s->llama_server_stop},
+      {"llama_server_completion", (void *)&s->llama_server_completion},
+      {"llama_server_completion_next_result",
+       (void *)&s->llama_server_completion_next_result},
+      {"llama_server_completion_cancel",
+       (void *)&s->llama_server_completion_cancel},
+      {"llama_server_release_task_result",
+       (void *)&s->llama_server_release_task_result},
+      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
+      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
+      {"llama_server_embedding", (void *)&s->llama_server_embedding},
+      {"llama_server_release_json_resp",
+       (void *)&s->llama_server_release_json_resp},
+      {"", NULL},
+  };
+
+  printf("loading library %s\n", libPath);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
+  if (!s->handle) {
+    err->id = -1;
+    char *msg = LOAD_ERR();
+    snprintf(err->msg, err->msg_len,
+             "Unable to load dynamic server library: %s", msg);
+    free(msg);
+    return;
+  }
+
+  for (i = 0; l[i].p != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(s->handle);
+      err->id = -1;
+      char *msg = LOAD_ERR();
+      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
+               l[i].s, msg);
+      free(msg);
+      return;
+    }
+  }
+}
+
+inline void dyn_llama_server_init(struct dynamic_llama_server s,
+                                           ext_server_params_t *sparams,
+                                           ext_server_resp_t *err) {
+  s.llama_server_init(sparams, err);
+}
+
+inline void dyn_llama_server_start(struct dynamic_llama_server s) {
+  s.llama_server_start();
+}
+
+inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
+  s.llama_server_stop();
+}
+
+inline void dyn_llama_server_completion(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 ext_server_resp_t *resp) {
+  s.llama_server_completion(json_req, resp);
+}
+
+inline void dyn_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result) {
+  s.llama_server_completion_next_result(task_id, result);
+}
+
+inline void dyn_llama_server_completion_cancel(
+    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
+  s.llama_server_completion_cancel(task_id, err);
+}
+inline void dyn_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result) {
+  s.llama_server_release_task_result(result);
+}
+
+inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
+                                               const char *json_req,
+                                               char **json_resp,
+                                               ext_server_resp_t *err) {
+  s.llama_server_tokenize(json_req, json_resp, err);
+}
+
+inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 char **json_resp,
+                                                 ext_server_resp_t *err) {
+  s.llama_server_detokenize(json_req, json_resp, err);
+}
+
+inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
+                                                const char *json_req,
+                                                char **json_resp,
+                                                ext_server_resp_t *err) {
+  s.llama_server_embedding(json_req, json_resp, err);
+}
+
+inline void dyn_llama_server_release_json_resp(
+    struct dynamic_llama_server s, char **json_resp) {
+  s.llama_server_release_json_resp(json_resp);
+}
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -0,0 +1,388 @@
+package llm
+
+/*
+#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
+#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
+#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
+#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
+#cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
+#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+#cgo darwin LDFLAGS: -lc++ -framework Accelerate
+#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+#cgo linux CFLAGS: -D_GNU_SOURCE
+#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
+#cgo linux windows LDFLAGS: -lpthread
+
+#include <stdlib.h>
+#include "dyn_ext_server.h"
+
+*/
+import "C"
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/gpu"
+)
+
+type dynExtServer struct {
+	s       C.struct_dynamic_llama_server
+	options api.Options
+}
+
+// Note: current implementation does not support concurrent instantiations
+var mutex sync.Mutex
+
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
+	var resp C.ext_server_resp_t
+	resp.msg_len = len
+	bytes := make([]byte, len)
+	resp.msg = (*C.char)(C.CBytes(bytes))
+	return resp
+}
+
+func freeExtServerResp(resp C.ext_server_resp_t) {
+	if resp.msg_len == 0 {
+		return
+	}
+	C.free(unsafe.Pointer(resp.msg))
+}
+
+func extServerResponseToErr(resp C.ext_server_resp_t) error {
+	return fmt.Errorf(C.GoString(resp.msg))
+}
+
+func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+	if !mutex.TryLock() {
+		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
+		mutex.Lock()
+	}
+	gpu.UpdatePath(filepath.Dir(library))
+	libPath := C.CString(library)
+	defer C.free(unsafe.Pointer(libPath))
+	resp := newExtServerResp(512)
+	defer freeExtServerResp(resp)
+	var srv C.struct_dynamic_llama_server
+	C.dyn_init(libPath, &srv, &resp)
+	if resp.id < 0 {
+		mutex.Unlock()
+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
+	}
+	llm := dynExtServer{
+		s:       srv,
+		options: opts,
+	}
+	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
+
+	var sparams C.ext_server_params_t
+	sparams.model = C.CString(model)
+	defer C.free(unsafe.Pointer(sparams.model))
+
+	sparams.embedding = true
+	sparams.n_ctx = C.uint(opts.NumCtx)
+	sparams.n_batch = C.uint(opts.NumBatch)
+	sparams.n_gpu_layers = C.int(opts.NumGPU)
+	sparams.main_gpu = C.int(opts.MainGPU)
+	sparams.n_parallel = 1 // TODO - wire up concurrency
+
+	// Always use the value encoded in the model
+	sparams.rope_freq_base = 0.0
+	sparams.rope_freq_scale = 0.0
+	sparams.memory_f16 = C.bool(opts.F16KV)
+	sparams.use_mlock = C.bool(opts.UseMLock)
+	sparams.use_mmap = C.bool(opts.UseMMap)
+
+	if opts.UseNUMA {
+		sparams.numa = C.int(1)
+	} else {
+		sparams.numa = C.int(0)
+	}
+
+	sparams.lora_adapters = nil
+	for i := 0; i < len(adapters); i++ {
+		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
+		defer C.free(unsafe.Pointer(la))
+		la.adapter = C.CString(adapters[i])
+		defer C.free(unsafe.Pointer(la.adapter))
+		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
+		la.next = nil
+		if i == 0 {
+			sparams.lora_adapters = la
+		} else {
+			tmp := sparams.lora_adapters
+			for ; tmp.next != nil; tmp = tmp.next {
+			}
+			tmp.next = la
+		}
+	}
+
+	if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		sparams.mmproj = C.CString(projectors[0])
+		defer C.free(unsafe.Pointer(sparams.mmproj))
+	} else {
+		sparams.mmproj = nil
+	}
+
+	sparams.n_threads = C.uint(opts.NumThread)
+
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		sparams.verbose_logging = C.bool(true)
+	} else {
+		sparams.verbose_logging = C.bool(false)
+	}
+
+	slog.Info("Initializing llama server")
+	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
+	initResp := newExtServerResp(512)
+	defer freeExtServerResp(initResp)
+	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
+	if initResp.id < 0 {
+		mutex.Unlock()
+		err := extServerResponseToErr(initResp)
+		slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
+		return nil, err
+	}
+
+	slog.Info("Starting llama main loop")
+	C.dyn_llama_server_start(llm.s)
+	return &llm, nil
+}
+
+func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+
+	if len(predict.Images) > 0 {
+		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
+	}
+
+	request := map[string]any{
+		"prompt":            predict.Prompt,
+		"stream":            true,
+		"n_predict":         predict.Options.NumPredict,
+		"n_keep":            predict.Options.NumKeep,
+		"temperature":       predict.Options.Temperature,
+		"top_k":             predict.Options.TopK,
+		"top_p":             predict.Options.TopP,
+		"tfs_z":             predict.Options.TFSZ,
+		"typical_p":         predict.Options.TypicalP,
+		"repeat_last_n":     predict.Options.RepeatLastN,
+		"repeat_penalty":    predict.Options.RepeatPenalty,
+		"presence_penalty":  predict.Options.PresencePenalty,
+		"frequency_penalty": predict.Options.FrequencyPenalty,
+		"mirostat":          predict.Options.Mirostat,
+		"mirostat_tau":      predict.Options.MirostatTau,
+		"mirostat_eta":      predict.Options.MirostatEta,
+		"penalize_nl":       predict.Options.PenalizeNewline,
+		"seed":              predict.Options.Seed,
+		"stop":              predict.Options.Stop,
+		"image_data":        predict.Images,
+		"cache_prompt":      true,
+	}
+
+	if predict.Format == "json" {
+		request["grammar"] = jsonGrammar
+		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
+			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
+		}
+	}
+
+	retryDelay := 100 * time.Microsecond
+	for retries := 0; retries < maxRetries; retries++ {
+		if retries > 0 {
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
+		}
+
+		// Handling JSON marshaling with special characters unescaped.
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
+
+		if err := enc.Encode(request); err != nil {
+			return fmt.Errorf("failed to marshal data: %w", err)
+		}
+
+		req := C.CString(buffer.String())
+		defer C.free(unsafe.Pointer(req))
+
+		C.dyn_llama_server_completion(llm.s, req, &resp)
+		if resp.id < 0 {
+			return extServerResponseToErr(resp)
+		}
+
+		retryNeeded := false
+		// keep track of the last token generated, this is used to abort if the model starts looping
+		var lastToken string
+		var tokenRepeat int
+	out:
+		for {
+			select {
+			case <-ctx.Done():
+				return cancelCompletion(llm, resp)
+			default:
+				var result C.ext_server_task_result_t
+				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
+				json_resp := C.GoString(result.json_resp)
+				C.dyn_llama_server_release_task_result(llm.s, &result)
+
+				var p prediction
+				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
+					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+					if resp.id < 0 {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
+					} else {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
+					}
+				}
+
+				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
+					retryNeeded = true
+					// task will already be canceled
+					break out
+				}
+
+				switch {
+				case strings.TrimSpace(p.Content) == lastToken:
+					tokenRepeat++
+				default:
+					lastToken = strings.TrimSpace(p.Content)
+					tokenRepeat = 0
+				}
+
+				// 30 picked as an arbitrary max token repeat limit, modify as needed
+				if tokenRepeat > 30 {
+					slog.Debug("prediction aborted, token repeat limit reached")
+					return cancelCompletion(llm, resp)
+				}
+
+				if p.Content != "" {
+					fn(PredictResult{
+						Content: p.Content,
+					})
+				}
+
+				if p.Stop || bool(result.stop) {
+					fn(PredictResult{
+						Done:               true,
+						PromptEvalCount:    p.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+						EvalCount:          p.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+					})
+					return nil
+				}
+			}
+		}
+		if !retryNeeded {
+			return nil // success
+		}
+	}
+
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
+}
+
+func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
+	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+	if resp.id < 0 {
+		return extServerResponseToErr(resp)
+	} else {
+		return nil
+	}
+}
+
+func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
+	if err != nil {
+		return nil, fmt.Errorf("marshaling encode data: %w", err)
+	}
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+
+	var encoded TokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
+		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return encoded.Tokens, err
+}
+
+func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+	if len(tokens) == 0 {
+		return "", nil
+	}
+	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
+	if err != nil {
+		return "", fmt.Errorf("marshaling decode data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
+	if resp.id < 0 {
+		return "", extServerResponseToErr(resp)
+	}
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+
+	var decoded DetokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
+		return "", fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return decoded.Content, err
+}
+
+func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: input})
+	if err != nil {
+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
+
+	var embedding EmbeddingResponse
+	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
+	}
+
+	return embedding.Embedding, nil
+}
+
+func (llm *dynExtServer) Close() {
+	C.dyn_llama_server_stop(llm.s)
+	mutex.Unlock()
+}
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -0,0 +1,74 @@
+#include <stdlib.h>
+
+#include "ext_server.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct dynamic_llama_server {
+  void *handle;
+  void (*llama_server_init)(ext_server_params_t *sparams,
+                            ext_server_resp_t *err);
+  void (*llama_server_start)();
+  void (*llama_server_stop)();
+  void (*llama_server_completion)(const char *json_req,
+                                  ext_server_resp_t *resp);
+  void (*llama_server_completion_next_result)(const int task_id,
+                                              ext_server_task_result_t *result);
+  void (*llama_server_completion_cancel)(const int task_id,
+                                         ext_server_resp_t *err);
+  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
+  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
+                                ext_server_resp_t *err);
+  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
+                                  ext_server_resp_t *err);
+  void (*llama_server_embedding)(const char *json_req, char **json_resp,
+                                 ext_server_resp_t *err);
+  void (*llama_server_release_json_resp)(char **json_resp);
+};
+
+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err);
+
+// No good way to call C function pointers from Go so inline the indirection
+void dyn_llama_server_init(struct dynamic_llama_server s,
+                                    ext_server_params_t *sparams,
+                                    ext_server_resp_t *err);
+
+void dyn_llama_server_start(struct dynamic_llama_server s);
+
+void dyn_llama_server_stop(struct dynamic_llama_server s);
+
+void dyn_llama_server_completion(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          ext_server_resp_t *resp);
+
+void dyn_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result);
+
+void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
+                                                 const int task_id,
+                                                 ext_server_resp_t *err);
+
+void dyn_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result);
+
+void dyn_llama_server_tokenize(struct dynamic_llama_server s,
+                                        const char *json_req, char **json_resp,
+                                        ext_server_resp_t *err);
+
+void dyn_llama_server_detokenize(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          char **json_resp,
+                                          ext_server_resp_t *err);
+
+void dyn_llama_server_embedding(struct dynamic_llama_server s,
+                                         const char *json_req, char **json_resp,
+                                         ext_server_resp_t *err);
+void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
+                                                 char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,14 +1,21 @@

-set(TARGET ollama_llama_server)
+set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
-install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+    add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp)
+else()
+    add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
 endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
+target_link_libraries(${TARGET} PRIVATE ggml llava common )
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
+install(TARGETS ext_server LIBRARY)
+
+if (CUDAToolkit_FOUND)
+    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if (WIN32)
+        target_link_libraries(${TARGET} PRIVATE nvml)
+    endif()
+endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -0,0 +1,18 @@
+# Extern C Server
+
+This directory contains a thin facade we layer on top of the Llama.cpp server to
+expose `extern C` interfaces to access the functionality through direct API
+calls in-process.  The llama.cpp code uses compile time macros to configure GPU
+type along with other settings.  During the `go generate ./...` execution, the
+build will generate one or more copies of the llama.cpp `extern C` server based
+on what GPU libraries are detected to support multiple GPU types as well as CPU
+only support. The Ollama go build then embeds these different servers to support
+different GPUs and settings at runtime.
+
+If you are making changes to the code in this directory, make sure to disable
+caching during your go build to ensure you pick up your changes.  A typical
+iteration cycle from the top of the source tree looks like:
+
+```
+go generate ./... && go build -a .
+```
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -0,0 +1,377 @@
+#include "ext_server.h"
+#include <atomic>
+
+// Necessary evil since the server types are not defined in a header
+#include "server.cpp"
+
+// Low level API access to verify GPU access
+#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define cudaGetDevice hipGetDevice
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaGetErrorString hipGetErrorString
+#else
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#endif // defined(GGML_USE_HIPBLAS)
+#endif // GGML_USE_CUBLAS
+
+// Expose the llama server as a callable extern "C" API
+llama_server_context *llama = NULL;
+std::thread ext_server_thread;
+bool shutting_down = false;
+std::atomic_int recv_counter;
+
+// RAII wrapper for tracking in-flight recv calls
+class atomicRecv {
+  public:
+    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
+      ++this->atomic;
+    }
+    ~atomicRecv() {
+      --this->atomic;
+    }
+  private:
+    std::atomic<int> &atomic;
+};
+ 
+void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
+  recv_counter = 0;
+  assert(err != NULL && sparams != NULL);
+  log_set_target(stderr);
+  if (!sparams->verbose_logging) {
+    server_verbose = true;
+    log_disable();
+  }
+
+  LOG_TEE("system info: %s\n", llama_print_system_info());
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    llama = new llama_server_context;
+    gpt_params params;
+    params.n_ctx = sparams->n_ctx;
+    params.n_batch = sparams->n_batch;
+    if (sparams->n_threads > 0) {
+      params.n_threads = sparams->n_threads;
+    }
+    params.n_parallel = sparams->n_parallel;
+    params.rope_freq_base = sparams->rope_freq_base;
+    params.rope_freq_scale = sparams->rope_freq_scale;
+
+    if (sparams->memory_f16) {
+      params.cache_type_k = "f16";
+      params.cache_type_v = "f16";
+    } else {
+      params.cache_type_k = "f32";
+      params.cache_type_v = "f32";
+    }
+
+    params.n_gpu_layers = sparams->n_gpu_layers;
+    params.main_gpu = sparams->main_gpu;
+    params.use_mlock = sparams->use_mlock;
+    params.use_mmap = sparams->use_mmap;
+    params.numa = (ggml_numa_strategy)sparams->numa;
+    params.embedding = sparams->embedding;
+    if (sparams->model != NULL) {
+      params.model = sparams->model;
+    }
+
+    if (sparams->lora_adapters != NULL) {
+      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
+          la = la->next) {
+        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+      }
+
+      params.use_mmap = false;
+    }
+
+    if (sparams->mmproj != NULL) {
+      params.mmproj = std::string(sparams->mmproj);
+    }
+
+#if defined(GGML_USE_CUBLAS)
+    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
+    LOG_TEE("Performing pre-initialization of GPU\n");
+    int id;
+    cudaError_t cudaErr = cudaGetDevice(&id);
+    if (cudaErr != cudaSuccess) {
+      err->id = -1;
+      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
+      return;
+    }
+#endif
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+  if (!llama->load_model(params)) { 
+    // an error occurred that was not thrown
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+    return;
+  }
+
+    llama->initialize();
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len,
+             "Unknown exception initializing llama server");
+  }
+}
+
+void llama_server_start() {
+  assert(llama != NULL);
+  // TODO mutex to protect thread creation
+  ext_server_thread = std::thread([&]() {
+    try {
+      LOG_TEE("llama server main loop starting\n");
+      ggml_time_init();
+      llama->queue_tasks.on_new_task(std::bind(
+        &llama_server_context::process_single_task, llama, std::placeholders::_1));
+      llama->queue_tasks.on_finish_multitask(std::bind(
+        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
+      llama->queue_tasks.on_run_slots(std::bind(
+        &llama_server_context::update_slots, llama));
+      llama->queue_results.on_multitask_update(std::bind(
+          &llama_server_queue::update_multitask,
+          &llama->queue_tasks,
+          std::placeholders::_1,
+          std::placeholders::_2,
+          std::placeholders::_3
+        ));
+      llama->queue_tasks.start_loop();
+    } catch (std::exception &e) {
+      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
+    } catch (...) {
+      LOG_TEE("caught unknown exception in llama server main loop\n");
+    }
+    LOG_TEE("\nllama server shutting down\n");
+    llama_backend_free();
+  });
+}
+
+void llama_server_stop() {
+  assert(llama != NULL);
+  // Shutdown any in-flight requests and block incoming requests.
+  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
+  shutting_down = true;
+
+  while (recv_counter.load() > 0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+
+  // This may take a while for any pending tasks to drain
+  // TODO - consider a timeout to cancel tasks if it's taking too long
+  llama->queue_tasks.terminate();
+  ext_server_thread.join();
+  delete llama;
+  llama = NULL;
+  LOG_TEE("llama server shutdown complete\n");
+  shutting_down = false;
+}
+
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
+  assert(llama != NULL && json_req != NULL && resp != NULL);
+  resp->id = -1;
+  resp->msg[0] = '\0';
+  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
+    json data = json::parse(json_req);
+    resp->id = llama->queue_tasks.get_new_id();
+    llama->queue_results.add_waiting_task_id(resp->id);
+    llama->request_completion(resp->id, data, false, false, -1);
+  } catch (std::exception &e) {
+    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
+  } catch (...) {
+    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
+  }
+}
+
+void llama_server_completion_next_result(const int task_id,
+                                         ext_server_task_result_t *resp) {
+  assert(llama != NULL && resp != NULL);
+  resp->id = -1;
+  resp->stop = false;
+  resp->error = false;
+  resp->json_resp = NULL;
+  std::string result_json;
+  try {
+    atomicRecv ar(recv_counter);
+    task_result result = llama->queue_results.recv(task_id);
+    result_json =
+        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+    resp->id = result.id;
+    resp->stop = result.stop;
+    resp->error = result.error;
+    if (result.error) {
+      LOG_TEE("next result cancel on error\n");
+      llama->request_cancel(task_id);
+      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
+      llama->queue_results.remove_waiting_task_id(task_id);
+    } else if (result.stop) {
+      LOG_TEE("next result cancel on stop\n");
+      llama->request_cancel(task_id);
+      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
+      llama->queue_results.remove_waiting_task_id(task_id);
+    } else if (shutting_down) {
+      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
+      llama->request_cancel(task_id);
+      llama->queue_results.remove_waiting_task_id(task_id);
+      resp->stop = true;
+    }
+  } catch (std::exception &e) {
+    resp->error = true;
+    resp->id = -1;
+    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
+    LOG_TEE("llama server completion exception %s\n", e.what());
+  } catch (...) {
+    resp->error = true;
+    resp->id = -1;
+    result_json = "{\"error\":\"Unknown exception during completion\"}";
+    LOG_TEE("llama server completion unknown exception\n");
+  }
+  const std::string::size_type size = result_json.size() + 1;
+  resp->json_resp = new char[size];
+  snprintf(resp->json_resp, size, "%s", result_json.c_str());
+}
+
+void llama_server_release_task_result(ext_server_task_result_t *result) {
+  if (result == NULL || result->json_resp == NULL) {
+    return;
+  }
+  delete[] result->json_resp;
+}
+
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
+  assert(llama != NULL && err != NULL);
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    llama->request_cancel(task_id);
+    llama->queue_results.remove_waiting_task_id(task_id);
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len,
+             "Unknown exception completion cancel in llama server");
+  }
+}
+
+void llama_server_tokenize(const char *json_req, char **json_resp,
+                           ext_server_resp_t *err) {
+  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+  *json_resp = NULL;
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
+    const json body = json::parse(json_req);
+    std::vector<llama_token> tokens;
+    if (body.count("content") != 0) {
+      tokens = llama->tokenize(body["content"], false);
+    }
+    const json data = format_tokenizer_response(tokens);
+    std::string result_json = data.dump();
+    const std::string::size_type size = result_json.size() + 1;
+    *json_resp = new char[size];
+    snprintf(*json_resp, size, "%s", result_json.c_str());
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
+  }
+}
+
+void llama_server_release_json_resp(char **json_resp) {
+  if (json_resp == NULL || *json_resp == NULL) {
+    return;
+  }
+  delete[] *json_resp;
+}
+
+void llama_server_detokenize(const char *json_req, char **json_resp,
+                             ext_server_resp_t *err) {
+  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+  *json_resp = NULL;
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
+    const json body = json::parse(json_req);
+    std::string content;
+    if (body.count("tokens") != 0) {
+      const std::vector<llama_token> tokens = body["tokens"];
+      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
+    }
+    const json data = format_detokenized_response(content);
+    std::string result_json = data.dump();
+    const std::string::size_type size = result_json.size() + 1;
+    *json_resp = new char[size];
+    snprintf(*json_resp, size, "%s", result_json.c_str());
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
+  }
+}
+
+void llama_server_embedding(const char *json_req, char **json_resp,
+                            ext_server_resp_t *err) {
+  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+  *json_resp = NULL;
+  err->id = 0;
+  err->msg[0] = '\0';
+  try {
+    if (shutting_down) {
+      throw std::runtime_error("server shutting down");
+    }
+    const json body = json::parse(json_req);
+    json prompt;
+    if (body.count("content") != 0) {
+      prompt = body["content"];
+    } else {
+      prompt = "";
+    }
+    const int task_id = llama->queue_tasks.get_new_id();
+    llama->queue_results.add_waiting_task_id(task_id);
+    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
+    atomicRecv ar(recv_counter);
+    task_result result = llama->queue_results.recv(task_id);
+    std::string result_json = result.result_json.dump();
+    const std::string::size_type size = result_json.size() + 1;
+    *json_resp = new char[size];
+    snprintf(*json_resp, size, "%s", result_json.c_str());
+    llama->queue_results.remove_waiting_task_id(task_id);
+  } catch (std::exception &e) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "exception %s", e.what());
+  } catch (...) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
+  }
+}
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -0,0 +1,95 @@
+#if defined(LLAMA_SERVER_LIBRARY)
+#ifndef LLAMA_SERVER_H
+#define LLAMA_SERVER_H
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+int __main(int argc, char **argv);
+
+// This exposes extern C entrypoints into the llama_server
+// To enable the server compile with LLAMA_SERVER_LIBRARY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct ext_server_resp {
+  int id;          // < 0 on error
+  size_t msg_len;  // caller must allocate msg and set msg_len
+  char *msg;
+} ext_server_resp_t;
+
+// Allocated and freed by caller
+typedef struct ext_server_lora_adapter {
+  char *adapter;
+  float scale;
+  struct ext_server_lora_adapter *next;
+} ext_server_lora_adapter_t;
+
+// Allocated and freed by caller
+typedef struct ext_server_params {
+  char *model;
+  uint32_t n_ctx;         // token context window, 0 = from model
+  uint32_t n_batch;       // prompt processing maximum batch size
+  uint32_t n_threads;     // number of threads to use for generation
+  int32_t n_parallel;     // number of parallel sequences to decodewra
+  float rope_freq_base;   // RoPE base frequency, 0 = from model
+  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+  bool memory_f16;        // use f16 instead of f32 for memory kv
+  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
+  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
+  bool use_mlock;        // force system to keep model in RAM
+  bool use_mmap;         // use mmap if possible
+  int numa;              // attempt optimizations that help on some NUMA systems
+  bool embedding;        // get only sentence embedding
+  ext_server_lora_adapter_t *lora_adapters;
+  char *mmproj;
+  bool verbose_logging;  // Enable verbose logging of the server
+} ext_server_params_t;
+
+typedef struct ext_server_task_result {
+  int id;
+  bool stop;
+  bool error;
+  char *json_resp;  // null terminated, memory managed by ext_server
+} ext_server_task_result_t;
+
+// Initialize the server once per process
+// err->id = 0 for success and err->msg[0] = NULL
+// err->id != 0 for failure, and err->msg contains error message
+void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
+
+// Run the main loop, called once per init
+void llama_server_start();
+// Stop the main loop and free up resources allocated in init and start.  Init
+// must be called again to reuse
+void llama_server_stop();
+
+// json_req null terminated string, memory managed by caller
+// resp->id >= 0 on success (task ID)
+// resp->id < 0 on error, and resp->msg contains error message
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
+
+// Caller must call llama_server_release_task_result to free resp->json_resp
+void llama_server_completion_next_result(const int task_id,
+                                         ext_server_task_result_t *result);
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
+void llama_server_release_task_result(ext_server_task_result_t *result);
+
+// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
+// 0
+void llama_server_tokenize(const char *json_req, char **json_resp,
+                           ext_server_resp_t *err);
+void llama_server_detokenize(const char *json_req, char **json_resp,
+                             ext_server_resp_t *err);
+void llama_server_embedding(const char *json_req, char **json_resp,
+                            ext_server_resp_t *err);
+void llama_server_release_json_resp(char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+#endif  // LLAMA_SERVER_LIBRARY
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -2770,7 +2770,7 @@ inline void signal_handler(int signal) {
    shutdown_handler(signal);
 }

-int main(int argc, char **argv)
+int _main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
    log_disable();
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -14,7 +14,7 @@ init_vars() {

    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ollama_llama_server"
+    CMAKE_TARGETS="--target ext_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
@@ -81,24 +81,27 @@ apply_patches() {
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+    mkdir -p ${BUILD_DIR}/lib/
+    ls ${BUILD_DIR}
+    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
+        ${GCC_ARCH} \
+        ${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,-rpath,\$ORIGIN \
+        -lpthread -ldl -lm \
+        ${EXTRA_LIBS}
 }

-compress() {
+compress_libs() {
    echo "Compressing payloads to reduce overall binary size..."
    pids=""
-    rm -rf ${BUILD_DIR}/bin/*.gz
-    for f in ${BUILD_DIR}/bin/* ; do
-        gzip -n --best -f ${f} &
+    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
+    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
+        gzip -n --best -f ${lib} &
        pids+=" $!"
    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            gzip -n --best -f ${f} &
-            pids+=" $!"
-        done
-    fi
-    echo
+    echo 
    for pid in ${pids}; do
        wait $pid
    done
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,31 +18,21 @@ sign() {
    fi
 }

-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"

 case "${GOARCH}" in
 "amd64")
    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"

-    # Static build for linking into the Go binary
-    init_vars
-    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}_static"
-    echo "Building static library"
-    build
-
-
    #
    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
    #
-    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/cpu"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
    echo "Building LCD CPU"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
-    compress
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
+    compress_libs

    #
    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
@@ -50,11 +40,11 @@ case "${GOARCH}" in
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
    echo "Building AVX CPU"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
-    compress
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
+    compress_libs

    #
    # ~2013 CPU Dynamic library
@@ -62,30 +52,20 @@ case "${GOARCH}" in
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
    echo "Building AVX2 CPU"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
-    compress
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
+    compress_libs
    ;;
 "arm64")
-
-    # Static build for linking into the Go binary
-    init_vars
-    CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}_static"
-    echo "Building static library"
-    build
-
-    init_vars
-    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    BUILD_DIR="../build/darwin/${ARCH}/metal"
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
    build
-    sign ${BUILD_DIR}/bin/ollama_llama_server
-    compress
+    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
+    compress_libs
    ;;
 *)
    echo "GOARCH must be set"
@@ -95,4 +75,3 @@ case "${GOARCH}" in
 esac

 cleanup
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -57,31 +57,16 @@ init_vars
 git_module_setup
 apply_patches

-
-init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-
-    if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
-        # Static build for linking into the Go binary
-        init_vars
-        CMAKE_TARGETS="--target llama --target ggml"
-        CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}_static"
-        echo "Building static library"
-        build
-    fi
-
-
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
-        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}/cpu"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
-        compress
+        compress_libs
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
@@ -98,12 +83,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
            #
-            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="../build/linux/${ARCH}/cpu"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
-            compress
+            compress_libs
        fi

        if [ "${ARCH}" == "x86_64" ]; then
@@ -117,10 +101,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
+                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
-                compress
+                compress_libs
            fi

            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
@@ -130,10 +114,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
+                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
                echo "Building AVX2 CPU"
                build
-                compress
+                compress_libs
            fi
        fi
    fi
@@ -172,8 +156,8 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
        # Disabling has minimal performance effect while maintaining compatibility. 
        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
    fi
-    CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
-    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build

@@ -181,20 +165,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    #
    # TODO - in the future we may shift to packaging these separately and conditionally
    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
+    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
    for lib in libcudart.so libcublas.so libcublasLt.so ; do
        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
+            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
        fi
    done
-    compress
+    compress_libs

 fi

@@ -217,24 +201,23 @@ if [ -d "${ROCM_PATH}" ]; then
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/bin/deps.txt"
-    touch "${BUILD_DIR}/bin/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
+    rm -f "${BUILD_DIR}/lib/deps.txt"
+    touch "${BUILD_DIR}/lib/deps.txt"
+    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
+        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
    done
    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/bin/deps.txt"
+    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
+        cat "${BUILD_DIR}/lib/deps.txt"
        echo "ERROR: deps file short"
        exit 1
    fi
-    compress
+    compress_libs
 fi

 cleanup
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -33,7 +33,7 @@ function init_vars {
        "-DBUILD_SHARED_LIBS=on",
        "-DLLAMA_NATIVE=off"
        )
-    $script:cmakeTargets = @("ollama_llama_server")
+    $script:cmakeTargets = @("ext_server")
    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -97,14 +97,16 @@ function apply_patches {
        }

        # Checkout each file
+        Set-Location -Path ${script:llamacppDir}
        foreach ($file in $filePaths) {
-            git -C "${script:llamacppDir}" checkout $file
+            git checkout $file
        }
    }

    # Apply each patch
    foreach ($patch in $patches) {
-        git -C "${script:llamacppDir}" apply $patch.FullName
+        Set-Location -Path ${script:llamacppDir}
+        git apply $patch.FullName
    }
 }

@@ -113,41 +115,41 @@ function build {
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
+    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    # Rearrange output to be consistent between different generators
-    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
-        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
-        remove-item "${script:buildDir}/bin/${script:config}"
+}
+
+function install {
+    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
+    md "${script:buildDir}/lib" -ea 0 > $null
+    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
+    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
+    # Display the dll dependencies in the build log
+    if ($script:DUMPBIN -ne $null) {
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
 }

 function sign {
    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
-        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
-            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
+        write-host "Signing ${script:buildDir}/lib/*.dll"
+        foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
+            & "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }

-function compress {
+function compress_libs {
    if ($script:GZIP -eq $null) {
        write-host "gzip not installed, not compressing files"
        return
    }
-    write-host "Compressing binaries..."
-    $binaries = dir "${script:buildDir}/bin/*.exe"
-    foreach ($file in $binaries) {
-        & "$script:GZIP" --best -f $file
-    }
-
    write-host "Compressing dlls..."
-    $dlls = dir "${script:buildDir}/bin/*.dll"
-    foreach ($file in $dlls) {
+    $libs = dir "${script:buildDir}/lib/*.dll"
+    foreach ($file in $libs) {
        & "$script:GZIP" --best -f $file
    }
 }
@@ -162,11 +164,14 @@ function cleanup {
        }

        # Checkout each file
+        Set-Location -Path ${script:llamacppDir}
        foreach ($file in $filePaths) {            
-            git -C "${script:llamacppDir}" checkout $file
+            git checkout $file
        }
-        git -C "${script:llamacppDir}" checkout CMakeLists.txt
    }
+    Set-Location "${script:llamacppDir}/"
+    git checkout CMakeLists.txt
+
 }

 init_vars
@@ -174,6 +179,7 @@ git_module_setup
 apply_patches

 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
+# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver

@@ -181,54 +187,32 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")

 if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {

-# GCC build for direct linking into the Go binary
-init_vars
-# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
-# as we need this to be compiled by gcc for golang to be able to link with itx
-write-host "Checking for MinGW..."
-# error action ensures we exit on failure
-get-command gcc
-get-command mingw32-make
-$script:cmakeTargets = @("llama", "ggml")
-$script:cmakeDefs = @(
-    "-G", "MinGW Makefiles"
-    "-DCMAKE_C_COMPILER=gcc.exe",
-    "-DCMAKE_CXX_COMPILER=g++.exe",
-    "-DBUILD_SHARED_LIBS=off",
-    "-DLLAMA_NATIVE=off",
-    "-DLLAMA_AVX=off",
-    "-DLLAMA_AVX2=off",
-    "-DLLAMA_AVX512=off",
-    "-DLLAMA_F16C=off",
-    "-DLLAMA_FMA=off")
-$script:buildDir="../build/windows/${script:ARCH}_static"
-write-host "Building static library"
-build
-
-# remaining llama.cpp builds use MSVC 
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:buildDir="../build/windows/${script:ARCH}/cpu"
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
    write-host "Building LCD CPU"
    build
+    install
    sign
-    compress
+    compress_libs

    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
    write-host "Building AVX CPU"
    build
+    install
    sign
-    compress
+    compress_libs

    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
    write-host "Building AVX2 CPU"
    build
+    install
    sign
-    compress
+    compress_libs
 } else {
    write-host "Skipping CPU generation step as requested"
 }
@@ -241,11 +225,13 @@ if ($null -ne $script:CUDA_LIB_DIR) {
        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
    }
    init_vars
-    $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
+    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    write-host "Building CUDA"
    build
+    install
    sign
-    compress
+    compress_libs
 }

 if ($null -ne $env:HIP_PATH) {
@@ -255,13 +241,12 @@ if ($null -ne $env:HIP_PATH) {
    }

    init_vars
-    $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
    $script:cmakeDefs += @(
        "-G", "Ninja", 
        "-DCMAKE_C_COMPILER=clang.exe",
        "-DCMAKE_CXX_COMPILER=clang++.exe",
        "-DLLAMA_HIPBLAS=on",
-        "-DHIP_PLATFORM=amd",
        "-DLLAMA_AVX=on",
        "-DLLAMA_AVX2=off",
        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
@@ -279,13 +264,13 @@ if ($null -ne $env:HIP_PATH) {
    build
    # Ninja doesn't prefix with config name
    ${script:config}=""
+    install
    if ($null -ne $script:DUMPBIN) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
    sign
-    compress
+    compress_libs
 }

-
 cleanup
-write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
+write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})"
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@@ -1,3 +1,3 @@
 package generate

-//go:generate bash ./gen_darwin.sh
+//go:generate sh ./gen_darwin.sh
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -7,18 +7,16 @@ import (
 	"slices"
 )

-type containerGGLA struct {
+type ContainerGGLA struct {
 	version uint32
 }

-func (c *containerGGLA) Name() string {
+func (c *ContainerGGLA) Name() string {
 	return "ggla"
 }

-func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
-	if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
-		return nil, err
-	}
+func (c *ContainerGGLA) Decode(rs io.ReadSeeker) (model, error) {
+	binary.Read(rs, binary.LittleEndian, &c.version)

 	switch c.version {
 	case 1:
@@ -26,45 +24,37 @@ func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
 		return nil, errors.New("invalid version")
 	}

-	model := newGGLA(c)
+	model := newModelGGLA(c)
 	err := model.decode(rs)
 	return model, err
 }

-type ggla struct {
-	*containerGGLA
+type ModelGGLA struct {
+	*ContainerGGLA

 	kv      KV
-	tensors []*Tensor
+	tensors []Tensor
 }

-func newGGLA(container *containerGGLA) *ggla {
-	return &ggla{
-		containerGGLA: container,
+func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
+	return &ModelGGLA{
+		ContainerGGLA: container,
 		kv:            make(KV),
 	}
 }

-func (llm *ggla) KV() KV {
-	return llm.kv
-}
-
-func (llm *ggla) Tensors() Tensors {
-	return llm.tensors
-}
-
-func (llm *ggla) decode(rs io.ReadSeeker) error {
+func (m *ModelGGLA) decode(rs io.ReadSeeker) error {
 	var r uint32
 	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
 		return err
 	}
-	llm.kv["r"] = r
+	m.kv["r"] = r

 	var alpha uint32
 	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
 		return err
 	}
-	llm.kv["alpha"] = alpha
+	m.kv["alpha"] = alpha

 	for {
 		var dims uint32
@@ -119,10 +109,54 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {

 		t.Offset = uint64(offset)

-		if _, err := rs.Seek(int64(t.size()), io.SeekCurrent); err != nil {
+		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
 			return err
 		}

-		llm.tensors = append(llm.tensors, &t)
+		m.tensors = append(m.tensors, t)
 	}
 }
+
+func (m *ModelGGLA) KV() KV {
+	return m.kv
+}
+
+func (m *ModelGGLA) Tensor() []Tensor {
+	return m.tensors
+}
+
+func (*ModelGGLA) ModelFamily() string {
+	return "ggla"
+}
+
+func (*ModelGGLA) ModelType() string {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) FileType() string {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) NumLayers() uint32 {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) NumGQA() uint32 {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) NumEmbed() uint32 {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) NumHead() uint32 {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) NumHeadKv() uint32 {
+	panic("not implemented")
+}
+
+func (*ModelGGLA) NumCtx() uint32 {
+	panic("not implemented")
+}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -3,14 +3,14 @@ package llm
 import (
 	"encoding/binary"
 	"errors"
-	"fmt"
 	"io"
-	"strings"
 )

 type GGML struct {
 	container
 	model
+
+	Size int64
 }

 const (
@@ -90,178 +90,15 @@ func fileType(fileType uint32) string {
 }

 type model interface {
-	KV() KV
-	Tensors() Tensors
-}
-
-type KV map[string]any
-
-func (kv KV) u64(key string) uint64 {
-	switch v := kv[key].(type) {
-	case uint64:
-		return v
-	case uint32:
-		return uint64(v)
-	case float64:
-		return uint64(v)
-	default:
-		return 0
-	}
-}
-
-func (kv KV) Architecture() string {
-	if s, ok := kv["general.architecture"].(string); ok {
-		return s
-	}
-
-	return "unknown"
-}
-
-func (kv KV) ParameterCount() uint64 {
-	return kv.u64("general.parameter_count")
-}
-
-func (kv KV) FileType() string {
-	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
-	}
-
-	return "unknown"
-}
-
-func (kv KV) BlockCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
-}
-
-func (kv KV) HeadCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
-}
-
-func (kv KV) HeadCountKV() uint64 {
-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
-		return headCountKV
-	}
-
-	return 1
-}
-
-func (kv KV) GQA() uint64 {
-	return kv.HeadCount() / kv.HeadCountKV()
-}
-
-func (kv KV) EmbeddingLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
-}
-
-func (kv KV) ContextLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
-}
-
-type Tensors []*Tensor
-
-func (ts Tensors) Layers() map[string]Layer {
-	layers := make(map[string]Layer)
-	for _, t := range ts {
-		parts := strings.Split(t.Name, ".")
-		if parts[0] == "blk" {
-			parts = parts[1:]
-		}
-
-		if _, ok := layers[parts[0]]; !ok {
-			layers[parts[0]] = make(Layer)
-		}
-
-		layers[parts[0]][strings.Join(parts[1:], ".")] = t
-	}
-
-	return layers
-}
-
-type Layer map[string]*Tensor
-
-func (l Layer) size() (size uint64) {
-	for _, t := range l {
-		size += t.size()
-	}
-
-	return size
-}
-
-type Tensor struct {
-	Name   string `json:"name"`
-	Kind   uint32 `json:"kind"`
-	Offset uint64 `json:"-"`
-
-	// Shape is the number of elements in each dimension
-	Shape []uint64 `json:"shape"`
-
-	io.WriterTo `json:"-"`
-}
-
-func (t Tensor) blockSize() uint64 {
-	switch {
-	case t.Kind < 2:
-		return 1
-	case t.Kind < 10:
-		return 32
-	default:
-		return 256
-	}
-}
-
-func (t Tensor) typeSize() uint64 {
-	blockSize := t.blockSize()
-
-	switch t.Kind {
-	case 0: // FP32
-		return 4
-	case 1: // FP16
-		return 2
-	case 2: // Q4_0
-		return 2 + blockSize/2
-	case 3: // Q4_1
-		return 2 + 2 + blockSize/2
-	case 6: // Q5_0
-		return 2 + 4 + blockSize/2
-	case 7: // Q5_1
-		return 2 + 2 + 4 + blockSize/2
-	case 8: // Q8_0
-		return 2 + blockSize
-	case 9: // Q8_1
-		return 4 + 4 + blockSize
-	case 10: // Q2_K
-		return blockSize/16 + blockSize/4 + 2 + 2
-	case 11: // Q3_K
-		return blockSize/8 + blockSize/4 + 12 + 2
-	case 12: // Q4_K
-		return 2 + 2 + 12 + blockSize/2
-	case 13: // Q5_K
-		return 2 + 2 + 12 + blockSize/8 + blockSize/2
-	case 14: // Q6_K
-		return blockSize/2 + blockSize/4 + blockSize/16 + 2
-	case 15: // Q8_K
-		return 2 + blockSize + 2*blockSize/16
-	case 16: // IQ2_XXS
-		return 2 + 2*blockSize/8
-	case 17: // IQ2_XS
-		return 2 + 2*blockSize/8 + blockSize/32
-	case 18: // IQ3_XXS
-		return 2 + 3*blockSize/8
-	default:
-		return 0
-	}
-}
-
-func (t Tensor) parameters() uint64 {
-	var count uint64 = 1
-	for _, n := range t.Shape {
-		count *= n
-	}
-	return count
-}
-
-func (t Tensor) size() uint64 {
-	return t.parameters() * t.typeSize() / t.blockSize()
+	ModelFamily() string
+	ModelType() string
+	FileType() string
+	NumLayers() uint32
+	NumGQA() uint32
+	NumEmbed() uint32
+	NumHead() uint32
+	NumHeadKv() uint32
+	NumCtx() uint32
 }

 type container interface {
@@ -285,102 +122,42 @@ const (

 var ErrUnsupportedFormat = errors.New("unsupported model format")

-func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
+func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	var c container
 	switch magic {
 	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
-		return nil, 0, ErrUnsupportedFormat
+		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
-		c = &containerGGLA{}
+		c = &ContainerGGLA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian}
+		c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian}
+		c = &ContainerGGUF{ByteOrder: binary.BigEndian}
 	default:
-		return nil, 0, errors.New("invalid file magic")
+		return nil, errors.New("invalid file magic")
 	}

 	model, err := c.Decode(rs)
 	if errors.Is(err, io.EOF) {
 		// noop
 	} else if err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-	}, offset, nil
-}
-
-func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
-	embedding := llm.KV().EmbeddingLength()
-	heads := llm.KV().HeadCount()
-	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
-
-	layers := llm.Tensors().Layers()
-
-	switch llm.KV().Architecture() {
-	case "llama":
-		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
-
-		partialOffload = 4 * batch * embedding
-		partialOffload += max(
-			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-		)
-
-		if ffnGateWeight, ok := layers["0"]["ffn_gate.0.weight"]; ok {
-			ffnGateWeight1 := ffnGateWeight.Shape[1]
-			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
-			partialOffload = max(
-				4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
-				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
-			)
-		}
-	case "gemma":
-		fullOffload = 4 * batch * (embedding + vocab)
-		partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
-	case "command-r":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(2+4*embedding+context*(1+heads)),
-		)
-
-		partialOffload = max(
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
-		)
-	case "qwen2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(1+2*embedding+context+context*heads),
-		)
-
-		partialOffload = max(
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
-		)
-	case "phi2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(1+4*embedding+context+context*heads),
-		)
-
-		partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
-	}
-
-	return
+		Size:      offset,
+	}, nil
 }
--- a/llm/gguf.go
+++ b/llm/gguf.go
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -0,0 +1,100 @@
+package llm
+
+import (
+	_ "embed"
+	"fmt"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+const jsonGrammar = `
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+
+string ::=
+  "\"" (
+    [^"\\] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= ([ \t\n] ws)?
+`
+
+type ImageData struct {
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
+}
+
+var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
+
+type prediction struct {
+	Content string `json:"content"`
+	Model   string `json:"model"`
+	Prompt  string `json:"prompt"`
+	Stop    bool   `json:"stop"`
+
+	Timings struct {
+		PredictedN  int     `json:"predicted_n"`
+		PredictedMS float64 `json:"predicted_ms"`
+		PromptN     int     `json:"prompt_n"`
+		PromptMS    float64 `json:"prompt_ms"`
+	}
+}
+
+const maxRetries = 3
+
+type PredictOpts struct {
+	Prompt  string
+	Format  string
+	Images  []ImageData
+	Options api.Options
+}
+
+type PredictResult struct {
+	Content            string
+	Done               bool
+	PromptEvalCount    int
+	PromptEvalDuration time.Duration
+	EvalCount          int
+	EvalDuration       time.Duration
+}
+
+type TokenizeRequest struct {
+	Content string `json:"content"`
+}
+
+type TokenizeResponse struct {
+	Tokens []int `json:"tokens"`
+}
+
+type DetokenizeRequest struct {
+	Tokens []int `json:"tokens"`
+}
+
+type DetokenizeResponse struct {
+	Content string `json:"content"`
+}
+
+type EmbeddingRequest struct {
+	Content string `json:"content"`
+}
+
+type EmbeddingResponse struct {
+	Embedding []float64 `json:"embedding"`
+}
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,86 +1,175 @@
 package llm

-// #cgo CFLAGS: -Illama.cpp
-// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
-// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
-// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
-// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
-// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
-// #include <stdlib.h>
-// #include "llama.h"
-import "C"
 import (
+	"context"
 	"fmt"
-	"unsafe"
+	"log/slog"
+	"os"
+	"runtime"
+	"slices"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/gpu"
 )

-// SystemInfo is an unused example of calling llama.cpp functions using CGo
-func SystemInfo() string {
-	return C.GoString(C.llama_print_system_info())
+type LLM interface {
+	Predict(context.Context, PredictOpts, func(PredictResult)) error
+	Embedding(context.Context, string) ([]float64, error)
+	Encode(context.Context, string) ([]int, error)
+	Decode(context.Context, []int) (string, error)
+	Close()
 }

-func Quantize(infile, outfile, filetype string) error {
-	cinfile := C.CString(infile)
-	defer C.free(unsafe.Pointer(cinfile))
+var cpuOnlyFamilies = []string{
+	"mamba",
+}

-	coutfile := C.CString(outfile)
-	defer C.free(unsafe.Pointer(coutfile))
+func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+	if _, err := os.Stat(model); err != nil {
+		return nil, err
+	}

-	params := C.llama_model_quantize_default_params()
-	params.nthread = -1
+	f, err := os.Open(model)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()

-	switch filetype {
-	case "F32":
-		params.ftype = fileTypeF32
-	case "F16":
-		params.ftype = fileTypeF16
-	case "Q4_0":
-		params.ftype = fileTypeQ4_0
-	case "Q4_1":
-		params.ftype = fileTypeQ4_1
-	case "Q4_1_F16":
-		params.ftype = fileTypeQ4_1_F16
-	case "Q8_0":
-		params.ftype = fileTypeQ8_0
-	case "Q5_0":
-		params.ftype = fileTypeQ5_0
-	case "Q5_1":
-		params.ftype = fileTypeQ5_1
-	case "Q2_K":
-		params.ftype = fileTypeQ2_K
-	case "Q3_K_S":
-		params.ftype = fileTypeQ3_K_S
-	case "Q3_K_M":
-		params.ftype = fileTypeQ3_K_M
-	case "Q3_K_L":
-		params.ftype = fileTypeQ3_K_L
-	case "Q4_K_S":
-		params.ftype = fileTypeQ4_K_S
-	case "Q4_K_M":
-		params.ftype = fileTypeQ4_K_M
-	case "Q5_K_S":
-		params.ftype = fileTypeQ5_K_S
-	case "Q5_K_M":
-		params.ftype = fileTypeQ5_K_M
-	case "Q6_K":
-		params.ftype = fileTypeQ6_K
-	case "IQ2_XXS":
-		params.ftype = fileTypeIQ2_XXS
-	case "IQ2_XS":
-		params.ftype = fileTypeIQ2_XS
-	case "Q2_K_S":
-		params.ftype = fileTypeQ2_K_S
-	case "Q3_K_XS":
-		params.ftype = fileTypeQ3_K_XS
-	case "IQ3_XXS":
-		params.ftype = fileTypeIQ3_XXS
+	ggml, err := DecodeGGML(f)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.NumCtx > int(ggml.NumCtx()) {
+		slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
+		opts.NumCtx = int(ggml.NumCtx())
+	}
+
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
+
+	vram, _ := gpu.CheckVRAM()
+	size := ggml.Size
+
+	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
+
+	// this amount is the overhead + tensors in memory
+	// TODO: get this from the llama.cpp's graph calculations instead of
+	// estimating it's 1/6 * kv_cache_size * num_gqa
+	graph := int64(ggml.NumGQA()) * kv / 6
+
+	// certain model architectures don't support gpu inference yet
+	if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
+		opts.NumGPU = 0
+	}
+
+	info := gpu.GetGPUInfo()
+	switch runtime.GOOS {
+	case "darwin":
+		if opts.NumGPU == 0 {
+			break
+		}
+
+		if size+kv+graph > vram {
+			slog.Info("not enough vram available, setting num_gpu=0")
+			opts.NumGPU = 0
+			break
+		}
+
+		// TODO: implement layer splitting on macOS
+		opts.NumGPU = 999
 	default:
-		return fmt.Errorf("unknown filetype: %s", filetype)
+		if info.Library == "cpu" {
+			slog.Info("GPU not available, falling back to CPU")
+			opts.NumGPU = 0
+			break
+		}
+
+		// don't use GPU at all if no layers are loaded
+		if opts.NumGPU == 0 {
+			info.Library = "cpu"
+			info.Variant = gpu.GetCPUVariant()
+			break
+		}
+
+		// user-defined GPU count
+		if opts.NumGPU != -1 {
+			break
+		}
+
+		// the "main" GPU needs the most memory and determines the limit
+		// of how many layers can be loaded. It needs to fit:
+		// 1. the full compute graph allocation for all devices (graph)
+		// 2. the proportional kv cache for all devices (kv * % layers)
+		// 3. the proportional model (size * % layers / # devices)
+		// This estimates the number of layers
+		maxlayers := int64(ggml.NumLayers()) + 1
+		devices := int64(info.DeviceCount)
+		avg := vram / devices
+		layers := maxlayers * (avg - graph) / (kv + size/devices)
+		if layers > maxlayers {
+			layers = maxlayers
+		}
+
+		// 1 + 2 must fit on the main gpu
+		min := graph + kv*layers/maxlayers
+		if layers <= 0 || min > avg {
+			slog.Info("not enough vram available, falling back to CPU only")
+			info.Library = "cpu"
+			info.Variant = gpu.GetCPUVariant()
+			opts.NumGPU = 0
+			break
+		}
+
+		opts.NumGPU = int(layers)
 	}

-	if retval := C.llama_model_quantize(cinfile, coutfile, &params); retval != 0 {
-		return fmt.Errorf("llama_model_quantize: %d", retval)
-	}
-
-	return nil
+	opts.RopeFrequencyBase = 0.0
+	opts.RopeFrequencyScale = 0.0
+	return newLlmServer(info, model, adapters, projectors, opts)
+}
+
+// Give any native cgo implementations an opportunity to initialize
+func Init() error {
+	return nativeInit()
+}
+
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+	dynLibs := getDynLibs(gpuInfo)
+
+	// Check to see if the user has requested a specific library instead of auto-detecting
+	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
+	if demandLib != "" {
+		libPath := availableDynLibs[demandLib]
+		if libPath == "" {
+			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
+		} else {
+			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
+			dynLibs = []string{libPath}
+		}
+	}
+
+	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
+	_, err := os.Stat(dynLibs[0])
+	if err != nil {
+		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
+		err = nativeInit()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	err2 := fmt.Errorf("unable to locate suitable llm library")
+	for _, dynLib := range dynLibs {
+		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
+		if err == nil {
+			return srv, nil
+		}
+		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
+		err2 = err
+	}
+
+	return nil, err2
 }
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -1,6 +0,0 @@
-package llm
-
-import "embed"
-
-//go:embed build/linux/*/*/bin/*
-var libEmbed embed.FS
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -1,6 +0,0 @@
-package llm
-
-import "embed"
-
-//go:embed build/windows/*/*/bin/*
-var libEmbed embed.FS
--- a/llm/patches/04-locale.diff
+++ b/llm/patches/04-locale.diff
@@ -0,0 +1,13 @@
+diff --git a/llama.cpp b/llama.cpp
+index b27aa272..99372f9c 100644
+--- a/llama.cpp
+++ b/llama.cpp
+@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
+     }
+ 
+     uint32_t to_lower(uint32_t code) {
+-        static const std::locale locale("en_US.UTF-8");
+        static const std::locale locale("");
+ #if defined(_WIN32)
+         if (code > 0xFFFF) {
+             return code;
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -1,211 +0,0 @@
-package llm
-
-import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"strings"
-
-	"golang.org/x/exp/slices"
-	"golang.org/x/sync/errgroup"
-
-	"github.com/ollama/ollama/gpu"
-)
-
-var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
-
-func Init() error {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		return err
-	}
-
-	slog.Info("extracting embedded files", "dir", payloadsDir)
-	binGlob := "build/*/*/*/bin/*"
-
-	// extract server libraries
-	err = extractFiles(payloadsDir, binGlob)
-	if err != nil {
-		return fmt.Errorf("extract binaries: %v", err)
-	}
-
-	var variants []string
-	for v := range availableServers() {
-		variants = append(variants, v)
-	}
-	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
-	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-
-	return nil
-}
-
-// binary names may contain an optional variant separated by '_'
-// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
-// Any library without a variant is the lowest common denominator
-func availableServers() map[string]string {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		slog.Error("payload lookup error", "error", err)
-		return nil
-	}
-
-	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*")
-
-	files, err := filepath.Glob(pattern)
-	if err != nil {
-		slog.Debug("could not glob", "pattern", pattern, "error", err)
-		return nil
-	}
-
-	servers := make(map[string]string)
-	for _, file := range files {
-		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(file)] = file
-	}
-
-	return servers
-}
-
-// serversForGpu returns a list of compatible servers give the provided GPU
-// info, ordered by performance. assumes Init() has been called
-// TODO - switch to metadata based mapping
-func serversForGpu(info gpu.GpuInfo) []string {
-	// glob workDir for files that start with ollama_
-	availableServers := availableServers()
-	requested := info.Library
-	if info.Variant != "" {
-		requested += "_" + info.Variant
-	}
-
-	servers := []string{}
-
-	// exact match first
-	for a := range availableServers {
-		if a == requested {
-			servers = []string{a}
-
-			if a == "metal" {
-				return servers
-			}
-
-			break
-		}
-	}
-
-	alt := []string{}
-
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if info.Library != "cpu" {
-		for a := range availableServers {
-			if info.Library == strings.Split(a, "_")[0] && a != requested {
-				alt = append(alt, a)
-			}
-		}
-
-		slices.Sort(alt)
-		servers = append(servers, alt...)
-	}
-
-	// Load up the best CPU variant if not primary requested
-	if info.Library != "cpu" {
-		variant := gpu.GetCPUVariant()
-		// If no variant, then we fall back to default
-		// If we have a variant, try that if we find an exact match
-		// Attempting to run the wrong CPU instructions will panic the
-		// process
-		if variant != "" {
-			for cmp := range availableServers {
-				if cmp == "cpu_"+variant {
-					servers = append(servers, cmp)
-					break
-				}
-			}
-		} else {
-			servers = append(servers, "cpu")
-		}
-	}
-
-	if len(servers) == 0 {
-		servers = []string{"cpu"}
-	}
-
-	return servers
-}
-
-// extract extracts the embedded files to the target directory
-func extractFiles(targetDir string, glob string) error {
-	files, err := fs.Glob(libEmbed, glob)
-	if err != nil || len(files) == 0 {
-		return errPayloadMissing
-	}
-
-	if err := os.MkdirAll(targetDir, 0o755); err != nil {
-		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
-	}
-
-	g := new(errgroup.Group)
-
-	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
-	for _, file := range files {
-		filename := file
-
-		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
-
-		slog.Debug("extracting", "variant", variant, "file", filename)
-
-		g.Go(func() error {
-			srcf, err := libEmbed.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer srcf.Close()
-
-			src := io.Reader(srcf)
-			if strings.HasSuffix(filename, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", filename, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-
-			variantDir := filepath.Join(targetDir, variant)
-			if err := os.MkdirAll(variantDir, 0o755); err != nil {
-				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
-			}
-
-			base := filepath.Base(filename)
-			destFilename := filepath.Join(variantDir, base)
-
-			_, err = os.Stat(destFilename)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", filename, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", filename, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", filename, err)
-			}
-			return nil
-		})
-	}
-
-	err = g.Wait()
-	if err != nil {
-		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
-		gpu.Cleanup()
-		return err
-	}
-	return nil
-}
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -0,0 +1,233 @@
+package llm
+
+import (
+	"compress/gzip"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+
+	"golang.org/x/exp/slices"
+	"golang.org/x/sync/errgroup"
+
+	"github.com/ollama/ollama/gpu"
+)
+
+// Libraries names may contain an optional variant separated by '_'
+// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
+// Any library without a variant is the lowest common denominator
+var availableDynLibs = map[string]string{}
+
+const pathComponentCount = 7
+
+// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
+func getDynLibs(gpuInfo gpu.GpuInfo) []string {
+	// Short circuit if we know we're using the default built-in (darwin only)
+	if gpuInfo.Library == "default" {
+		return []string{"default"}
+	}
+	// TODO - temporary until we have multiple CPU variations for Darwin
+	// Short circuit on darwin with metal only
+	if len(availableDynLibs) == 1 {
+		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
+			return []string{availableDynLibs["metal"]}
+		}
+	}
+
+	exactMatch := ""
+	dynLibs := []string{}
+	altDynLibs := []string{}
+	requested := gpuInfo.Library
+	if gpuInfo.Variant != "" {
+		requested += "_" + gpuInfo.Variant
+	}
+	// Try to find an exact match
+	for cmp := range availableDynLibs {
+		if requested == cmp {
+			exactMatch = cmp
+			dynLibs = []string{availableDynLibs[cmp]}
+			break
+		}
+	}
+	// Then for GPUs load alternates and sort the list for consistent load ordering
+	if gpuInfo.Library != "cpu" {
+		for cmp := range availableDynLibs {
+			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
+				altDynLibs = append(altDynLibs, cmp)
+			}
+		}
+		slices.Sort(altDynLibs)
+		for _, altDynLib := range altDynLibs {
+			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
+		}
+	}
+
+	// Load up the best CPU variant if not primary requested
+	if gpuInfo.Library != "cpu" {
+		variant := gpu.GetCPUVariant()
+		// If no variant, then we fall back to default
+		// If we have a variant, try that if we find an exact match
+		// Attempting to run the wrong CPU instructions will panic the
+		// process
+		if variant != "" {
+			for cmp := range availableDynLibs {
+				if cmp == "cpu_"+variant {
+					dynLibs = append(dynLibs, availableDynLibs[cmp])
+					break
+				}
+			}
+		} else {
+			dynLibs = append(dynLibs, availableDynLibs["cpu"])
+		}
+	}
+
+	// Finally, if we didn't find any matches, LCD CPU FTW
+	if len(dynLibs) == 0 {
+		dynLibs = []string{availableDynLibs["cpu"]}
+	}
+	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
+	return dynLibs
+}
+
+func rocmDynLibPresent() bool {
+	for dynLibName := range availableDynLibs {
+		if strings.HasPrefix(dynLibName, "rocm") {
+			return true
+		}
+	}
+	return false
+}
+
+func nativeInit() error {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		return err
+	}
+
+	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
+
+	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
+	if err != nil {
+		if errors.Is(err, payloadMissing) {
+			slog.Info(fmt.Sprintf("%s", payloadMissing))
+			return nil
+		}
+		return err
+	}
+	for _, lib := range libs {
+		// The last dir component is the variant name
+		variant := filepath.Base(filepath.Dir(lib))
+		availableDynLibs[variant] = lib
+	}
+
+	if err := verifyDriverAccess(); err != nil {
+		return err
+	}
+
+	// Report which dynamic libraries we have loaded to assist troubleshooting
+	variants := make([]string, len(availableDynLibs))
+	i := 0
+	for variant := range availableDynLibs {
+		variants[i] = variant
+		i++
+	}
+	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
+
+	return nil
+}
+
+func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return nil, payloadMissing
+	}
+
+	var mu sync.Mutex
+	var libs []string
+	var g errgroup.Group
+	for _, file := range files {
+		pathComps := strings.Split(file, "/")
+		if len(pathComps) != pathComponentCount {
+			slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
+			continue
+		}
+
+		file := file
+		g.Go(func() error {
+			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
+			// Include the variant in the path to avoid conflicts between multiple server libs
+			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
+			srcFile, err := libEmbed.Open(file)
+			if err != nil {
+				return fmt.Errorf("read payload %s: %v", file, err)
+			}
+			defer srcFile.Close()
+			if err := os.MkdirAll(targetDir, 0o755); err != nil {
+				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
+			}
+			src := io.Reader(srcFile)
+			filename := file
+			if strings.HasSuffix(file, ".gz") {
+				src, err = gzip.NewReader(src)
+				if err != nil {
+					return fmt.Errorf("decompress payload %s: %v", file, err)
+				}
+				filename = strings.TrimSuffix(filename, ".gz")
+			}
+
+			destFile := filepath.Join(targetDir, filepath.Base(filename))
+			if strings.Contains(destFile, "server") {
+				mu.Lock()
+				libs = append(libs, destFile)
+				mu.Unlock()
+			}
+
+			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFp.Close()
+			if _, err := io.Copy(destFp, src); err != nil {
+				return fmt.Errorf("copy payload %s: %v", file, err)
+			}
+			return nil
+		})
+	}
+	err = g.Wait()
+	if err != nil {
+		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
+		gpu.Cleanup()
+		return nil, err
+	}
+	return libs, nil
+}
+
+func verifyDriverAccess() error {
+	if runtime.GOOS != "linux" {
+		return nil
+	}
+	// Only check ROCm access if we have the dynamic lib loaded
+	if rocmDynLibPresent() {
+		// Verify we have permissions - either running as root, or we have group access to the driver
+		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
+		if err != nil {
+			if errors.Is(err, fs.ErrPermission) {
+				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
+			} else if errors.Is(err, fs.ErrNotExist) {
+				// expected behavior without a radeon card
+				return nil
+			}
+
+			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
+		}
+		fd.Close()
+	}
+	return nil
+}
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
+var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
+var libEmbed embed.FS
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )

-//go:embed build/darwin/x86_64/*/bin/*
+//go:embed llama.cpp/build/linux/*/*/lib/*
 var libEmbed embed.FS
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@@ -0,0 +1,58 @@
+package llm
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/gpu"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGetDynLibs(t *testing.T) {
+	availableDynLibs = map[string]string{
+		"cpu": "X_cpu",
+	}
+	assert.Equal(t, false, rocmDynLibPresent())
+	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
+	assert.Len(t, res, 1)
+	assert.Equal(t, availableDynLibs["cpu"], res[0])
+
+	variant := gpu.GetCPUVariant()
+	if variant != "" {
+		variant = "_" + variant
+	}
+	availableDynLibs = map[string]string{
+		"rocm_v5":       "X_rocm_v5",
+		"rocm_v6":       "X_rocm_v6",
+		"cpu" + variant: "X_cpu",
+	}
+	assert.Equal(t, true, rocmDynLibPresent())
+	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
+	assert.Len(t, res, 3)
+	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
+	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
+	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
+
+	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
+	assert.Len(t, res, 3)
+	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
+	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
+	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
+
+	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
+	assert.Len(t, res, 1)
+	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
+
+	res = getDynLibs(gpu.GpuInfo{Library: "default"})
+	assert.Len(t, res, 1)
+	assert.Equal(t, "default", res[0])
+
+	availableDynLibs = map[string]string{
+		"rocm":          "X_rocm_v5",
+		"cpu" + variant: "X_cpu",
+	}
+	assert.Equal(t, true, rocmDynLibPresent())
+	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
+	assert.Len(t, res, 2)
+	assert.Equal(t, availableDynLibs["rocm"], res[0])
+	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
+}
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )

-//go:embed build/darwin/arm64/*/bin/*
+//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
 var libEmbed embed.FS
--- a/llm/server.go
+++ b/llm/server.go
@@ -1,843 +0,0 @@
-package llm
-
-import (
-	"bufio"
-	"bytes"
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"io"
-	"log"
-	"log/slog"
-	"math/rand"
-	"net"
-	"net/http"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
-)
-
-// LlamaServer is an instance of the llama.cpp server
-type LlamaServer struct {
-	port    int
-	cmd     *exec.Cmd
-	done    chan error // Channel to signal when the process exits
-	status  *StatusWriter
-	options api.Options
-}
-
-func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
-	f, err := os.Open(model)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	ggml, _, err := DecodeGGML(f)
-	if err != nil {
-		return nil, err
-	}
-
-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
-		opts.NumCtx = int(ggml.KV().ContextLength())
-	}
-
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
-	memoryAvailable, _ := gpu.CheckVRAM()
-	info := gpu.GetGPUInfo()
-
-	memoryMinimum := info.MinimumMemory
-	for _, projector := range projectors {
-		memoryMinimum += projectorMemoryRequirements(projector)
-
-		// multimodal models require at least 2048 context
-		opts.NumCtx = max(opts.NumCtx, 2048)
-	}
-
-	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
-	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
-
-	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
-	if graphPartialOffload == 0 {
-		graphPartialOffload = ggml.KV().GQA() * kv / 6
-	}
-
-	if graphFullOffload == 0 {
-		graphFullOffload = graphPartialOffload
-	}
-
-	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
-	memoryRequiredTotal := memoryMinimum + graphFullOffload
-
-	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
-	memoryRequiredPartial := memoryMinimum + graphPartialOffload
-
-	if info.Library != "metal" {
-		if memoryRequiredPartial > memoryAvailable {
-			info.Library = "cpu"
-		}
-	}
-
-	var layerCount int
-	layers := ggml.Tensors().Layers()
-	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		memoryLayer := layers[fmt.Sprintf("%d", i)].size()
-
-		// KV is proportional to the number of layers
-		memoryLayer += kv / ggml.KV().BlockCount()
-
-		memoryRequiredTotal += memoryLayer
-		if memoryAvailable > memoryRequiredPartial+memoryLayer {
-			memoryRequiredPartial += memoryLayer
-			layerCount++
-		}
-	}
-
-	memoryLayerOutput := layers["output"].size()
-	memoryRequiredTotal += memoryLayerOutput
-
-	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
-		// disable partial offloading when model is greater than total system memory
-		opts.NumGPU = 0
-	} else if memoryAvailable > memoryRequiredTotal {
-		layerCount = int(ggml.KV().BlockCount()) + 1
-		memoryRequiredPartial = memoryRequiredTotal
-	}
-
-	if opts.NumGPU < 0 {
-		opts.NumGPU = layerCount
-	}
-
-	slog.Info(
-		"offload to gpu",
-		"reallayers", opts.NumGPU,
-		"layers", layerCount,
-		"required", format.HumanBytes2(memoryRequiredTotal),
-		"used", format.HumanBytes2(memoryRequiredPartial),
-		"available", format.HumanBytes2(memoryAvailable),
-		"kv", format.HumanBytes2(kv),
-		"fulloffload", format.HumanBytes2(graphFullOffload),
-		"partialoffload", format.HumanBytes2(graphPartialOffload),
-	)
-
-	if len(adapters) > 1 {
-		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
-	}
-
-	availableServers := availableServers()
-	servers := serversForGpu(info)
-
-	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
-	if demandLib != "" {
-		serverPath := availableServers[demandLib]
-		if serverPath == "" {
-			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
-		} else {
-			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
-			servers = []string{demandLib}
-		}
-	}
-
-	if len(servers) == 0 {
-		return nil, fmt.Errorf("no servers found for %v", info)
-	}
-
-	params := []string{
-		"--model", model,
-		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
-		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
-		"--embedding",
-	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
-		params = append(params, "--log-format", "json")
-	} else {
-		params = append(params, "--log-disable")
-	}
-
-	if opts.NumGPU >= 0 {
-		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
-	}
-
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
-		params = append(params, "--verbose")
-	}
-
-	if opts.MainGPU > 0 {
-		params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
-	}
-
-	if len(adapters) > 0 {
-		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
-		params = append(params, "--lora", adapters[0])
-	}
-
-	if len(projectors) > 0 {
-		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
-		params = append(params, "--mmproj", projectors[0])
-	}
-
-	if opts.NumThread > 0 {
-		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
-	}
-
-	if !opts.F16KV {
-		params = append(params, "--memory-f32")
-	}
-
-	if opts.UseMLock {
-		params = append(params, "--mlock")
-	}
-
-	if !opts.UseMMap {
-		params = append(params, "--no-mmap")
-	}
-
-	if opts.UseNUMA {
-		params = append(params, "--numa")
-	}
-
-	// Loop through potential servers
-	var finalErr error
-	for i := 0; i < len(servers); i++ {
-		dir := availableServers[servers[i]]
-
-		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
-		port := 0
-		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
-			var l *net.TCPListener
-			if l, err = net.ListenTCP("tcp", a); err == nil {
-				port = l.Addr().(*net.TCPAddr).Port
-				l.Close()
-			}
-		}
-		if port == 0 {
-			slog.Debug("ResolveTCPAddr failed ", "error", err)
-			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-		}
-		finalParams := append(params, "--port", strconv.Itoa(port))
-
-		pathEnv := "LD_LIBRARY_PATH"
-		if runtime.GOOS == "windows" {
-			pathEnv = "PATH"
-		}
-		// append the server directory to LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{dir}
-		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// Append our runner directory to the path
-			// This will favor system libraries over our bundled library dependencies
-			libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
-		}
-
-		server := filepath.Join(dir, "ollama_llama_server")
-		if runtime.GOOS == "windows" {
-			server = server + ".exe"
-		}
-
-		s := &LlamaServer{
-			port:    port,
-			cmd:     exec.Command(server, finalParams...),
-			status:  NewStatusWriter(os.Stderr),
-			options: opts,
-		}
-		libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
-		slog.Debug(libEnv)
-		s.cmd.Env = append(os.Environ(), libEnv)
-		s.cmd.Stdout = os.Stdout
-		s.cmd.Stderr = s.status
-
-		slog.Info("starting llama server", "cmd", s.cmd.String())
-
-		if err = s.cmd.Start(); err != nil {
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
-			finalErr = err
-			continue
-		}
-
-		// reap subprocess when it exits
-		go func() {
-			// Exit status managed via getServerStatus
-			_ = s.cmd.Wait()
-		}()
-
-		return s, nil
-	}
-
-	slog.Error("unable to load any llama server", "error", finalErr)
-	return nil, finalErr
-}
-
-func projectorMemoryRequirements(filename string) uint64 {
-	file, err := os.Open(filename)
-	if err != nil {
-		return 0
-	}
-	defer file.Close()
-
-	ggml, _, err := DecodeGGML(file)
-	if err != nil {
-		return 0
-	}
-
-	var mem uint64
-	for _, layer := range ggml.Tensors().Layers() {
-		mem += layer.size()
-	}
-
-	return mem
-}
-
-type ServerStatus int
-
-const ( // iota is reset to 0
-	ServerStatusReady ServerStatus = iota
-	ServerStatusNoSlotsAvaialble
-	ServerStatusLoadingModel
-	ServerStatusNotResponding
-	ServerStatusError
-)
-
-type ServerStatusResp struct {
-	Status          string `json:"status"`
-	SlotsIdle       int    `json:"slots_idle"`
-	SlotsProcessing int    `json:"slots_processing"`
-	Error           string `json:"error"`
-}
-
-func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
-	// Fail fast if its exited
-	if s.cmd.ProcessState != nil {
-		msg := ""
-		if s.status != nil && s.status.LastErrMsg != "" {
-			msg = s.status.LastErrMsg
-		}
-		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/health", s.port), nil)
-	if err != nil {
-		return ServerStatusError, fmt.Errorf("error creating GET request: %v", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		if errors.Is(err, context.DeadlineExceeded) {
-			return ServerStatusNotResponding, fmt.Errorf("server not responding")
-		}
-		return ServerStatusError, fmt.Errorf("health resp: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return ServerStatusError, fmt.Errorf("read health request: %w", err)
-	}
-
-	var status ServerStatusResp
-	if err := json.Unmarshal(body, &status); err != nil {
-		return ServerStatusError, fmt.Errorf("health unmarshal encode response: %w", err)
-	}
-
-	switch status.Status {
-	case "ok":
-		return ServerStatusReady, nil
-	case "no slot available":
-		return ServerStatusNoSlotsAvaialble, nil
-	case "loading model":
-		return ServerStatusLoadingModel, nil
-	default:
-		return ServerStatusError, fmt.Errorf("server error: %+v", status)
-	}
-}
-
-func (s *LlamaServer) Ping(ctx context.Context) error {
-	_, err := s.getServerStatus(ctx)
-	if err != nil {
-		slog.Debug("server unhealthy", "error", err)
-		return err
-	}
-	return nil
-}
-
-func (s *LlamaServer) WaitUntilRunning() error {
-	start := time.Now()
-	// TODO we need to wire up a better way to detect hangs during model load and startup of the server
-	expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
-	ticker := time.NewTicker(50 * time.Millisecond)
-	defer ticker.Stop()
-
-	slog.Info("waiting for llama runner to start responding")
-	var lastStatus ServerStatus = -1
-	for {
-		select {
-		case err := <-s.done:
-			msg := ""
-			if s.status != nil && s.status.LastErrMsg != "" {
-				msg = s.status.LastErrMsg
-			}
-			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
-		case <-ticker.C:
-			if time.Now().After(expiresAt) {
-				// timeout
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-				return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
-			}
-			if s.cmd.ProcessState != nil {
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-				return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
-			}
-
-			ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
-			defer cancel()
-			status, err := s.getServerStatus(ctx)
-			if err != nil && lastStatus != status {
-				slog.Debug("server not yet available", "error", err)
-				lastStatus = status
-				continue
-			}
-
-			switch status {
-			case ServerStatusLoadingModel:
-				// TODO - this state never seems to happen with the current server.cpp code (bug?)
-				// it doesn't respond to the health endpoint until after the model is loaded
-				slog.Debug("loading model")
-			case ServerStatusReady:
-				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
-				return nil
-			}
-		}
-	}
-}
-
-const jsonGrammar = `
-root   ::= object
-value  ::= object | array | string | number | ("true" | "false" | "null") ws
-
-object ::=
-  "{" ws (
-            string ":" ws value
-    ("," ws string ":" ws value)*
-  )? "}" ws
-
-array  ::=
-  "[" ws (
-            value
-    ("," ws value)*
-  )? "]" ws
-
-string ::=
-  "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
-  )* "\"" ws
-
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
-
-# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
-`
-
-const maxBufferSize = 512 * format.KiloByte
-const maxRetries = 3
-
-type ImageData struct {
-	Data []byte `json:"data"`
-	ID   int    `json:"id"`
-}
-
-type completion struct {
-	Content string `json:"content"`
-	Model   string `json:"model"`
-	Prompt  string `json:"prompt"`
-	Stop    bool   `json:"stop"`
-
-	Timings struct {
-		PredictedN  int     `json:"predicted_n"`
-		PredictedMS float64 `json:"predicted_ms"`
-		PromptN     int     `json:"prompt_n"`
-		PromptMS    float64 `json:"prompt_ms"`
-	}
-}
-
-type CompletionRequest struct {
-	Prompt  string
-	Format  string
-	Images  []ImageData
-	Options api.Options
-}
-
-type CompletionResponse struct {
-	Content            string
-	Done               bool
-	PromptEvalCount    int
-	PromptEvalDuration time.Duration
-	EvalCount          int
-	EvalDuration       time.Duration
-}
-
-func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
-	request := map[string]any{
-		"prompt":            req.Prompt,
-		"stream":            true,
-		"n_predict":         req.Options.NumPredict,
-		"n_keep":            req.Options.NumKeep,
-		"main_gpu":          req.Options.MainGPU,
-		"temperature":       req.Options.Temperature,
-		"top_k":             req.Options.TopK,
-		"top_p":             req.Options.TopP,
-		"tfs_z":             req.Options.TFSZ,
-		"typical_p":         req.Options.TypicalP,
-		"repeat_last_n":     req.Options.RepeatLastN,
-		"repeat_penalty":    req.Options.RepeatPenalty,
-		"presence_penalty":  req.Options.PresencePenalty,
-		"frequency_penalty": req.Options.FrequencyPenalty,
-		"mirostat":          req.Options.Mirostat,
-		"mirostat_tau":      req.Options.MirostatTau,
-		"mirostat_eta":      req.Options.MirostatEta,
-		"penalize_nl":       req.Options.PenalizeNewline,
-		"seed":              req.Options.Seed,
-		"stop":              req.Options.Stop,
-		"image_data":        req.Images,
-		"cache_prompt":      true,
-	}
-
-	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
-	if err != nil {
-		return err
-	} else if status != ServerStatusReady {
-		return fmt.Errorf("unexpected server status: %d", status)
-	}
-
-	if req.Format == "json" {
-		request["grammar"] = jsonGrammar
-		if !strings.Contains(strings.ToLower(req.Prompt), "json") {
-			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
-		}
-	}
-
-	retryDelay := 100 * time.Microsecond
-	for retries := 0; retries < maxRetries; retries++ {
-		if retries > 0 {
-			time.Sleep(retryDelay) // wait before retrying
-			retryDelay *= 2        // exponential backoff
-		}
-
-		// Handling JSON marshaling with special characters unescaped.
-		buffer := &bytes.Buffer{}
-		enc := json.NewEncoder(buffer)
-		enc.SetEscapeHTML(false)
-
-		if err := enc.Encode(request); err != nil {
-			return fmt.Errorf("failed to marshal data: %v", err)
-		}
-
-		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
-		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
-		if err != nil {
-			return fmt.Errorf("error creating POST request: %v", err)
-		}
-		req.Header.Set("Content-Type", "application/json")
-
-		resp, err := http.DefaultClient.Do(req)
-		if err != nil {
-			return fmt.Errorf("POST predict: %v", err)
-		}
-		defer resp.Body.Close()
-
-		if resp.StatusCode >= 400 {
-			bodyBytes, err := io.ReadAll(resp.Body)
-			if err != nil {
-				return fmt.Errorf("failed reading llm error response: %w", err)
-			}
-			log.Printf("llm predict error: %s", bodyBytes)
-			return fmt.Errorf("%s", bodyBytes)
-		}
-
-		scanner := bufio.NewScanner(resp.Body)
-		buf := make([]byte, 0, maxBufferSize)
-		scanner.Buffer(buf, maxBufferSize)
-
-		retryNeeded := false
-		// keep track of the last token generated, this is used to abort if the model starts looping
-		var lastToken string
-		var tokenRepeat int
-
-		for scanner.Scan() {
-			select {
-			case <-ctx.Done():
-				// This handles the request cancellation
-				return ctx.Err()
-			default:
-				line := scanner.Bytes()
-				if len(line) == 0 {
-					continue
-				}
-
-				// try again on slot unavailable
-				if bytes.Contains(line, []byte("slot unavailable")) {
-					retryNeeded = true
-					break
-				}
-
-				evt, ok := bytes.CutPrefix(line, []byte("data: "))
-				if !ok {
-					return fmt.Errorf("error parsing llm response stream: %s", line)
-				}
-
-				var c completion
-				if err := json.Unmarshal(evt, &c); err != nil {
-					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
-				}
-
-				switch {
-				case strings.TrimSpace(c.Content) == lastToken:
-					tokenRepeat++
-				default:
-					lastToken = strings.TrimSpace(c.Content)
-					tokenRepeat = 0
-				}
-
-				// 30 picked as an arbitrary max token repeat limit, modify as needed
-				if tokenRepeat > 30 {
-					slog.Debug("prediction aborted, token repeat limit reached")
-					return ctx.Err()
-				}
-
-				if c.Content != "" {
-					fn(CompletionResponse{
-						Content: c.Content,
-					})
-				}
-
-				if c.Stop {
-					fn(CompletionResponse{
-						Done:               true,
-						PromptEvalCount:    c.Timings.PromptN,
-						PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
-						EvalCount:          c.Timings.PredictedN,
-						EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
-					})
-					return nil
-				}
-			}
-		}
-
-		if err := scanner.Err(); err != nil {
-			if strings.Contains(err.Error(), "unexpected EOF") {
-				s.Close()
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-
-				return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
-			}
-			return fmt.Errorf("error reading llm response: %v", err)
-		}
-
-		if !retryNeeded {
-			return nil // success
-		}
-	}
-
-	// should never reach here ideally
-	return fmt.Errorf("max retries exceeded")
-}
-
-type EmbeddingRequest struct {
-	Content string `json:"content"`
-}
-
-type EmbeddingResponse struct {
-	Embedding []float64 `json:"embedding"`
-}
-
-func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
-	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
-	if err != nil {
-		return nil, err
-	} else if status != ServerStatusReady {
-		return nil, fmt.Errorf("unexpected server status: %d", status)
-	}
-
-	data, err := json.Marshal(TokenizeRequest{Content: prompt})
-	if err != nil {
-		return nil, fmt.Errorf("error marshaling embed data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
-	if err != nil {
-		return nil, fmt.Errorf("error creating embed request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return nil, fmt.Errorf("do embedding request: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, fmt.Errorf("error reading embed response: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm encode error: %s", body)
-		return nil, fmt.Errorf("%s", body)
-	}
-
-	var embedding EmbeddingResponse
-	if err := json.Unmarshal(body, &embedding); err != nil {
-		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
-	}
-
-	return embedding.Embedding, nil
-}
-
-type TokenizeRequest struct {
-	Content string `json:"content"`
-}
-
-type TokenizeResponse struct {
-	Tokens []int `json:"tokens"`
-}
-
-func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
-	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
-	if err != nil {
-		return nil, err
-	} else if status != ServerStatusReady {
-		return nil, fmt.Errorf("unexpected server status: %d", status)
-	}
-
-	data, err := json.Marshal(TokenizeRequest{Content: content})
-	if err != nil {
-		return nil, fmt.Errorf("marshaling encode data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
-	if err != nil {
-		return nil, fmt.Errorf("encode request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return nil, fmt.Errorf("do encode request: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, fmt.Errorf("read encode request: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm encode error: %s", body)
-		return nil, fmt.Errorf("%s", body)
-	}
-
-	var encoded TokenizeResponse
-	if err := json.Unmarshal(body, &encoded); err != nil {
-		return nil, fmt.Errorf("unmarshal encode response: %w", err)
-	}
-
-	return encoded.Tokens, nil
-}
-
-type DetokenizeRequest struct {
-	Tokens []int `json:"tokens"`
-}
-
-type DetokenizeResponse struct {
-	Content string `json:"content"`
-}
-
-func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
-	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
-	if err != nil {
-		return "", err
-	} else if status != ServerStatusReady {
-		return "", fmt.Errorf("unexpected server status: %d", status)
-	}
-
-	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
-	if err != nil {
-		return "", fmt.Errorf("marshaling decode data: %w", err)
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
-	if err != nil {
-		return "", fmt.Errorf("decode request: %w", err)
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	resp, err := http.DefaultClient.Do(req)
-	if err != nil {
-		return "", fmt.Errorf("do decode request: %w", err)
-	}
-	defer resp.Body.Close()
-
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return "", fmt.Errorf("read decode request: %w", err)
-	}
-
-	if resp.StatusCode >= 400 {
-		log.Printf("llm decode error: %s", body)
-		return "", fmt.Errorf("%s", body)
-	}
-
-	var decoded DetokenizeResponse
-	if err := json.Unmarshal(body, &decoded); err != nil {
-		return "", fmt.Errorf("unmarshal encode response: %w", err)
-	}
-
-	return decoded.Content, nil
-}
-
-func (s *LlamaServer) Close() error {
-	if s.cmd != nil {
-		slog.Debug("stopping llama server")
-		return s.cmd.Process.Kill()
-	}
-
-	return nil
-}
-
-func parseDurationMs(ms float64) time.Duration {
-	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
-	if err != nil {
-		panic(err)
-	}
-
-	return dur
-}
--- a/llm/status.go
+++ b/llm/status.go
@@ -1,42 +0,0 @@
-package llm
-
-import (
-	"bytes"
-	"os"
-)
-
-// StatusWriter is a writer that captures error messages from the llama runner process
-type StatusWriter struct {
-	LastErrMsg string
-	out        *os.File
-}
-
-func NewStatusWriter(out *os.File) *StatusWriter {
-	return &StatusWriter{
-		out: out,
-	}
-}
-
-// TODO - regex matching to detect errors like
-// libcublasLt.so.11: cannot open shared object file: No such file or directory
-
-var errorPrefixes = []string{
-	"error:",
-	"CUDA error",
-	"cudaMalloc failed",
-	"\"ERR\"",
-}
-
-func (w *StatusWriter) Write(b []byte) (int, error) {
-	var errMsg string
-	for _, prefix := range errorPrefixes {
-		if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
-			errMsg = prefix + string(bytes.TrimSpace(after))
-		}
-	}
-	if errMsg != "" {
-		w.LastErrMsg = errMsg
-	}
-
-	return w.out.Write(b)
-}
--- a/llm/utils.go
+++ b/llm/utils.go
@@ -0,0 +1,15 @@
+package llm
+
+import (
+	"fmt"
+	"time"
+)
+
+func parseDurationMs(ms float64) time.Duration {
+	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
+	if err != nil {
+		panic(err)
+	}
+
+	return dur
+}
--- a/macapp/README.md
+++ b/macapp/README.md
@@ -14,7 +14,7 @@ go build .
 Then run the desktop app with `npm start`:

 ```
-cd macapp
+cd app
 npm install
 npm start
 ```
--- a/readline/history.go
+++ b/readline/history.go
@@ -142,9 +142,7 @@ func (h *History) Save() error {
 	for cnt := 0; cnt < h.Size(); cnt++ {
 		v, _ := h.Buf.Get(cnt)
 		line, _ := v.([]rune)
-		if _, err := buf.WriteString(string(line) + "\n"); err != nil {
-			return err
-		}
+		buf.WriteString(string(line) + "\n")
 	}
 	buf.Flush()
 	f.Close()
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -10,7 +10,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$V
 # For developers, you can override the DOCKER_ORG to generate multiarch manifests
 #  DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
+ARCH_IMAGE_REPO=${ARCH_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}

 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
@@ -25,7 +25,7 @@ OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
 if [ -z "${PUSH}" ] ; then
    LOAD_OR_PUSH="--load"
 else
-    echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
+    echo "Will be pushing ${ARCH_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
    LOAD_OR_PUSH="--push"
 fi

@@ -37,7 +37,7 @@ if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
            --build-arg=VERSION \
            --build-arg=GOFLAGS \
            -f Dockerfile \
-            -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
+            -t ${ARCH_IMAGE_REPO}:$VERSION-${TARGETARCH} \
            .
    done

@@ -49,7 +49,7 @@ if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
            --build-arg=GOFLAGS \
            --target runtime-rocm \
            -f Dockerfile \
-            -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
+            -t ${ARCH_IMAGE_REPO}:$VERSION-rocm \
            .
    fi
 fi
@@ -57,21 +57,21 @@ fi
 if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
    if [ -n "${PUSH}" ]; then
        docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
-            ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
-            ${RELEASE_IMAGE_REPO}:$VERSION-arm64
+            ${ARCH_IMAGE_REPO}:$VERSION-amd64 \
+            ${ARCH_IMAGE_REPO}:$VERSION-arm64
        docker manifest push ${FINAL_IMAGE_REPO}:$VERSION

        # For symmetry, tag/push the rocm image
-        if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
+        if [ "${ARCH_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
            echo "Tagging and pushing rocm image"
-            docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
-            docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
+            docker pull ${ARCH_IMAGE_REPO}:$VERSION-rocm
+            docker tag ${ARCH_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
            docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
        fi
    else
        echo "Skipping manifest generation when not pushing images are available locally as "
-        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
-        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
-        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
+        echo "  ${ARCH_IMAGE_REPO}:$VERSION-amd64"
+        echo "  ${ARCH_IMAGE_REPO}:$VERSION-arm64"
+        echo "  ${ARCH_IMAGE_REPO}:$VERSION-rocm"
    fi
 fi
--- a/scripts/tag_latest.sh
+++ b/scripts/tag_latest.sh
@@ -1,33 +0,0 @@
-#!/bin/sh
-
-set -eu
-
-# We use 2 different image repositories to handle combining architecture images into multiarch manifest
-# (The ROCm image is x86 only and is not a multiarch manifest)
-# For developers, you can override the DOCKER_ORG to generate multiarch manifests
-#  DOCKER_ORG=jdoe VERSION=0.1.30 PUSH=1 ./scripts/tag_latest.sh
-DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
-FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
-
-# Set PUSH to a non-empty string to trigger push instead of load
-PUSH=${PUSH:-""}
-
-echo "Assembling manifest and tagging latest"
-docker manifest rm ${FINAL_IMAGE_REPO}:latest || true
-docker manifest create ${FINAL_IMAGE_REPO}:latest \
-    ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
-    ${RELEASE_IMAGE_REPO}:$VERSION-arm64
-
-docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
-docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:rocm
-
-if [ -n "${PUSH}" ]; then
-    echo "Pushing latest tags up..."
-    docker manifest push ${FINAL_IMAGE_REPO}:latest
-    docker push ${FINAL_IMAGE_REPO}:rocm
-else
-    echo "Not pushing ${FINAL_IMAGE_REPO}:latest and ${FINAL_IMAGE_REPO}:rocm"
-fi
-
-
--- a/server/download.go
+++ b/server/download.go
@@ -247,8 +247,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
 				}

 				if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
-					const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
-					slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
+					slog.Info(fmt.Sprintf("%s part %d stalled; retrying", b.Digest[7:19], part.N))
 					// reset last updated
 					part.lastUpdated = time.Time{}
 					return errPartStalled
--- a/server/images.go
+++ b/server/images.go
@@ -26,7 +26,6 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/version"
@@ -284,7 +283,7 @@ func realpath(mfDir, from string) string {
 	return abspath
 }

-func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
+func CreateModel(ctx context.Context, name, modelFileDir string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
 	deleteMap := make(map[string]struct{})
 	if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
 		for _, layer := range append(manifest.Layers, manifest.Config) {
@@ -322,7 +321,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c

 			pathName := realpath(modelFileDir, c.Args)

-			ggufName, err := convertModel(name, pathName, fn)
+			ggufName, err := convertSafetensors(name, pathName)
 			if err != nil {
 				var pathErr *fs.PathError
 				switch {
@@ -338,26 +337,6 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 			if ggufName != "" {
 				pathName = ggufName
 				defer os.RemoveAll(ggufName)
-
-				if quantization != "" {
-					quantization = strings.ToUpper(quantization)
-					fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", "F16", quantization)})
-					tempfile, err := os.CreateTemp(filepath.Dir(ggufName), quantization)
-					if err != nil {
-						return err
-					}
-					defer os.RemoveAll(tempfile.Name())
-
-					if err := llm.Quantize(ggufName, tempfile.Name(), quantization); err != nil {
-						return err
-					}
-
-					if err := tempfile.Close(); err != nil {
-						return err
-					}
-
-					pathName = tempfile.Name()
-				}
 			}

 			bin, err := os.Open(pathName)
@@ -440,32 +419,34 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 			defer bin.Close()

 			var offset int64
+		CREATE:
 			for {
 				fn(api.ProgressResponse{Status: "creating model layer"})
-				if _, err := bin.Seek(offset, io.SeekStart); err != nil {
-					return err
-				}

-				ggml, size, err := llm.DecodeGGML(bin)
-				if errors.Is(err, io.EOF) {
-					break
-				} else if errors.Is(err, llm.ErrUnsupportedFormat) {
-					return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
-				} else if err != nil {
-					return err
+				bin.Seek(offset, io.SeekStart)
+				ggml, err := llm.DecodeGGML(bin)
+				if err != nil {
+					switch {
+					case errors.Is(err, io.EOF):
+						break CREATE
+					case errors.Is(err, llm.ErrUnsupportedFormat):
+						return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
+					default:
+						return err
+					}
 				}

 				config.SetModelFormat(ggml.Name())
-				config.SetModelFamily(ggml.KV().Architecture())
-				config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
-				config.SetFileType(ggml.KV().FileType())
+				config.SetModelFamily(ggml.ModelFamily())
+				config.SetModelType(ggml.ModelType())
+				config.SetFileType(ggml.FileType())

 				mediatype := mediatype
-				if ggml.KV().Architecture() == "clip" {
+				if ggml.ModelFamily() == "clip" {
 					mediatype = "application/vnd.ollama.image.projector"
 				}

-				sr := io.NewSectionReader(bin, offset, size)
+				sr := io.NewSectionReader(bin, offset, ggml.Size)
 				layer, err := NewLayer(sr, mediatype)
 				if err != nil {
 					return err
@@ -473,7 +454,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c

 				layers.Add(layer)

-				offset += size
+				offset += ggml.Size
 			}
 		case "adapter":
 			if strings.HasPrefix(c.Args, "@") {
@@ -492,12 +473,12 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 			}
 			defer bin.Close()

-			_, size, err := llm.DecodeGGML(bin)
+			ggml, err := llm.DecodeGGML(bin)
 			if err != nil {
 				return err
 			}

-			sr := io.NewSectionReader(bin, 0, size)
+			sr := io.NewSectionReader(bin, 0, ggml.Size)
 			layer, err := NewLayer(sr, mediatype)
 			if err != nil {
 				return err
@@ -569,6 +550,13 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 			}
 		}

+		// xxx - can this be removed?
+		if config.ModelType == "65B" {
+			if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
+				config.ModelType = "70B"
+			}
+		}
+
 		var b bytes.Buffer
 		if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
 			return err
@@ -633,8 +621,8 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 	return nil
 }

-func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string, error) {
-	r, err := zip.OpenReader(path)
+func convertSafetensors(name, fn string) (string, error) {
+	r, err := zip.OpenReader(fn)
 	if err != nil {
 		return "", err
 	}
@@ -646,7 +634,6 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
 	}
 	defer os.RemoveAll(tempDir)

-	fn(api.ProgressResponse{Status: "unpacking model metadata"})
 	for _, f := range r.File {
 		fpath := filepath.Join(tempDir, f.Name)
 		outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
@@ -668,37 +655,37 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
 		rc.Close()
 	}

-	mf, err := convert.GetModelFormat(tempDir)
+	params, err := convert.GetParams(tempDir)
 	if err != nil {
 		return "", err
 	}

-	params, err := mf.GetParams(tempDir)
+	SupportedArchs := []string{
+		"MistralForCausalLM",
+	}
+
+	for _, arch := range params.Architectures {
+		if !slices.Contains(SupportedArchs, arch) {
+			return "", fmt.Errorf("this safetensors model is not yet supported")
+		}
+	}
+
+	t, err := convert.GetSafeTensors(tempDir)
 	if err != nil {
 		return "", err
 	}

-	mArch, err := mf.GetModelArch(name, tempDir, params)
+	vocab, err := convert.LoadTokens(tempDir)
 	if err != nil {
 		return "", err
 	}

-	fn(api.ProgressResponse{Status: "processing tensors"})
-	if err := mArch.GetTensors(); err != nil {
-		return "", err
-	}
-
-	if err := mArch.LoadVocab(); err != nil {
-		return "", err
-	}
-
-	fn(api.ProgressResponse{Status: "converting model"})
-	path, err = mArch.WriteGGUF()
+	fn, err = convert.WriteGGUF(name, t, params, vocab)
 	if err != nil {
 		return "", err
 	}

-	return path, nil
+	return fn, nil
 }

 func CopyModel(src, dest string) error {
--- a/server/routes.go
+++ b/server/routes.go
@@ -56,49 +56,34 @@ func init() {
 var loaded struct {
 	mu sync.Mutex

-	llama *llm.LlamaServer
+	runner llm.LLM

+	expireAt    time.Time
 	expireTimer *time.Timer

-	model      string
-	adapters   []string
-	projectors []string
+	*Model
 	*api.Options
 }

 var defaultSessionDuration = 5 * time.Minute

-func unload() {
-	if loaded.llama != nil {
-		loaded.llama.Close()
-	}
-
-	loaded.llama = nil
-	loaded.model = ""
-	loaded.adapters = nil
-	loaded.projectors = nil
-	loaded.Options = nil
-}
-
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
-	ctx, cancel := context.WithTimeout(c, 10*time.Second)
-	defer cancel()
-
-	needLoad := loaded.llama == nil || // is there a model loaded?
-		loaded.model != model.ModelPath || // has the base model changed?
-		!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
-		!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
-		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
-		loaded.llama.Ping(ctx) != nil
+	needLoad := loaded.runner == nil || // is there a model loaded?
+		loaded.ModelPath != model.ModelPath || // has the base model changed?
+		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
+		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed?

 	if needLoad {
-		if loaded.llama != nil {
+		if loaded.runner != nil {
 			slog.Info("changing loaded model")
-			unload()
+			loaded.runner.Close()
+			loaded.runner = nil
+			loaded.Model = nil
+			loaded.Options = nil
 		}

-		llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -110,24 +95,29 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
 			return err
 		}

-		loaded.model = model.ModelPath
-		loaded.adapters = model.AdapterPaths
-		loaded.projectors = model.ProjectorPaths
-		loaded.llama = llama
+		loaded.Model = model
+		loaded.runner = llmRunner
 		loaded.Options = &opts
-
-		if err = llama.WaitUntilRunning(); err != nil {
-			slog.Error("error loading llama server", "error", err)
-			unload()
-			return err
-		}
 	}

+	loaded.expireAt = time.Now().Add(sessionDuration)
+
 	if loaded.expireTimer == nil {
 		loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
 			loaded.mu.Lock()
 			defer loaded.mu.Unlock()
-			unload()
+
+			if time.Now().Before(loaded.expireAt) {
+				return
+			}
+
+			if loaded.runner != nil {
+				loaded.runner.Close()
+			}
+
+			loaded.runner = nil
+			loaded.Model = nil
+			loaded.Options = nil
 		})
 	}

@@ -275,7 +265,7 @@ func GenerateHandler(c *gin.Context) {

 		sb.Reset()
 		if req.Context != nil {
-			prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
+			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
@@ -296,8 +286,9 @@ func GenerateHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)

-		fn := func(r llm.CompletionResponse) {
+		fn := func(r llm.PredictResult) {
 			// Update model expiration
+			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)

 			// Build up the full response
@@ -331,7 +322,7 @@ func GenerateHandler(c *gin.Context) {
 					}

 					// TODO (jmorganca): encode() should not strip special tokens
-					tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
+					tokens, err := loaded.runner.Encode(c.Request.Context(), p)
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
@@ -353,13 +344,13 @@ func GenerateHandler(c *gin.Context) {
 		}

 		// Start prediction
-		req := llm.CompletionRequest{
+		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
 		}
-		if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
+		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -480,7 +471,7 @@ func EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
+	embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
@@ -651,7 +642,7 @@ func CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()

-		if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
+		if err := CreateModel(ctx, model, filepath.Dir(req.Path), commands, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -917,24 +908,6 @@ func HeadBlobHandler(c *gin.Context) {
 }

 func CreateBlobHandler(c *gin.Context) {
-	path, err := GetBlobsPath(c.Param("digest"))
-	if err != nil {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
-	_, err = os.Stat(path)
-	switch {
-	case errors.Is(err, os.ErrNotExist):
-		// noop
-	case err != nil:
-		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	default:
-		c.Status(http.StatusOK)
-		return
-	}
-
 	layer, err := NewLayer(c.Request.Body, "")
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -1040,14 +1013,16 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
 }

 func (s *Server) GenerateRoutes() http.Handler {
+	var origins []string
+	if o := os.Getenv("OLLAMA_ORIGINS"); o != "" {
+		origins = strings.Split(o, ",")
+	}
+
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true

-	if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
-		config.AllowOrigins = strings.Split(allowedOrigins, ",")
-	}
-
+	config.AllowOrigins = origins
 	for _, allowOrigin := range defaultAllowOrigins {
 		config.AllowOrigins = append(config.AllowOrigins,
 			fmt.Sprintf("http://%s", allowOrigin),
@@ -1150,7 +1125,9 @@ func Serve(ln net.Listener) error {
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signals
-		unload()
+		if loaded.runner != nil {
+			loaded.runner.Close()
+		}
 		gpu.Cleanup()
 		os.Exit(0)
 	}()
@@ -1221,7 +1198,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
 func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
 	encode := func(s string) ([]int, error) {
-		return loaded.llama.Tokenize(ctx, s)
+		return loaded.runner.Encode(ctx, s)
 	}

 	prompt, err := ChatPrompt(template, messages, numCtx, encode)
@@ -1351,8 +1328,9 @@ func ChatHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)

-		fn := func(r llm.CompletionResponse) {
+		fn := func(r llm.PredictResult) {
 			// Update model expiration
+			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)

 			resp := api.ChatResponse{
@@ -1376,12 +1354,14 @@ func ChatHandler(c *gin.Context) {
 			ch <- resp
 		}

-		if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
+		// Start prediction
+		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
-		}, fn); err != nil {
+		}
+		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -3,7 +3,6 @@ package server
 import (
 	"bytes"
 	"context"
-	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -17,6 +16,7 @@ import (
 	"github.com/stretchr/testify/assert"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/version"
 )
@@ -31,22 +31,13 @@ func Test_Routes(t *testing.T) {
 	}

 	createTestFile := func(t *testing.T, name string) string {
-		t.Helper()
-
 		f, err := os.CreateTemp(t.TempDir(), name)
 		assert.Nil(t, err)
 		defer f.Close()

-		err = binary.Write(f, binary.LittleEndian, []byte("GGUF"))
+		_, err = f.Write([]byte("GGUF"))
 		assert.Nil(t, err)
-
-		err = binary.Write(f, binary.LittleEndian, uint32(3))
-		assert.Nil(t, err)
-
-		err = binary.Write(f, binary.LittleEndian, uint64(0))
-		assert.Nil(t, err)
-
-		err = binary.Write(f, binary.LittleEndian, uint64(0))
+		_, err = f.Write([]byte{0x2, 0})
 		assert.Nil(t, err)

 		return f.Name()
@@ -61,7 +52,7 @@ func Test_Routes(t *testing.T) {
 		fn := func(resp api.ProgressResponse) {
 			t.Logf("Status: %s", resp.Status)
 		}
-		err = CreateModel(context.TODO(), name, "", "", commands, fn)
+		err = CreateModel(context.TODO(), name, "", commands, fn)
 		assert.Nil(t, err)
 	}

@@ -210,7 +201,7 @@ func Test_Routes(t *testing.T) {
 		},
 	}

-	s := &Server{}
+	s := Server{}
 	router := s.GenerateRoutes()

 	httpSrv := httptest.NewServer(router)
@@ -241,3 +232,27 @@ func Test_Routes(t *testing.T) {

 	}
 }
+
+type MockLLM struct {
+	encoding []int
+}
+
+func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
+	return nil
+}
+
+func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
+	return llm.encoding, nil
+}
+
+func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
+	return "", nil
+}
+
+func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
+	return []float64{}, nil
+}
+
+func (llm *MockLLM) Close() {
+	// do nothing
+}
--- a/types/model/digest.go
+++ b/types/model/digest.go
@@ -1,83 +0,0 @@
-package model
-
-import (
-	"fmt"
-	"log/slog"
-	"strings"
-	"unicode"
-)
-
-// Digest represents a digest of a model Manifest. It is a comparable value
-// type and is immutable.
-//
-// The zero Digest is not a valid digest.
-type Digest struct {
-	s string
-}
-
-// Type returns the digest type of the digest.
-//
-// Example:
-//
-//	ParseDigest("sha256-1234").Type() // returns "sha256"
-func (d Digest) Type() string {
-	typ, _, _ := strings.Cut(d.s, "-")
-	return typ
-}
-
-// String returns the digest in the form of "<digest-type>-<digest>", or the
-// empty string if the digest is invalid.
-func (d Digest) String() string { return d.s }
-
-// IsValid returns true if the digest is valid (not zero).
-//
-// A valid digest may be created only by ParseDigest, or
-// ParseName(name).Digest().
-func (d Digest) IsValid() bool { return d.s != "" }
-
-// LogValue implements slog.Value.
-func (d Digest) LogValue() slog.Value {
-	return slog.StringValue(d.String())
-}
-
-var (
-	_ slog.LogValuer = Digest{}
-)
-
-// ParseDigest parses a string in the form of "<digest-type>-<digest>" into a
-// Digest.
-func ParseDigest(s string) Digest {
-	typ, digest, ok := strings.Cut(s, "-")
-	if !ok {
-		typ, digest, ok = strings.Cut(s, ":")
-	}
-	if ok && isValidDigestType(typ) && isValidHex(digest) {
-		return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
-	}
-	return Digest{}
-}
-
-func isValidDigestType(s string) bool {
-	if len(s) == 0 {
-		return false
-	}
-	for _, r := range s {
-		if !unicode.IsLower(r) && !unicode.IsDigit(r) {
-			return false
-		}
-	}
-	return true
-}
-
-func isValidHex(s string) bool {
-	if len(s) == 0 {
-		return false
-	}
-	for i := range s {
-		c := s[i]
-		if c < '0' || c > '9' && c < 'a' || c > 'f' {
-			return false
-		}
-	}
-	return true
-}
--- a/types/model/digest_test.go
+++ b/types/model/digest_test.go
@@ -1,46 +0,0 @@
-package model
-
-import "testing"
-
-var testDigests = map[string]Digest{
-	"":                 {},
-	"sha256-1234":      {s: "sha256-1234"},
-	"sha256-5678":      {s: "sha256-5678"},
-	"blake2-9abc":      {s: "blake2-9abc"},
-	"-1234":            {},
-	"sha256-":          {},
-	"sha256-1234-5678": {},
-	"sha256-P":         {}, //         invalid  hex
-	"sha256-1234P":     {},
-	"---":              {},
-}
-
-func TestDigestParse(t *testing.T) {
-	// Test cases.
-	for s, want := range testDigests {
-		got := ParseDigest(s)
-		t.Logf("ParseDigest(%q) = %#v", s, got)
-		if got != want {
-			t.Errorf("ParseDigest(%q) = %q; want %q", s, got, want)
-		}
-	}
-}
-
-func TestDigestString(t *testing.T) {
-	// Test cases.
-	for s, d := range testDigests {
-		want := s
-		if !d.IsValid() {
-			want = ""
-		}
-		got := d.String()
-		if got != want {
-			t.Errorf("ParseDigest(%q).String() = %q; want %q", s, got, want)
-		}
-
-		got = ParseDigest(s).String()
-		if got != want {
-			t.Errorf("roundtrip ParseDigest(%q).String() = %q; want %q", s, got, want)
-		}
-	}
-}
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -1,688 +0,0 @@
-package model
-
-import (
-	"cmp"
-	"errors"
-	"fmt"
-	"hash/maphash"
-	"io"
-	"log/slog"
-	"path/filepath"
-	"slices"
-	"strings"
-	"sync"
-
-	"github.com/ollama/ollama/types/structs"
-)
-
-// Errors
-var (
-	// ErrInvalidName, ErrIncompleteName, and ErrInvalidDigest are not
-	// used by this package, but are exported so that other packages can
-	// use them, instead of defining their own errors for them.
-	ErrInvalidName    = errors.New("invalid model name")
-	ErrIncompleteName = errors.New("incomplete model name")
-	ErrInvalidDigest  = errors.New("invalid digest")
-)
-
-// Defaults
-const (
-	// MaskDefault is the default mask used by [Name.DisplayShortest].
-	MaskDefault = "registry.ollama.ai/library/?:latest"
-
-	// MaskNothing is a mask that masks nothing.
-	MaskNothing = "?/?/?:?"
-
-	// DefaultFill is the default fill used by [ParseName].
-	FillDefault = "registry.ollama.ai/library/?:latest+Q4_0"
-
-	// FillNothing is a fill that fills nothing.
-	FillNothing = "?/?/?:?+?"
-)
-
-const MaxNamePartLen = 128
-
-type PartKind int
-
-// Levels of concreteness
-const (
-	// Each value aligns with its index in the Name.parts array.
-
-	PartHost PartKind = iota
-	PartNamespace
-	PartModel
-	PartTag
-	PartBuild
-	PartDigest
-
-	// NumParts is the number of parts in a Name. In this list, it must
-	// follow the final part.
-	NumParts
-
-	PartExtraneous = -1
-)
-
-var kindNames = map[PartKind]string{
-	PartHost:      "Host",
-	PartNamespace: "Namespace",
-	PartModel:     "Name",
-	PartTag:       "Tag",
-	PartBuild:     "Build",
-	PartDigest:    "Digest",
-}
-
-func (k PartKind) String() string {
-	return cmp.Or(kindNames[k], "Unknown")
-}
-
-// Name is an opaque reference to a model. It holds the parts of a model
-// with the case preserved, but is not directly comparable with other Names
-// since model names can be represented with different casing depending on
-// the use case. For instance, "Mistral" and "mistral" are the same model
-// but each version may have come from different sources (e.g. copied from a
-// Web page, or from a file path).
-//
-// Valid Names can ONLY be constructed by calling [ParseName].
-//
-// A Name is valid if and only if is have a valid Model part. The other parts
-// are optional.
-//
-// A Name is considered "complete" if it has all parts present. To check if a
-// Name is complete, use [Name.IsComplete].
-//
-// To compare two names in a case-insensitive manner, use [Name.EqualFold].
-//
-// The parts of a Name are:
-//
-//   - Host: the domain of the model (optional)
-//   - Namespace: the namespace of the model (optional)
-//   - Model: the name of the model (required)
-//   - Tag: the tag of the model (optional)
-//   - Build: the build of the model; usually the quantization or "file type" (optional)
-//
-// The parts can be obtained in their original form by calling [Name.Parts].
-//
-// To check if a Name has at minimum a valid model part, use [Name.IsValid].
-type Name struct {
-	_     structs.Incomparable
-	parts [NumParts]string // host, namespace, model, tag, build, digest
-
-	// TODO(bmizerany): track offsets and hold s (raw string) here? We
-	// could pack the offsets all into a single uint64 since the first
-	// parts take less bits since their max offset is less than the max
-	// offset of the next part. This would save a ton of bytes per Name
-	// and mean zero allocations for String.
-}
-
-// ParseName parses s into a Name, and returns the result of filling it with
-// defaults. The input string must be a valid string
-// representation of a model name in the form:
-//
-//	[host/][namespace/]<model>[:tag][+build][@<digest-type>-<digest>]
-//
-// The name part is required, all others are optional. If a part is missing,
-// it is left empty in the returned Name. If a part is invalid, the zero Ref
-// value is returned.
-//
-// The build part is normalized to uppercase.
-//
-// Examples of valid paths:
-//
-//	"example.com/library/mistral:7b+x"
-//	"example.com/eva/mistral:7b+Q4_0"
-//	"mistral:7b+x"
-//	"example.com/mike/mistral:latest+Q4_0"
-//	"example.com/bruce/mistral:latest"
-//	"example.com/pdevine/thisisfine:7b+Q4_0@sha256-1234567890abcdef"
-//
-// Examples of invalid paths:
-//
-//	"example.com/mistral:7b+"
-//	"example.com/mistral:7b+Q4_0+"
-//	"x/y/z/z:8n+I"
-//	""
-//
-// It returns the zero value if any part is invalid.
-//
-// # Fills
-//
-// For any valid s, the fill string is used to fill in missing parts of the
-// Name. The fill string must be a valid Name with the exception that any part
-// may be the string ("?"), which will not be considered for filling.
-func ParseName(s, fill string) Name {
-	var r Name
-	parts(s)(func(kind PartKind, part string) bool {
-		if kind == PartDigest && !ParseDigest(part).IsValid() {
-			r = Name{}
-			return false
-		}
-		if kind == PartExtraneous || !isValidPart(kind, part) {
-			r = Name{}
-			return false
-		}
-		r.parts[kind] = part
-		return true
-	})
-	if r.IsValid() || r.IsResolved() {
-		return fillName(r, fill)
-	}
-	return Name{}
-}
-
-func parseMask(s string) Name {
-	var r Name
-	parts(s)(func(kind PartKind, part string) bool {
-		if part == "?" {
-			// mask part; treat as empty but valid
-			return true
-		}
-		if !isValidPart(kind, part) {
-			panic(fmt.Errorf("invalid mask part %s: %q", kind, part))
-		}
-		r.parts[kind] = part
-		return true
-	})
-	return r
-}
-
-func MustParseName(s, fill string) Name {
-	r := ParseName(s, fill)
-	if !r.IsValid() {
-		panic("invalid Name: " + s)
-	}
-	return r
-}
-
-// fillName fills in the missing parts of dst with the parts of src.
-//
-// The returned Name will only be valid if dst is valid.
-//
-// It skipps fill parts that are "?".
-func fillName(r Name, fill string) Name {
-	fill = cmp.Or(fill, FillDefault)
-	f := parseMask(fill)
-	if fill != FillNothing && f.IsZero() {
-		panic("invalid fill")
-	}
-	for i := range r.parts {
-		if f.parts[i] == "?" {
-			continue
-		}
-		r.parts[i] = cmp.Or(r.parts[i], f.parts[i])
-	}
-	return r
-}
-
-// WithBuild returns a copy of r with the build set to the given string.
-func (r Name) WithBuild(build string) Name {
-	r.parts[PartBuild] = build
-	return r
-}
-
-func (r Name) WithDigest(digest Digest) Name {
-	r.parts[PartDigest] = digest.String()
-	return r
-}
-
-var mapHashSeed = maphash.MakeSeed()
-
-// MapHash returns a case insensitive hash for use in maps and equality
-// checks. For a convenient way to compare names, use [Name.EqualFold].
-//
-//nolint:errcheck
-func (r Name) MapHash() uint64 {
-	// correctly hash the parts with case insensitive comparison
-	var h maphash.Hash
-	h.SetSeed(mapHashSeed)
-	for _, part := range r.parts {
-		// downcase the part for hashing
-		for i := range part {
-			c := part[i]
-			if c >= 'A' && c <= 'Z' {
-				c = c - 'A' + 'a'
-			}
-			h.WriteByte(c)
-		}
-	}
-	return h.Sum64()
-}
-
-func (r Name) slice(from, to PartKind) Name {
-	var v Name
-	copy(v.parts[from:to+1], r.parts[from:to+1])
-	return v
-}
-
-// DisplayShortest returns the shortest possible, masked display string in form:
-//
-//	[host/][<namespace>/]<model>[:<tag>]
-//
-// # Masks
-//
-// The mask is a string that specifies which parts of the name to omit based
-// on case-insensitive comparison. [Name.DisplayShortest] omits parts of the name
-// that are the same as the mask, moving from left to right until the first
-// unequal part is found. It then moves right to left until the first unequal
-// part is found. The result is the shortest possible display string.
-//
-// Unlike a [Name] the mask can contain "?" characters which are treated as
-// wildcards. A "?" will never match a part of the name, since a valid name
-// can never contain a "?" character.
-//
-// For example: Given a Name ("registry.ollama.ai/library/mistral:latest") masked
-// with ("registry.ollama.ai/library/?:latest") will produce the display string
-// ("mistral").
-//
-// If mask is the empty string, then [MaskDefault] is used.
-//
-// DisplayShortest panics if the mask is not the empty string, MaskNothing, and
-// invalid.
-//
-// # Builds
-//
-// For now, DisplayShortest does consider the build or return one in the
-// result. We can lift this restriction when needed.
-func (r Name) DisplayShortest(mask string) string {
-	mask = cmp.Or(mask, MaskDefault)
-	d := parseMask(mask)
-	if mask != MaskNothing && r.IsZero() {
-		panic("invalid Name")
-	}
-	for i := range PartTag {
-		if !strings.EqualFold(r.parts[i], d.parts[i]) {
-			break
-		}
-		r.parts[i] = ""
-	}
-	for i := PartTag; i >= 0; i-- {
-		if !strings.EqualFold(r.parts[i], d.parts[i]) {
-			break
-		}
-		r.parts[i] = ""
-	}
-	return r.slice(PartHost, PartTag).DisplayLong()
-}
-
-// DisplayLongest returns the result of r.DisplayShortest(MaskNothing).
-func (r Name) DisplayLongest() string {
-	return r.DisplayShortest(MaskNothing)
-}
-
-var seps = [...]string{
-	PartHost:      "/",
-	PartNamespace: "/",
-	PartModel:     ":",
-	PartTag:       "+",
-	PartBuild:     "@",
-	PartDigest:    "",
-}
-
-// WriteTo implements io.WriterTo. It writes the fullest possible display
-// string in form:
-//
-//	<host>/<namespace>/<model>:<tag>+<build>@<digest-type>-<digest>
-//
-// Missing parts and their separators are not written.
-//
-// The full digest is always prefixed with "@". That is if [Name.IsValid]
-// reports false and [Name.IsResolved] reports true, then the string is
-// returned as "@<digest-type>-<digest>".
-func (r Name) writeTo(w io.StringWriter) error {
-	var partsWritten int
-	for i := range r.parts {
-		if r.parts[i] == "" {
-			continue
-		}
-		if partsWritten > 0 || i == int(PartDigest) {
-			if _, err := w.WriteString(seps[i-1]); err != nil {
-				return err
-			}
-		}
-		if _, err := w.WriteString(r.parts[i]); err != nil {
-			return err
-		}
-		partsWritten++
-	}
-	return nil
-}
-
-var builderPool = sync.Pool{
-	New: func() interface{} {
-		return &strings.Builder{}
-	},
-}
-
-// DisplayLong returns the fullest possible display string in form:
-//
-//	<host>/<namespace>/<model>:<tag>+<build>
-//
-// If any part is missing, it is omitted from the display string.
-func (r Name) DisplayLong() string {
-	b := builderPool.Get().(*strings.Builder)
-	defer builderPool.Put(b)
-	b.Reset()
-	b.Grow(50) // arbitrarily long enough for most names
-	_ = r.writeTo(b)
-	return b.String()
-}
-
-// GoString implements fmt.GoStringer. It returns a string suitable for
-// debugging and logging. It is similar to [Name.DisplayLong] but it always
-// returns a string that includes all parts of the Name, with missing parts
-// replaced with a ("?").
-func (r Name) GoString() string {
-	for i := range r.parts {
-		r.parts[i] = cmp.Or(r.parts[i], "?")
-	}
-	return r.DisplayLong()
-}
-
-// LogValue implements slog.Valuer.
-func (r Name) LogValue() slog.Value {
-	return slog.StringValue(r.GoString())
-}
-
-// IsComplete reports whether the Name is fully qualified. That is it has a
-// domain, namespace, name, tag, and build.
-func (r Name) IsComplete() bool {
-	return !slices.Contains(r.parts[:PartDigest], "")
-}
-
-// IsCompleteNoBuild is like [Name.IsComplete] but it does not require the
-// build part to be present.
-func (r Name) IsCompleteNoBuild() bool {
-	return !slices.Contains(r.parts[:PartBuild], "")
-}
-
-// IsResolved reports true if the Name has a valid digest.
-//
-// It is possible to have a valid Name, or a complete Name that is not
-// resolved.
-func (r Name) IsResolved() bool {
-	return r.Digest().IsValid()
-}
-
-// Digest returns the digest part of the Name, if any.
-//
-// If Digest returns a non-empty string, then [Name.IsResolved] will return
-// true, and digest is considered valid.
-func (r Name) Digest() Digest {
-	// This was already validated by ParseName, so we can just return it.
-	return Digest{r.parts[PartDigest]}
-}
-
-// EqualFold reports whether r and o are equivalent model names, ignoring
-// case.
-func (r Name) EqualFold(o Name) bool {
-	return r.CompareFold(o) == 0
-}
-
-// CompareFold performs a case-insensitive cmp.Compare on r and o.
-//
-// This can be used with [slices.SortFunc].
-//
-// For simple equality checks, use [Name.EqualFold].
-func (r Name) CompareFold(o Name) int {
-	return slices.CompareFunc(r.parts[:], o.parts[:], compareFold)
-}
-
-func compareFold(a, b string) int {
-	return slices.CompareFunc([]rune(a), []rune(b), func(a, b rune) int {
-		return cmp.Compare(downcase(a), downcase(b))
-	})
-}
-
-func downcase(r rune) rune {
-	if r >= 'A' && r <= 'Z' {
-		return r - 'A' + 'a'
-	}
-	return r
-}
-
-func (r Name) Host() string      { return r.parts[PartHost] }
-func (r Name) Namespace() string { return r.parts[PartNamespace] }
-func (r Name) Model() string     { return r.parts[PartModel] }
-func (r Name) Build() string     { return r.parts[PartBuild] }
-func (r Name) Tag() string       { return r.parts[PartTag] }
-
-// iter_Seq2 is a iter.Seq2 defined here to avoid the current build
-// restrictions in the go1.22 iter package requiring the
-// goexperiment.rangefunc tag to be set via the GOEXPERIMENT=rangefunc flag,
-// which we are not yet ready to support.
-//
-// Once we are ready to support rangefunc, this can be removed and replaced
-// with the iter.Seq2 type.
-type iter_Seq2[A, B any] func(func(A, B) bool)
-
-// Parts returns a sequence of the parts of a Name string from most specific
-// to least specific.
-//
-// It normalizes the input string by removing "http://" and "https://" only.
-// No other normalizations are performed.
-func parts(s string) iter_Seq2[PartKind, string] {
-	return func(yield func(PartKind, string) bool) {
-		if strings.HasPrefix(s, "http://") {
-			s = strings.TrimPrefix(s, "http://")
-		} else {
-			s = strings.TrimPrefix(s, "https://")
-		}
-
-		if len(s) > MaxNamePartLen || len(s) == 0 {
-			return
-		}
-
-		numConsecutiveDots := 0
-		partLen := 0
-		state, j := PartDigest, len(s)
-		for i := len(s) - 1; i >= 0; i-- {
-			if partLen++; partLen > MaxNamePartLen {
-				// catch a part that is too long early, so
-				// we don't keep spinning on it, waiting for
-				// an isInValidPart check which would scan
-				// over it again.
-				yield(state, s[i+1:j])
-				return
-			}
-
-			switch s[i] {
-			case '@':
-				switch state {
-				case PartDigest:
-					if !yield(PartDigest, s[i+1:j]) {
-						return
-					}
-					if i == 0 {
-						// This is the form
-						// "@<digest>" which is valid.
-						//
-						// We're done.
-						return
-					}
-					state, j, partLen = PartBuild, i, 0
-				default:
-					yield(PartExtraneous, s[i+1:j])
-					return
-				}
-			case '+':
-				switch state {
-				case PartBuild, PartDigest:
-					if !yield(PartBuild, s[i+1:j]) {
-						return
-					}
-					state, j, partLen = PartTag, i, 0
-				default:
-					yield(PartExtraneous, s[i+1:j])
-					return
-				}
-			case ':':
-				switch state {
-				case PartTag, PartBuild, PartDigest:
-					if !yield(PartTag, s[i+1:j]) {
-						return
-					}
-					state, j, partLen = PartModel, i, 0
-				default:
-					yield(PartExtraneous, s[i+1:j])
-					return
-				}
-			case '/':
-				switch state {
-				case PartModel, PartTag, PartBuild, PartDigest:
-					if !yield(PartModel, s[i+1:j]) {
-						return
-					}
-					state, j = PartNamespace, i
-				case PartNamespace:
-					if !yield(PartNamespace, s[i+1:j]) {
-						return
-					}
-					state, j, partLen = PartHost, i, 0
-				default:
-					yield(PartExtraneous, s[i+1:j])
-					return
-				}
-			default:
-				if s[i] == '.' {
-					if numConsecutiveDots++; numConsecutiveDots > 1 {
-						yield(state, "")
-						return
-					}
-				} else {
-					numConsecutiveDots = 0
-				}
-			}
-		}
-
-		if state <= PartNamespace {
-			yield(state, s[:j])
-		} else {
-			yield(PartModel, s[:j])
-		}
-	}
-}
-
-func (r Name) IsZero() bool {
-	return r.parts == [NumParts]string{}
-}
-
-// IsValid reports if a model has at minimum a valid model part.
-func (r Name) IsValid() bool {
-	// Parts ensures we only have valid parts, so no need to validate
-	// them here, only check if we have a name or not.
-	return r.parts[PartModel] != ""
-}
-
-// ParseNameFromURLPath parses forms of a URL path into a Name. Specifically,
-// it trims any leading "/" and then calls [ParseName] with fill.
-func ParseNameFromURLPath(s, fill string) Name {
-	s = strings.TrimPrefix(s, "/")
-	return ParseName(s, fill)
-}
-
-// URLPath returns a complete, canonicalized, relative URL path using the parts of a
-// complete Name.
-//
-// The parts maintain their original case.
-//
-// Example:
-//
-//	ParseName("example.com/namespace/model:tag+build").URLPath() // returns "/example.com/namespace/model:tag"
-func (r Name) URLPath() string {
-	return r.DisplayShortest(MaskNothing)
-}
-
-// ParseNameFromFilepath parses a file path into a Name. The input string must be a
-// valid file path representation of a model name in the form:
-//
-//	host/namespace/model/tag/build
-//
-// The zero valid is returned if s does not contain all path elements
-// leading up to the model part, or if any path element is an invalid part
-// for the its corresponding part kind.
-//
-// The fill string is used to fill in missing parts of any constructed Name.
-// See [ParseName] for more information on the fill string.
-func ParseNameFromFilepath(s, fill string) Name {
-	var r Name
-	for i := range PartBuild + 1 {
-		part, rest, _ := strings.Cut(s, string(filepath.Separator))
-		if !isValidPart(i, part) {
-			return Name{}
-		}
-		r.parts[i] = part
-		s = rest
-		if s == "" {
-			break
-		}
-	}
-	if s != "" {
-		return Name{}
-	}
-	if !r.IsValid() {
-		return Name{}
-	}
-	return fillName(r, fill)
-}
-
-// Filepath returns a complete, canonicalized, relative file path using the
-// parts of a complete Name.
-//
-// Each parts is downcased, except for the build part which is upcased.
-//
-// Example:
-//
-//	ParseName("example.com/namespace/model:tag+build").Filepath() // returns "example.com/namespace/model/tag/BUILD"
-func (r Name) Filepath() string {
-	for i := range r.parts {
-		if PartKind(i) == PartBuild {
-			r.parts[i] = strings.ToUpper(r.parts[i])
-		} else {
-			r.parts[i] = strings.ToLower(r.parts[i])
-		}
-	}
-	return filepath.Join(r.parts[:]...)
-}
-
-// FilepathNoBuild returns a complete, canonicalized, relative file path using
-// the parts of a complete Name, but without the build part.
-func (r Name) FilepathNoBuild() string {
-	for i := range PartBuild {
-		r.parts[i] = strings.ToLower(r.parts[i])
-	}
-	return filepath.Join(r.parts[:PartBuild]...)
-}
-
-// isValidPart reports if s contains all valid characters for the given
-// part kind.
-func isValidPart(kind PartKind, s string) bool {
-	if s == "" {
-		return false
-	}
-	var consecutiveDots int
-	for _, c := range []byte(s) {
-		if c == '.' {
-			if consecutiveDots++; consecutiveDots >= 2 {
-				return false
-			}
-		} else {
-			consecutiveDots = 0
-		}
-		if !isValidByteFor(kind, c) {
-			return false
-		}
-	}
-	return true
-}
-
-func isValidByteFor(kind PartKind, c byte) bool {
-	if kind == PartNamespace && c == '.' {
-		return false
-	}
-	if c == '.' || c == '-' {
-		return true
-	}
-	if c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_' {
-		return true
-	}
-	return false
-}
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -1,708 +0,0 @@
-package model
-
-import (
-	"bytes"
-	"cmp"
-	"fmt"
-	"log/slog"
-	"path/filepath"
-	"slices"
-	"strings"
-	"testing"
-)
-
-type fields struct {
-	host, namespace, model, tag, build string
-	digest                             string
-}
-
-func fieldsFromName(p Name) fields {
-	return fields{
-		host:      p.parts[PartHost],
-		namespace: p.parts[PartNamespace],
-		model:     p.parts[PartModel],
-		tag:       p.parts[PartTag],
-		build:     p.parts[PartBuild],
-		digest:    p.parts[PartDigest],
-	}
-}
-
-var testNames = map[string]fields{
-	"mistral:latest":                 {model: "mistral", tag: "latest"},
-	"mistral":                        {model: "mistral"},
-	"mistral:30B":                    {model: "mistral", tag: "30B"},
-	"mistral:7b":                     {model: "mistral", tag: "7b"},
-	"mistral:7b+Q4_0":                {model: "mistral", tag: "7b", build: "Q4_0"},
-	"mistral+KQED":                   {model: "mistral", build: "KQED"},
-	"mistral.x-3:7b+Q4_0":            {model: "mistral.x-3", tag: "7b", build: "Q4_0"},
-	"mistral:7b+q4_0":                {model: "mistral", tag: "7b", build: "q4_0"},
-	"llama2":                         {model: "llama2"},
-	"user/model":                     {namespace: "user", model: "model"},
-	"example.com/ns/mistral:7b+Q4_0": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "Q4_0"},
-	"example.com/ns/mistral:7b+X":    {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "X"},
-
-	// invalid digest
-	"mistral:latest@invalid256-": {},
-	"mistral:latest@-123":        {},
-	"mistral:latest@!-123":       {},
-	"mistral:latest@1-!":         {},
-	"mistral:latest@":            {},
-
-	// resolved
-	"x@sha123-1": {model: "x", digest: "sha123-1"},
-	"@sha456-2":  {digest: "sha456-2"},
-
-	"@@sha123-1": {},
-
-	// preserves case for build
-	"x+b": {model: "x", build: "b"},
-
-	// invalid (includes fuzzing trophies)
-	" / / : + ": {},
-	" / : + ":   {},
-	" : + ":     {},
-	" + ":       {},
-	" : ":       {},
-	" / ":       {},
-	" /":        {},
-	"/ ":        {},
-	"/":         {},
-	":":         {},
-	"+":         {},
-
-	// (".") in namepsace is not allowed
-	"invalid.com/7b+x": {},
-
-	"invalid:7b+Q4_0:latest": {},
-	"in valid":               {},
-	"invalid/y/z/foo":        {},
-	"/0":                     {},
-	"0 /0":                   {},
-	"0 /":                    {},
-	"0/":                     {},
-	":/0":                    {},
-	"+0/00000":               {},
-	"0+.\xf2\x80\xf6\x9d00000\xe5\x99\xe6\xd900\xd90\xa60\x91\xdc0\xff\xbf\x99\xe800\xb9\xdc\xd6\xc300\x970\xfb\xfd0\xe0\x8a\xe1\xad\xd40\x9700\xa80\x980\xdd0000\xb00\x91000\xfe0\x89\x9b\x90\x93\x9f0\xe60\xf7\x84\xb0\x87\xa5\xff0\xa000\x9a\x85\xf6\x85\xfe\xa9\xf9\xe9\xde00\xf4\xe0\x8f\x81\xad\xde00\xd700\xaa\xe000000\xb1\xee0\x91": {},
-	"0//0":                        {},
-	"m+^^^":                       {},
-	"file:///etc/passwd":          {},
-	"file:///etc/passwd:latest":   {},
-	"file:///etc/passwd:latest+u": {},
-
-	":x": {},
-	"+x": {},
-	"x+": {},
-
-	// Disallow ("\.+") in any part to prevent path traversal anywhere
-	// we convert the name to a path.
-	"../etc/passwd":  {},
-	".../etc/passwd": {},
-	"./../passwd":    {},
-	"./0+..":         {},
-
-	strings.Repeat("a", MaxNamePartLen):   {model: strings.Repeat("a", MaxNamePartLen)},
-	strings.Repeat("a", MaxNamePartLen+1): {},
-}
-
-// TestConsecutiveDots tests that consecutive dots are not allowed in any
-// part, to avoid path traversal. There also are some tests in testNames, but
-// this test is more exhaustive and exists to emphasize the importance of
-// preventing path traversal.
-func TestNameConsecutiveDots(t *testing.T) {
-	for i := 1; i < 10; i++ {
-		s := strings.Repeat(".", i)
-		if i > 1 {
-			if g := ParseName(s, FillNothing).DisplayLong(); g != "" {
-				t.Errorf("ParseName(%q) = %q; want empty string", s, g)
-			}
-		} else {
-			if g := ParseName(s, FillNothing).DisplayLong(); g != s {
-				t.Errorf("ParseName(%q) = %q; want %q", s, g, s)
-			}
-		}
-	}
-}
-
-func TestNameParts(t *testing.T) {
-	var p Name
-	if w, g := int(NumParts), len(p.parts); w != g {
-		t.Errorf("Parts() = %d; want %d", g, w)
-	}
-}
-
-func TestNamePartString(t *testing.T) {
-	if g := PartKind(-2).String(); g != "Unknown" {
-		t.Errorf("Unknown part = %q; want %q", g, "Unknown")
-	}
-	for kind, name := range kindNames {
-		if g := kind.String(); g != name {
-			t.Errorf("%s = %q; want %q", kind, g, name)
-		}
-	}
-}
-
-func TestParseName(t *testing.T) {
-	for baseName, want := range testNames {
-		for _, prefix := range []string{"", "https://", "http://"} {
-			// We should get the same results with or without the
-			// http(s) prefixes
-			s := prefix + baseName
-
-			t.Run(s, func(t *testing.T) {
-				name := ParseName(s, FillNothing)
-				got := fieldsFromName(name)
-				if got != want {
-					t.Errorf("ParseName(%q) = %q; want %q", s, got, want)
-				}
-
-				// test round-trip
-				if !ParseName(name.DisplayLong(), FillNothing).EqualFold(name) {
-					t.Errorf("ParseName(%q).String() = %s; want %s", s, name.DisplayLong(), baseName)
-				}
-			})
-		}
-	}
-}
-
-func TestParseNameFill(t *testing.T) {
-	cases := []struct {
-		in   string
-		fill string
-		want string
-	}{
-		{"mistral", "example.com/library/?:latest+Q4_0", "example.com/library/mistral:latest+Q4_0"},
-		{"mistral", "example.com/library/?:latest", "example.com/library/mistral:latest"},
-		{"llama2:x", "example.com/library/?:latest+Q4_0", "example.com/library/llama2:x+Q4_0"},
-
-		// Invalid
-		{"", "example.com/library/?:latest+Q4_0", ""},
-		{"llama2:?", "example.com/library/?:latest+Q4_0", ""},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			name := ParseName(tt.in, tt.fill)
-			if g := name.DisplayLong(); g != tt.want {
-				t.Errorf("ParseName(%q, %q) = %q; want %q", tt.in, tt.fill, g, tt.want)
-			}
-		})
-	}
-
-	t.Run("invalid fill", func(t *testing.T) {
-		defer func() {
-			if recover() == nil {
-				t.Fatal("expected panic")
-			}
-		}()
-		ParseName("x", "^")
-	})
-}
-
-func TestParseNameHTTPDoublePrefixStrip(t *testing.T) {
-	cases := []string{
-		"http://https://valid.com/valid/valid:latest",
-		"https://http://valid.com/valid/valid:latest",
-	}
-	for _, s := range cases {
-		t.Run(s, func(t *testing.T) {
-			name := ParseName(s, FillNothing)
-			if name.IsValid() {
-				t.Errorf("expected invalid path; got %#v", name)
-			}
-		})
-	}
-
-}
-
-func TestCompleteWithAndWithoutBuild(t *testing.T) {
-	cases := []struct {
-		in              string
-		complete        bool
-		completeNoBuild bool
-	}{
-		{"", false, false},
-		{"incomplete/mistral:7b+x", false, false},
-		{"incomplete/mistral:7b+Q4_0", false, false},
-		{"incomplete:7b+x", false, false},
-		{"complete.com/x/mistral:latest+Q4_0", true, true},
-		{"complete.com/x/mistral:latest", false, true},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
-			t.Logf("ParseName(%q) = %#v", tt.in, p)
-			if g := p.IsComplete(); g != tt.complete {
-				t.Errorf("Complete(%q) = %v; want %v", tt.in, g, tt.complete)
-			}
-			if g := p.IsCompleteNoBuild(); g != tt.completeNoBuild {
-				t.Errorf("CompleteNoBuild(%q) = %v; want %v", tt.in, g, tt.completeNoBuild)
-			}
-		})
-	}
-
-	// Complete uses Parts which returns a slice, but it should be
-	// inlined when used in Complete, preventing any allocations or
-	// escaping to the heap.
-	allocs := testing.AllocsPerRun(1000, func() {
-		keep(ParseName("complete.com/x/mistral:latest+Q4_0", FillNothing).IsComplete())
-	})
-	if allocs > 0 {
-		t.Errorf("Complete allocs = %v; want 0", allocs)
-	}
-}
-
-func TestNameLogValue(t *testing.T) {
-	cases := []string{
-		"example.com/library/mistral:latest+Q4_0",
-		"mistral:latest",
-		"mistral:7b+Q4_0",
-	}
-	for _, s := range cases {
-		t.Run(s, func(t *testing.T) {
-			var b bytes.Buffer
-			log := slog.New(slog.NewTextHandler(&b, nil))
-			name := ParseName(s, FillNothing)
-			log.Info("", "name", name)
-			want := fmt.Sprintf("name=%s", name.GoString())
-			got := b.String()
-			if !strings.Contains(got, want) {
-				t.Errorf("expected log output to contain %q; got %q", want, got)
-			}
-		})
-	}
-}
-
-func TestNameGoString(t *testing.T) {
-	cases := []struct {
-		name         string
-		in           string
-		wantString   string
-		wantGoString string // default is tt.in
-	}{
-		{
-			name:         "Complete Name",
-			in:           "example.com/library/mistral:latest+Q4_0",
-			wantGoString: "example.com/library/mistral:latest+Q4_0@?",
-		},
-		{
-			name:         "Short Name",
-			in:           "mistral:latest",
-			wantGoString: "?/?/mistral:latest+?@?",
-		},
-		{
-			name:         "Long Name",
-			in:           "library/mistral:latest",
-			wantGoString: "?/library/mistral:latest+?@?",
-		},
-		{
-			name:         "Case Preserved",
-			in:           "Library/Mistral:Latest",
-			wantGoString: "?/Library/Mistral:Latest+?@?",
-		},
-		{
-			name:         "With digest",
-			in:           "Library/Mistral:Latest@sha256-123456",
-			wantGoString: "?/Library/Mistral:Latest+?@sha256-123456",
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
-			tt.wantGoString = cmp.Or(tt.wantGoString, tt.in)
-			if g := fmt.Sprintf("%#v", p); g != tt.wantGoString {
-				t.Errorf("GoString() = %q; want %q", g, tt.wantGoString)
-			}
-		})
-	}
-}
-
-func TestDisplayLongest(t *testing.T) {
-	g := ParseName("example.com/library/mistral:latest+Q4_0", FillNothing).DisplayLongest()
-	if g != "example.com/library/mistral:latest" {
-		t.Errorf("got = %q; want %q", g, "example.com/library/mistral:latest")
-	}
-}
-
-func TestDisplayShortest(t *testing.T) {
-	cases := []struct {
-		in        string
-		mask      string
-		want      string
-		wantPanic bool
-	}{
-		{"example.com/library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/library/mistral:latest+Q4_0", "example.com/_/_:latest", "library/mistral", false},
-		{"example.com/library/mistral:latest+Q4_0", "", "example.com/library/mistral", false},
-		{"example.com/library/mistral:latest+Q4_0", "", "example.com/library/mistral", false},
-
-		// case-insensitive
-		{"Example.com/library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/Library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/library/Mistral:latest+Q4_0", "example.com/library/_:latest", "Mistral", false},
-		{"example.com/library/mistral:Latest+Q4_0", "example.com/library/_:latest", "mistral", false},
-		{"example.com/library/mistral:Latest+q4_0", "example.com/library/_:latest", "mistral", false},
-
-		// zero value
-		{"", MaskDefault, "", true},
-
-		// invalid mask
-		{"example.com/library/mistral:latest+Q4_0", "example.com/mistral", "", true},
-
-		// DefaultMask
-		{"registry.ollama.ai/library/mistral:latest+Q4_0", MaskDefault, "mistral", false},
-
-		// Auto-Fill
-		{"x", "example.com/library/_:latest", "x", false},
-		{"x", "example.com/library/_:latest+Q4_0", "x", false},
-		{"x/y:z", "a.com/library/_:latest+Q4_0", "x/y:z", false},
-		{"x/y:z", "a.com/library/_:latest+Q4_0", "x/y:z", false},
-	}
-
-	for _, tt := range cases {
-		t.Run("", func(t *testing.T) {
-			defer func() {
-				if tt.wantPanic {
-					if recover() == nil {
-						t.Errorf("expected panic")
-					}
-				}
-			}()
-
-			p := ParseName(tt.in, FillNothing)
-			t.Logf("ParseName(%q) = %#v", tt.in, p)
-			if g := p.DisplayShortest(tt.mask); g != tt.want {
-				t.Errorf("got = %q; want %q", g, tt.want)
-			}
-		})
-	}
-}
-
-func TestParseNameAllocs(t *testing.T) {
-	allocs := testing.AllocsPerRun(1000, func() {
-		keep(ParseName("example.com/mistral:7b+Q4_0", FillNothing))
-	})
-	if allocs > 0 {
-		t.Errorf("ParseName allocs = %v; want 0", allocs)
-	}
-}
-
-func BenchmarkParseName(b *testing.B) {
-	b.ReportAllocs()
-
-	for range b.N {
-		keep(ParseName("example.com/mistral:7b+Q4_0", FillNothing))
-	}
-}
-
-func FuzzParseNameFromFilepath(f *testing.F) {
-	f.Add("example.com/library/mistral/7b/Q4_0")
-	f.Add("example.com/../mistral/7b/Q4_0")
-	f.Add("example.com/x/../7b/Q4_0")
-	f.Add("example.com/x/../7b")
-	f.Fuzz(func(t *testing.T, s string) {
-		name := ParseNameFromFilepath(s, FillNothing)
-		if strings.Contains(s, "..") && !name.IsZero() {
-			t.Fatalf("non-zero value for path with '..': %q", s)
-		}
-		if name.IsValid() == name.IsZero() {
-			t.Errorf("expected valid path to be non-zero value; got %#v", name)
-		}
-	})
-}
-
-func FuzzParseName(f *testing.F) {
-	f.Add("example.com/mistral:7b+Q4_0")
-	f.Add("example.com/mistral:7b+q4_0")
-	f.Add("example.com/mistral:7b+x")
-	f.Add("x/y/z:8n+I")
-	f.Add(":x")
-	f.Add("@sha256-123456")
-	f.Add("example.com/mistral:latest+Q4_0@sha256-123456")
-	f.Add(":@!@")
-	f.Add("...")
-	f.Fuzz(func(t *testing.T, s string) {
-		r0 := ParseName(s, FillNothing)
-
-		if strings.Contains(s, "..") && !r0.IsZero() {
-			t.Fatalf("non-zero value for path with '..': %q", s)
-		}
-
-		if !r0.IsValid() && !r0.IsResolved() {
-			if !r0.EqualFold(Name{}) {
-				t.Errorf("expected invalid path to be zero value; got %#v", r0)
-			}
-			t.Skipf("invalid path: %q", s)
-		}
-
-		for _, p := range r0.parts {
-			if len(p) > MaxNamePartLen {
-				t.Errorf("part too long: %q", p)
-			}
-		}
-
-		if !strings.EqualFold(r0.DisplayLong(), s) {
-			t.Errorf("String() did not round-trip with case insensitivity: %q\ngot  = %q\nwant = %q", s, r0.DisplayLong(), s)
-		}
-
-		r1 := ParseName(r0.DisplayLong(), FillNothing)
-		if !r0.EqualFold(r1) {
-			t.Errorf("round-trip mismatch: %+v != %+v", r0, r1)
-		}
-	})
-}
-
-func TestNameStringAllocs(t *testing.T) {
-	name := ParseName("example.com/ns/mistral:latest+Q4_0", FillNothing)
-	allocs := testing.AllocsPerRun(1000, func() {
-		keep(name.DisplayLong())
-	})
-	if allocs > 1 {
-		t.Errorf("String allocs = %v; want 0", allocs)
-	}
-}
-
-func TestNamePath(t *testing.T) {
-	cases := []struct {
-		in   string
-		want string
-	}{
-		{"example.com/library/mistral:latest+Q4_0", "example.com/library/mistral:latest"},
-
-		// incomplete
-		{"example.com/library/mistral:latest", "example.com/library/mistral:latest"},
-		{"", ""},
-	}
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
-			t.Logf("ParseName(%q) = %#v", tt.in, p)
-			if g := p.URLPath(); g != tt.want {
-				t.Errorf("got = %q; want %q", g, tt.want)
-			}
-		})
-	}
-}
-
-func TestNameFilepath(t *testing.T) {
-	cases := []struct {
-		in          string
-		want        string
-		wantNoBuild string
-	}{
-		{
-			in:          "example.com/library/mistral:latest+Q4_0",
-			want:        "example.com/library/mistral/latest/Q4_0",
-			wantNoBuild: "example.com/library/mistral/latest",
-		},
-		{
-			in:          "Example.Com/Library/Mistral:Latest+Q4_0",
-			want:        "example.com/library/mistral/latest/Q4_0",
-			wantNoBuild: "example.com/library/mistral/latest",
-		},
-		{
-			in:          "Example.Com/Library/Mistral:Latest+Q4_0",
-			want:        "example.com/library/mistral/latest/Q4_0",
-			wantNoBuild: "example.com/library/mistral/latest",
-		},
-		{
-			in:          "example.com/library/mistral:latest",
-			want:        "example.com/library/mistral/latest",
-			wantNoBuild: "example.com/library/mistral/latest",
-		},
-		{
-			in:          "",
-			want:        "",
-			wantNoBuild: "",
-		},
-	}
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			p := ParseName(tt.in, FillNothing)
-			t.Logf("ParseName(%q) = %#v", tt.in, p)
-			g := p.Filepath()
-			g = filepath.ToSlash(g)
-			if g != tt.want {
-				t.Errorf("got = %q; want %q", g, tt.want)
-			}
-			g = p.FilepathNoBuild()
-			g = filepath.ToSlash(g)
-			if g != tt.wantNoBuild {
-				t.Errorf("got = %q; want %q", g, tt.wantNoBuild)
-			}
-		})
-	}
-}
-
-func TestParseNameFilepath(t *testing.T) {
-	cases := []struct {
-		in   string
-		fill string // default is FillNothing
-		want string
-	}{
-		{
-			in:   "example.com/library/mistral/latest/Q4_0",
-			want: "example.com/library/mistral:latest+Q4_0",
-		},
-		{
-			in:   "example.com/library/mistral/latest",
-			fill: "?/?/?:latest+Q4_0",
-			want: "example.com/library/mistral:latest+Q4_0",
-		},
-		{
-			in:   "example.com/library/mistral",
-			fill: "?/?/?:latest+Q4_0",
-			want: "example.com/library/mistral:latest+Q4_0",
-		},
-		{
-			in:   "example.com/library",
-			want: "",
-		},
-		{
-			in:   "example.com/",
-			want: "",
-		},
-		{
-			in:   "example.com/^/mistral/latest/Q4_0",
-			want: "",
-		},
-		{
-			in:   "example.com/library/mistral/../Q4_0",
-			want: "",
-		},
-		{
-			in:   "example.com/library/mistral/latest/Q4_0/extra",
-			want: "",
-		},
-	}
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			in := strings.ReplaceAll(tt.in, "/", string(filepath.Separator))
-			fill := cmp.Or(tt.fill, FillNothing)
-			want := ParseName(tt.want, fill)
-			if g := ParseNameFromFilepath(in, fill); !g.EqualFold(want) {
-				t.Errorf("got = %q; want %q", g.DisplayLong(), tt.want)
-			}
-		})
-	}
-}
-
-func TestParseNameFromPath(t *testing.T) {
-	cases := []struct {
-		in   string
-		want string
-		fill string // default is FillNothing
-	}{
-		{
-			in:   "example.com/library/mistral:latest+Q4_0",
-			want: "example.com/library/mistral:latest+Q4_0",
-		},
-		{
-			in:   "/example.com/library/mistral:latest+Q4_0",
-			want: "example.com/library/mistral:latest+Q4_0",
-		},
-		{
-			in:   "/example.com/library/mistral",
-			want: "example.com/library/mistral",
-		},
-		{
-			in:   "/example.com/library/mistral",
-			fill: "?/?/?:latest+Q4_0",
-			want: "example.com/library/mistral:latest+Q4_0",
-		},
-		{
-			in:   "/example.com/library",
-			want: "",
-		},
-		{
-			in:   "/example.com/",
-			want: "",
-		},
-		{
-			in:   "/example.com/^/mistral/latest",
-			want: "",
-		},
-	}
-	for _, tt := range cases {
-		t.Run(tt.in, func(t *testing.T) {
-			fill := cmp.Or(tt.fill, FillNothing)
-			if g := ParseNameFromURLPath(tt.in, fill); g.DisplayLong() != tt.want {
-				t.Errorf("got = %q; want %q", g.DisplayLong(), tt.want)
-			}
-		})
-	}
-}
-
-func ExampleName_MapHash() {
-	m := map[uint64]bool{}
-
-	// key 1
-	m[ParseName("mistral:latest+q4", FillNothing).MapHash()] = true
-	m[ParseName("miSTRal:latest+Q4", FillNothing).MapHash()] = true
-	m[ParseName("mistral:LATest+Q4", FillNothing).MapHash()] = true
-
-	// key 2
-	m[ParseName("mistral:LATest", FillNothing).MapHash()] = true
-
-	fmt.Println(len(m))
-	// Output:
-	// 2
-}
-
-func ExampleName_CompareFold_sort() {
-	names := []Name{
-		ParseName("mistral:latest", FillNothing),
-		ParseName("mistRal:7b+q4", FillNothing),
-		ParseName("MIstral:7b", FillNothing),
-	}
-
-	slices.SortFunc(names, Name.CompareFold)
-
-	for _, n := range names {
-		fmt.Println(n.DisplayLong())
-	}
-
-	// Output:
-	// MIstral:7b
-	// mistRal:7b+q4
-	// mistral:latest
-}
-
-func ExampleName_completeAndResolved() {
-	for _, s := range []string{
-		"x/y/z:latest+q4_0@sha123-1",
-		"x/y/z:latest+q4_0",
-		"@sha123-1",
-	} {
-		name := ParseName(s, FillNothing)
-		fmt.Printf("complete:%v resolved:%v  digest:%s\n", name.IsComplete(), name.IsResolved(), name.Digest())
-	}
-
-	// Output:
-	// complete:true resolved:true  digest:sha123-1
-	// complete:true resolved:false  digest:
-	// complete:false resolved:true  digest:sha123-1
-}
-
-func ExampleName_DisplayShortest() {
-	name := ParseName("example.com/jmorganca/mistral:latest+Q4_0", FillNothing)
-
-	fmt.Println(name.DisplayShortest("example.com/jmorganca/_:latest"))
-	fmt.Println(name.DisplayShortest("example.com/_/_:latest"))
-	fmt.Println(name.DisplayShortest("example.com/_/_:_"))
-	fmt.Println(name.DisplayShortest("_/_/_:_"))
-
-	// Default
-	name = ParseName("registry.ollama.ai/library/mistral:latest+Q4_0", FillNothing)
-	fmt.Println(name.DisplayShortest(""))
-
-	// Output:
-	// mistral
-	// jmorganca/mistral
-	// jmorganca/mistral:latest
-	// example.com/jmorganca/mistral:latest
-	// mistral
-}
-
-func keep[T any](v T) T { return v }
--- a/types/model/testdata/fuzz/FuzzParseRef/1d43ee52085cb4aa
+++ b/types/model/testdata/fuzz/FuzzParseRef/1d43ee52085cb4aa
@@ -1,2 +0,0 @@
-go test fuzz v1
-string("/0")
--- a/types/model/testdata/fuzz/FuzzParseRef/27fd759314f0e6d6
+++ b/types/model/testdata/fuzz/FuzzParseRef/27fd759314f0e6d6
@@ -1,2 +0,0 @@
-go test fuzz v1
-string("0//0")
--- a/types/model/testdata/fuzz/FuzzParseRef/3e3b70dba384074d
+++ b/types/model/testdata/fuzz/FuzzParseRef/3e3b70dba384074d
@@ -1,2 +0,0 @@
-go test fuzz v1
-string("0 /0")
--- a/types/model/testdata/fuzz/FuzzParseRef/71f1fdff711b6dab
+++ b/types/model/testdata/fuzz/FuzzParseRef/71f1fdff711b6dab
@@ -1,2 +0,0 @@
-go test fuzz v1
-string("+0/00000")
--- a/types/model/testdata/fuzz/FuzzParseRef/82c2975c430ac608
+++ b/types/model/testdata/fuzz/FuzzParseRef/82c2975c430ac608
@@ -1,2 +0,0 @@
-go test fuzz v1
-string(":")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Daniel Hiltgen	dc011d16b9	Backport MacOS SDK fix from main	2024-04-04 11:17:48 -07:00
Daniel Hiltgen	8d08676a99	Apply 01-cache.diff	2024-04-04 10:11:23 -07:00