fix ref

more comments
comments
2026-01-03 13:10:17 -05:00 · 2024-08-26 19:59:33 -07:00 · 2024-08-26 19:56:45 -07:00 · 2024-08-26 19:54:06 -07:00 · 2024-08-26 18:09:21 -07:00 · 2024-08-26 18:07:59 -07:00
124 changed files with 26352 additions and 3503 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -7,5 +7,3 @@ llm/llama.cpp
 .env
 .cache
 test_data
-llm/build
-llama/build
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -102,8 +102,7 @@ jobs:
        with:
          name: generate-windows-cpu
          path: |
-            build/**/*
-            build/**/*.a
+            llm/build/**/bin/*
            llm/build/**/*.a
            dist/windows-amd64/**

@@ -177,7 +176,7 @@ jobs:
        with:
          name: generate-windows-rocm
          path: |
-            build/**/*
+            llm/build/**/bin/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@@ -266,7 +265,7 @@ jobs:
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
-            build/**/*
+            llm/build/**/bin/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@@ -339,7 +338,7 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-      - run: dir build
+      - run: dir llm/build
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@@ -360,7 +359,9 @@ jobs:
    environment: release
    runs-on: linux
    env:
-      PLATFORM: linux/amd64
+      OLLAMA_SKIP_MANIFEST_CREATE: '1'
+      BUILD_ARCH: amd64
+      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@@ -368,8 +369,14 @@ jobs:
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
+          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@@ -383,7 +390,9 @@ jobs:
    environment: release
    runs-on: linux-arm64
    env:
-      PLATFORM: linux/arm64
+      OLLAMA_SKIP_MANIFEST_CREATE: '1'
+      BUILD_ARCH: arm64
+      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@@ -412,8 +421,14 @@ jobs:
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
+          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-arm64
@@ -421,178 +436,6 @@ jobs:
            dist/*linux*
            !dist/*-cov

-  # Container image build
-  build-container-image:
-    environment: release
-    strategy:
-      matrix:
-        runner:
-          - linux
-          - linux-arm64
-    runs-on: ${{ matrix.runner }}
-    env:
-      FINAL_IMAGE_REPO: ollama/ollama
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: 'Install Docker'
-        if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt-get update
-          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
-          sudo usermod -aG docker $USER
-          sudo apt-get install acl
-          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.FINAL_IMAGE_REPO }}
-          flavor: |
-            latest=false
-          tags: |
-            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
-            type=semver,pattern={{version}}
-      - name: Set Version
-        shell: bash
-        run: |
-          machine=$(uname -m)
-          case ${machine} in
-            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
-            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
-          esac >>$GITHUB_ENV
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Build and push by digest
-        id: build
-        uses: docker/build-push-action@v6
-        with:
-          context: "."
-          platforms: linux/${{ env.ARCH }}
-          build-args: |
-            GOFLAGS
-          outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
-      - name: Export digest
-        run: |
-          mkdir -p /tmp/digests
-          digest="${{ steps.build.outputs.digest }}"
-          touch "/tmp/digests/${digest#sha256:}"
-      - name: Upload digest
-        uses: actions/upload-artifact@v4
-        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
-          path: /tmp/digests/*
-          if-no-files-found: error
-          retention-days: 1
-  merge:
-    environment: release
-    runs-on: linux
-    needs:
-      - build-container-image
-    env:
-      FINAL_IMAGE_REPO: ollama/ollama
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Download digests
-        uses: actions/download-artifact@v4
-        with:
-          path: /tmp/digests
-          pattern: digests-*
-          merge-multiple: true
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.FINAL_IMAGE_REPO }}
-          flavor: |
-            latest=false
-          tags: |
-            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
-            type=semver,pattern={{version}}
-      - name: Set Version
-        shell: bash
-        run: |
-          machine=$(uname -m)
-          case ${machine} in
-            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
-            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
-          esac >>$GITHUB_ENV
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Create manifest list and push
-        working-directory: /tmp/digests
-        run: |
-          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
-            $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
-      - name: Inspect image
-        run: |
-          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
-  build-container-image-rocm:
-    environment: release
-    runs-on: linux
-    env:
-      FINAL_IMAGE_REPO: ollama/ollama
-      ARCH: amd64
-      PLATFORM_PAIR: linux-amd64
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.FINAL_IMAGE_REPO }}
-          flavor: |
-            latest=false
-          tags: |
-            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
-            type=semver,pattern={{version}}
-      - name: Set Version
-        shell: bash
-        run: |
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Build and push by digest
-        id: build
-        uses: docker/build-push-action@v6
-        with:
-          context: "."
-          target: runtime-rocm
-          build-args: |
-            GOFLAGS
-          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
-          push: true
-
  # Aggregate all the assets and ship a release
  release:
    needs:
@@ -605,6 +448,8 @@ jobs:
    permissions:
      contents: write
    env:
+      OLLAMA_SKIP_IMAGE_BUILD: '1'
+      PUSH: '1'
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
@@ -613,6 +458,12 @@ jobs:
        run: |
          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - run: ./scripts/build_docker.sh
      - name: Retrieve built artifact
        uses: actions/download-artifact@v4
        with:
@@ -623,6 +474,8 @@ jobs:
          ls -lh dist/
          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
          mv sha256sum.txt dist/
+          mv dist/linux-???64 .
+          mv dist/linux-amd64-rocm .
          cat dist/sha256sum.txt
      - name: Create or update Release
        run: |
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -81,6 +81,12 @@ jobs:
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        name: 'Unix Go Generate'
      - run: go build .
+      - uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
+          path: |
+            llm/build/**/bin/*
+            llm/build/**/*.a
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@@ -108,6 +114,12 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cuda-${{ matrix.cuda-version }}-libraries
+          path: |
+            llm/build/**/bin/*
+            dist/windows-amd64/**
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@@ -135,6 +147,12 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
+      - uses: actions/upload-artifact@v4
+        with:
+          name: rocm-${{ matrix.rocm-version }}-libraries
+          path: |
+            llm/build/**/bin/*
+            dist/windows-amd64/**

  # ROCm generation step
  generate-windows-rocm:
@@ -171,6 +189,7 @@ jobs:
        name: go generate
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
+      # TODO - do we need any artifacts?

  # CUDA generation step
  generate-windows-cuda:
@@ -212,6 +231,7 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
+      # TODO - do we need any artifacts?

  lint:
    strategy:
@@ -243,6 +263,14 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
+      - run: |
+          mkdir -p llm/build/linux/$ARCH/stub/bin
+          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
+        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
+      - run: |
+          mkdir -p llm/build/darwin/$ARCH/stub/bin
+          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
+        if: ${{ startsWith(matrix.os, 'macos-') }}
      - uses: golangci/golangci-lint-action@v6
        with:
          args: --timeout 8m0s -v
@@ -273,10 +301,23 @@ jobs:
          cache: true
      - run: |
          case ${{ matrix.arch }} in
-            amd64) echo ARCH=amd64 ;;
+            amd64) echo ARCH=x86_64 ;;
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
+      - run: |
+          mkdir -p llm/build/linux/$ARCH/stub/bin
+          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
+        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
+      - run: |
+          mkdir -p llm/build/darwin/$ARCH/stub/bin
+          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
+        if: ${{ startsWith(matrix.os, 'macos-') }}
+        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
+      - uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.os }}-binaries
+          path: ollama
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,4 @@ ggml-metal.metal
 test_data
 *.crt
 llm/build
-build/*/*/*
-!build/**/placeholder
-llama/build
 __debug_bin*
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -32,10 +32,6 @@ linters:
 linters-settings:
  gci:
    sections: [standard, default, localmodule]
-  staticcheck:
-    checks:
-      - all
-      - -SA1019 # omit Deprecated check
 severity:
  default-severity: error
  rules:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -18,7 +18,7 @@ See the [development documentation](./docs/development.md) for instructions on h

 * New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
 * Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
-* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
+* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.

 ### Issues that may not be accepted

--- a/147
+++ b/147
@@ -16,12 +16,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-1
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=amd64
+ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
@@ -33,12 +33,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-1
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH=amd64
+ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
@@ -47,32 +47,32 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=arm64
+ENV GOARCH arm64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
    CUDA_VARIANT="_v11" \
    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH=arm64
+ENV GOARCH arm64 
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
@@ -86,13 +86,13 @@ FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-b
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV LIBRARY_PATH=/opt/amdgpu/lib64
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LIBRARY_PATH /opt/amdgpu/lib64
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
-ENV GOARCH=amd64
+ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
@@ -103,11 +103,11 @@ ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-ENV GOARCH=amd64
+ENV GOARCH amd64 
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
@@ -128,11 +128,11 @@ ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
-ENV GOARCH=arm64
+ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate

 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
@@ -143,112 +143,71 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh


-# Intermediate stages used for ./scripts/build_linux.sh
+# Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
-ENV CGO_ENABLED=1
+ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz

+# Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
-ENV CGO_ENABLED=1
+ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-
-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH as dist
-
-
-# Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-
-FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
+COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .

-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+# Strip out ROCm dependencies to keep the primary image lean
+FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
+RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* 

-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+# Runtime stages
+FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
+COPY --from=amd64-libs-without-rocm /scratch/ /lib/
+RUN apt-get update && apt-get install -y ca-certificates
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/

-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-# across releases
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+RUN apt-get update && apt-get install -y ca-certificates
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+
+# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
+RUN update-pciids
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+RUN ln -s /opt/rocm/lib /lib/ollama
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0

 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]

 FROM runtime-$TARGETARCH
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
--- a/README.md
+++ b/README.md
@@ -295,24 +295,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)

 ### Terminal

@@ -337,11 +326,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
-
-### Apple Vision Pro
- [Enchanted](https://github.com/AugustDev/enchanted)

 ### Database

@@ -351,28 +335,23 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Package managers

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
- [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
 - [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
- [Flox](https://flox.dev/blog/ollama-part-one)

 ### Libraries

 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
- [crewAI](https://github.com/crewAIInc/crewAI)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
- [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
+- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
 - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
 - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
 - [Ollama for Dart](https://github.com/breitburg/dart-ollama)
@@ -389,17 +368,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
- [Gollm](https://docs.gollm.co/examples/ollama-example)
- [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)

 ### Mobile

 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)

 ### Extensions & Plugins

@@ -424,16 +397,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [Plasmoid Ollama Control](https://github.com/imoize/plasmoid-ollamacontrol) (KDE Plasma extension that allows you to quickly manage/control Ollama model)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)

 ### Supported backends

--- a/api/types.go
+++ b/api/types.go
@@ -296,17 +296,15 @@ type EmbeddingResponse struct {
 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
 	Model     string `json:"model"`
+	Path      string `json:"path"`
 	Modelfile string `json:"modelfile"`
 	Stream    *bool  `json:"stream,omitempty"`
 	Quantize  string `json:"quantize,omitempty"`

-	// Deprecated: set the model name with Model instead
+	// Name is deprecated, see Model
 	Name string `json:"name"`

-	// Deprecated: set the file content with Modelfile instead
-	Path string `json:"path"`
-
-	// Deprecated: use Quantize instead
+	// Quantization is deprecated, see Quantize
 	Quantization string `json:"quantization,omitempty"`
 }

@@ -314,7 +312,7 @@ type CreateRequest struct {
 type DeleteRequest struct {
 	Model string `json:"model"`

-	// Deprecated: set the model name with Model instead
+	// Name is deprecated, see Model
 	Name string `json:"name"`
 }

@@ -329,7 +327,7 @@ type ShowRequest struct {

 	Options map[string]interface{} `json:"options"`

-	// Deprecated: set the model name with Model instead
+	// Name is deprecated, see Model
 	Name string `json:"name"`
 }

@@ -361,7 +359,7 @@ type PullRequest struct {
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`

-	// Deprecated: set the model name with Model instead
+	// Name is deprecated, see Model
 	Name string `json:"name"`
 }

@@ -382,7 +380,7 @@ type PushRequest struct {
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`

-	// Deprecated: set the model name with Model instead
+	// Name is deprecated, see Model
 	Name string `json:"name"`
 }

--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -87,7 +87,7 @@ DialogFontSize=12

 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
+Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
@@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"

 [Run]
-Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
+Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden

 [UninstallRun]
 ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi

 [Registry]
 Root: HKCU; Subkey: "Environment"; \
-    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
-    Check: NeedsAddPath('{app}')
+    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
+    Check: NeedsAddPath('{app}\bin')

 [Code]

--- a/build/darwin/amd64/placeholder
+++ b/build/darwin/amd64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/build/darwin/arm64/placeholder
+++ b/build/darwin/arm64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/build/embed_darwin_amd64.go
+++ b/build/embed_darwin_amd64.go
@@ -1,8 +0,0 @@
-package build
-
-import "embed"
-
-// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
-
-//go:embed darwin/amd64/*
-var EmbedFS embed.FS
--- a/build/embed_darwin_arm64.go
+++ b/build/embed_darwin_arm64.go
@@ -1,8 +0,0 @@
-package build
-
-import "embed"
-
-// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
-
-//go:embed darwin/arm64/*
-var EmbedFS embed.FS
--- a/build/embed_linux.go
+++ b/build/embed_linux.go
@@ -1,6 +0,0 @@
-package build
-
-import "embed"
-
-//go:embed linux/*
-var EmbedFS embed.FS
--- a/build/embed_unused.go
+++ b/build/embed_unused.go
@@ -1,8 +0,0 @@
-//go:build !linux && !darwin
-
-package build
-
-import "embed"
-
-// unused on windows
-var EmbedFS embed.FS
--- a/build/linux/amd64/placeholder
+++ b/build/linux/amd64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/build/linux/arm64/placeholder
+++ b/build/linux/arm64/placeholder
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -2,7 +2,6 @@ package cmd

 import (
 	"archive/zip"
-	"bufio"
 	"bytes"
 	"context"
 	"crypto/ed25519"
@@ -22,7 +21,6 @@ import (
 	"regexp"
 	"runtime"
 	"slices"
-	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -346,39 +344,6 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
 	return len(p), nil
 }

-func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
-	p := progress.NewProgress(os.Stderr)
-	defer p.StopAndClear()
-
-	spinner := progress.NewSpinner("")
-	p.Add("", spinner)
-
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	req := &api.GenerateRequest{
-		Model:     opts.Model,
-		KeepAlive: opts.KeepAlive,
-	}
-
-	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
-}
-
-func StopHandler(cmd *cobra.Command, args []string) error {
-	opts := &runOptions{
-		Model:     args[0],
-		KeepAlive: &api.Duration{Duration: 0},
-	}
-	if err := loadOrUnloadModel(cmd, opts); err != nil {
-		if strings.Contains(err.Error(), "not found") {
-			return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
-		}
-	}
-	return nil
-}
-
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true

@@ -457,7 +422,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	opts.ParentModel = info.Details.ParentModel

 	if interactive {
-		if err := loadOrUnloadModel(cmd, &opts); err != nil {
+		if err := loadModel(cmd, &opts); err != nil {
 			return err
 		}

@@ -613,7 +578,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 	table.SetHeaderLine(false)
 	table.SetBorder(false)
 	table.SetNoWhiteSpace(true)
-	table.SetTablePadding("    ")
+	table.SetTablePadding("\t")
 	table.AppendBulk(data)
 	table.Render()

@@ -648,15 +613,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 				cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
 				procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
 			}
-
-			var until string
-			delta := time.Since(m.ExpiresAt)
-			if delta > 0 {
-				until = "Stopping..."
-			} else {
-				until = format.HumanTime(m.ExpiresAt, "Never")
-			}
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
 		}
 	}

@@ -667,7 +624,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 	table.SetHeaderLine(false)
 	table.SetBorder(false)
 	table.SetNoWhiteSpace(true)
-	table.SetTablePadding("    ")
+	table.SetTablePadding("\t")
 	table.AppendBulk(data)
 	table.Render()

@@ -763,89 +720,122 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	return showInfo(resp, os.Stdout)
+	showInfo(resp)
+
+	return nil
 }

-func showInfo(resp *api.ShowResponse, w io.Writer) error {
-	tableRender := func(header string, rows func() [][]string) {
-		fmt.Fprintln(w, " ", header)
-		table := tablewriter.NewWriter(w)
-		table.SetAlignment(tablewriter.ALIGN_LEFT)
-		table.SetBorder(false)
-		table.SetNoWhiteSpace(true)
-		table.SetTablePadding("    ")
+func showInfo(resp *api.ShowResponse) {
+	arch := resp.ModelInfo["general.architecture"].(string)

-		switch header {
-		case "Template", "System", "License":
-			table.SetColWidth(100)
-		}
-
-		table.AppendBulk(rows())
-		table.Render()
-		fmt.Fprintln(w)
+	modelData := [][]string{
+		{"arch", arch},
+		{"parameters", resp.Details.ParameterSize},
+		{"quantization", resp.Details.QuantizationLevel},
+		{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+		{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
 	}

-	tableRender("Model", func() (rows [][]string) {
-		if resp.ModelInfo != nil {
-			arch := resp.ModelInfo["general.architecture"].(string)
-			rows = append(rows, []string{"", "architecture", arch})
-			rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
-			rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
-			rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
-		} else {
-			rows = append(rows, []string{"", "architecture", resp.Details.Family})
-			rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
-		}
-		rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
-		return
-	})
+	mainTableData := [][]string{
+		{"Model"},
+		{renderSubTable(modelData, false)},
+	}

 	if resp.ProjectorInfo != nil {
-		tableRender("Projector", func() (rows [][]string) {
-			arch := resp.ProjectorInfo["general.architecture"].(string)
-			rows = append(rows, []string{"", "architecture", arch})
-			rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
-			rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
-			rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
-			return
-		})
+		projectorData := [][]string{
+			{"arch", "clip"},
+			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
+		}
+
+		if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
+			projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
+		}
+
+		projectorData = append(projectorData,
+			[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
+			[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
+		)
+
+		mainTableData = append(mainTableData,
+			[]string{"Projector"},
+			[]string{renderSubTable(projectorData, false)},
+		)
 	}

 	if resp.Parameters != "" {
-		tableRender("Parameters", func() (rows [][]string) {
-			scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
-			for scanner.Scan() {
-				if text := scanner.Text(); text != "" {
-					rows = append(rows, append([]string{""}, strings.Fields(text)...))
-				}
-			}
-			return
-		})
-	}
-
-	head := func(s string, n int) (rows [][]string) {
-		scanner := bufio.NewScanner(strings.NewReader(s))
-		for scanner.Scan() && (len(rows) < n || n < 0) {
-			if text := scanner.Text(); text != "" {
-				rows = append(rows, []string{"", strings.TrimSpace(text)})
-			}
-		}
-		return
+		mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
 	}

 	if resp.System != "" {
-		tableRender("System", func() [][]string {
-			return head(resp.System, 2)
-		})
+		mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
 	}

 	if resp.License != "" {
-		tableRender("License", func() [][]string {
-			return head(resp.License, 2)
-		})
+		mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
 	}

-	return nil
+	table := tablewriter.NewWriter(os.Stdout)
+	table.SetAutoWrapText(false)
+	table.SetBorder(false)
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+	for _, v := range mainTableData {
+		table.Append(v)
+	}
+
+	table.Render()
+}
+
+func renderSubTable(data [][]string, file bool) string {
+	var buf bytes.Buffer
+	table := tablewriter.NewWriter(&buf)
+	table.SetAutoWrapText(!file)
+	table.SetBorder(false)
+	table.SetNoWhiteSpace(true)
+	table.SetTablePadding("\t")
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+
+	for _, v := range data {
+		table.Append(v)
+	}
+
+	table.Render()
+
+	renderedTable := buf.String()
+	lines := strings.Split(renderedTable, "\n")
+	for i, line := range lines {
+		lines[i] = "\t" + line
+	}
+
+	return strings.Join(lines, "\n")
+}
+
+func twoLines(s string) [][]string {
+	lines := strings.Split(s, "\n")
+	res := [][]string{}
+
+	count := 0
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line != "" {
+			count++
+			res = append(res, []string{line})
+			if count == 2 {
+				return res
+			}
+		}
+	}
+	return res
+}
+
+func formatParams(s string) string {
+	lines := strings.Split(s, "\n")
+	table := [][]string{}
+
+	for _, line := range lines {
+		table = append(table, strings.Fields(line))
+	}
+	return renderSubTable(table, false)
 }

 func CopyHandler(cmd *cobra.Command, args []string) error {
@@ -1335,15 +1325,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-
-	stopCmd := &cobra.Command{
-		Use:     "stop MODEL",
-		Short:   "Stop a running model",
-		Args:    cobra.ExactArgs(1),
-		PreRunE: checkServerHeartbeat,
-		RunE:    StopHandler,
-	}
-
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
@@ -1411,7 +1392,6 @@ func NewCLI() *cobra.Command {
 		createCmd,
 		showCmd,
 		runCmd,
-		stopCmd,
 		pullCmd,
 		pushCmd,
 		listCmd,
@@ -1438,8 +1418,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
-				envVars["OLLAMA_GPU_OVERHEAD"],
-				envVars["OLLAMA_LOAD_TIMEOUT"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
@@ -1451,7 +1429,6 @@ func NewCLI() *cobra.Command {
 		createCmd,
 		showCmd,
 		runCmd,
-		stopCmd,
 		pullCmd,
 		pushCmd,
 		listCmd,
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -1,206 +0,0 @@
-package cmd
-
-import (
-	"bytes"
-	"os"
-	"path/filepath"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestShowInfo(t *testing.T) {
-	t.Run("bare details", func(t *testing.T) {
-		var b bytes.Buffer
-		if err := showInfo(&api.ShowResponse{
-			Details: api.ModelDetails{
-				Family:            "test",
-				ParameterSize:     "7B",
-				QuantizationLevel: "FP16",
-			},
-		}, &b); err != nil {
-			t.Fatal(err)
-		}
-
-		expect := `  Model
-    architecture    test    
-    parameters      7B      
-    quantization    FP16    
-
-`
-
-		if diff := cmp.Diff(expect, b.String()); diff != "" {
-			t.Errorf("unexpected output (-want +got):\n%s", diff)
-		}
-	})
-
-	t.Run("bare model info", func(t *testing.T) {
-		var b bytes.Buffer
-		if err := showInfo(&api.ShowResponse{
-			ModelInfo: map[string]any{
-				"general.architecture":    "test",
-				"general.parameter_count": float64(7_000_000_000),
-				"test.context_length":     float64(0),
-				"test.embedding_length":   float64(0),
-			},
-			Details: api.ModelDetails{
-				Family:            "test",
-				ParameterSize:     "7B",
-				QuantizationLevel: "FP16",
-			},
-		}, &b); err != nil {
-			t.Fatal(err)
-		}
-
-		expect := `  Model
-    architecture        test    
-    parameters          7B      
-    context length      0       
-    embedding length    0       
-    quantization        FP16    
-
-`
-		if diff := cmp.Diff(expect, b.String()); diff != "" {
-			t.Errorf("unexpected output (-want +got):\n%s", diff)
-		}
-	})
-
-	t.Run("parameters", func(t *testing.T) {
-		var b bytes.Buffer
-		if err := showInfo(&api.ShowResponse{
-			Details: api.ModelDetails{
-				Family:            "test",
-				ParameterSize:     "7B",
-				QuantizationLevel: "FP16",
-			},
-			Parameters: `
-			stop never
-			stop gonna
-			stop give
-			stop you
-			stop up
-			temperature 99`,
-		}, &b); err != nil {
-			t.Fatal(err)
-		}
-
-		expect := `  Model
-    architecture    test    
-    parameters      7B      
-    quantization    FP16    
-
-  Parameters
-    stop           never    
-    stop           gonna    
-    stop           give     
-    stop           you      
-    stop           up       
-    temperature    99       
-
-`
-		if diff := cmp.Diff(expect, b.String()); diff != "" {
-			t.Errorf("unexpected output (-want +got):\n%s", diff)
-		}
-	})
-
-	t.Run("project info", func(t *testing.T) {
-		var b bytes.Buffer
-		if err := showInfo(&api.ShowResponse{
-			Details: api.ModelDetails{
-				Family:            "test",
-				ParameterSize:     "7B",
-				QuantizationLevel: "FP16",
-			},
-			ProjectorInfo: map[string]any{
-				"general.architecture":         "clip",
-				"general.parameter_count":      float64(133_700_000),
-				"clip.vision.embedding_length": float64(0),
-				"clip.vision.projection_dim":   float64(0),
-			},
-		}, &b); err != nil {
-			t.Fatal(err)
-		}
-
-		expect := `  Model
-    architecture    test    
-    parameters      7B      
-    quantization    FP16    
-
-  Projector
-    architecture        clip       
-    parameters          133.70M    
-    embedding length    0          
-    dimensions          0          
-
-`
-		if diff := cmp.Diff(expect, b.String()); diff != "" {
-			t.Errorf("unexpected output (-want +got):\n%s", diff)
-		}
-	})
-
-	t.Run("system", func(t *testing.T) {
-		var b bytes.Buffer
-		if err := showInfo(&api.ShowResponse{
-			Details: api.ModelDetails{
-				Family:            "test",
-				ParameterSize:     "7B",
-				QuantizationLevel: "FP16",
-			},
-			System: `You are a pirate!
-Ahoy, matey!
-Weigh anchor!
-			`,
-		}, &b); err != nil {
-			t.Fatal(err)
-		}
-
-		expect := `  Model
-    architecture    test    
-    parameters      7B      
-    quantization    FP16    
-
-  System
-    You are a pirate!    
-    Ahoy, matey!         
-
-`
-		if diff := cmp.Diff(expect, b.String()); diff != "" {
-			t.Errorf("unexpected output (-want +got):\n%s", diff)
-		}
-	})
-
-	t.Run("license", func(t *testing.T) {
-		var b bytes.Buffer
-		license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if err := showInfo(&api.ShowResponse{
-			Details: api.ModelDetails{
-				Family:            "test",
-				ParameterSize:     "7B",
-				QuantizationLevel: "FP16",
-			},
-			License: string(license),
-		}, &b); err != nil {
-			t.Fatal(err)
-		}
-
-		expect := `  Model
-    architecture    test    
-    parameters      7B      
-    quantization    FP16    
-
-  License
-    MIT License             
-    Copyright (c) Ollama    
-
-`
-		if diff := cmp.Diff(expect, b.String()); diff != "" {
-			t.Errorf("unexpected output (-want +got):\n%s", diff)
-		}
-	})
-}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -18,6 +18,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
 )
@@ -30,6 +31,26 @@ const (
 	MultilineSystem
 )

+func loadModel(cmd *cobra.Command, opts *runOptions) error {
+	p := progress.NewProgress(os.Stderr)
+	defer p.StopAndClear()
+
+	spinner := progress.NewSpinner("")
+	p.Add("", spinner)
+
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	chatReq := &api.ChatRequest{
+		Model:     opts.Model,
+		KeepAlive: opts.KeepAlive,
+	}
+
+	return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
+}
+
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
@@ -196,7 +217,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
-			if err := loadOrUnloadModel(cmd, &opts); err != nil {
+			if err := loadModel(cmd, &opts); err != nil {
 				return err
 			}
 			continue
@@ -350,7 +371,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 				switch args[1] {
 				case "info":
-					_ = showInfo(resp, os.Stderr)
+					showInfo(resp)
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -208,18 +208,14 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

-	vocabSize := int(p.VocabSize)
-	switch {
-	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+	if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
+		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
-	case vocabSize < len(t.Vocabulary.Tokens):
-		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
-	default:
+	} else {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}

--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -34,20 +34,10 @@ func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
 }

 func (p *gemma2Model) Replacements() []string {
-	return []string{
-		"model.embed_tokens", "token_embd",
-		"model.norm", "output_norm",
-		"model.layers", "blk",
-		"input_layernorm", "attn_norm",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.o_proj", "attn_output",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.down_proj", "ffn_down",
-		"mlp.up_proj", "ffn_up",
+	return append(
+		p.gemmaModel.Replacements(),
 		"post_attention_layernorm", "post_attention_norm",
 		"pre_feedforward_layernorm", "ffn_norm",
 		"post_feedforward_layernorm", "post_ffw_norm",
-	}
+	)
 }
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -15,7 +15,6 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
-	"strings"
 	"testing"

 	"golang.org/x/exp/maps"
@@ -23,12 +22,6 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type tensorData struct {
-	Offsets []int  `json:"data_offsets"`
-	Type    string `json:"dtype"`
-	Shape   []int  `json:"shape"`
-}
-
 func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	t.Helper()

@@ -96,14 +89,13 @@ func TestMain(m *testing.M) {
 	os.Exit(m.Run())
 }

-func TestConvertModel(t *testing.T) {
+func TestConvertFull(t *testing.T) {
 	cases := []string{
 		"Meta-Llama-3-8B-Instruct",
 		"Meta-Llama-3.1-8B-Instruct",
 		"Mistral-7B-Instruct-v0.2",
 		"Mixtral-8x7B-Instruct-v0.1",
 		"gemma-2b-it",
-		"gemma-2-2b-it",
 		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
 		"Phi-3-mini-128k-instruct",
 		"all-MiniLM-L6-v2",
@@ -148,132 +140,6 @@ func TestConvertModel(t *testing.T) {
 	}
 }

-func TestConvertInvalidTensorNames(t *testing.T) {
-	f, err := os.CreateTemp(t.TempDir(), "testmodel")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	tempDir := t.TempDir()
-
-	td := map[string]*tensorData{}
-	offset := 4096
-
-	td["model.layers.0.self_attn.q_proj.weight"] = &tensorData{
-		Offsets: []int{0, offset},
-		Type:    "F32",
-		Shape:   []int{4096, 4096},
-	}
-	td["blk.0.attn_q.weight"] = &tensorData{
-		Offsets: []int{offset, offset * 2},
-		Type:    "F32",
-		Shape:   []int{4096, 4096},
-	}
-	generateSafetensorTestData(t, tempDir, td)
-
-	err = ConvertModel(os.DirFS(tempDir), f)
-	if err == nil || !strings.HasPrefix(err.Error(), "duplicate tensor name") {
-		t.Errorf("expected error but didn't get one")
-	}
-}
-
-func TestConvertInvalidDatatype(t *testing.T) {
-	f, err := os.CreateTemp(t.TempDir(), "testmodel")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	tempDir := t.TempDir()
-
-	td := map[string]*tensorData{}
-	offset := 4096 * 14336
-
-	td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
-		Offsets: []int{0, offset},
-		Type:    "I8",
-		Shape:   []int{4096, 14336},
-	}
-	td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
-		Offsets: []int{offset, offset},
-		Type:    "U8",
-		Shape:   []int{},
-	}
-	generateSafetensorTestData(t, tempDir, td)
-
-	err = ConvertModel(os.DirFS(tempDir), f)
-	if err == nil || err.Error() != "unsupported safetensors model" {
-		t.Errorf("expected error but didn't get one")
-	}
-}
-
-func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[string]*tensorData) {
-	data, err := json.Marshal(tensorData)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var buf bytes.Buffer
-
-	l := int64(len(data))
-	err = binary.Write(&buf, binary.LittleEndian, l)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	_, err = buf.Write(data)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer fdata.Close()
-
-	_, err = fdata.Write(buf.Bytes())
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	configData := `
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ]
-}
-`
-
-	f, err := os.Create(filepath.Join(tempDir, "config.json"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	_, err = f.WriteString(configData)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	tokenizerData := `
-{
-}
-`
-
-	f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	_, err = f.WriteString(tokenizerData)
-	if err != nil {
-		t.Fatal(err)
-	}
-}
-
 func TestConvertAdapter(t *testing.T) {
 	type AdapterCase struct {
 		Name     string
@@ -355,6 +221,11 @@ func TestConvertAdapter(t *testing.T) {
 }

 func generateLoraTestData(t *testing.T, tempDir string) {
+	type tensorData struct {
+		Offsets []int  `json:"data_offsets"`
+		Type    string `json:"dtype"`
+		Shape   []int  `json:"shape"`
+	}
 	offset := 4096 * 8 * 4

 	td := map[string]*tensorData{"__metadata__": nil}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
 	"io/fs"
@@ -49,19 +48,8 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 		keys := maps.Keys(headers)
 		slices.Sort(keys)

-		names := make(map[string]struct{}, len(keys))
-
 		for _, key := range keys {
 			if value := headers[key]; value.Type != "" {
-				// bitsandbytes quantized models are unsupported
-				if len(value.Shape) == 0 {
-					return nil, errors.New("unsupported safetensors model")
-				}
-				ggufName := replacer.Replace(key)
-				if _, ok := names[ggufName]; ok {
-					return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName)
-				}
-				names[ggufName] = struct{}{}
 				ts = append(ts, safetensor{
 					fs:     fsys,
 					path:   p,
@@ -69,7 +57,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
 					offset: safetensorsPad(n, value.Offsets[0]),
 					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
 					tensorBase: &tensorBase{
-						name:  ggufName,
+						name:  replacer.Replace(key),
 						shape: value.Shape,
 					},
 				})
--- a/convert/testdata/gemma-2-2b-it.json
+++ b/convert/testdata/gemma-2-2b-it.json
@@ -1,312 +0,0 @@
-{
-  "general.architecture": "gemma2",
-  "general.file_type": "1",
-  "general.quantization_version": "2",
-  "gemma2.block_count": "26",
-  "gemma2.context_length": "8192",
-  "gemma2.embedding_length": "2304",
-  "gemma2.feed_forward_length": "9216",
-  "gemma2.attention.head_count": "8",
-  "gemma2.attention.head_count_kv": "4",
-  "gemma2.attention.key_length": "256",
-  "gemma2.attention.value_length": "256",
-  "gemma2.attention.layer_norm_rms_epsilon": "1e-06",
-  "tokenizer.ggml.model": "llama",
-  "tokenizer.ggml.add_bos_token": "true",
-  "tokenizer.ggml.add_eos_token": "false",
-  "tokenizer.ggml.bos_token_id": "2",
-  "tokenizer.ggml.eos_token_id": "1",
-  "tokenizer.ggml.padding_token_id": "0",
-  "tokenizer.ggml.unknown_token_id": "3",
-  "tokenizer.ggml.scores": "0872465d173867d755d3ee728f882b9dc2057a0bfd596fe1e3d131522f1250d8",
-  "tokenizer.ggml.token_type": "8d40143b3477df77beea4139420335ede458bf5e14102f01b0170197b55da8d8",
-  "tokenizer.ggml.tokens": "c6e66de1841f04de8b8d236d461ab720a4c9b9b5414dc293a09c6e10eab45fda",
-  "token_embd.weight": "64a9d30707e659e2e673656d71f5aef7a9fb9fd83bb9a77558dfc5abbe218a05",
-  "blk.0.attn_k.weight": "d8b4437c5edb3cddf6af9987038e1bb2b191c4f0fce0e160d2abace717f5d5d7",
-  "blk.0.attn_norm.weight": "1eb73e3f7aa8e502f6ca31cd19efbb8e4fd9a89692e13e48ac8205545a7fa7e8",
-  "blk.0.attn_output.weight": "39e7b78e57d356a22dd89ce1c4d7163b970712ba756545e1703f97866cd2192e",
-  "blk.0.attn_q.weight": "795058e23b6109febd9d55c89e1eebe6af0714ec8c56fd86a160876a6135ffe8",
-  "blk.0.attn_v.weight": "0cd6e583d1887c020472e961bbb113fe5a0d23ae2f1c2c876fc366cdb7692b52",
-  "blk.0.ffn_down.weight": "51eb4d962189e945a84e94e0dc1aad3f8f90cc1a11e18029670afcd0ea0acb1b",
-  "blk.0.ffn_gate.weight": "9811a29b8ad48432925897ab21dfcb13c5cbd372aeccbbefca9b7866883b4ce3",
-  "blk.0.ffn_norm.weight": "92cbf4652ef503c1de5b10f2be00b3fcf00100980cb3baa8f3013a8d8bf3d851",
-  "blk.0.ffn_up.weight": "af87de21746879483ed1b374cdd76b19ba11ca2b6dbb1beba98efdf3be3e8077",
-  "blk.0.post_attention_norm.weight": "32e135f1f258ffe407018899e39af1725d59d66d60022b9a21575ba160e0357a",
-  "blk.0.post_ffw_norm.weight": "ba286f5ac11b07fbc986173708c66f1920427be5a6d108af38fa0a837c1c8eb6",
-  "blk.1.attn_k.weight": "51584435552051f7fade76beca582b3f7190cf7fc07adcf527c2774d4b1c3901",
-  "blk.1.attn_norm.weight": "6833104c7fbf35a7e799ae56c262b97fffa14789642aee14381b25acd21ed80a",
-  "blk.1.attn_output.weight": "14c39481369087bf292ac9a3ab2ef166f9fe376a9f90c246653213ef264febdc",
-  "blk.1.attn_q.weight": "443f64ae2229f857c69d6bebb7800b685786cb77884c3ae19d4286aeed081325",
-  "blk.1.attn_v.weight": "0df482de2038f1e4c8a7733ac0ddb69ad90759dab5968b942af0155588de4c4a",
-  "blk.1.ffn_down.weight": "66f30763a8bbbcaea609a0087ed75fadb5e771c06378dd2cea94cf17e492e8cf",
-  "blk.1.ffn_gate.weight": "a7151bff00a545fa18b2c92dcd2a14572ccf9beb957a6c494f1374e8ebe174c9",
-  "blk.1.ffn_norm.weight": "e197d71ea11b5276bc0167d2663b88089b3ff42b47ba91e85f6c5d95f6306435",
-  "blk.1.ffn_up.weight": "57c182e0b14cccd1350d388f0c616991702e74281db54637451b70f4ccc24f9b",
-  "blk.1.post_attention_norm.weight": "3c56f837168d784c2d8bac247c130bdca6610c095c8da4558c536ccad7605609",
-  "blk.1.post_ffw_norm.weight": "d2a51d320fd01069dd7ccaa7082f16a7faeb671885607d7900b10a89c354d0fa",
-  "blk.2.attn_k.weight": "bc103c818192de7ce36caaf89dc117be4df13fb902e0bd9a23c64edace5df9b6",
-  "blk.2.attn_norm.weight": "0f2503aa126083a5d6ac72481be1ef66c6014705b573682b35bd864e4749a3d5",
-  "blk.2.attn_output.weight": "05fcd4a1226e482f91803a266f72caca887a93e63c2d2ba5611ab3c68d38743a",
-  "blk.2.attn_q.weight": "6a10b5c2fd423d1e4c4fd60fa8c154a0159b6b2501ea79cae2ef19f45a674e5e",
-  "blk.2.attn_v.weight": "3cf891945a1f8ae7cc908a5c6b729ff5b70f4436c5ffdbf245cc0ed4cc19cd1b",
-  "blk.2.ffn_down.weight": "ea204fd04e0d2fc728a9861a459216bbfec629c152004ba625f52cd8837bd51e",
-  "blk.2.ffn_gate.weight": "3a3518729f1b8b64a82b8792f33987db5418fdb094be0263c68f146a5c38de54",
-  "blk.2.ffn_norm.weight": "754ede678b725de41a34b82f0edf7688b5c065be7c0d46df6f7ad9430d986884",
-  "blk.2.ffn_up.weight": "ffdcb88439f5828ffbd9fc844b03ff91637b790b9838097258cc3ae75935720c",
-  "blk.2.post_attention_norm.weight": "4b3f53b7ba26e8c36b2dfda3b7e5fc4b1065257cefdea235fc7df9af130ac2fd",
-  "blk.2.post_ffw_norm.weight": "e550369e26b8485e2b54ad34b34bc98af5494287dcc513c2c39cf1eaa5b89d07",
-  "blk.3.attn_k.weight": "89f24ea450e37d9e95757651a83205c085d81b354ee9489dd6310a391d8409f3",
-  "blk.3.attn_norm.weight": "24e2ea662b7cb822b4ca5cd61bc17f2709f406d990ec3b4a0dac1cc112db45cf",
-  "blk.3.attn_output.weight": "ac4dad69473c6e3fac56669212cadd8c34ecc5973d945972e974d94805334967",
-  "blk.3.attn_q.weight": "b6a9c9a7d4722b9096631c65de62228dfddca6e26edfe6af7fce01e116ef0f4c",
-  "blk.3.attn_v.weight": "f272a960a40093942309bc342a379984cbacec2d7bc64428db3f64e6b1887ed4",
-  "blk.3.ffn_down.weight": "c0188ba50d8228805982029c277fc0e87aa57473b8363037c648f6d006ff828a",
-  "blk.3.ffn_gate.weight": "a04aec1561ee6c0fbb18c3db49dc62fb533619cf697fd548cbf2279761aaec3b",
-  "blk.3.ffn_norm.weight": "bc053837d44087ec05eb5d9458357b2a5be787789b19cdbbdc694b57697f99a6",
-  "blk.3.ffn_up.weight": "b3ce8b274f20796d3b1a7c08ba27a919066f9de89a782faa544c4a8d6bea1382",
-  "blk.3.post_attention_norm.weight": "9c922dee7a7df5667289e2788e60170238239cee2dfdbbd9e435763f9f416718",
-  "blk.3.post_ffw_norm.weight": "b682544ac953ad2e0b49027ed8916f2e9d1aba5d1587bb4127ac703570c7a03a",
-  "blk.4.attn_k.weight": "143b0cbb4b787b95c2b6212374410e32173ccef2adb914908a2f89a7916de512",
-  "blk.4.attn_norm.weight": "5668f60491b780273745192662d02c9a92a4f692b29d16aa0bbc7413fec4f85b",
-  "blk.4.attn_output.weight": "b9f2bdb68be1e0cf66dd19f8fa2afb105910ad2ef394864cb32cea8f8944e0d5",
-  "blk.4.attn_q.weight": "ddcf1343dafbc2dfcd0b8741225af22fe4b54b2becce29240bd01c34265d126c",
-  "blk.4.attn_v.weight": "6dc7074366e7ed52d9f48c594dcc85bef738e096276cb99d28228c89eecc5b9c",
-  "blk.4.ffn_down.weight": "30334ffc59ce343cf2a1b973174acb7722823463adc07e19a99bd0f404bc9906",
-  "blk.4.ffn_gate.weight": "890f7c8af208d63b28db52c4b8c16c2288a382d87ff5a6a6d6b0a5b3bf27e6cd",
-  "blk.4.ffn_norm.weight": "ff0316cc7847221eb86a90c1ab441d4ee61553d410c66414a7755021b3b12448",
-  "blk.4.ffn_up.weight": "6af97d113f91564c636734f215e25ee602d48eb045458f300b3ec7582be0f41d",
-  "blk.4.post_attention_norm.weight": "69438f231e105e68216b078bdeb35a7cdc8b12c4e2845e18ecf4c8d361d6a321",
-  "blk.4.post_ffw_norm.weight": "0fd535da78bcf2b32c95b05b2b83dc49817393765be90d8cc1ed3d56f47b68ec",
-  "blk.5.attn_k.weight": "0166eb3c6d20dcf3d3c169e94caa8dee057535bb525e29f698fb6f8844f18a6c",
-  "blk.5.attn_norm.weight": "a7808f27f164023d5cde2be00fc23cac6c71aa0ddeb60bc23e12411b80087672",
-  "blk.5.attn_output.weight": "8b65b2027a0842b68c5308f91d6a31de9599d794157d77df8418b19f9e0d9334",
-  "blk.5.attn_q.weight": "966bc626ef2c2394d872087a41c126bb1b67d1d5f6de920204ef5e5b16c34003",
-  "blk.5.attn_v.weight": "9a362aef3f4437fbf0ef6e1ba785f3329c3db2960f93fe36547d2795e9c254ea",
-  "blk.5.ffn_down.weight": "63e53541d34197720c06f297aa8142ac6b6eec002c7987b296f26e8b1400f931",
-  "blk.5.ffn_gate.weight": "d9591fdd32f783e0fc26e20d5d587ee8971ac8ae2e4c818c6eac1c125c7c7f37",
-  "blk.5.ffn_norm.weight": "677334cc60ecce3a7f4ab3acda15d359353d7358872f614ad8914e3780e9fc6e",
-  "blk.5.ffn_up.weight": "a63764110e1c655ffbd55af0669b2dfe4cc29d0e198d33a8e5426461b08a85f7",
-  "blk.5.post_attention_norm.weight": "c55499f859b2c0a7f5cabceaae47309a5ad38bc29d0f4a8db81f1357023162a9",
-  "blk.5.post_ffw_norm.weight": "82752754665f842418f3e302cb5f43d1e0504dcd124c4b8ddb77018b2c793837",
-  "blk.6.attn_k.weight": "e20a5f0d6c807273c8d491439566b428497ac02097cf0aa55e33748c28e14be6",
-  "blk.6.attn_norm.weight": "2c6ba42fd3c73d72073ced03a32dd28d70a89ed9bbbc8fea1ba03a7ade951e6c",
-  "blk.6.attn_output.weight": "4de7c5c2f4a133a266e17ed8c14c52959466b54cc7ab9e19f789a33b4850f284",
-  "blk.6.attn_q.weight": "56462d921800e6b8cd2213fef04c4ff16d728905cb2f4c58e966d0a053a3b0ae",
-  "blk.6.attn_v.weight": "b758dcbff769d6240c2245ede1dbc62c4170a67c77458e866312589220fe29af",
-  "blk.6.ffn_down.weight": "582247fb3c2bf687cbe9413fe18d18ad47bef4b65df7d78905e10335c6134764",
-  "blk.6.ffn_gate.weight": "3035444d5286aefb7a6d04e55bc27e1fac7cf895cd5be02319a431b8e047b4ae",
-  "blk.6.ffn_norm.weight": "e582d24c66e01b96faa20ce6adfda3d8583b11e809bff89969927398175e369a",
-  "blk.6.ffn_up.weight": "6f4b7bbfedeacf61a4866ae0616c4ba6c9e856662e8f00ae6aaec7f52c53e7b4",
-  "blk.6.post_attention_norm.weight": "8fe51b50bd677d21586aecab0b565c4bf9fa68ad50bfe366f45e8fea3c657ca8",
-  "blk.6.post_ffw_norm.weight": "81ba3cb4c2bf5c546b86855b7a885d3fafededc67eb3a35cd3598b03c9e26e65",
-  "blk.7.attn_k.weight": "2e044179cdcae0946708c86bfea7aa0391e1f7e2a09b33fca035d384cc3ca758",
-  "blk.7.attn_norm.weight": "94b48c546b046803c60e75a3acb17a356b710735989938021b565f68df9b4985",
-  "blk.7.attn_output.weight": "65709b4ad7a581f4d75793d39d4032a359f6bcc0c3835205242a0b99e5b66824",
-  "blk.7.attn_q.weight": "8ded993c95d1f7caf201ceb6fa035cd6ed6d351b50b999fa9355dfee9486cb5b",
-  "blk.7.attn_v.weight": "c92d5e2d2d48397542bc03bea25bf39154075e66c5bb1ead85188505aa04ae91",
-  "blk.7.ffn_down.weight": "e8ba8fb57208805ef1dc23cd7c86e9a2d1fb7c52c3940d292cd5bb2eb24b3fac",
-  "blk.7.ffn_gate.weight": "f0f06d6a2e06c5ac252083bc61d05c814e6289d3f4e4a87d2f06918254c02c36",
-  "blk.7.ffn_norm.weight": "ebf8ef775f72624148e09d68a4332187a7a5020c521fe0623da1cd3485ad33e0",
-  "blk.7.ffn_up.weight": "a554adc4fc7122c247c77670e169916ba1794c787b5be30a2b36705138f1f746",
-  "blk.7.post_attention_norm.weight": "3aa6bc21d85c3a0c12b964e82b12feaedfdd13130c3cd2229228e24e0967ebdf",
-  "blk.7.post_ffw_norm.weight": "508bc7b19ee8ff08f0007c890133a462fc57c7e72b16ee8f6dd64def264ef876",
-  "blk.8.attn_k.weight": "363c8e74056642fe9e7c2f3f9769d57319cd3fa0a6022810189ab8d894322885",
-  "blk.8.attn_norm.weight": "685b49a1f1acb169f4df0bdd8e3de6943f3033cebad14b898a72000595610d92",
-  "blk.8.attn_output.weight": "7bde571e4efef1c6a6143f0526721dfb59e0a0ea0e1a3616a322b2eb937efa48",
-  "blk.8.attn_q.weight": "fc993dbc1074c28a0e1d85e5ab2f4ea6a9c6c1affe7ee56027000a275daed9b6",
-  "blk.8.attn_v.weight": "281e8791d3aef9b3864f1cb054da0ae0c2fef4ce0a58b1bad8bc136b2fa0f62b",
-  "blk.8.ffn_down.weight": "b1164a2578a7f87ed99c2bbc76c5dfbbbc6a1a803605391acc3f320fc989ffd7",
-  "blk.8.ffn_gate.weight": "6b39a3b3aaaa79aee61416b54d62160b9258042650e61c6b47bc77c2dd17daf3",
-  "blk.8.ffn_norm.weight": "17ea1362c72da27f12bc936500492035bdef3fd8f940cb12b57f37d42ba8ecb1",
-  "blk.8.ffn_up.weight": "bc3a7c47afc440d2bdf8fbe9ddf2c9220467472c60c8b4ded8c0f181470ec96c",
-  "blk.8.post_attention_norm.weight": "5c506204e00411ef9c8b4134d40eedcc19fffe68dd0af7d7cc49dcabf2dfac7e",
-  "blk.8.post_ffw_norm.weight": "002faec235c3678864e2901eed275ce4e9dc229164a91c9cd4c965142ba62305",
-  "blk.9.attn_k.weight": "0bab39d8c237f1b6d0010db40467142625a9e6f2e0e4c49a56c12b41e4e0b1fa",
-  "blk.9.attn_norm.weight": "de5f38e873b17f07aa7598831b89cc1cae2c9bc3eb2e042ee9af059d2563e84e",
-  "blk.9.attn_output.weight": "8a8184702c25a62df9ff309c0c7badc8587208523b2be3e8fa90ce7080573e6f",
-  "blk.9.attn_q.weight": "7c961b2431b09ddf95377acd07201cb91bf13d9cd3ae0f2c25c7d6a0358d9f50",
-  "blk.9.attn_v.weight": "e22d240cb4743067033e659cbf210ebe2ebbab3e1dea6ccbe5eaa982382ca038",
-  "blk.9.ffn_down.weight": "a426f81210f03d6ad53277416e1fdcdf37d8065e4817613edaf6c67a343426be",
-  "blk.9.ffn_gate.weight": "a82eba825cb77b8e64f85ff99ede2fc71bc9b01751eeb17e9e6c246ee12ea62e",
-  "blk.9.ffn_norm.weight": "1a97f9b1302a3a326d534c5c3fed2db6db0ae45fd0edd381a3e4fc1c75d81030",
-  "blk.9.ffn_up.weight": "5f20bac2bbf03bb42adb92fbf99561651e1edda57e0b61935ac7f6c08c0ed7cb",
-  "blk.9.post_attention_norm.weight": "9f9866d13988e1946b1e1c80d9374a92a6e3be33748f8eaed3e126d1e1a4c796",
-  "blk.9.post_ffw_norm.weight": "a6896dbf698db4dbbe5dbf12417d4fd80e9cad0c539c858892ec0aa5b046bb58",
-  "blk.10.attn_k.weight": "ca8446e5d21ecd4e6a70dca8d321be480be4fba94d70cba065205436feb44270",
-  "blk.10.attn_norm.weight": "4f41fe290e8f21f63b82151b6cce94bf7318d121468816b0c58af0ff7c1658ab",
-  "blk.10.attn_output.weight": "c626d2e9681c5c941bbde43dddfae1a8d4986bf2be4470857bc8e8bd7f869044",
-  "blk.10.attn_q.weight": "1e61b210a13a429977325cf15d781ab77d604cfa862f4270329cbd94237d5835",
-  "blk.10.attn_v.weight": "8ff8d3e3f058ec3b35ada1057f2ed59c06494d0e0be6a8dc3ff9edf9f0e1a115",
-  "blk.10.ffn_down.weight": "bcebc04219f8081a5f483e58103c0ddbbbc631a0a54fd6dd9d55778e041f70ee",
-  "blk.10.ffn_gate.weight": "7a23a1e620ef871384ddf9611ccdcfb893fbf013cc203ac8e72f745420f1eea0",
-  "blk.10.ffn_norm.weight": "e3a375e43c349a1c6c66c22328e513cc1af3137fe839e43dc8e9be2f65914fd7",
-  "blk.10.ffn_up.weight": "5d182e7c94369194fca5f19cbbe668a999911e57f3d363bc7fb6088428700cb9",
-  "blk.10.post_attention_norm.weight": "b841c6308296e8984f3c5f549c6e3a242f4b3e19141e1f54cc08de9c46759c09",
-  "blk.10.post_ffw_norm.weight": "9d66fa05b5c940208f634f5053d809094c99a2a10a1d1e8847c8281fbd99fb49",
-  "blk.11.attn_k.weight": "14adf24ebb2bb17b336ca81cec3e690fd854782f4440ca6c66cc1d7e7bf1c850",
-  "blk.11.attn_norm.weight": "2d2213f311f50414702b5b34f22aafb9d9a0b6787243e7578562583dc40ad195",
-  "blk.11.attn_output.weight": "de1f14cc2a7fff00cf11b229f0576999205f17b9536e97abc9d6de3cc79a7884",
-  "blk.11.attn_q.weight": "2bcc5c147524003109ece0be08b89ac8b25baa71416ffa76573c6c052ffc6eea",
-  "blk.11.attn_v.weight": "2e6ab8573070c22dc1e0d7aebe4d52123226dacf7822dcce06fadbb38fb036a4",
-  "blk.11.ffn_down.weight": "1b86902f4e36868421e5228b9445051f8290b292df22a6d1af836dcecc1f25c3",
-  "blk.11.ffn_gate.weight": "e756e8081bd0a16aea4a9ef5076ad102113524f7a3d50a3a77aaa7f7938b63e8",
-  "blk.11.ffn_norm.weight": "6913887267be227cf9d1991a3dd8db2e7e74bb9b5fbdfcb9ac954fd7d7b95b3b",
-  "blk.11.ffn_up.weight": "619a3ac0609ebdf42c3fb2b6e4b1db48df79e6dd8418d7ab8f1bbff13d8a6a50",
-  "blk.11.post_attention_norm.weight": "e4b4ba92cef7b6a78407e8ab1b0307d47dac6c3df7b6817e28038317ff662d7e",
-  "blk.11.post_ffw_norm.weight": "40aceeec58cb855f0c158c9cc217168fcd5d0e735567d587217b1d78df17bc5f",
-  "blk.12.attn_k.weight": "c54c5a4d4892522022d1aa2204cfc624f0b4042caa536e678967316293fe5cb1",
-  "blk.12.attn_norm.weight": "7cd2ef58298569ffdf244d9b390f3917245276c8206e5780af5f96d8c0bbb446",
-  "blk.12.attn_output.weight": "85495ef9cc8b3deb21f741bde463ff6493acae2be51f02ecdeef952cbdec3375",
-  "blk.12.attn_q.weight": "d19383f83fd119bfb8c0280c9515705c11d8e7d502019fcf8f49efeef0d106d0",
-  "blk.12.attn_v.weight": "869ac669ba49531d9128892a0e27cef15de508ff40cdf80cc1681dde50d09204",
-  "blk.12.ffn_down.weight": "578f39f8f9fc2f09138afc884a952d7cc3a9a31de4216acd10e88e19e0b75f8c",
-  "blk.12.ffn_gate.weight": "e29a0186bc6c4a0720246306e922d3a83f777dadcf4ac80bad468287031cc8b5",
-  "blk.12.ffn_norm.weight": "e1ee95c6584b5cb57fcf1db8ce2bcc03aff91eb389238c094a61c00dde93d1f2",
-  "blk.12.ffn_up.weight": "2a826f06d7cdfb3edc6ae250ff44363ef77a2a9cdf96313e23a331b99ebfa17d",
-  "blk.12.post_attention_norm.weight": "4bafc7699b948d5cbc0d3e09b418b06c6abc4651a61ada9609d9a2f21c7e5607",
-  "blk.12.post_ffw_norm.weight": "bbb8c34a7176bb1a49f9fe2bacca0bd26b673d52c0835b2e90fa11f2962f077f",
-  "blk.13.attn_k.weight": "ffeefccfe8255d1b694382012ff4134eee5fec9d9491c8d0ff0a13832d1a37e8",
-  "blk.13.attn_norm.weight": "35713726529e3887c4135a88e86e8a4d7270ba5b9f2d1ab462622fbf40a7cdce",
-  "blk.13.attn_output.weight": "0d60b7c5cd71190a9ef4b873b0f516be15447c32d83914db2794b14592b0b460",
-  "blk.13.attn_q.weight": "8296069e65bef794cefc61257fc65789b3cb22955e30f3df129205e5041b2222",
-  "blk.13.attn_v.weight": "ca0f4ab9d16a748fc643a5c0c7a19826a811bf2a4e7316a8c935d4bf0ce8abc6",
-  "blk.13.ffn_down.weight": "d5514e0c8e7b3ed1cbcc1605eb5be1733b6ab3514cf8a0508fc72f7d05ed8bcb",
-  "blk.13.ffn_gate.weight": "8108e517a82e08a3aefbbd267bfa50a1668f92a76273280ce8a6bc1f6dd61521",
-  "blk.13.ffn_norm.weight": "5fcb6132d2134bf1f835b904a99820fa501dbc57d2224129f7098bf3cabc1d36",
-  "blk.13.ffn_up.weight": "6d744b7cd390a3cae3aa350dd379b81246acd056a2259996b6aaadece8465ccc",
-  "blk.13.post_attention_norm.weight": "e08b14698912509790e9575b8676971fbb0a4d82d719367e3756c0d0c4ab8cc0",
-  "blk.13.post_ffw_norm.weight": "2b196e4450fc5f1e7367b2cf7fe33a15fe919fbcdd861d11002346f16e980535",
-  "blk.14.attn_k.weight": "120e5f48d7268dfd9ab5f4bc9cc57a7cec63ea9635f56b80d435eb22936e9483",
-  "blk.14.attn_norm.weight": "146367bcce4db72cc894419a2e0145a6f533507dd68e4739c10ee480308c401f",
-  "blk.14.attn_output.weight": "720fa0165e756876c5cb6ad9e2780dd910390933f3f8849e5add5da04266650b",
-  "blk.14.attn_q.weight": "f5183466f56219ca1aca52d8b82c2d966a4198fea40fdd6b39f4d8b06ca2a6dd",
-  "blk.14.attn_v.weight": "24f8ea3d5512cd37c43c8329cb0da0c90d1895aef763ac2dcee3fe5157ec50a2",
-  "blk.14.ffn_down.weight": "e29960965b384ae5ab3d898a4dbaa8fddd28fa0e477ac28bcac49dec12a5ac67",
-  "blk.14.ffn_gate.weight": "6d0d6a74bfe9692e8f8eedff0fc34fc4fa1c8687794f35f2e2b033ab2d7510b8",
-  "blk.14.ffn_norm.weight": "f7036c1a9a71e046c9d2af16e9218fda5dbb0f7241ab44747abed1f0f9d602ca",
-  "blk.14.ffn_up.weight": "7d69ea1424007ffc9c12247dd0308c616e93ac02a59ec341cfa48f92d6ce3b10",
-  "blk.14.post_attention_norm.weight": "65b9712834d9445d4236bec362f3fb795c20d60c541b3dc6dbb7914d9b493e41",
-  "blk.14.post_ffw_norm.weight": "9c6a8da2e4e437d5cfdf3b9097e9f8b64bf07946a048badec20f4d374613f38f",
-  "blk.15.attn_k.weight": "864bc618303a0e4ee67fb1d5e751de61e936cd51e96669dd86f8cd08f2305045",
-  "blk.15.attn_norm.weight": "f9f4187da6eeadc2fc5921d8fe669741697d16c13d71e4aaeb73b82f50dc577e",
-  "blk.15.attn_output.weight": "ce2419a0b097036b2a31f2f4ad731d5814bcc2ef4c511786e24471e5eefd273b",
-  "blk.15.attn_q.weight": "9539db5a970d11ebe99722d1e13fcd635e250033630811efe583d2f97778e4a9",
-  "blk.15.attn_v.weight": "1c834b48ccd88adaeabb7d8bcb6be0bcd6d5ac1354ce88fc28f19a1a96b81ab3",
-  "blk.15.ffn_down.weight": "bc1f97a65dde6fa2c1e5397afb612266944b343f2eaa868b635ddd25829f8a42",
-  "blk.15.ffn_gate.weight": "1b14529d57056b79037f6cb5008132e62cc35992353b38dda59572274623103b",
-  "blk.15.ffn_norm.weight": "9af77458de9ee55c66f93865759f9c2c398557f94f3fa8fa6af30543d7339cde",
-  "blk.15.ffn_up.weight": "41d524a26b61a9595816b4fd53cf57ef50a702e4ef32933ff6136dca9136a267",
-  "blk.15.post_attention_norm.weight": "c60a03cd0e63a7db5c80015e58e9b97ba2208caa19f66a6fef5c4447eca900ce",
-  "blk.15.post_ffw_norm.weight": "34f7f9f96769215bbc3d17084df091864aef96a6645b7d0b3b7d9bd92f1a4b0b",
-  "blk.16.attn_k.weight": "7e27240d9f3a8c6cf0f4a980113d43234f514eadc3e3e1792b86efb29ffb1a6d",
-  "blk.16.attn_norm.weight": "af798acc0899282a30448edec48223b3e8efda177090273e612d8eca5e377301",
-  "blk.16.attn_output.weight": "79df39a3709d3d53e84146291e0944a7a653d06705293d9ccb5648dceadb432c",
-  "blk.16.attn_q.weight": "db58a1c3b83ad294804e5fd7321005719e200659173466df5a52a182b80b7165",
-  "blk.16.attn_v.weight": "2af6d48cbaeb225b5c1a704f76abd89c8ab1521417695b112b4dcc2cbd39b74d",
-  "blk.16.ffn_down.weight": "fc1c813eb5e7da3d6194569d6cb21602fc6eff2dc8e1b0eb753f2d5df148189c",
-  "blk.16.ffn_gate.weight": "7a80bcbc42464bd55df4814a6edbd7b5c153e0428323bbe49de55e2d2add33e7",
-  "blk.16.ffn_norm.weight": "2041685ee926d30f3f2ae4ec35b5688f1cd834167a6359a7d4057eac804c58b2",
-  "blk.16.ffn_up.weight": "8da4b718973ac1d43b928829bc45e062fd101984d6c98dd825bd7c5d08ebfbe3",
-  "blk.16.post_attention_norm.weight": "975c48fe680a6167438a106140a8872eee7765191f152d80e3b8ddf47693e095",
-  "blk.16.post_ffw_norm.weight": "4de2d4d483acfe4fc77860ea929025df2f4e15c10729413f36a18c94eaa6d689",
-  "blk.17.attn_k.weight": "f937e61f0af8c4cd98ee742648eb60e02e579683e21d421071295a3b70aebaad",
-  "blk.17.attn_norm.weight": "c3270583ed28b7e423f5b170c59113234f258169b93a867d9274f4c10b7cb115",
-  "blk.17.attn_output.weight": "b8c1150e81e685e539a5dcf2c19047a24eba2b281fabe166674b1d71ef4612ea",
-  "blk.17.attn_q.weight": "c255100ae2011e7dc7e3bf3bc3ccd96d859fbb98581cae993d7b82c1ba8e8b39",
-  "blk.17.attn_v.weight": "5830bb0a555984c6485348067f70b5d22ae337c011aa9248dac2ff4c95944551",
-  "blk.17.ffn_down.weight": "8ff9a7cccaa3776434a9d895aae4fb5c36c736bf2ec98784226b4c234940fbb0",
-  "blk.17.ffn_gate.weight": "1b52876739712831c272911533da206f407b46034a1a4ae8a88c1f96b6bd5747",
-  "blk.17.ffn_norm.weight": "d0e16ba5e87c91b545334e022058c7d03849665c3b1a6298771b656531366b66",
-  "blk.17.ffn_up.weight": "4dd6211d01dbebbe21052708eddc242b082a58b5f18ed16479e17987c1d3432e",
-  "blk.17.post_attention_norm.weight": "6f49c775c7417dade77ba8268a0f8441c1e5ec28b5d7e4dc5ed07a04d04600c8",
-  "blk.17.post_ffw_norm.weight": "b91a0bb2e6679e9c9be06ad323adae441d00a3d673efb19d7c4954be2aa84b27",
-  "blk.18.attn_k.weight": "22b565ace1b4da8b33865a58625be1d90beea9891f29686a69fa9cf7c93217db",
-  "blk.18.attn_norm.weight": "3e0160d7063c8753de65d2356a66648e47d921efdc5c917efb8209892120f8db",
-  "blk.18.attn_output.weight": "e3180f0bb4ca90b31e9b08158db38e332de62dfbaefe34aa94cc316409331e09",
-  "blk.18.attn_q.weight": "f3a5a83614c3ba7ea41cdd5b1b0819a241ee2a951a381ce4a9e001d3f700ed8f",
-  "blk.18.attn_v.weight": "f3350a5984fb951fc738adcf78147e6d812ff1c576670c460cafc99c253c1654",
-  "blk.18.ffn_down.weight": "9e9d09b13a33525e14bdaee6efc65c551ac7cf7680e534b940ab122a3a7c1ac9",
-  "blk.18.ffn_gate.weight": "ebaec8b4b578a2e8d815baac12f1675c208f80c68074d5a18288a2e1a60680ee",
-  "blk.18.ffn_norm.weight": "33e7687c53a242f2f8dc7093a491c97b18d4a5a8c14d183f02bd586a770f05aa",
-  "blk.18.ffn_up.weight": "78a1816662378ce56cc870e705174492781897b3afd2d4d97a51f10f2f2987c1",
-  "blk.18.post_attention_norm.weight": "a58dde3f12df3e94cbc27d87c8ea86f89af8a388a506446ff6758f05399b05fc",
-  "blk.18.post_ffw_norm.weight": "cebf90cc143577d483cca27b032dfd82031ee59bdf17c0e2cf60a0a3ad5bf996",
-  "blk.19.attn_k.weight": "4683375d0599ac9e2232196aae1e90af13a14cae26e865465de5c8e257bb2055",
-  "blk.19.attn_norm.weight": "f3eba936bfb1814bbcb0a1d62739eb66daac839df8c9c836fe0e94860df88525",
-  "blk.19.attn_output.weight": "51c0f01d38a9dcfe9bdbc4643576fab164c1d9e4b7168b7695c0ee55e6965667",
-  "blk.19.attn_q.weight": "28d15b69b8416f2e7ddc88fe381cb1e2ef2ad705fb1c268139ba96498cc74848",
-  "blk.19.attn_v.weight": "6860f1cd720638e63a981fa2c0b4db900129826bcb9823c9ddf9fb8b1b9f3383",
-  "blk.19.ffn_down.weight": "bc7f2d7827ee01c2dd41401c7b3b1700ad3a4ff620e8bb734f92630d342dcc7f",
-  "blk.19.ffn_gate.weight": "54d03ef69ba373fc410fbca8f1e34a565d58e4296d9a035ff7e48340b9c848e7",
-  "blk.19.ffn_norm.weight": "9178fc796a340ee6e8128ca74c0cb6203d1adbed6927af4e5ac7863da57affc7",
-  "blk.19.ffn_up.weight": "a77bd708026c6e83ad5c79c223278e74621bcf74a9641c7818d96b595daaad20",
-  "blk.19.post_attention_norm.weight": "ae94aa26f4c411bf9496a6fd4a6df64ee589ee1ae9a04b531d45acc95721e582",
-  "blk.19.post_ffw_norm.weight": "9ad210700edeef12133bdcff04bf1c7f62b49f6f4a9ba483c7cdc59857c24a5c",
-  "blk.20.attn_k.weight": "e35bce1e9f4a7a09ef34721f57ea38cfca68c272f52d923fe50af8308f66cfaa",
-  "blk.20.attn_norm.weight": "644800f6926fd34f233795c4dec1151a295d2138ca8cac33e3e48167d26f8b41",
-  "blk.20.attn_output.weight": "8d3758cd236471741e1ad66c0710cb79077dc8c7a3a292d35bc551c0c5abe627",
-  "blk.20.attn_q.weight": "c333b1f0f6f956b5d73891df10b1a0321e55fc31c40d623a24e1f52caa6a998b",
-  "blk.20.attn_v.weight": "8562b418d0c4868a050fb19fa3fcaf50a8cf1c669f537d666c80c7b3a04714e1",
-  "blk.20.ffn_down.weight": "97efb608ac44cc804198faec3ee66eafe56ced6b7ca5359700c6f1df75b7205e",
-  "blk.20.ffn_gate.weight": "5c61151d86f28415c73c73d90ec088c646cbe5c1640197caf58eb501ba7db293",
-  "blk.20.ffn_norm.weight": "24bbe0a701afd4bbeea65b3edde712b3cbb2281043bbc43dbf250582453116ed",
-  "blk.20.ffn_up.weight": "e170cf68e249566aa99eb6f6b265679bf9a5a6b76830ba24e7e130c2515910c4",
-  "blk.20.post_attention_norm.weight": "e092d751cfe20dbf2d348358f3b38397bd83e4ed94d6bbaa6bbaddcd902b2ac4",
-  "blk.20.post_ffw_norm.weight": "219a18a47dcba76e669e4322223a5a9227bd3db1de3fbd3d3cfb22e54a783c5a",
-  "blk.21.attn_k.weight": "c3a095ebddb42c63824f1c98da65263dc88e4d790a26aa1632840b44f5cc7cb1",
-  "blk.21.attn_norm.weight": "ef8bbaded5fbc45ad9cf3985ae02174524e7090fe6362811124f942ef643bec7",
-  "blk.21.attn_output.weight": "668f018aba72baac6252aa3ad58569ddd55ab751a0dd8d7bcc9fb9b6efb4bf53",
-  "blk.21.attn_q.weight": "e759c65663089f3bbbd51847934c185e680c82f1249065d5d487da638e519e6d",
-  "blk.21.attn_v.weight": "2ff57762686cf9ba1f5a6be76503454b97556ce67f4ac98254bd0562231197ba",
-  "blk.21.ffn_down.weight": "3fd106556fb721b1c28ae3f4026bc83eb1b08ed910f2ba5f466c6b5f327d91cb",
-  "blk.21.ffn_gate.weight": "338022d882f4b6619e8054a6fb909696fa3eef3013cf69b65c3cacdfc5b9e42c",
-  "blk.21.ffn_norm.weight": "1e77660c23a3f9653ee721a863d1960f773d87437cabc4dc0a6e17ee3d4e5e44",
-  "blk.21.ffn_up.weight": "7d31b20fbc2e6eba8f350f170069dc36f0cb12f68fbc4206ec5022a74085ebcb",
-  "blk.21.post_attention_norm.weight": "9638bae8d8bdcd7ed68da282979cd84a07c41ff9cabcaea94ebc846a1803db23",
-  "blk.21.post_ffw_norm.weight": "d622ef11115fe0cbe04b727d5a3b6371e7f39bf08c8d5eb9bc6da52e3f3cfb9d",
-  "blk.22.attn_k.weight": "5c321cb29deffbe57de200dd206a62005f1e80acb86c4fd2349dd44c8d3594fd",
-  "blk.22.attn_norm.weight": "198d949705d7170a331d75889d8c7500c3635254dac2cc6aa4dc35d556584536",
-  "blk.22.attn_output.weight": "19805cd5d7025b457e5d41d70db8b3fd63c2dd0e4a94d3ef1704d50ef4e749e8",
-  "blk.22.attn_q.weight": "177836cd583fc87405975ddc21ebfebdaa090a0363799664c72caa3da851ae2c",
-  "blk.22.attn_v.weight": "fea255692483e30d0108f9e4e250eb3ed7dbda8d83f499b06519b8c223ae6096",
-  "blk.22.ffn_down.weight": "00cb8939f03e5817d6d412de8cf2c923c9568d5493e382cec7faf5718fb034eb",
-  "blk.22.ffn_gate.weight": "b0591065b91281b2fbd8a9567f3568d40479f680e1f0a29e27ae213f37642489",
-  "blk.22.ffn_norm.weight": "96b5c5d0737c2ceb8fc869f54adb9e5f46e28cb7b177c40f49fa926b923c00f8",
-  "blk.22.ffn_up.weight": "81f472185b24344ab0594ea8246cc6e200e0dc1cab4943e74fbe4ca19d5a9701",
-  "blk.22.post_attention_norm.weight": "27fa9aa6260aa3071e0391e1a1d49322dcb6e8072315b8a9b7064087108dbd06",
-  "blk.22.post_ffw_norm.weight": "f37e1dcd7f643d9545675ffe9dc527a11eba86eb204989c2f44f636b266d896a",
-  "blk.23.attn_k.weight": "5d82f36658a56c3f94d0bb2d61f65509c966fa6568f81812e0d3e338b380ef8c",
-  "blk.23.attn_norm.weight": "b7983f88d9cad88bc88a528923e6da592ad20e699965b223ebc10840fe1f4fec",
-  "blk.23.attn_output.weight": "59f97f80f430d71606aab0158a195aed29ccd3405e6c0a5c41c809be8eb01898",
-  "blk.23.attn_q.weight": "53ac4789fe958919cc02ea4222bcd64c0ea1b4baa54304bff46635bdf42f7490",
-  "blk.23.attn_v.weight": "ec8abe09b9e84dbb52c7a068094657c6d3c62fe551ba8d7c3a3f23da622e9756",
-  "blk.23.ffn_down.weight": "3cf547eccb1b82aa64f208cee9682d7f558ca84e0aead7d9d3d1420d90f3d992",
-  "blk.23.ffn_gate.weight": "366aa2486d911ba81eb519119e13807deacf7e9908bc1975a2a63e00d6b10124",
-  "blk.23.ffn_norm.weight": "6d1d4a4af34bb7dc090ac87d6457d398c3e0fb68bd2e2b60b099dc318b6cfac3",
-  "blk.23.ffn_up.weight": "53f76692e253f5d2420b3f200c731b9f3b7a83e379920b4a067c729b4674aa4d",
-  "blk.23.post_attention_norm.weight": "7c952fa0efa76b3f048c8c4c9e8dcb5e3724d231327eda6423a34d3f3d3367de",
-  "blk.23.post_ffw_norm.weight": "7ab188cfe61f0a91b40309a0ab6bfa99f19d0ff2a37b6ac10e5f0c7f44eb5270",
-  "blk.24.attn_k.weight": "225798792f9bfdd10eff0505ebe61e0aad0209c17b431f6044ee7968ffe8c198",
-  "blk.24.attn_norm.weight": "635e3c1ebf5219bbebfc40ef164bc32d2b726ef595a94da64ac524ae878e2915",
-  "blk.24.attn_output.weight": "482f5bb2db8d9ed22b253d9a3296333b239efe698e5992e5d77e7e12dc2a5cf5",
-  "blk.24.attn_q.weight": "43805bbccddb65d58fffc4be9b5c374d4e1df1395ec1e1ffb4bcff03e98d5adb",
-  "blk.24.attn_v.weight": "fa741af54b4a3b1775d32f59134756090c5df2e7345a12a2d8db94fe289667a7",
-  "blk.24.ffn_down.weight": "83c6351e3162626b276f524a57836144625c2556dbe321b57cbd8fd486a68fab",
-  "blk.24.ffn_gate.weight": "fbe66be0d84d12cea5176cc7eaef64382ffc7324cd9d6266a3342dc43442f2ac",
-  "blk.24.ffn_norm.weight": "77c1445a8639ad24938bdf0280233eea2362d47391421833dfa72ec756dfc1e8",
-  "blk.24.ffn_up.weight": "78235ac729ee23c1cf1ae543751e3af32776d8808cee6e529c2a625a1f027654",
-  "blk.24.post_attention_norm.weight": "161f71b6d07628d43e4ae51a4c9088ec6ca2db123a17986a14505d83fdd04dad",
-  "blk.24.post_ffw_norm.weight": "cf1ba692aa683368b02ac413e69b2521b98c69a5274eacbb54165b53bf38a8b2",
-  "blk.25.attn_k.weight": "057a56bd8c8d2b41608d1f71faa3052902152ddf85e47669ad950c1c3e77c33f",
-  "blk.25.attn_norm.weight": "b7179fe02c334da556ddcf6c1b502245639a728c4cbba8b552d8e1df4565ee9d",
-  "blk.25.attn_output.weight": "4fed8b05b08a0ff75ffd022701bbeb52f17b23d09332a1ddcba737244bd0d3b0",
-  "blk.25.attn_q.weight": "c52e99f5d38bf7538d6106a0bbf38ac6dc6296bca9a3f849afa384ea67b4af01",
-  "blk.25.attn_v.weight": "c49c23d8e1cfa6a8eb971eb69942204890c6d7d830dc8774c84b108a80598912",
-  "blk.25.ffn_down.weight": "c08d4dc8412b19fdc870c164b83c341b236ec6fe7bb4a9bcfe0dc100faa20286",
-  "blk.25.ffn_gate.weight": "1a4cb3f36735d59181721471452807903006539e5e1b5ceb4f72d1d7ae134127",
-  "blk.25.ffn_norm.weight": "8fd6bd0dcec5198761525a36992a57c9ec5e9da60a22092839a84ae8c4e87f26",
-  "blk.25.ffn_up.weight": "3a00f39bdd5f31dc5e3b281d2002e1ac4f2475d49a0ac1d7720a25b377dcd04a",
-  "blk.25.post_attention_norm.weight": "e5f31a648612c859b6d21c9ee426e87a86cb1973dfdd86276c767371d9cef5ad",
-  "blk.25.post_ffw_norm.weight": "553c3bd774922c99c2384380a142d019881d30dbf0fe3bf9430dabfb3f6cbd33",
-  "output_norm.weight": "49445c4585ab0a8135717a0bdb1cda4a062a030177d0119561d91542aec5744b"
-}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -100,21 +100,8 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}

 		if template, ok := p["chat_template"]; ok {
-			var s []struct {
-				Name     string `json:"name"`
-				Template string `json:"template"`
-			}
-			if err := json.Unmarshal(template, &t.Template); err == nil {
-				// noop
-			} else if err := json.Unmarshal(template, &s); err == nil {
-				for _, e := range s {
-					if e.Name == "default" {
-						t.Template = e.Template
-						break
-					}
-				}
-			} else {
-				return nil, fmt.Errorf("invalid chat_template: %w", err)
+			if err := json.Unmarshal(template, &t.Template); err != nil {
+				return nil, err
 			}
 		}

@@ -154,6 +141,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 }

 type tokenizer struct {
+	Version     string  `json:"version"`
 	AddedTokens []token `json:"added_tokens"`
 	Model       struct {
 		Type   string         `json:"type"`
@@ -251,7 +239,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
 		return pattern.Func(fsys)
 	}

-	return nil, errors.New("unknown tokenizer format")
+	return nil, errors.New("unknown tensor format")
 }

 type SpecialVocabulary struct {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -1,208 +0,0 @@
-package convert
-
-import (
-	"io"
-	"io/fs"
-	"os"
-	"path/filepath"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func createTokenizerFS(t *testing.T, dir string, files map[string]io.Reader) fs.FS {
-	t.Helper()
-
-	for k, v := range files {
-		if err := func() error {
-			f, err := os.Create(filepath.Join(dir, k))
-			if err != nil {
-				return err
-			}
-			defer f.Close()
-
-			if _, err := io.Copy(f, v); err != nil {
-				return err
-			}
-
-			return nil
-		}(); err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-	}
-
-	return os.DirFS(dir)
-}
-
-func TestParseTokenizer(t *testing.T) {
-	cases := []struct {
-		name              string
-		fsys              fs.FS
-		specialTokenTypes []string
-		want              *Tokenizer
-	}{
-		{
-			name: "string chat template",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{}`),
-				"tokenizer_config.json": strings.NewReader(`{
-					"chat_template": "<default template>"
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{Model: "gpt2"},
-				Pre:        "default",
-				Template:   "<default template>",
-			},
-		},
-		{
-			name: "list chat template",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{}`),
-				"tokenizer_config.json": strings.NewReader(`{
-					"chat_template": [
-						{
-							"name": "default",
-							"template": "<default template>"
-						},
-						{
-							"name": "tools",
-							"template": "<tools template>"
-						}
-					]
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{Model: "gpt2"},
-				Pre:        "default",
-				Template:   "<default template>",
-			},
-		},
-		{
-			name: "added tokens",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"added_tokens": [
-						{
-							"id": 999,
-							"content": "<unused999>",
-							"special": false
-						}
-					]
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model:  "gpt2",
-					Tokens: []string{"<unused999>"},
-					Scores: []float32{999},
-					Types:  []int32{4},
-				},
-				Pre: "default",
-			},
-		},
-		{
-			name: "added tokens overlap vocab",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"added_tokens": [
-						{
-							"id": 0,
-							"content": "<pad>",
-							"special": true
-						}
-					],
-					"model": {
-						"vocab": {
-							"<pad>": 0
-						}
-					}
-				}`),
-			}),
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model:  "gpt2",
-					Tokens: []string{"<pad>"},
-					Scores: []float32{0},
-					Types:  []int32{3},
-				},
-				Pre: "default",
-			},
-		},
-		{
-			name: "special token types",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"added_tokens": [
-						{
-							"id": 0,
-							"content": "<pad>",
-							"special": true
-						},
-						{
-							"id": 1,
-							"content": "<eos>",
-							"special": true
-						},
-						{
-							"id": 2,
-							"content": "<bos>",
-							"special": true
-						},
-						{
-							"id": 3,
-							"content": "<unk>",
-							"special": true
-						}
-					],
-					"model": {
-						"vocab": {
-							"<pad>": 0,
-							"<eos>": 1,
-							"<bos>": 2,
-							"<unk>": 3
-						}
-					}
-				}`),
-				"tokenizer_config.json": strings.NewReader(`{
-					"add_bos_token": true,
-					"add_eos_token": false,
-					"bos_token": "<bos>",
-					"eos_token": "<eos>",
-					"pad_token": "<pad>",
-					"unk_token": "<unk>"
-				}`),
-			}),
-			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model:  "gpt2",
-					Tokens: []string{"<pad>", "<eos>", "<bos>", "<unk>"},
-					Scores: []float32{0, 1, 2, 3},
-					Types:  []int32{3, 3, 3, 3},
-				},
-				SpecialVocabulary: []*SpecialVocabulary{
-					{Type: "pad", Content: "<pad>", ID: 0, AddToken: false},
-					{Type: "eos", Content: "<eos>", ID: 1, AddToken: false},
-					{Type: "bos", Content: "<bos>", ID: 2, AddToken: true},
-					{Type: "unk", Content: "<unk>", ID: 3, AddToken: false},
-				},
-				Pre: "default",
-			},
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			tokenizer, err := parseTokenizer(tt.fsys, tt.specialTokenTypes)
-			if err != nil {
-				t.Fatalf("unexpected error: %v", err)
-			}
-
-			if diff := cmp.Diff(tt.want, tokenizer); diff != "" {
-				t.Errorf("unexpected tokenizer (-want +got):\n%s", diff)
-			}
-		})
-	}
-}
--- a/docs/api.md
+++ b/docs/api.md
@@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "prompt": "Why is the sky blue?"
 }'
 ```
@@ -80,7 +80,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1"
+  "model": "llama3"
 }'
 ```

@@ -400,7 +400,7 @@ A single JSON object is returned:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
@@ -445,7 +445,7 @@ Send a chat message with a streaming response.

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -461,7 +461,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -476,7 +476,7 @@ Final response:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@@ -494,7 +494,7 @@ Final response:

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "llama3.1",
+  "model": "registry.ollama.ai/library/llama3:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -557,7 +557,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -571,7 +571,7 @@ Final response:

 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "llama3.1",
+  "model": "registry.ollama.ai/library/llama3:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter

 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.1"
+  "name": "llama3"
 }'
 ```

@@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model.

 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama3.1",
+  "source": "llama3",
  "destination": "llama3-backup"
 }'
 ```
@@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.1"
+  "name": "llama3"
 }'
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@@ -194,8 +194,6 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.

-> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
-
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.

 ## How can I use Ollama in Visual Studio Code?
@@ -247,12 +245,12 @@ The `keep_alive` parameter can be set to:

 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
 ```

 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
 ```

 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -10,7 +10,7 @@ Check your compute compatibility to see if your card is supported:
 | 9.0                | NVIDIA              | `H100`                                                                                                      |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
-| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`   |
+| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060`         |
 |                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
 | 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
 | 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
--- a/docs/images/ollama-keys.png
+++ b/docs/images/ollama-keys.png
--- a/docs/import.md
+++ b/docs/import.md
@@ -38,7 +38,7 @@ Ollama supports importing adapters based on several different model architecture

 You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:

-  * Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
+  * Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
  * [Unsloth](https://github.com/unslothai/unsloth)
  * [MLX](https://github.com/ml-explore/mlx)

@@ -158,7 +158,7 @@ You can share any model you have created by pushing it to [ollama.com](https://o

 First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.

-<img src="images/signup.png" alt="Sign-Up" width="40%">
+![Sign-Up](images/signup.png)

 The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.

@@ -166,7 +166,7 @@ Now that you have created an account and are signed-in, go to the [Ollama Keys S

 Follow the directions on the page to determine where your Ollama Public Key is located.

-<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
+![Ollama Key](images/ollama-keys.png)

 Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.

--- a/docs/linux.md
+++ b/docs/linux.md
@@ -1,59 +1,39 @@
-# Linux
+# Ollama on Linux

 ## Install

-To install Ollama, run the following command:
+Install Ollama running this one-liner:

-```shell
+>
+
+```bash
 curl -fsSL https://ollama.com/install.sh | sh
 ```

+## AMD Radeon GPU support
+
+While AMD has contributed the `amdgpu` driver upstream to the official linux
+kernel source, the version is older and may not support all ROCm features. We
+recommend you install the latest driver from
+https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+GPU.
+
 ## Manual install

-Download and extract the package:
+### Download `ollama`

-```shell
-curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
-sudo tar -C /usr -xzf ollama-linux-amd64.tgz
-```
+Download and extract the Linux package:

-Start Ollama:
-
-```shell
-ollama serve
-```
-
-In another terminal, verify that Ollama is running:
-
-```shell
-ollama -v
-```
-
-### AMD GPU install
-
-If you have an AMD GPU, also download and extract the additional ROCm package:
-
-```shell
-curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
-sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
-```
-
-### ARM64 install
-
-Download and extract the ARM64-specific package:
-
-```shell
-curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
-sudo tar -C /usr -xzf ollama-linux-arm64.tgz
+```bash
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)

-Create a user and group for Ollama:
+Create a user for Ollama:

-```shell
-sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
-sudo usermod -a -G ollama $(whoami)
+```bash
+sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
 ```

 Create a service file in `/etc/systemd/system/ollama.service`:
@@ -69,7 +49,6 @@ User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
-Environment="PATH=$PATH"

 [Install]
 WantedBy=default.target
@@ -77,54 +56,46 @@ WantedBy=default.target

 Then start the service:

-```shell
+```bash
 sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```

-### Install CUDA drivers (optional)
+### Install CUDA drivers (optional – for Nvidia GPUs)

 [Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.

 Verify that the drivers are installed by running the following command, which should print details about your GPU:

-```shell
+```bash
 nvidia-smi
 ```

-### Install AMD ROCm drivers (optional)
+### Install ROCm (optional - for Radeon GPUs)
+[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)

-[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.
+Make sure to install ROCm v6

 ### Start Ollama

-Start Ollama and verify it is running:
+Start Ollama using `systemd`:

-```shell
+```bash
 sudo systemctl start ollama
-sudo systemctl status ollama
 ```

-> [!NOTE]
-> While AMD has contributed the `amdgpu` driver upstream to the official linux
-> kernel source, the version is older and may not support all ROCm features. We
-> recommend you install the latest driver from
-> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-> GPU.
+## Update

-## Updating
+Update ollama by running the install script again:

-Update Ollama by running the install script again:
-
-```shell
+```bash
 curl -fsSL https://ollama.com/install.sh | sh
 ```

-Or by re-downloading Ollama:
+Or by downloading the ollama binary:

-```shell
-curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
-sudo tar -C /usr -xzf ollama-linux-amd64.tgz
+```bash
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```

 ## Installing specific versions
@@ -133,15 +104,15 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s

 For example:

-```shell
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
+```
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
 ```

 ## Viewing logs

 To view logs of Ollama running as a startup service, run:

-```shell
+```bash
 journalctl -e -u ollama
 ```

@@ -149,7 +120,7 @@ journalctl -e -u ollama

 Remove the ollama service:

-```shell
+```bash
 sudo systemctl stop ollama
 sudo systemctl disable ollama
 sudo rm /etc/systemd/system/ollama.service
@@ -157,13 +128,13 @@ sudo rm /etc/systemd/system/ollama.service

 Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):

-```shell
+```bash
 sudo rm $(which ollama)
 ```

 Remove the downloaded models and Ollama service user and group:

-```shell
+```bash
 sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -11,9 +11,8 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from existing model](#build-from-existing-model)
-    - [Build from a Safetensors model](#build-from-a-safetensors-model)
-    - [Build from a GGUF file](#build-from-a-gguf-file)
+    - [Build from llama3](#build-from-llama3)
+    - [Build from a bin file](#build-from-a-bin-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
  - [TEMPLATE](#template)
@@ -50,7 +49,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:

 ```modelfile
-FROM llama3.1
+FROM llama3
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -72,10 +71,10 @@ More examples are available in the [examples directory](../examples).
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.

  ```bash
-  > ollama show --modelfile llama3.1
+  > ollama show --modelfile llama3
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama3.1:latest
+  # FROM llama3:latest
  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>

@@ -100,39 +99,22 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```

-#### Build from existing model
+#### Build from llama3

 ```modelfile
-FROM llama3.1
+FROM llama3
 ```

 A list of available base models:
 <https://github.com/ollama/ollama#model-library>
-Additional models can be found at:
-<https://ollama.com/library>

-#### Build from a Safetensors model
+#### Build from a `bin` file

 ```modelfile
-FROM <model directory>
+FROM ./ollama-model.bin
 ```

-The model directory should contain the Safetensors weights for a supported architecture.
-
-Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
-  * Gemma (including Gemma 1 and Gemma 2)
-  * Phi3
-
-#### Build from a GGUF file
-
-```modelfile
-FROM ./ollama-model.gguf
-```
-
-The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
-
+This bin file location should be specified as an absolute path or relative to the `Modelfile` location.

 ### PARAMETER

@@ -192,23 +174,10 @@ SYSTEM """<system message>"""

 ### ADAPTER

-The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
-
-#### Safetensor adapter
+The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.

 ```modelfile
-ADAPTER <path to safetensor adapter>
-```
-
-Currently supported Safetensor adapters:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
-  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
-  * Gemma (including Gemma 1 and Gemma 2)
-
-#### GGUF adapter
-
-```modelfile
-ADAPTER ./ollama-lora.gguf
+ADAPTER ./ollama-lora.bin
 ```

 ### LICENSE
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama3.1',
+    model='llama3',
 )

 response = client.chat.completions.create(
@@ -46,13 +46,13 @@ response = client.chat.completions.create(
 )

 completion = client.completions.create(
-    model="llama3.1",
+    model="llama3",
    prompt="Say this is a test",
 )

 list_completion = client.models.list()

-model = client.models.retrieve("llama3.1")
+model = client.models.retrieve("llama3")

 embeddings = client.embeddings.create(
    model="all-minilm",
@@ -74,7 +74,7 @@ const openai = new OpenAI({

 const chatCompletion = await openai.chat.completions.create({
    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'llama3.1',
+    model: 'llama3',
 })

 const response = await openai.chat.completions.create({
@@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
 })

 const completion = await openai.completions.create({
-    model: "llama3.1",
+    model: "llama3",
    prompt: "Say this is a test.",
 })

 const listCompletion = await openai.models.list()

-const model = await openai.models.retrieve("llama3.1")
+const model = await openai.models.retrieve("llama3")

 const embedding = await openai.embeddings.create({
  model: "all-minilm",
@@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3.1",
+        "model": "llama3",
        "messages": [
            {
                "role": "system",
@@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
 curl http://localhost:11434/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3.1",
+        "model": "llama3",
        "prompt": "Say this is a test"
    }'

 curl http://localhost:11434/v1/models

-curl http://localhost:11434/v1/models/llama3.1
+curl http://localhost:11434/v1/models/llama3

 curl http://localhost:11434/v1/embeddings \
    -H "Content-Type: application/json" \
@@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
 Before using a model, pull it locally `ollama pull`:

 ```shell
-ollama pull llama3.1
+ollama pull llama3
 ```

 ### Default model names
@@ -282,7 +282,7 @@ ollama pull llama3.1
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:

 ```
-ollama cp llama3.1 gpt-3.5-turbo
+ollama cp llama3 gpt-3.5-turbo
 ```

 Afterwards, this new model name can be specified the `model` field:
@@ -300,28 +300,3 @@ curl http://localhost:11434/v1/chat/completions \
        ]
    }'
 ```
-
-### Setting the context size
-
-The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
-
-```modelfile
-FROM <some model>
-PARAMETER num_ctx <context size>
-```
-
-Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
-
-```shell
-curl http://localhost:11434/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "mymodel",
-        "messages": [
-            {
-                "role": "user",
-                "content": "Hello!"
-            }
-        ]
-    }'
-```
--- a/docs/template.md
+++ b/docs/template.md
@@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
 To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.

 ```dockerfile
-FROM llama3.1
+FROM llama3

 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -91,17 +91,6 @@ If none of those resolve the problem, gather additional information and file an
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`


-## AMD GPU Discovery
-
-On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
-
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
-
-If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
-
 ## Windows Terminal Errors

 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn

 Here's a quick example showing API access from `powershell`
 ```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
 ```

 ## Troubleshooting
@@ -48,9 +48,6 @@ the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %HOMEPATH%\.ollama` contains models and configuration
 - `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories

-## Uninstall
-
-The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.

 ## Standalone CLI

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -30,7 +30,9 @@ func Host() *url.URL {
 		defaultPort = "443"
 	}

-	hostport, path, _ := strings.Cut(hostport, "/")
+	// trim trailing slashes
+	hostport = strings.TrimRight(hostport, "/")
+
 	host, port, err := net.SplitHostPort(hostport)
 	if err != nil {
 		host, port = "127.0.0.1", defaultPort
@@ -43,13 +45,15 @@ func Host() *url.URL {

 	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
 		slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
-		port = defaultPort
+		return &url.URL{
+			Scheme: scheme,
+			Host:   net.JoinHostPort(host, defaultPort),
+		}
 	}

 	return &url.URL{
 		Scheme: scheme,
 		Host:   net.JoinHostPort(host, port),
-		Path:   path,
 	}
 }

@@ -112,26 +116,6 @@ func KeepAlive() (keepAlive time.Duration) {
 	return keepAlive
 }

-// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
-// Zero or Negative values are treated as infinite.
-// Default is 5 minutes.
-func LoadTimeout() (loadTimeout time.Duration) {
-	loadTimeout = 5 * time.Minute
-	if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
-		if d, err := time.ParseDuration(s); err == nil {
-			loadTimeout = d
-		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
-			loadTimeout = time.Duration(n) * time.Second
-		}
-	}
-
-	if loadTimeout <= 0 {
-		return time.Duration(math.MaxInt64)
-	}
-
-	return loadTimeout
-}
-
 func Bool(k string) func() bool {
 	return func() bool {
 		if s := Var(k); s != "" {
@@ -179,6 +163,53 @@ var (
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 )

+func RunnersDir() (p string) {
+	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
+		return p
+	}
+
+	if runtime.GOOS != "windows" {
+		return
+	}
+
+	defer func() {
+		if p == "" {
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
+		}
+	}()
+
+	// On Windows we do not carry the payloads inside the main executable
+	exe, err := os.Executable()
+	if err != nil {
+		return
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		return
+	}
+
+	var paths []string
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
+		paths = append(paths,
+			root,
+			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
+		)
+	}
+
+	// Try a few variations to improve developer experience when building from source in the local tree
+	for _, path := range paths {
+		candidate := filepath.Join(path, "lib", "ollama", "runners")
+		if _, err := os.Stat(candidate); err == nil {
+			p = candidate
+			break
+		}
+	}
+
+	return p
+}
+
 func Uint(key string, defaultValue uint) func() uint {
 	return func() uint {
 		if s := Var(key); s != "" {
@@ -204,23 +235,6 @@ var (
 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )

-func Uint64(key string, defaultValue uint64) func() uint64 {
-	return func() uint64 {
-		if s := Var(key); s != "" {
-			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
-				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
-			} else {
-				return n
-			}
-		}
-
-		return defaultValue
-	}
-}
-
-// Set aside VRAM per GPU
-var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
-
 type EnvVar struct {
 	Name        string
 	Value       any
@@ -231,11 +245,9 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
-		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
-		"OLLAMA_LOAD_TIMEOUT":      {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
@@ -243,22 +255,10 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
+		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
-
-		// Informational
-		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
-		"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
-		"NO_PROXY":    {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
 	}
-
-	if runtime.GOOS != "windows" {
-		// Windows environment variables are case-insensitive so there's no need to duplicate them
-		ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
-		ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
-		ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
-	}
-
 	if runtime.GOOS != "darwin" {
 		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
 		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
@@ -267,7 +267,6 @@ func AsMap() map[string]EnvVar {
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
 		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
-
 	return ret
 }

@@ -283,12 +282,3 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
-
-// On windows, we keep the binary at the top directory, but
-// other platforms use a "bin" directory, so this returns ".."
-func LibRelativeToExe() string {
-	if runtime.GOOS == "windows" {
-		return "."
-	}
-	return ".."
-}
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -13,35 +13,34 @@ func TestHost(t *testing.T) {
 		value  string
 		expect string
 	}{
-		"empty":               {"", "http://127.0.0.1:11434"},
-		"only address":        {"1.2.3.4", "http://1.2.3.4:11434"},
-		"only port":           {":1234", "http://:1234"},
-		"address and port":    {"1.2.3.4:1234", "http://1.2.3.4:1234"},
-		"hostname":            {"example.com", "http://example.com:11434"},
-		"hostname and port":   {"example.com:1234", "http://example.com:1234"},
-		"zero port":           {":0", "http://:0"},
-		"too large port":      {":66000", "http://:11434"},
-		"too small port":      {":-1", "http://:11434"},
-		"ipv6 localhost":      {"[::1]", "http://[::1]:11434"},
-		"ipv6 world open":     {"[::]", "http://[::]:11434"},
-		"ipv6 no brackets":    {"::1", "http://[::1]:11434"},
-		"ipv6 + port":         {"[::1]:1337", "http://[::1]:1337"},
-		"extra space":         {" 1.2.3.4 ", "http://1.2.3.4:11434"},
-		"extra quotes":        {"\"1.2.3.4\"", "http://1.2.3.4:11434"},
-		"extra space+quotes":  {" \" 1.2.3.4 \" ", "http://1.2.3.4:11434"},
-		"extra single quotes": {"'1.2.3.4'", "http://1.2.3.4:11434"},
-		"http":                {"http://1.2.3.4", "http://1.2.3.4:80"},
-		"http port":           {"http://1.2.3.4:4321", "http://1.2.3.4:4321"},
-		"https":               {"https://1.2.3.4", "https://1.2.3.4:443"},
-		"https port":          {"https://1.2.3.4:4321", "https://1.2.3.4:4321"},
-		"proxy path":          {"https://example.com/ollama", "https://example.com:443/ollama"},
+		"empty":               {"", "127.0.0.1:11434"},
+		"only address":        {"1.2.3.4", "1.2.3.4:11434"},
+		"only port":           {":1234", ":1234"},
+		"address and port":    {"1.2.3.4:1234", "1.2.3.4:1234"},
+		"hostname":            {"example.com", "example.com:11434"},
+		"hostname and port":   {"example.com:1234", "example.com:1234"},
+		"zero port":           {":0", ":0"},
+		"too large port":      {":66000", ":11434"},
+		"too small port":      {":-1", ":11434"},
+		"ipv6 localhost":      {"[::1]", "[::1]:11434"},
+		"ipv6 world open":     {"[::]", "[::]:11434"},
+		"ipv6 no brackets":    {"::1", "[::1]:11434"},
+		"ipv6 + port":         {"[::1]:1337", "[::1]:1337"},
+		"extra space":         {" 1.2.3.4 ", "1.2.3.4:11434"},
+		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
+		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
+		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
+		"http":                {"http://1.2.3.4", "1.2.3.4:80"},
+		"http port":           {"http://1.2.3.4:4321", "1.2.3.4:4321"},
+		"https":               {"https://1.2.3.4", "1.2.3.4:443"},
+		"https port":          {"https://1.2.3.4:4321", "1.2.3.4:4321"},
 	}

 	for name, tt := range cases {
 		t.Run(name, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", tt.value)
-			if host := Host(); host.String() != tt.expect {
-				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
+			if host := Host(); host.Host != tt.expect {
+				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
 			}
 		})
 	}
@@ -215,40 +214,6 @@ func TestKeepAlive(t *testing.T) {
 	}
 }

-func TestLoadTimeout(t *testing.T) {
-	defaultTimeout := 5 * time.Minute
-	cases := map[string]time.Duration{
-		"":       defaultTimeout,
-		"1s":     time.Second,
-		"1m":     time.Minute,
-		"1h":     time.Hour,
-		"5m0s":   defaultTimeout,
-		"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
-		"0":      time.Duration(math.MaxInt64),
-		"60":     60 * time.Second,
-		"120":    2 * time.Minute,
-		"3600":   time.Hour,
-		"-0":     time.Duration(math.MaxInt64),
-		"-1":     time.Duration(math.MaxInt64),
-		"-1m":    time.Duration(math.MaxInt64),
-		// invalid values
-		" ":   defaultTimeout,
-		"???": defaultTimeout,
-		"1d":  defaultTimeout,
-		"1y":  defaultTimeout,
-		"1w":  defaultTimeout,
-	}
-
-	for tt, expect := range cases {
-		t.Run(tt, func(t *testing.T) {
-			t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
-			if actual := LoadTimeout(); actual != expect {
-				t.Errorf("%s: expected %s, got %s", tt, expect, actual)
-			}
-		})
-	}
-}
-
 func TestVar(t *testing.T) {
 	cases := map[string]string{
 		"value":       "value",
--- a/examples/langchain-python-rag-privategpt/requirements.txt
+++ b/examples/langchain-python-rag-privategpt/requirements.txt
@@ -1,6 +1,6 @@
 langchain==0.0.274
 gpt4all==1.0.8
-chromadb==0.5.0
+chromadb==0.4.7
 llama-cpp-python==0.1.81
 urllib3==2.0.4
 PyMuPDF==1.23.5
@@ -12,4 +12,4 @@ pandoc==2.3
 pypandoc==1.11
 tqdm==4.66.1
 sentence_transformers==2.2.2
-numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
+numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
--- a/examples/python-loganalysis/Modelfile
+++ b/examples/python-loganalysis/Modelfile
@@ -4,5 +4,5 @@ SYSTEM """
 You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
 """

-PARAMETER temperature 0.3
+PARAMETER TEMPERATURE 0.3

--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@@ -21,8 +21,6 @@ You can try this with the `logtest.logfile` file included in this directory.
 2. Install the Python Requirements.

   ```bash
-   python3 -m venv .venv
-   source .venv/bin/activate
   pip install -r requirements.txt
   ```

--- a/examples/python-loganalysis/requirements.txt
+++ b/examples/python-loganalysis/requirements.txt
@@ -1 +1 @@
-Requests>=2.32.3
+Requests==2.31.0
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -9,8 +9,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-
-	"github.com/ollama/ollama/envconfig"
 )

 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -56,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -360,10 +359,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	if len(resp) == 0 {
 		slog.Info("no compatible amdgpu devices detected")
 	}
-	if err := verifyKFDDriverAccess(); err != nil {
-		slog.Error("amdgpu devices detected but permission problems block access", "error", err)
-		return nil
-	}
 	return resp
 }

@@ -460,19 +455,3 @@ func getFreeMemory(usedFile string) (uint64, error) {
 	}
 	return usedMemory, nil
 }
-
-func verifyKFDDriverAccess() error {
-	// Verify we have permissions - either running as root, or we have group access to the driver
-	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
-	if err != nil {
-		if errors.Is(err, fs.ErrPermission) {
-			return fmt.Errorf("permissions not set up properly.  Either run ollama as root, or add you user account to the render group. %w", err)
-		} else if errors.Is(err, fs.ErrNotExist) {
-			// Container runtime failure?
-			return fmt.Errorf("kfd driver not loaded.  If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
-		}
-		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
-	}
-	fd.Close()
-	return nil
-}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
+	rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -0,0 +1,148 @@
+package gpu
+
+import (
+	"errors"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/ollama/ollama/envconfig"
+)
+
+var (
+	lock        sync.Mutex
+	payloadsDir = ""
+)
+
+func PayloadsDir() (string, error) {
+	lock.Lock()
+	defer lock.Unlock()
+	var err error
+	if payloadsDir == "" {
+		runnersDir := envconfig.RunnersDir()
+
+		if runnersDir != "" {
+			payloadsDir = runnersDir
+			return payloadsDir, nil
+		}
+
+		// The remainder only applies on non-windows where we still carry payloads in the main executable
+		cleanupTmpDirs()
+		tmpDir := envconfig.TmpDir()
+		if tmpDir == "" {
+			tmpDir, err = os.MkdirTemp("", "ollama")
+			if err != nil {
+				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
+			}
+		} else {
+			err = os.MkdirAll(tmpDir, 0o755)
+			if err != nil {
+				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
+			}
+		}
+
+		// Track our pid so we can clean up orphaned tmpdirs
+		n := filepath.Join(tmpDir, "ollama.pid")
+		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
+			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
+		}
+
+		// We create a distinct subdirectory for payloads within the tmpdir
+		// This will typically look like /tmp/ollama3208993108/runners on linux
+		payloadsDir = filepath.Join(tmpDir, "runners")
+	}
+	return payloadsDir, nil
+}
+
+// Best effort to clean up prior tmpdirs
+func cleanupTmpDirs() {
+	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
+	if err != nil {
+		return
+	}
+
+	for _, match := range matches {
+		raw, err := os.ReadFile(match)
+		if errors.Is(err, os.ErrNotExist) {
+			slog.Debug("not a ollama runtime directory, skipping", "path", match)
+			continue
+		} else if err != nil {
+			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
+			continue
+		}
+
+		pid, err := strconv.Atoi(string(raw))
+		if err != nil {
+			slog.Warn("invalid pid, skipping", "path", match, "error", err)
+			continue
+		}
+
+		p, err := os.FindProcess(pid)
+		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+			slog.Warn("process still running, skipping", "pid", pid, "path", match)
+			continue
+		}
+
+		if err := os.Remove(match); err != nil {
+			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
+		}
+
+		runners := filepath.Join(filepath.Dir(match), "runners")
+		if err := os.RemoveAll(runners); err != nil {
+			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
+		}
+
+		if err := os.Remove(filepath.Dir(match)); err != nil {
+			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
+		}
+	}
+}
+
+func Cleanup() {
+	lock.Lock()
+	defer lock.Unlock()
+	runnersDir := envconfig.RunnersDir()
+	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
+		// We want to fully clean up the tmpdir parent of the payloads dir
+		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
+		slog.Debug("cleaning up", "dir", tmpDir)
+		err := os.RemoveAll(tmpDir)
+		if err != nil {
+			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
+			time.Sleep(1000 * time.Millisecond)
+			err = os.RemoveAll(tmpDir)
+			if err != nil {
+				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
+			}
+		}
+	}
+}
+
+func UpdatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		slog.Info("updating", "PATH", newPath)
+		os.Setenv("PATH", newPath)
+	}
+	// linux and darwin rely on rpath
+}
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -57,7 +57,7 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 		}
 	}

-	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
 		return "v11"
 	}
 	return "v12"
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -93,9 +93,10 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	libDir := LibraryDir()
-	if libDir != "" {
-		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
+	tmpDir, _ := PayloadsDir()
+	if tmpDir != "" {
+		// TODO - add "payloads" for subprocess
+		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)

@@ -652,7 +653,7 @@ func LibraryDir() string {
 		slog.Warn("failed to lookup working directory", "error", err)
 	}
 	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
 		libDep := filepath.Join("lib", "ollama")
 		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
 			return filepath.Join(root, libDep)
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -2,7 +2,7 @@ set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
--- a/llm/ext_server/json.hpp
+++ b/llm/ext_server/json.hpp
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -262,7 +262,7 @@ struct server_slot {
       char buffer[512];
        double t_token = t_prompt_processing / n_prompt_tokens_processed;
        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-        snprintf(buffer, sizeof(buffer), "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
                t_prompt_processing, n_prompt_tokens_processed,
                t_token, n_tokens_second);
        LOG_DEBUG(buffer, {
@@ -276,7 +276,7 @@ struct server_slot {

        t_token = t_token_generation / n_decoded;
        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-        snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
+        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
                t_token_generation, n_decoded,
                t_token, n_tokens_second);
        LOG_DEBUG(buffer, {
@@ -288,7 +288,7 @@ struct server_slot {
            {"n_tokens_second",    n_tokens_second},
        });

-        snprintf(buffer, sizeof(buffer), "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
+        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
        LOG_DEBUG(buffer, {
            {"slot_id",             id},
            {"task_id",             task_id},
@@ -425,7 +425,7 @@ struct llama_server_context

        n_ctx = llama_n_ctx(ctx);

-        add_bos_token = llama_add_bos_token(model);
+        add_bos_token = llama_should_add_bos_token(model);

        return true;
    }
@@ -913,9 +913,7 @@ struct llama_server_context
        slot.sampled = result.tok;

        // search stop word and delete it
-        if (!llama_token_is_eog(model, result.tok))
-            slot.generated_text += token_str;
-
+        slot.generated_text += token_str;
        slot.has_next_token = true;

        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@@ -956,36 +954,30 @@ struct llama_server_context
        if (!incomplete)
        {
            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
+            const std::string str_test = slot.generated_text.substr(pos);
+            bool is_stop_full = false;
+            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+            if (stop_pos != std::string::npos)
+            {
+                is_stop_full = true;
+                slot.generated_text.erase(
+                    slot.generated_text.begin() + pos + stop_pos,
+                    slot.generated_text.end());
+                pos = std::min(slot.n_sent_text, slot.generated_text.size());
+            }
+            else
+            {
+                is_stop_full = false;
+                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+            }

-            if (!llama_token_is_eog(model, result.tok)) {
-                const std::string str_test = slot.generated_text.substr(pos);
-                bool is_stop_full = false;
-                size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-                if (stop_pos != std::string::npos)
-                {
-                    is_stop_full = true;
-                    slot.generated_text.erase(
-                        slot.generated_text.begin() + pos + stop_pos,
-                        slot.generated_text.end());
-                    pos = std::min(slot.n_sent_text, slot.generated_text.size());
-                }
-                else
-                {
-                    is_stop_full = false;
-                    stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
-                }
-
-                // check if there is any token to predict
-                if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
-                {
-                    // no send the stop word in the response
-                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                    slot.n_sent_text += result.text_to_send.size();
-                    // add the token to slot queue and cache
-                }
-            } else {
-                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                    slot.n_sent_text += result.text_to_send.size();
+            // check if there is any token to predict
+            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+            {
+                // no send the stop word in the response
+                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                slot.n_sent_text += result.text_to_send.size();
+                // add the token to slot queue and cache
            }

            if (slot.params.stream)
@@ -1039,7 +1031,7 @@ struct llama_server_context
                continue;
            }

-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                LOG_TEE("Error processing the given image");
                return false;
            }
@@ -1125,7 +1117,9 @@ struct llama_server_context
            {"multimodal", multimodal}
        };

-        res.result_json["content"] = tkn.text_to_send;
+        if (!llama_token_is_eog(model, tkn.tok)) {
+            res.result_json["content"] = tkn.text_to_send;
+        }

        if (slot.sparams.n_probs > 0)
        {
@@ -2020,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("options:\n");
    printf("  -h, --help                show this help message and exit\n");
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
+    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
@@ -2293,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            params.cpuparams.n_threads = std::stoi(argv[i]);
+            params.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--grp-attn-n" || arg == "-gan")
        {
@@ -2321,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                invalid_param = true;
                break;
            }
-            params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+            params.n_threads_batch = std::stoi(argv[i]);
        }
        else if (arg == "--threads-http")
        {
@@ -2632,11 +2626,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
        params.kv_overrides.back().key[0] = 0;
    }

-    postprocess_cpu_params(params.cpuparams, nullptr);
-    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
-
    if (invalid_param)
    {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
@@ -2786,8 +2775,8 @@ int main(int argc, char **argv) {
                            {"commit", LLAMA_COMMIT}});

    LOG_INFO("system info", {
-                                {"n_threads", params.cpuparams.n_threads},
-                                {"n_threads_batch", params.cpuparams_batch.n_threads},
+                                {"n_threads", params.n_threads},
+                                {"n_threads_batch", params.n_threads_batch},
                                {"total_threads", std::thread::hardware_concurrency()},
                                {"system_info", llama_print_system_info()},
                            });
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -31,7 +31,6 @@ init_vars() {
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        DIST_BASE=../../dist/darwin-${GOARCH}/
-        PAYLOAD_BASE=../../build/darwin/${GOARCH}
        ;;
    "Linux")
        LIB_EXT="so"
@@ -41,7 +40,6 @@ init_vars() {
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        DIST_BASE=../../dist/linux-${GOARCH}/
-        PAYLOAD_BASE=../../build/linux/${GOARCH}
        ;;
    *)
        ;;
@@ -49,8 +47,7 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
-    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
-    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
+    GZIP=$(which pigz 2>/dev/null || echo "gzip")
 }

 git_module_setup() {
@@ -69,47 +66,40 @@ git_module_setup() {
 }

 apply_patches() {
-    # apply temporary patches until fix is upstream
-    for patch in ../patches/*.patch; do
-        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
-    done
+    # Wire up our CMakefile
+    if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
+        echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
+    fi
+
+    if [ -n "$(ls -A ../patches/*.diff)" ]; then
+        # apply temporary patches until fix is upstream
+        for patch in ../patches/*.diff; do
+            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
+                (cd ${LLAMACPP_DIR}; git checkout ${file})
+            done
+        done
+        for patch in ../patches/*.diff; do
+            (cd ${LLAMACPP_DIR} && git apply ${patch})
+        done
+    fi
 }

 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    # remove unnecessary build artifacts
-    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }

-dist() {
-    [ -z "${RUNNER}" ] && exit 1
-    mkdir -p ${RUNNER_BASE}/${RUNNER}/
-    for f in ${BUILD_DIR}/bin/* ; do
-        cp ${f} ${RUNNER_BASE}/${RUNNER}/
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            cp ${f} ${RUNNER_BASE}/${RUNNER}/
-        done
-    fi
-}
-
-# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
-    [ -z "${RUNNER}" ] && exit 1
-    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
-    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
-    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
+    echo "Compressing payloads to reduce overall binary size..."
+    rm -rf ${BUILD_DIR}/bin/*.gz
    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
+        ${GZIP} -n --best -f ${f} &
        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
+            ${GZIP} -n --best -f ${f} &
            compress_pids+=" $!"
        done
    fi
@@ -125,7 +115,7 @@ wait_for_compress() {

 install() {
    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
-    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
+    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
        cp -af "${lib}" "${BUILD_DIR}/bin/"
    done
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -19,7 +19,7 @@ sign() {
    fi
 }

-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
+COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"

 case "${GOARCH}" in
 "amd64")
@@ -39,8 +39,7 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
+        BUILD_DIR="../build/darwin/${ARCH}/cpu"
        echo "Building LCD CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -52,8 +51,7 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu_avx
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
+        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
        echo "Building AVX CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -65,8 +63,7 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        RUNNER=cpu_avx2
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
+        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
        build
@@ -87,8 +84,7 @@ case "${GOARCH}" in
    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
        init_vars
        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        RUNNER="metal"
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
+        BUILD_DIR="../build/darwin/${ARCH}/metal"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -79,12 +79,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        RUNNER="cpu"
-        BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+        BUILD_DIR="../build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
        install
-        dist
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -104,12 +102,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            RUNNER=cpu
-            BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+            BUILD_DIR="../build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
            install
-            dist
            compress
        fi

@@ -124,12 +120,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                RUNNER=cpu_avx
-                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
                install
-                dist
                compress
            fi

@@ -140,12 +134,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                RUNNER=cpu_avx2
-                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                echo "Building AVX2 CPU"
                build
                install
-                dist
                compress
            fi
        fi
@@ -195,13 +187,11 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    fi
    export CUDAFLAGS="-t8"
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    RUNNER=cuda${CUDA_VARIANT}
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
    install
-    dist
    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
    mkdir -p "${CUDA_DIST_DIR}"
    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@@ -222,8 +212,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    RUNNER=oneapi
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+    BUILD_DIR="../build/linux/${ARCH}/oneapi"
    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@@ -242,7 +231,6 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
    install
-    dist
    compress
 fi

@@ -271,8 +259,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
        echo "Building custom ROCM GPU"
    fi
-    RUNNER=rocm${ROCM_VARIANT}
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
+    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
    # ROCm dependencies are too large to fit into a unified bundle
    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
    # TODO figure out how to disable runpath (rpath)
@@ -282,17 +269,13 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then

    # copy the ROCM dependencies
    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
        cp -a "${dep}"* "${ROCM_DIST_DIR}"
-        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
-            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
-        fi
    done
    install
-    dist
    compress
 fi

 cleanup
 wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
+echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -83,9 +83,29 @@ function git_module_setup {
 }

 function apply_patches {
+    # Wire up our CMakefile
+    if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
+        Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
+    }
+
    # Apply temporary patches until fix is upstream
-    foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
-        git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
+    $patches = Get-ChildItem "../patches/*.diff"
+    foreach ($patch in $patches) {
+        # Extract file paths from the patch file
+        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
+            $parts = $_ -split ' '
+            ($parts[1] -split '/', 2)[1]
+        }
+
+        # Checkout each file
+        foreach ($file in $filePaths) {
+            git -C "${script:llamacppDir}" checkout $file
+        }
+    }
+
+    # Apply each patch
+    foreach ($patch in $patches) {
+        git -C "${script:llamacppDir}" apply $patch.FullName
    }
 }

--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -360,13 +360,11 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui

 	switch llm.KV().Architecture() {
 	case "llama":
-		fullOffload = max(
-			4*batch*(1+4*embedding+context*(1+heads)),
-			4*batch*(embedding+vocab),
-		)
+		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))

 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
+			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -0,0 +1,11 @@
+package llm
+
+import (
+	"embed"
+	"syscall"
+)
+
+//go:embed build/darwin/x86_64/*/bin/*
+var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -1,7 +1,11 @@
 package llm

 import (
+	"embed"
 	"syscall"
 )

+//go:embed build/darwin/arm64/*/bin/*
+var libEmbed embed.FS
+
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -1,7 +1,11 @@
 package llm

 import (
+	"embed"
 	"syscall"
 )

+//go:embed build/linux/*/*/bin/*
+var libEmbed embed.FS
+
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -1,9 +1,13 @@
 package llm

 import (
+	"embed"
 	"syscall"
 )

+// unused on windows
+var libEmbed embed.FS
+
 const CREATE_DEFAULT_ERROR_MODE = 0x04000000

 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,7 +7,6 @@ import (
 	"strings"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
@@ -95,7 +94,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// Overflow that didn't fit into the GPU
 	var overflow uint64

-	overhead := envconfig.GpuOverhead()
 	availableList := make([]string, len(gpus))
 	for i, gpu := range gpus {
 		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
@@ -166,22 +164,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
-			slog.Debug("gpu has too little memory to allocate any layers",
-				"id", gpus[i].ID,
-				"library", gpus[i].Library,
-				"variant", gpus[i].Variant,
-				"compute", gpus[i].Compute,
-				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
-				"name", gpus[i].Name,
-				"total", format.HumanBytes2(gpus[i].TotalMemory),
-				"available", format.HumanBytes2(gpus[i].FreeMemory),
-				"minimum_memory", gpus[i].MinimumMemory,
-				"layer_size", format.HumanBytes2(layerSize),
-				"gpu_zer_overhead", format.HumanBytes2(gzo),
-				"partial_offload", format.HumanBytes2(graphPartialOffload),
-				"full_offload", format.HumanBytes2(graphFullOffload),
-			)
+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
 			continue
 		}
 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
@@ -212,7 +196,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[i%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if (g.g.FreeMemory - overhead) > used+layerSize {
+			if g.g.FreeMemory > used+layerSize {
 				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
@@ -235,7 +219,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[layerCount%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
+			if g.g.FreeMemory > used+memoryLayerOutput {
 				gpuAllocations[g.i] += memoryLayerOutput
 				layerCounts[g.i]++
 				layerCount++
@@ -322,7 +306,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 }

 func (m MemoryEstimate) log() {
-	overhead := envconfig.GpuOverhead()
 	slog.Info(
 		"offload to "+m.inferenceLibrary,
 		slog.Group(
@@ -340,7 +323,6 @@ func (m MemoryEstimate) log() {
 			"memory",
 			// memory available by GPU for offloading
 			"available", m.availableList,
-			"gpu_overhead", format.HumanBytes2(overhead),
 			slog.Group(
 				"required",
 				// memory required for full offloading
--- a/llm/patches/0000-cmakelist.patch
+++ b/llm/patches/0000-cmakelist.patch
@@ -1,22 +0,0 @@
-From 8b8d83ffca775840acc5dc700f3b3703e9f5cfe4 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Fri, 23 Aug 2024 11:27:48 -0700
-Subject: [PATCH] patch cmakelist
-
---
- CMakeLists.txt | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index a3132063..6a2a9912 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -199,3 +199,5 @@ if (LLAMA_BUILD_EXAMPLES)
-     add_subdirectory(examples)
-     add_subdirectory(pocs)
- endif()
-+
-+add_subdirectory(../ext_server ext_server) # ollama
-- 
-2.45.2
-
--- a/llm/patches/0008-solar-pro.patch
+++ b/llm/patches/0008-solar-pro.patch
@@ -1,402 +0,0 @@
-From 8313ce5f43f11f3d84f352f97f3802792e90e18c Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:16 -0700
-Subject: [PATCH] add solar-pro support
-
-solar-pro introduces block skip connections where blocks are connected
-to other, non-sequential blocks with a scale multiple
-
-this change adds 4 new keys to store the skip connections and one new
-tensor to store the scalar. the scalar is implemented a 1-dimensional
-tensor with 2 elements dervied from the model's bskcn_tv configuration.
-in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
- src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 254 insertions(+), 13 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index f79bd782..b7771f53 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -213,6 +213,7 @@ enum llm_arch {
-     LLM_ARCH_NEMOTRON,
-     LLM_ARCH_EXAONE,
-     LLM_ARCH_RWKV6,
-+    LLM_ARCH_SOLAR,
-     LLM_ARCH_UNKNOWN,
- };
- 
-@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_NEMOTRON,        "nemotron"     },
-     { LLM_ARCH_EXAONE,          "exaone"       },
-     { LLM_ARCH_RWKV6,           "rwkv6"        },
-+    { LLM_ARCH_SOLAR,           "solar"        },
-     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
- };
- 
-@@ -314,6 +316,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_KV_LORA_RANK,
-     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
-+    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
- 
-     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_FREQ_BASE,
-@@ -405,19 +408,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_TIME_MIX_EXTRA_DIM,                "%s.time_mix_extra_dim"                },
-     { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
- 
-    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
-    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
-+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
-+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
-+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"                },
-+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"               },
-+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"             },
-+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon"   },
-+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                   },
-+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"              },
-+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
-+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
-+    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
- 
-     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-@@ -589,6 +593,7 @@ enum llm_tensor {
-     LLM_TENSOR_ENC_FFN_DOWN,
-     LLM_TENSOR_ENC_FFN_UP,
-     LLM_TENSOR_ENC_OUTPUT_NORM,
-+    LLM_TENSOR_BSKCN_TV,
- };
- 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -1408,6 +1413,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
-             { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },
-         },
-     },
-+    {
-+        LLM_ARCH_SOLAR,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_OUTPUT,          "output" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
-+        },
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-@@ -2237,6 +2260,7 @@ enum e_model {
-     MODEL_15B,
-     MODEL_16B,
-     MODEL_20B,
-+    MODEL_22B,
-     MODEL_30B,
-     MODEL_34B,
-     MODEL_35B,
-@@ -2284,6 +2308,8 @@ struct llama_hparams {
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
- 
-+    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
-+
-     uint32_t n_layer_dense_lead = 0;
-     uint32_t n_lora_q = 0;
-     uint32_t n_lora_kv = 0;
-@@ -2349,6 +2375,7 @@ struct llama_hparams {
-         if (this->n_head_arr    != other.n_head_arr)    return true;
-         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
-         if (this->n_ff_arr      != other.n_ff_arr)      return true;
-+        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
- 
-         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
-         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2455,6 +2482,14 @@ struct llama_hparams {
-             return ssm_d_state * ssm_d_inner;
-         }
-     }
-+
-+    bool n_bskcn(uint32_t n, uint32_t il = 0) const {
-+        if (il < n_layer) {
-+            return n_bskcn_arr[n][il] > 0;
-+        }
-+
-+        GGML_ABORT("fatal error");
-+    }
- };
- 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2635,6 +2670,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_gate_scale;
-     struct ggml_tensor * ffn_up_scale;
-     struct ggml_tensor * ffn_down_scale;
-+
-+    struct ggml_tensor * bskcn_tv;
- };
- 
- // very similar to llama_batch,
-@@ -5937,6 +5974,21 @@ static void llm_load_hparams(
-                     default: model.type = e_model::MODEL_UNKNOWN;
-                 }
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-+
-+                for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
-+                    auto & bskcn = hparams.n_bskcn_arr.at(i);
-+                    bskcn.fill(0);
-+                    ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
-+                }
-+
-+                switch (hparams.n_layer) {
-+                    case 64: model.type = e_model::MODEL_22B; break;
-+                    default: model.type = e_model::MODEL_UNKNOWN;
-+                }
-+            }
-         default: (void)0;
-     }
- 
-@@ -8420,6 +8472,38 @@ static bool llm_load_tensors(
-                     }
- 
-                 } break;
-+            case LLM_ARCH_SOLAR:
-+                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-+
-+                    // output
-+                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-+                    }
-+
-+                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
-+
-+                        auto & layer = model.layers[i];
-+
-+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
-+
-+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-+
-+                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
-+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-+                    }
-+                } break;
-             default:
-                 throw std::runtime_error("unknown architecture");
-         }
-@@ -15173,6 +15257,158 @@ struct llm_build_context {
- 
-         return gf;
-     }
-+
-+    ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-+
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
-+        const int64_t n_embd_head = hparams.n_embd_head_v;
-+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-+        GGML_ASSERT(n_embd_head == hparams.n_rot);
-+
-+        struct ggml_tensor * cur;
-+        struct ggml_tensor * inpL;
-+
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
-+
-+        // inp_pos - contains the positions
-+        struct ggml_tensor * inp_pos = build_inp_pos();
-+
-+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-+
-+        struct ggml_tensor * bskcn_1;
-+        struct ggml_tensor * bskcn_2;
-+
-+        for (int il = 0; il < n_layer; ++il) {
-+            struct ggml_tensor * inpSA = inpL;
-+
-+            if (hparams.n_bskcn(0, il)) {
-+                bskcn_1 = inpSA;
-+            }
-+
-+            if (hparams.n_bskcn(1, il)) {
-+                bskcn_2 = inpSA;
-+            }
-+
-+            if (hparams.n_bskcn(2, il)) {
-+                inpSA = ggml_add(
-+                   ctx0,
-+                   ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
-+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
-+            }
-+
-+            if (hparams.n_bskcn(3, il)) {
-+                inpSA = ggml_add(
-+                   ctx0,
-+                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
-+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
-+            }
-+
-+            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
-+                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
-+            cb(cur, "attn_norm", il);
-+
-+            // self-attention
-+            {
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
-+
-+                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-+                cb(Qcur, "Qcur", il);
-+                if (model.layers[il].bq) {
-+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-+                    cb(Qcur, "Qcur", il);
-+                }
-+
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-+                cb(Kcur, "Kcur", il);
-+                if (model.layers[il].bk) {
-+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
-+                }
-+
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-+                cb(Vcur, "Vcur", il);
-+                if (model.layers[il].bv) {
-+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-+                    cb(Vcur, "Vcur", il);
-+                }
-+
-+                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
-+
-+                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Kcur, "Kcur", il);
-+
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-+                        model.layers[il].wo, model.layers[il].bo,
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-+            }
-+
-+            if (il == n_layer - 1) {
-+                // skip computing output for unused tokens
-+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                n_tokens = n_outputs;
-+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-+            }
-+
-+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+            cb(ffn_inp, "ffn_inp", il);
-+
-+            // feed-forward network
-+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-+                    model.layers[il].ffn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
-+            cb(cur, "ffn_norm", il);
-+
-+            cur = llm_build_ffn(ctx0, lctx, cur,
-+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                    NULL,
-+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-+            cb(cur, "ffn_out", il);
-+
-+            cur = ggml_add(ctx0, cur, ffn_inp);
-+            cb(cur, "ffn_out", il);
-+
-+            cur = lctx.cvec.apply_to(ctx0, cur, il);
-+            cb(cur, "l_out", il);
-+
-+            // input for next layer
-+            inpL = cur;
-+        }
-+
-+        cur = inpL;
-+
-+        cur = llm_build_norm(ctx0, cur, hparams,
-+                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
-+        cb(cur, "result_norm", -1);
-+
-+        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-+        cb(cur, "result_output", -1);
-+
-+        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
- };
- 
- static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -15423,6 +15659,10 @@ static struct ggml_cgraph * llama_build_graph(
-             {
-                 result = llm.build_rwkv6();
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                result = llm.build_solar();
-+            } break;
-         default:
-             GGML_ABORT("fatal error");
-     }
-@@ -18503,6 +18743,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_ARCTIC:
-         case LLM_ARCH_DEEPSEEK2:
-         case LLM_ARCH_CHATGLM:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-- 
-2.46.0
-
--- a/llm/patches/0001-load-progress.patch
+++ b/llm/patches/0001-load-progress.patch
@@ -1,18 +1,8 @@
-From 2cfaa0a04faa9c87ba8f1ac8527eb953e69c6cde Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:10 -0700
-Subject: [PATCH] 01-load-progress.diff
-
---
- common/common.cpp | 2 ++
- common/common.h   | 7 +++++++
- 2 files changed, 9 insertions(+)
-
 diff --git a/common/common.cpp b/common/common.cpp
-index 9fa18472..48ff41e9 100644
+index 2c05a4d4..927f0e3d 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2573,6 +2573,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2093,6 +2093,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
@@ -22,10 +12,10 @@ index 9fa18472..48ff41e9 100644
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
-index cb5e7f6d..d8f043f7 100644
+index 65c0ef81..ebca2c77 100644
 --- a/common/common.h
 +++ b/common/common.h
-@@ -204,6 +204,13 @@ struct gpt_params {
+@@ -184,6 +184,13 @@ struct gpt_params {
     std::string mmproj = "";        // path to multimodal projector
     std::vector<std::string> image; // path to image file(s)
 
@@ -39,6 +29,3 @@ index cb5e7f6d..d8f043f7 100644
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
-- 
-2.46.0
-
--- a/llm/patches/0002-clip-log.patch
+++ b/llm/patches/0002-clip-log.patch
@@ -1,14 +1,5 @@
-From ba4bba80a744f76ac67b8234451c259a3c5da83b Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:11 -0700
-Subject: [PATCH] 02-clip-log.diff
-
---
- examples/llava/clip.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 9b890571..cb51793d 100644
+index e431c7f7..f077e688 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
@@ -19,6 +10,3 @@ index 9b890571..cb51793d 100644
 #include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
-- 
-2.46.0
-
--- a/llm/patches/0003-load_exception.patch
+++ b/llm/patches/0003-load_exception.patch
@@ -1,17 +1,8 @@
-From e43bfd3f607a6dfcaba2d490d35f412a52e55e30 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:12 -0700
-Subject: [PATCH] 03-load_exception.diff
-
---
- src/llama.cpp | 25 ++++++++++++++++---------
- 1 file changed, 16 insertions(+), 9 deletions(-)
-
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 88355971..926bb71a 100644
+index 73f52435..58a00fb1 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -8635,7 +8635,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -7241,7 +7241,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
@@ -20,7 +11,7 @@ index 88355971..926bb71a 100644
     }
 
     return 0;
-@@ -18022,16 +18022,23 @@ struct llama_model * llama_load_model_from_file(
+@@ -17564,16 +17564,23 @@ struct llama_model * llama_load_model_from_file(
         }
         model->rpc_servers.push_back(servers);
     }
@@ -52,6 +43,3 @@ index 88355971..926bb71a 100644
     }
 
     return model;
-- 
-2.46.0
-
--- a/llm/patches/0004-metal.patch
+++ b/llm/patches/0004-metal.patch
@@ -1,17 +1,8 @@
-From 29411d9a9d2b6a0af6425ffe88498f17f71f7d5d Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:12 -0700
-Subject: [PATCH] 04-metal.diff
-
---
- ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
- 1 file changed, 13 insertions(+), 17 deletions(-)
-
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 91b5e61b..9cfa72ac 100644
+index 0207b787..b5e9884b 100644
 --- a/ggml/src/ggml-metal.m
 +++ b/ggml/src/ggml-metal.m
-@@ -1734,27 +1734,23 @@ static enum ggml_status ggml_metal_graph_compute(
+@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
                         // to the matrix-vector kernel
                         int ne11_mm_min = 1;
 
@@ -52,6 +43,3 @@ index 91b5e61b..9cfa72ac 100644
 
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-- 
-2.46.0
-
--- a/llm/patches/0005-default-pretokenizer.patch
+++ b/llm/patches/0005-default-pretokenizer.patch
@@ -1,17 +1,8 @@
-From b298ac8614d1e38da28f760eb1d2ae8af0fbbe62 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:13 -0700
-Subject: [PATCH] 05-default-pretokenizer.diff
-
---
- src/llama.cpp | 14 +++-----------
- 1 file changed, 3 insertions(+), 11 deletions(-)
-
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 926bb71a..d1e959fc 100644
+index a207451f..2ddf431d 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
+@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -29,9 +20,9 @@ index 926bb71a..d1e959fc 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "exaone") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "codeshell") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
@@ -39,6 +30,3 @@ index 926bb71a..d1e959fc 100644
             }
         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-- 
-2.46.0
-
--- a/llm/patches/0006-embeddings.patch
+++ b/llm/patches/0006-embeddings.patch
@@ -1,45 +1,37 @@
-From c9a6ca9fc039233dee746a4da9705762cd9e515d Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:14 -0700
-Subject: [PATCH] 06-embeddings.diff
-
---
- src/llama.cpp | 17 ++++++++++-------
- 1 file changed, 10 insertions(+), 7 deletions(-)
-
 diff --git a/src/llama.cpp b/src/llama.cpp
-index d1e959fc..f79bd782 100644
+index 1fe2b9f7..a43312a7 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -15898,7 +15898,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+     const bool has_embd   =  lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -16167,20 +16167,23 @@ static int llama_decode_internal(
+@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
 -        } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-            embd = nullptr;
+-            res = nullptr; // do not extract logits for embedding case
+-            embd = gf->nodes[gf->n_nodes - 1];
+-            if (strcmp(embd->name, "result_embd_pooled") != 0) {
+-                embd = gf->nodes[gf->n_nodes - 2];
 +        }
 +
 +        if (cparams.embeddings) {
-             for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = gf->n_nodes - 1; i >= 0; --i) {
 +                embd = gf->nodes[i];
 +                if (strcmp(embd->name, "result_embd_pooled") == 0) {
-                     break;
-                 }
+                    break;
+                }
             }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
+             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+-        } else {
+         } else {
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
@@ -47,9 +39,7 @@ index d1e959fc..f79bd782 100644
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
-- 
-2.46.0
-
--- a/llm/patches/0007-clip-unicode.patch
+++ b/llm/patches/0007-clip-unicode.patch
@@ -1,17 +1,8 @@
-From ae2b188a679c83ce105aa1e823499441dfab3c57 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:15 -0700
-Subject: [PATCH] 07-clip-unicode.diff
-
---
- examples/llava/clip.cpp | 23 +++++++++++++++++++++++
- 1 file changed, 23 insertions(+)
-
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index cb51793d..8716472b 100644
+index 95fbe3d0..5a02a6ec 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -41,6 +41,14 @@
+@@ -32,6 +33,14 @@
 #include <cinttypes>
 #include <limits>
 
@@ -26,7 +17,7 @@ index cb51793d..8716472b 100644
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
-@@ -1223,7 +1231,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1055,7 +1064,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             return nullptr;
         }
 
@@ -49,6 +40,3 @@ index cb51793d..8716472b 100644
         if (!fin) {
             LOG_TEE("cannot open model file for loading tensors\n");
             clip_free(new_clip);
-- 
-2.46.0
-
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,350 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index 2e8374d5..70d0afde 100644
+--- a/common/common.cpp
+++ b/common/common.cpp
+@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+         if (loaded_la.adapter == nullptr) {
+             fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+-            llama_free(lctx);
+-            llama_free_model(model);
+-            return iparams;
+
+            // if that fails, try loading as ggla for compatibility
+            int err = llama_model_apply_lora_from_file(model,
+                                                    la.path.c_str(),
+                                                    la.scale,
+                                                    nullptr,
+                                                    params.n_threads);
+            if (err != 0) {
+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+                llama_free(lctx);
+                llama_free_model(model);
+                return iparams;
+            } else {
+                break;
+            }
+         }
+         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+     }
+diff --git a/include/llama.h b/include/llama.h
+index 93fd77ca..b0fb37a6 100644
+--- a/include/llama.h
+++ b/include/llama.h
+@@ -1160,6 +1160,20 @@ extern "C" {
+ 
+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+ 
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int32_t llama_model_apply_lora_from_file(
+            const struct llama_model * model,
+                            const char * path_lora,
+                                float   scale,
+                            const char * path_base_model,
+                                int32_t   n_threads);
+
+
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 80a0dd0f..9d7b0e17 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
+     fputs(text, stderr);
+     fflush(stderr);
+ }
+
+static int llama_apply_lora_from_file_internal(
+    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
+) {
+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+
+    const int64_t t_start_lora_us = ggml_time_us();
+
+    llama_file fin(path_lora, "rb");
+
+    // verify magic and version
+    {
+        uint32_t magic = fin.read_u32();
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
+            return 1;
+        }
+
+        uint32_t format_version = fin.read_u32();
+        if (format_version != 1) {
+            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
+            return 1;
+        }
+    }
+
+    int32_t lora_r = fin.read_u32();
+    int32_t lora_alpha = fin.read_u32();
+    float scaling = scale * (float)lora_alpha / (float)lora_r;
+
+    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
+
+    // load base model
+    std::unique_ptr<llama_model_loader> ml;
+    if (path_base_model) {
+        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
+        ml->init_mappings(/*prefetch*/ false); // no prefetching
+    }
+
+    struct tensor_meta {
+        std::string name;
+        ggml_type type;
+        int32_t ne[2];
+        size_t offset;
+    };
+    std::map<std::string, tensor_meta> tensor_meta_map;
+
+    // load all tensor meta
+    while (true) {
+        if (fin.tell() == fin.size) {
+            // eof
+            break;
+        }
+
+        int32_t n_dims;
+        int32_t name_len;
+        int32_t ftype;
+
+        fin.read_raw(&n_dims, sizeof(n_dims));
+        fin.read_raw(&name_len, sizeof(name_len));
+        fin.read_raw(&ftype, sizeof(ftype));
+
+        if (n_dims != 1 && n_dims != 2) {
+            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
+            return 1;
+        }
+
+        int32_t ne[2] = { 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            fin.read_raw(&ne[i], sizeof(ne[i]));
+        }
+
+        std::string name;
+        {
+            GGML_ASSERT(name_len < GGML_MAX_NAME);
+            char buf[GGML_MAX_NAME];
+            fin.read_raw(buf, name_len);
+            name = std::string(buf, name_len);
+        }
+
+        // check for lora suffix
+        std::string lora_suffix;
+        if (name.length() > 6) {
+            lora_suffix = name.substr(name.length() - 6);
+        }
+        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
+            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
+            return 1;
+        }
+
+        // tensor type
+        ggml_type wtype;
+        switch (ftype) {
+            case 0: wtype = GGML_TYPE_F32;  break;
+            case 1: wtype = GGML_TYPE_F16;  break;
+            default:
+                    {
+                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
+                                __func__, ftype);
+                        return 1;
+                    }
+        }
+
+        // data offset
+        size_t offset = fin.tell();
+        offset = (offset + 31) & -32;
+
+        // skip tensor data
+        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
+
+        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
+    }
+
+    bool warned = false;
+    int n_tensors = 0;
+
+    // apply
+    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+    if (backend_cpu == nullptr) {
+        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
+        return 1;
+    }
+    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
+
+    std::vector<no_init<uint8_t>> read_buf;
+    for (const auto & it : model.tensors_by_name) {
+        const std::string & base_name = it.first;
+        ggml_tensor * model_t = it.second;
+
+        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
+            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
+            continue;
+        }
+
+        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
+        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
+
+        ggml_init_params lora_init_params = {
+            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+            /* .mem_buffer */ nullptr,
+            /* .no_alloc   */ true,
+        };
+        ggml_context * lora_ctx = ggml_init(lora_init_params);
+        if (lora_ctx == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        // create tensors
+        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
+        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
+        ggml_set_name(loraA, metaA.name.c_str());
+        ggml_set_name(loraB, metaB.name.c_str());
+
+        ggml_tensor * base_t;
+        if (ml) {
+            if (!ml->get_tensor_meta(base_name.c_str())) {
+                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
+                return 1;
+            }
+            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
+        } else {
+            base_t = ggml_dup_tensor(lora_ctx, model_t);
+        }
+        ggml_set_name(base_t, base_name.c_str());
+
+        // allocate in backend buffer
+        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+        if (lora_buf == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
+            return 1;
+        }
+
+        // load tensor data
+        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
+            read_buf.resize(ggml_nbytes(tensor));
+            fin.seek(tensor_meta.offset, SEEK_SET);
+            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
+        };
+        load_tensor(metaA, loraA);
+        load_tensor(metaB, loraB);
+
+        // load base model tensor data
+        if (ml) {
+            ml->load_data_for(base_t);
+        } else {
+            ggml_backend_tensor_copy(model_t, base_t);
+        }
+
+        if (ggml_is_quantized(base_t->type) && !warned) {
+            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                            "use a f16 or f32 base model with --lora-base\n", __func__);
+            warned = true;
+        }
+
+        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
+            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
+                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
+            ggml_free(lora_ctx);
+            ggml_backend_buffer_free(lora_buf);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        auto build_lora_graph = [&]() {
+            // w = w + BA*s
+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+            ggml_set_name(BA, "BA");
+
+            if (scaling != 1.0f) {
+                BA = ggml_scale(lora_ctx, BA, scaling);
+                ggml_set_name(BA, "BA_scaled");
+            }
+
+            ggml_tensor * r;
+            r = ggml_add_inplace(lora_ctx, base_t, BA);
+            ggml_set_name(r, "r_add");
+
+            if (base_t->type != model_t->type) {
+                // convert the result to the model type
+                r = ggml_cast(lora_ctx, r, model_t->type);
+                ggml_set_name(r, "r_cast");
+            }
+
+            return r;
+        };
+
+        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
+        ggml_tensor * r = build_lora_graph();
+        ggml_build_forward_expand(gf, r);
+
+        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+        if (graph_buf == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
+            ggml_free(lora_ctx);
+            ggml_backend_buffer_free(lora_buf);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        ggml_backend_graph_compute(backend_cpu, gf);
+
+        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
+
+#if 0
+        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
+        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
+
+        // sched compute
+        ggml_build_forward_expand(gf, build_graph());
+        ggml_backend_sched_init_measure(sched, gf);
+
+        // create the graph again, since the previous one was destroyed by the measure
+        ggml_graph_clear(gf);
+        ggml_build_forward_expand(gf, build_graph());
+        ggml_backend_sched_graph_compute(sched, gf);
+        ggml_backend_sched_free(sched);
+#endif
+
+        ggml_backend_buffer_free(lora_buf);
+        ggml_backend_buffer_free(graph_buf);
+        ggml_free(lora_ctx);
+
+        n_tensors++;
+        if (n_tensors % 4 == 0) {
+            LLAMA_LOG_INFO(".");
+        }
+    }
+
+    ggml_backend_free(backend_cpu);
+
+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
+    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
+
+    return 0;
+}
+
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
+    try {
+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        return 1;
+    }
+}
+\ No newline at end of file
--- a/llm/patches/11-phi3-sliding-window.diff
+++ b/llm/patches/11-phi3-sliding-window.diff
@@ -0,0 +1,43 @@
+From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Wed, 31 Jul 2024 14:57:04 -0700
+Subject: [PATCH] phi3 sliding window
+
+---
+ src/llama.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index a207451f..f2872d4e 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
+             } break;
+         case LLM_ARCH_PHI3:
+             {
+-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ 
+                 switch (hparams.n_layer) {
+@@ -10762,7 +10762,7 @@ struct llm_build_context {
+         struct ggml_tensor * inp_pos = build_inp_pos();
+ 
+         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+        struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
+ 
+         for (int il = 0; il < n_layer; ++il) {
+             auto residual = inpL;
+@@ -10820,7 +10820,7 @@ struct llm_build_context {
+ 
+                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                         model.layers[il].wo, model.layers[il].bo,
+-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+             }
+ 
+             if (il == n_layer - 1) {
+-- 
+2.45.2
+
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -0,0 +1,233 @@
+package llm
+
+import (
+	"compress/gzip"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strings"
+
+	"golang.org/x/sync/errgroup"
+
+	"github.com/ollama/ollama/gpu"
+)
+
+var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
+
+func Init() error {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		return err
+	}
+
+	if runtime.GOOS != "windows" {
+		slog.Info("extracting embedded files", "dir", payloadsDir)
+		binGlob := "build/*/*/*/bin/*"
+
+		// extract server libraries
+		err = extractFiles(payloadsDir, binGlob)
+		if err != nil {
+			return fmt.Errorf("extract binaries: %v", err)
+		}
+	}
+
+	var variants []string
+	for v := range getAvailableServers() {
+		variants = append(variants, v)
+	}
+	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
+
+	return nil
+}
+
+// binary names may contain an optional variant separated by '_'
+// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
+// Any library without a variant is the lowest common denominator
+func getAvailableServers() map[string]string {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		slog.Error("payload lookup error", "error", err)
+		return nil
+	}
+
+	// glob payloadsDir for files that start with ollama_
+	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
+
+	files, err := filepath.Glob(pattern)
+	if err != nil {
+		slog.Debug("could not glob", "pattern", pattern, "error", err)
+		return nil
+	}
+
+	servers := make(map[string]string)
+	for _, file := range files {
+		slog.Debug("availableServers : found", "file", file)
+		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
+	}
+
+	return servers
+}
+
+// serversForGpu returns a list of compatible servers give the provided GPU
+// info, ordered by performance. assumes Init() has been called
+// TODO - switch to metadata based mapping
+func serversForGpu(info gpu.GpuInfo) []string {
+	// glob workDir for files that start with ollama_
+	availableServers := getAvailableServers()
+	requested := info.Library
+	if info.Variant != gpu.CPUCapabilityNone.String() {
+		requested += "_" + info.Variant
+	}
+
+	servers := []string{}
+
+	// exact match first
+	for a := range availableServers {
+		if a == requested {
+			servers = []string{a}
+
+			if a == "metal" {
+				return servers
+			}
+
+			break
+		}
+	}
+
+	alt := []string{}
+
+	// Then for GPUs load alternates and sort the list for consistent load ordering
+	if info.Library != "cpu" {
+		for a := range availableServers {
+			if info.Library == strings.Split(a, "_")[0] && a != requested {
+				alt = append(alt, a)
+			}
+		}
+
+		slices.Sort(alt)
+		servers = append(servers, alt...)
+	}
+
+	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
+		// Load up the best CPU variant if not primary requested
+		if info.Library != "cpu" {
+			variant := gpu.GetCPUCapability()
+			// If no variant, then we fall back to default
+			// If we have a variant, try that if we find an exact match
+			// Attempting to run the wrong CPU instructions will panic the
+			// process
+			if variant != gpu.CPUCapabilityNone {
+				for cmp := range availableServers {
+					if cmp == "cpu_"+variant.String() {
+						servers = append(servers, cmp)
+						break
+					}
+				}
+			} else {
+				servers = append(servers, "cpu")
+			}
+		}
+
+		if len(servers) == 0 {
+			servers = []string{"cpu"}
+		}
+	}
+
+	return servers
+}
+
+// Return the optimal server for this CPU architecture
+func serverForCpu() string {
+	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+		return "metal"
+	}
+	variant := gpu.GetCPUCapability()
+	availableServers := getAvailableServers()
+	if variant != gpu.CPUCapabilityNone {
+		for cmp := range availableServers {
+			if cmp == "cpu_"+variant.String() {
+				return cmp
+			}
+		}
+	}
+	return "cpu"
+}
+
+// extract extracts the embedded files to the target directory
+func extractFiles(targetDir string, glob string) error {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return errPayloadMissing
+	}
+
+	if err := os.MkdirAll(targetDir, 0o755); err != nil {
+		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
+	}
+
+	g := new(errgroup.Group)
+
+	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
+	for _, file := range files {
+		filename := file
+
+		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
+
+		slog.Debug("extracting", "variant", variant, "file", filename)
+
+		g.Go(func() error {
+			srcf, err := libEmbed.Open(filename)
+			if err != nil {
+				return err
+			}
+			defer srcf.Close()
+
+			src := io.Reader(srcf)
+			if strings.HasSuffix(filename, ".gz") {
+				src, err = gzip.NewReader(src)
+				if err != nil {
+					return fmt.Errorf("decompress payload %s: %v", filename, err)
+				}
+				filename = strings.TrimSuffix(filename, ".gz")
+			}
+
+			variantDir := filepath.Join(targetDir, variant)
+			if err := os.MkdirAll(variantDir, 0o755); err != nil {
+				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
+			}
+
+			base := filepath.Base(filename)
+			destFilename := filepath.Join(variantDir, base)
+
+			_, err = os.Stat(destFilename)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", filename, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", filename, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", filename, err)
+			}
+			return nil
+		})
+	}
+
+	err = g.Wait()
+	if err != nil {
+		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
+		gpu.Cleanup()
+		return err
+	}
+	return nil
+}
--- a/llm/server.go
+++ b/llm/server.go
@@ -24,11 +24,9 @@ import (
 	"golang.org/x/sync/semaphore"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
-	"github.com/ollama/ollama/runners"
 )

 type LlamaServer interface {
@@ -100,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		systemTotalMemory = systemMemInfo.TotalMemory
 		systemFreeMemory = systemMemInfo.FreeMemory
 		systemSwapFreeMemory = systemMemInfo.FreeSwap
-		slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	}

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
@@ -108,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		gpus = gpu.GetCPUInfo()
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = runners.ServerForCpu()
+		cpuRunner = serverForCpu()
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
@@ -120,7 +118,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = runners.ServerForCpu()
+			cpuRunner = serverForCpu()
 			gpus = gpu.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
@@ -147,20 +145,25 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}

-	rDir, err := runners.Refresh(build.EmbedFS)
-	if err != nil {
-		return nil, err
-	}
-
-	availableServers := runners.GetAvailableServers(rDir)
+	availableServers := getAvailableServers()
 	if len(availableServers) == 0 {
-		return nil, finalErr
+		if runtime.GOOS != "windows" {
+			slog.Warn("llama server binary disappeared, reinitializing payloads")
+			err = Init()
+			if err != nil {
+				slog.Warn("failed to reinitialize payloads", "error", err)
+				return nil, err
+			}
+			availableServers = getAvailableServers()
+		} else {
+			return nil, finalErr
+		}
 	}
 	var servers []string
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 	} else {
-		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
+		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
@@ -271,7 +274,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--tensor-split", estimate.TensorSplit)
 	}

-	for i := range servers {
+	for i := range len(servers) {
 		dir := availableServers[servers[i]]
 		if dir == "" {
 			// Shouldn't happen
@@ -327,7 +330,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		_, err := os.Stat(server)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
-			_, err = runners.Refresh(build.EmbedFS)
+			err = Init()
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
@@ -406,7 +409,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		if err = s.cmd.Start(); err != nil {
-			// Detect permission denied and augment the message about noexec
+			// Detect permission denied and augment them essage about noexec
 			if errors.Is(err, os.ErrPermission) {
 				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
 				continue
@@ -581,7 +584,8 @@ func (s *llmServer) Ping(ctx context.Context) error {

 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	stallDuration := envconfig.LoadTimeout()    // If no progress happens
+	stallDuration := 5 * time.Minute            // If no progress happens
+	finalLoadDuration := 5 * time.Minute        // After we hit 100%, give the runner more time to come online
 	stallTimer := time.Now().Add(stallDuration) // give up if we stall

 	slog.Info("waiting for llama runner to start responding")
@@ -633,7 +637,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 				stallTimer = time.Now().Add(stallDuration)
 			} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
 				slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
-				stallTimer = time.Now().Add(stallDuration)
+				stallTimer = time.Now().Add(finalLoadDuration)
 				fullyLoaded = true
 			}
 			time.Sleep(time.Millisecond * 250)
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -79,7 +79,7 @@ type ChatCompletionRequest struct {
 	Stop             any             `json:"stop"`
 	Temperature      *float64        `json:"temperature"`
 	FrequencyPenalty *float64        `json:"frequency_penalty"`
-	PresencePenalty  *float64        `json:"presence_penalty"`
+	PresencePenalty  *float64        `json:"presence_penalty_penalty"`
 	TopP             *float64        `json:"top_p"`
 	ResponseFormat   *ResponseFormat `json:"response_format"`
 	Tools            []api.Tool      `json:"tools"`
@@ -452,7 +452,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}

 	if r.Temperature != nil {
-		options["temperature"] = *r.Temperature
+		options["temperature"] = *r.Temperature * 2.0
 	} else {
 		options["temperature"] = 1.0
 	}
@@ -462,11 +462,11 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 	}

 	if r.FrequencyPenalty != nil {
-		options["frequency_penalty"] = *r.FrequencyPenalty
+		options["frequency_penalty"] = *r.FrequencyPenalty * 2.0
 	}

 	if r.PresencePenalty != nil {
-		options["presence_penalty"] = *r.PresencePenalty
+		options["presence_penalty"] = *r.PresencePenalty * 2.0
 	}

 	if r.TopP != nil {
@@ -513,7 +513,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 	}

 	if r.Temperature != nil {
-		options["temperature"] = *r.Temperature
+		options["temperature"] = *r.Temperature * 2.0
 	} else {
 		options["temperature"] = 1.0
 	}
@@ -522,9 +522,9 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		options["seed"] = *r.Seed
 	}

-	options["frequency_penalty"] = r.FrequencyPenalty
+	options["frequency_penalty"] = r.FrequencyPenalty * 2.0

-	options["presence_penalty"] = r.PresencePenalty
+	options["presence_penalty"] = r.PresencePenalty * 2.0

 	if r.TopP != 0.0 {
 		options["top_p"] = r.TopP
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -22,10 +22,7 @@ const (
 	image  = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
 )

-var (
-	False = false
-	True  = true
-)
+var False = false

 func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
 	return func(c *gin.Context) {
@@ -73,44 +70,6 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
-		{
-			name: "chat handler with options",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				],
-				"stream":            true,
-				"max_tokens":        999,
-				"seed":              123,
-				"stop":              ["\n", "stop"],
-				"temperature":       3.0,
-				"frequency_penalty": 4.0,
-				"presence_penalty":  5.0,
-				"top_p":             6.0,
-				"response_format":   {"type": "json_object"}
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "Hello",
-					},
-				},
-				Options: map[string]any{
-					"num_predict":       999.0, // float because JSON doesn't distinguish between float and int
-					"seed":              123.0,
-					"stop":              []any{"\n", "stop"},
-					"temperature":       3.0,
-					"frequency_penalty": 4.0,
-					"presence_penalty":  5.0,
-					"top_p":             6.0,
-				},
-				Format: "json",
-				Stream: &True,
-			},
-		},
 		{
 			name: "chat handler with image content",
 			body: `{
@@ -227,8 +186,6 @@ func TestChatMiddleware(t *testing.T) {
 			req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body))
 			req.Header.Set("Content-Type", "application/json")

-			defer func() { capturedRequest = nil }()
-
 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

@@ -245,6 +202,7 @@ func TestChatMiddleware(t *testing.T) {
 			if !reflect.DeepEqual(tc.err, errResp) {
 				t.Fatal("errors did not match")
 			}
+			capturedRequest = nil
 		})
 	}
 }
@@ -275,7 +233,7 @@ func TestCompletionsMiddleware(t *testing.T) {
 				Options: map[string]any{
 					"frequency_penalty": 0.0,
 					"presence_penalty":  0.0,
-					"temperature":       0.8,
+					"temperature":       1.6,
 					"top_p":             1.0,
 					"stop":              []any{"\n", "stop"},
 				},
--- a/runners/common.go
+++ b/runners/common.go
@@ -1,384 +0,0 @@
-package runners
-
-import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"slices"
-	"strconv"
-	"strings"
-	"sync"
-	"syscall"
-
-	"golang.org/x/sync/errgroup"
-
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
-)
-
-const (
-	binGlob = "*/*/*/*"
-)
-
-var (
-	lock       sync.Mutex
-	runnersDir = ""
-)
-
-// Return the location where runners are stored
-// If runners are payloads, this will either extract them
-// or refresh them if any have disappeared due to tmp cleaners
-func Refresh(payloadFS fs.FS) (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	var err error
-
-	// Wire up extra logging on our first load
-	if runnersDir == "" {
-		defer func() {
-			var runners []string
-			for v := range GetAvailableServers(runnersDir) {
-				runners = append(runners, v)
-			}
-			slog.Info("Dynamic LLM libraries", "runners", runners)
-			slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-		}()
-	}
-
-	if hasPayloads(payloadFS) {
-		if runnersDir == "" {
-			runnersDir, err = extractRunners(payloadFS)
-		} else {
-			err = refreshRunners(payloadFS, runnersDir)
-		}
-	} else if runnersDir == "" {
-		runnersDir, err = locateRunners()
-	}
-
-	return runnersDir, err
-}
-
-func Cleanup(payloadFS fs.FS) {
-	lock.Lock()
-	defer lock.Unlock()
-	if hasPayloads(payloadFS) && runnersDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
-		if err != nil {
-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-		}
-	}
-}
-
-func locateRunners() (string, error) {
-	exe, err := os.Executable()
-	if err != nil {
-		return "", err
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
-		)
-	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
-		if _, err := os.Stat(candidate); err == nil {
-			return candidate, nil
-		}
-	}
-	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
-}
-
-// Return true if we're carying nested payloads for the runners
-func hasPayloads(payloadFS fs.FS) bool {
-	files, err := fs.Glob(payloadFS, binGlob)
-	if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
-		return false
-	}
-	return true
-}
-
-func extractRunners(payloadFS fs.FS) (string, error) {
-	cleanupTmpDirs()
-	tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
-	if err != nil {
-		return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-	}
-	// Track our pid so we can clean up orphaned tmpdirs
-	n := filepath.Join(tmpDir, "ollama.pid")
-	if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
-		slog.Warn("failed to write pid file", "file", n, "error", err)
-	}
-	// We create a distinct subdirectory for payloads within the tmpdir
-	// This will typically look like /tmp/ollama3208993108/runners on linux
-	rDir := filepath.Join(tmpDir, "runners")
-
-	slog.Info("extracting embedded files", "dir", rDir)
-	return rDir, refreshRunners(payloadFS, rDir)
-}
-
-func refreshRunners(payloadFS fs.FS, rDir string) error {
-	// extract or refresh server libraries
-	err := extractFiles(payloadFS, rDir, binGlob)
-	if err != nil {
-		return fmt.Errorf("extract binaries: %v", err)
-	}
-	return nil
-}
-
-// extract extracts the embedded files to the target directory
-func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
-	files, err := fs.Glob(payloadFS, glob)
-	if err != nil || len(files) == 0 {
-		// Should not happen
-		return fmt.Errorf("extractFiles called without payload present")
-	}
-
-	if err := os.MkdirAll(targetDir, 0o755); err != nil {
-		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
-	}
-
-	g := new(errgroup.Group)
-
-	// $OS/$GOARCH/$RUNNER/$FILE
-	for _, file := range files {
-		filename := file
-
-		runner := filepath.Base(filepath.Dir(filename))
-
-		slog.Debug("extracting", "runner", runner, "payload", filename)
-
-		g.Go(func() error {
-			srcf, err := payloadFS.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer srcf.Close()
-
-			src := io.Reader(srcf)
-			if strings.HasSuffix(filename, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", filename, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-
-			runnerDir := filepath.Join(targetDir, runner)
-			if err := os.MkdirAll(runnerDir, 0o755); err != nil {
-				return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
-			}
-
-			base := filepath.Base(filename)
-			destFilename := filepath.Join(runnerDir, base)
-
-			_, err = os.Stat(destFilename)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", filename, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", filename, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", filename, err)
-			}
-			return nil
-		})
-	}
-
-	err = g.Wait()
-	if err != nil {
-		slog.Error("failed to extract files", "error", err)
-		// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
-		err := os.RemoveAll(targetDir)
-		if err != nil {
-			slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
-		}
-		return err
-	}
-	return nil
-}
-
-// Best effort to clean up prior tmpdirs
-func cleanupTmpDirs() {
-	tmpDir := envconfig.TmpDir()
-	if tmpDir == "" {
-		tmpDir = os.TempDir()
-	}
-	matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
-	if err != nil {
-		return
-	}
-
-	for _, match := range matches {
-		raw, err := os.ReadFile(match)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Debug("not a ollama runtime directory, skipping", "path", match)
-			continue
-		} else if err != nil {
-			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		pid, err := strconv.Atoi(string(raw))
-		if err != nil {
-			slog.Warn("invalid pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		p, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("process still running, skipping", "pid", pid, "path", match)
-			continue
-		}
-
-		if err := os.Remove(match); err != nil {
-			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
-		}
-
-		runners := filepath.Join(filepath.Dir(match), "runners")
-		if err := os.RemoveAll(runners); err != nil {
-			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
-		}
-
-		if err := os.Remove(filepath.Dir(match)); err != nil {
-			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
-		}
-	}
-}
-
-// directory names are the name of the runner and may contain an optional
-// variant prefixed with '_' as the separator. For example, "cuda_v11" and
-// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
-// lowest common denominator
-func GetAvailableServers(payloadsDir string) map[string]string {
-	if payloadsDir == "" {
-		slog.Error("empty runner dir")
-		return nil
-	}
-
-	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
-
-	files, err := filepath.Glob(pattern)
-	if err != nil {
-		slog.Debug("could not glob", "pattern", pattern, "error", err)
-		return nil
-	}
-
-	servers := make(map[string]string)
-	for _, file := range files {
-		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
-	}
-
-	return servers
-}
-
-// serversForGpu returns a list of compatible servers give the provided GPU
-// info, ordered by performance. assumes Init() has been called
-// TODO - switch to metadata based mapping
-func ServersForGpu(info gpu.GpuInfo) []string {
-	// glob workDir for files that start with ollama_
-	availableServers := GetAvailableServers(runnersDir)
-	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone.String() {
-		requested += "_" + info.Variant
-	}
-
-	servers := []string{}
-
-	// exact match first
-	for a := range availableServers {
-		if a == requested {
-			servers = []string{a}
-
-			if a == "metal" {
-				return servers
-			}
-
-			break
-		}
-	}
-
-	alt := []string{}
-
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if info.Library != "cpu" {
-		for a := range availableServers {
-			if info.Library == strings.Split(a, "_")[0] && a != requested {
-				alt = append(alt, a)
-			}
-		}
-
-		slices.Sort(alt)
-		servers = append(servers, alt...)
-	}
-
-	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
-		// Load up the best CPU variant if not primary requested
-		if info.Library != "cpu" {
-			variant := gpu.GetCPUCapability()
-			// If no variant, then we fall back to default
-			// If we have a variant, try that if we find an exact match
-			// Attempting to run the wrong CPU instructions will panic the
-			// process
-			if variant != gpu.CPUCapabilityNone {
-				for cmp := range availableServers {
-					if cmp == "cpu_"+variant.String() {
-						servers = append(servers, cmp)
-						break
-					}
-				}
-			} else {
-				servers = append(servers, "cpu")
-			}
-		}
-
-		if len(servers) == 0 {
-			servers = []string{"cpu"}
-		}
-	}
-
-	return servers
-}
-
-// Return the optimal server for this CPU architecture
-func ServerForCpu() string {
-	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		return "metal"
-	}
-	variant := gpu.GetCPUCapability()
-	availableServers := GetAvailableServers(runnersDir)
-	if variant != gpu.CPUCapabilityNone {
-		for cmp := range availableServers {
-			if cmp == "cpu_"+variant.String() {
-				return cmp
-			}
-		}
-	}
-	return "cpu"
-}
--- a/runners/runners_test.go
+++ b/runners/runners_test.go
@@ -1,50 +0,0 @@
-package runners
-
-import (
-	"log/slog"
-	"os"
-	"path"
-	"runtime"
-	"strings"
-	"testing"
-	"testing/fstest"
-)
-
-func TestRefreshRunners(t *testing.T) {
-	slog.SetLogLoggerLevel(slog.LevelDebug)
-
-	payloadFS := fstest.MapFS{
-		path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
-	}
-	tmpDir, err := os.MkdirTemp("", "testing")
-	if err != nil {
-		t.Fatalf("failed to make tmp dir %s", err)
-	}
-	t.Setenv("OLLAMA_TMPDIR", tmpDir)
-	rDir, err := Refresh(payloadFS)
-	if err != nil {
-		t.Fatalf("failed to extract to %s %s", tmpDir, err)
-	}
-	if !strings.Contains(rDir, tmpDir) {
-		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
-	}
-
-	// spot check results
-	servers := GetAvailableServers(rDir)
-	if len(servers) < 1 {
-		t.Fatalf("expected at least 1 server")
-	}
-
-	// Refresh contents
-	rDir, err = extractRunners(payloadFS)
-	if err != nil {
-		t.Fatalf("failed to extract to %s %s", tmpDir, err)
-	}
-	if !strings.Contains(rDir, tmpDir) {
-		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
-	}
-
-	cleanupTmpDirs()
-
-	Cleanup(payloadFS)
-}
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -2,7 +2,8 @@

 set -e

-. $(dirname $0)/env.sh
+export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"

 mkdir -p dist

--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -2,34 +2,76 @@

 set -eu

-. $(dirname $0)/env.sh
+export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+
+# We use 2 different image repositories to handle combining architecture images into multiarch manifest
+# (The ROCm image is x86 only and is not a multiarch manifest)
+# For developers, you can override the DOCKER_ORG to generate multiarch manifests
+#  DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
+DOCKER_ORG=${DOCKER_ORG:-"ollama"}
+RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
+FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
+
+BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}

 # Set PUSH to a non-empty string to trigger push instead of load
 PUSH=${PUSH:-""}

+# In CI mode, we break things down
+OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
+OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
+
 if [ -z "${PUSH}" ] ; then
-    echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally.  set PUSH=1 to push"
    LOAD_OR_PUSH="--load"
 else
-    echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
+    echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
    LOAD_OR_PUSH="--push"
 fi

-docker buildx build \
-    ${LOAD_OR_PUSH} \
-    --platform=${PLATFORM} \
-    ${OLLAMA_COMMON_BUILD_ARGS} \
-    -f Dockerfile \
-    -t ${FINAL_IMAGE_REPO}:$VERSION \
-    .
+if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
+    for TARGETARCH in ${BUILD_ARCH}; do
+        docker build \
+            ${LOAD_OR_PUSH} \
+            --platform=linux/${TARGETARCH} \
+            --build-arg=VERSION \
+            --build-arg=GOFLAGS \
+            -f Dockerfile \
+            -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
+            .
+    done

-if echo $PLATFORM | grep "amd64" > /dev/null; then
-    docker buildx build \
-        ${LOAD_OR_PUSH} \
-        --platform=linux/amd64 \
-        ${OLLAMA_COMMON_BUILD_ARGS} \
-        --target runtime-rocm \
-        -f Dockerfile \
-        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
-        .
-fi
+    if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then
+        docker build \
+            ${LOAD_OR_PUSH} \
+            --platform=linux/amd64 \
+            --build-arg=VERSION \
+            --build-arg=GOFLAGS \
+            --target runtime-rocm \
+            -f Dockerfile \
+            -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
+            .
+    fi
+fi
+
+if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
+    if [ -n "${PUSH}" ]; then
+        docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
+            ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
+            ${RELEASE_IMAGE_REPO}:$VERSION-arm64
+        docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
+
+        # For symmetry, tag/push the rocm image
+        if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
+            echo "Tagging and pushing rocm image"
+            docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
+            docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
+            docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
+        fi
+    else
+        echo "Skipping manifest generation when not pushing images are available locally as "
+        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
+        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
+        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
+    fi
+fi
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -1,29 +1,37 @@
 #!/bin/sh
-#
-# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
-#
-# docker context create amd64 --docker host=ssh://mybuildhost
-# docker buildx create --name mybuilder amd64 --platform linux/amd64
-# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
-# docker buildx use mybuilder
-

 set -eu

-. $(dirname $0)/env.sh
+export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+GZIP=$(which pigz 2>/dev/null || echo "gzip")

+BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
+export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
 mkdir -p dist

-docker buildx build \
-        --output type=local,dest=./dist/ \
-        --platform=${PLATFORM} \
-        ${OLLAMA_COMMON_BUILD_ARGS} \
-        --target dist \
+for TARGETARCH in ${BUILD_ARCH}; do
+    docker build \
+        --platform=linux/$TARGETARCH \
+        --build-arg=GOFLAGS \
+        --build-arg=CGO_CFLAGS \
+        --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
+        --build-arg=AMDGPU_TARGETS \
+        --target build-$TARGETARCH \
        -f Dockerfile \
+        -t builder:$TARGETARCH \
        .
-
-# buildx behavior changes for single vs. multiplatform
-if echo $PLATFORM | grep "," > /dev/null ; then 
-        mv -f ./dist/linux_*64/ollama* ./dist/
-        rmdir ./dist/linux_*64
-fi
+    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
+    rm -rf ./dist/linux-$TARGETARCH
+    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
+    fi
+    docker rm builder-$TARGETARCH
+    echo "Compressing final linux bundle..."
+    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
+    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
+    if [ -d dist/linux-$TARGETARCH-rocm ]; then
+        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
+    fi
+done
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -122,8 +122,8 @@ function buildOllama() {
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
 }

 function buildApp() {
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -1,14 +0,0 @@
-# Common environment setup across build*.sh scripts
-
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
-export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
-PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
-DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
-FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
-OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
-
-echo "Building Ollama"
-echo "VERSION=$VERSION"
-echo "PLATFORM=$PLATFORM"
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -38,7 +38,7 @@ IS_WSL2=false
 KERN=$(uname -r)
 case "$KERN" in
    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
-    *icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
+    *icrosoft) error "Microsoft WSL1 is not currently supported. Please upgrade to WSL2 with 'wsl --set-version <distro> 2'" ;;
    *) ;;
 esac

@@ -356,12 +356,12 @@ if ! lsmod | grep -q nvidia || ! lsmod | grep -q nvidia_uvm; then
 fi

 # make sure the NVIDIA modules are loaded on boot with nvidia-persistenced
-if available nvidia-persistenced; then
+if command -v nvidia-persistenced > /dev/null 2>&1; then
    $SUDO touch /etc/modules-load.d/nvidia.conf
    MODULES="nvidia nvidia-uvm"
    for MODULE in $MODULES; do
        if ! grep -qxF "$MODULE" /etc/modules-load.d/nvidia.conf; then
-            echo "$MODULE" | $SUDO tee -a /etc/modules-load.d/nvidia.conf > /dev/null
+            echo "$MODULE" | sudo tee -a /etc/modules-load.d/nvidia.conf > /dev/null
        fi
    done
 fi
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -30,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
        dnf install -y rh-git227-git
        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz findutils
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
 elif grep -i "rocky" /etc/system-release >/dev/null; then
    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@@ -45,7 +45,6 @@ EOF
    dnf install -y git \
        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
-        findutils \
        pigz
 else
    echo "ERROR Unexpected distro"
--- a/scripts/tag_latest.sh
+++ b/scripts/tag_latest.sh
@@ -2,12 +2,32 @@

 set -eu

+# We use 2 different image repositories to handle combining architecture images into multiarch manifest
+# (The ROCm image is x86 only and is not a multiarch manifest)
 # For developers, you can override the DOCKER_ORG to generate multiarch manifests
-#  DOCKER_ORG=jdoe VERSION=0.1.30 ./scripts/tag_latest.sh
+#  DOCKER_ORG=jdoe VERSION=0.1.30 PUSH=1 ./scripts/tag_latest.sh
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
+RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}

-echo "Updating ${FINAL_IMAGE_REPO}:latest -> ${FINAL_IMAGE_REPO}:${VERSION}"
-docker buildx imagetools create -t ${FINAL_IMAGE_REPO}:latest ${FINAL_IMAGE_REPO}:${VERSION}
-echo "Updating ${FINAL_IMAGE_REPO}:rocm -> ${FINAL_IMAGE_REPO}:${VERSION}-rocm"
-docker buildx imagetools create -t ${FINAL_IMAGE_REPO}:rocm ${FINAL_IMAGE_REPO}:${VERSION}-rocm
+# Set PUSH to a non-empty string to trigger push instead of load
+PUSH=${PUSH:-""}
+
+echo "Assembling manifest and tagging latest"
+docker manifest rm ${FINAL_IMAGE_REPO}:latest || true
+docker manifest create ${FINAL_IMAGE_REPO}:latest \
+    ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
+    ${RELEASE_IMAGE_REPO}:$VERSION-arm64
+
+docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
+docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:rocm
+
+if [ -n "${PUSH}" ]; then
+    echo "Pushing latest tags up..."
+    docker manifest push ${FINAL_IMAGE_REPO}:latest
+    docker push ${FINAL_IMAGE_REPO}:rocm
+else
+    echo "Not pushing ${FINAL_IMAGE_REPO}:latest and ${FINAL_IMAGE_REPO}:rocm"
+fi
+
+
--- a/server/download.go
+++ b/server/download.go
@@ -256,7 +256,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 				continue
 			}
 			defer resp.Body.Close()
-			if resp.StatusCode != http.StatusTemporaryRedirect && resp.StatusCode != http.StatusOK {
+			if resp.StatusCode != http.StatusTemporaryRedirect {
 				return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
 			}
 			return resp.Location()
--- a/server/model.go
+++ b/server/model.go
@@ -272,30 +272,6 @@ func detectContentType(r io.Reader) (string, error) {
 	return "unknown", nil
 }

-func parseObjects(s string) []map[string]any {
-	var objs []map[string]any
-	for offset := 0; offset < len(s); {
-		var obj map[string]any
-		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
-		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
-			break
-		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
-			// skip over any syntax errors
-			offset += int(syntax.Offset)
-		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
-			// skip over any unmarshalable types
-			offset += int(unmarshalType.Offset)
-		} else if err != nil {
-			return nil
-		} else {
-			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
-		}
-	}
-
-	return objs
-}
-
 // parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
 // mxyng: this only really works if the input contains tool calls in some JSON format
 func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
@@ -328,14 +304,16 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		return nil, false
 	}

-	templateObjects := parseObjects(b.String())
-	if len(templateObjects) == 0 {
+	var kv map[string]any
+	// execute the subtree with placeholders to identify the keys
+	// trim any commands that might exist in the template
+	if err := json.Unmarshal(bytes.TrimSuffix(b.Bytes(), []byte(",")), &kv); err != nil {
 		return nil, false
 	}

 	// find the keys that correspond to the name and arguments fields
 	var name, arguments string
-	for k, v := range templateObjects[0] {
+	for k, v := range kv {
 		switch v.(type) {
 		case string:
 			name = k
@@ -348,32 +326,43 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		return nil, false
 	}

-	responseObjects := parseObjects(s)
-	if len(responseObjects) == 0 {
-		return nil, false
-	}
-
-	// collect all nested objects
-	var collect func(any) []map[string]any
-	collect = func(obj any) (all []map[string]any) {
-		switch o := obj.(type) {
-		case map[string]any:
-			all = append(all, o)
-			for _, v := range o {
-				all = append(all, collect(v)...)
-			}
-		case []any:
-			for _, v := range o {
-				all = append(all, collect(v)...)
-			}
-		}
-
-		return all
-	}
-
 	var objs []map[string]any
-	for _, p := range responseObjects {
-		objs = append(objs, collect(p)...)
+	for offset := 0; offset < len(s); {
+		var obj map[string]any
+		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
+		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
+			break
+		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
+			// skip over any syntax errors
+			offset += int(syntax.Offset)
+		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
+			// skip over any unmarshalable types
+			offset += int(unmarshalType.Offset)
+		} else if err != nil {
+			slog.Error("parseToolCalls", "error", err)
+			return nil, false
+		} else {
+			offset += int(decoder.InputOffset())
+
+			// collect all nested objects
+			var collect func(any) []map[string]any
+			collect = func(obj any) (all []map[string]any) {
+				switch o := obj.(type) {
+				case map[string]any:
+					all = append(all, o)
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				case []any:
+					for _, v := range o {
+						all = append(all, collect(v)...)
+					}
+				}
+
+				return all
+			}
+			objs = append(objs, collect(obj)...)
+		}
 	}

 	var toolCalls []api.ToolCall
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -69,7 +69,6 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
 		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
-		{"nemotron", `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]} </toolcall>`, true},
 	}

 	var tools []api.Tool
@@ -140,7 +139,6 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,

 func TestParseFromFileFromLayer(t *testing.T) {
 	tempModels := t.TempDir()
-	t.Setenv("OLLAMA_MODELS", tempModels)

 	file, err := os.CreateTemp(tempModels, "")
 	if err != nil {
@@ -191,7 +189,6 @@ func TestParseFromFileFromLayer(t *testing.T) {

 func TestParseLayerFromCopy(t *testing.T) {
 	tempModels := t.TempDir()
-	t.Setenv("OLLAMA_MODELS", tempModels)

 	file2, err := os.CreateTemp(tempModels, "")
 	if err != nil {
@@ -218,45 +215,3 @@ func TestParseLayerFromCopy(t *testing.T) {
 		t.Fatalf("got %d != want 5", len(layers))
 	}
 }
-
-func TestParseObjects(t *testing.T) {
-	tests := []struct {
-		input string
-		want  []map[string]any
-	}{
-		{
-			input: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			want: []map[string]any{
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, Canada"}},
-			},
-		},
-		{
-			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall>`,
-			want: []map[string]any{
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
-			},
-		},
-		{
-			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall> <toolcall>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, ON"}} </toolcall>`,
-			want: []map[string]any{
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, ON"}},
-			},
-		},
-		{
-			input: `{"name": "get_current_weather", "arguments": `,
-			want:  nil,
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.input, func(t *testing.T) {
-			got := parseObjects(tc.input)
-
-			if diff := cmp.Diff(got, tc.want); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-			}
-		})
-	}
-}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -73,6 +73,18 @@ func ParseModelPath(name string) ModelPath {

 var errModelPathInvalid = errors.New("invalid model path")

+func (mp ModelPath) Validate() error {
+	if mp.Repository == "" {
+		return fmt.Errorf("%w: model repository name is required", errModelPathInvalid)
+	}
+
+	if strings.Contains(mp.Tag, ":") {
+		return fmt.Errorf("%w: ':' (colon) is not allowed in tag names", errModelPathInvalid)
+	}
+
+	return nil
+}
+
 func (mp ModelPath) GetNamespaceRepository() string {
 	return fmt.Sprintf("%s/%s", mp.Namespace, mp.Repository)
 }
@@ -93,11 +105,7 @@ func (mp ModelPath) GetShortTagname() string {

 // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
 func (mp ModelPath) GetManifestPath() (string, error) {
-	if p := filepath.Join(mp.Registry, mp.Namespace, mp.Repository, mp.Tag); filepath.IsLocal(p) {
-		return filepath.Join(envconfig.Models(), "manifests", p), nil
-	}
-
-	return "", errModelPathInvalid
+	return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
 }

 func (mp ModelPath) BaseURL() *url.URL {
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -1,7 +1,6 @@
 package server

 import (
-	"errors"
 	"os"
 	"path/filepath"
 	"testing"
@@ -155,10 +154,3 @@ func TestParseModelPath(t *testing.T) {
 		})
 	}
 }
-
-func TestInsecureModelpath(t *testing.T) {
-	mp := ParseModelPath("../../..:something")
-	if _, err := mp.GetManifestPath(); !errors.Is(err, errModelPathInvalid) {
-		t.Errorf("expected error: %v", err)
-	}
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -26,13 +26,11 @@ import (
 	"golang.org/x/sync/errgroup"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
-	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -119,32 +117,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	// expire the runner
-	if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
-		model, err := GetModel(req.Model)
-		if err != nil {
-			switch {
-			case os.IsNotExist(err):
-				c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
-			case err.Error() == "invalid model name":
-				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			}
-			return
-		}
-		s.sched.expireRunner(model)
-
-		c.JSON(http.StatusOK, api.GenerateResponse{
-			Model:      req.Model,
-			CreatedAt:  time.Now().UTC(),
-			Response:   "",
-			Done:       true,
-			DoneReason: "unload",
-		})
-		return
-	}
-
 	if req.Format != "" && req.Format != "json" {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
 		return
@@ -491,7 +463,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }

-func (s *Server) PullHandler(c *gin.Context) {
+func (s *Server) PullModelHandler(c *gin.Context) {
 	var req api.PullRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -541,7 +513,7 @@ func (s *Server) PullHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) PushHandler(c *gin.Context) {
+func (s *Server) PushModelHandler(c *gin.Context) {
 	var req api.PushRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -605,7 +577,7 @@ func checkNameExists(name model.Name) error {
 	return nil
 }

-func (s *Server) CreateHandler(c *gin.Context) {
+func (s *Server) CreateModelHandler(c *gin.Context) {
 	var r api.CreateRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -675,7 +647,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func (s *Server) DeleteHandler(c *gin.Context) {
+func (s *Server) DeleteModelHandler(c *gin.Context) {
 	var r api.DeleteRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -708,7 +680,7 @@ func (s *Server) DeleteHandler(c *gin.Context) {
 	}
 }

-func (s *Server) ShowHandler(c *gin.Context) {
+func (s *Server) ShowModelHandler(c *gin.Context) {
 	var req api.ShowRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -857,7 +829,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
 	return kv, nil
 }

-func (s *Server) ListHandler(c *gin.Context) {
+func (s *Server) ListModelsHandler(c *gin.Context) {
 	ms, err := Manifests()
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -907,7 +879,7 @@ func (s *Server) ListHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, api.ListResponse{Models: models})
 }

-func (s *Server) CopyHandler(c *gin.Context) {
+func (s *Server) CopyModelHandler(c *gin.Context) {
 	var r api.CopyRequest
 	if err := c.ShouldBindJSON(&r); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -1109,33 +1081,33 @@ func (s *Server) GenerateRoutes() http.Handler {
 		allowedHostsMiddleware(s.addr),
 	)

-	r.POST("/api/pull", s.PullHandler)
+	r.POST("/api/pull", s.PullModelHandler)
 	r.POST("/api/generate", s.GenerateHandler)
 	r.POST("/api/chat", s.ChatHandler)
 	r.POST("/api/embed", s.EmbedHandler)
 	r.POST("/api/embeddings", s.EmbeddingsHandler)
-	r.POST("/api/create", s.CreateHandler)
-	r.POST("/api/push", s.PushHandler)
-	r.POST("/api/copy", s.CopyHandler)
-	r.DELETE("/api/delete", s.DeleteHandler)
-	r.POST("/api/show", s.ShowHandler)
+	r.POST("/api/create", s.CreateModelHandler)
+	r.POST("/api/push", s.PushModelHandler)
+	r.POST("/api/copy", s.CopyModelHandler)
+	r.DELETE("/api/delete", s.DeleteModelHandler)
+	r.POST("/api/show", s.ShowModelHandler)
 	r.POST("/api/blobs/:digest", s.CreateBlobHandler)
 	r.HEAD("/api/blobs/:digest", s.HeadBlobHandler)
-	r.GET("/api/ps", s.PsHandler)
+	r.GET("/api/ps", s.ProcessHandler)

 	// Compatibility endpoints
 	r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler)
 	r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler)
 	r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler)
-	r.GET("/v1/models", openai.ListMiddleware(), s.ListHandler)
-	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowHandler)
+	r.GET("/v1/models", openai.ListMiddleware(), s.ListModelsHandler)
+	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowModelHandler)

 	for _, method := range []string{http.MethodGet, http.MethodHead} {
 		r.Handle(method, "/", func(c *gin.Context) {
 			c.String(http.StatusOK, "Ollama is running")
 		})

-		r.Handle(method, "/api/tags", s.ListHandler)
+		r.Handle(method, "/api/tags", s.ListModelsHandler)
 		r.Handle(method, "/api/version", func(c *gin.Context) {
 			c.JSON(http.StatusOK, gin.H{"version": version.Version})
 		})
@@ -1218,12 +1190,12 @@ func Serve(ln net.Listener) error {
 		srvr.Close()
 		schedDone()
 		sched.unloadAllRunners()
-		runners.Cleanup(build.EmbedFS)
+		gpu.Cleanup()
 		done()
 	}()

-	if _, err := runners.Refresh(build.EmbedFS); err != nil {
-		return fmt.Errorf("unable to initialize llm runners %w", err)
+	if err := llm.Init(); err != nil {
+		return fmt.Errorf("unable to initialize llm library %w", err)
 	}

 	s.sched.Run(schedCtx)
@@ -1297,7 +1269,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 	})
 }

-func (s *Server) PsHandler(c *gin.Context) {
+func (s *Server) ProcessHandler(c *gin.Context) {
 	models := []api.ProcessModelResponse{}

 	for _, v := range s.sched.loaded {
@@ -1350,32 +1322,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

-	// expire the runner
-	if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
-		model, err := GetModel(req.Model)
-		if err != nil {
-			switch {
-			case os.IsNotExist(err):
-				c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
-			case err.Error() == "invalid model name":
-				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			}
-			return
-		}
-		s.sched.expireRunner(model)
-
-		c.JSON(http.StatusOK, api.ChatResponse{
-			Model:      req.Model,
-			CreatedAt:  time.Now().UTC(),
-			Message:    api.Message{Role: "assistant"},
-			Done:       true,
-			DoneReason: "unload",
-		})
-		return
-	}
-
 	caps := []Capability{CapabilityCompletion}
 	if len(req.Tools) > 0 {
 		caps = append(caps, CapabilityTools)
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -93,7 +93,7 @@ func TestCreateFromBin(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)

 	var s Server
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -120,7 +120,7 @@ func TestCreateFromModel(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -134,7 +134,7 @@ func TestCreateFromModel(t *testing.T) {
 		filepath.Join(p, "manifests", "registry.ollama.ai", "library", "test", "latest"),
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test",
 		Stream:    &stream,
@@ -162,7 +162,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -182,7 +182,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-bc80b03733773e0728011b2f4adf34c458b400e1aad48cb28d61170f3a2ad2d6"),
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -210,7 +210,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM Say hi!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -230,7 +230,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-f29e82a8284dbdf5910b1555580ff60b04238b8da9d5e51159ada67a4d0d5851"),
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nSYSTEM \"\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -267,7 +267,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nPARAMETER temperature 1\nPARAMETER top_k 10\nPARAMETER stop USER:\nPARAMETER stop ASSISTANT:", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -288,7 +288,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	})

 	// in order to merge parameters, the second model must be created FROM the first
-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7",
 		Stream:    &stream,
@@ -326,7 +326,7 @@ func TestCreateMergeParameters(t *testing.T) {
 	}

 	// slices are replaced
-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nPARAMETER temperature 0.6\nPARAMETER top_p 0.7\nPARAMETER stop <|endoftext|>",
 		Stream:    &stream,
@@ -371,7 +371,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nMESSAGE assistant \"What is my purpose?\"\nMESSAGE user \"You run tests.\"\nMESSAGE assistant \"Oh, my god.\"", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -391,7 +391,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-e0e27d47045063ccb167ae852c51d49a98eab33fabaee4633fdddf97213e40b5"),
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: "FROM test\nMESSAGE assistant \"You're a test, Harry.\"\nMESSAGE user \"I-I'm a what?\"\nMESSAGE assistant \"A test. And a thumping good one at that, I'd wager.\"",
 		Stream:    &stream,
@@ -448,7 +448,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt }}\nSYSTEM Say hello!\nTEMPLATE {{ .System }} {{ .Prompt }}\nSYSTEM Say bye!", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -488,7 +488,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	}

 	t.Run("incomplete template", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@@ -500,7 +500,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	})

 	t.Run("template with unclosed if", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@@ -512,7 +512,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 	})

 	t.Run("template with undefined function", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{  Prompt }}", createBinFile(t, nil, nil)),
 			Stream:    &stream,
@@ -531,7 +531,7 @@ func TestCreateLicenses(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s\nLICENSE MIT\nLICENSE Apache-2.0", createBinFile(t, nil, nil)),
 		Stream:    &stream,
@@ -579,7 +579,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 	var s Server

 	t.Run("matched", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name: "test",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
@@ -593,14 +593,14 @@ func TestCreateDetectTemplate(t *testing.T) {

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
 			filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"),
-			filepath.Join(p, "blobs", "sha256-35360843d0c84fb1506952a131bbef13cd2bb4a541251f22535170c05b56e672"),
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
-			filepath.Join(p, "blobs", "sha256-de3959f841e9ef6b4b6255fa41cb9e0a45da89c3066aa72bdd07a4747f848990"),
+			filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
+			filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"),
 		})
 	})

 	t.Run("unmatched", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name:      "test",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 			Stream:    &stream,
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -22,7 +22,7 @@ func TestDelete(t *testing.T) {

 	var s Server

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test",
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 	})
@@ -31,7 +31,7 @@ func TestDelete(t *testing.T) {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Name:      "test2",
 		Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .System }} {{ .Prompt }}", createBinFile(t, nil, nil)),
 	})
@@ -52,7 +52,7 @@ func TestDelete(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
 	})

-	w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})
+	w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})

 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
@@ -68,7 +68,7 @@ func TestDelete(t *testing.T) {
 		filepath.Join(p, "blobs", "sha256-fe7ac77b725cda2ccad03f88a880ecdfd7a33192d6cae08fce2c0ee1455991ed"),
 	})

-	w = createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test2"})
+	w = createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test2"})

 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
@@ -102,7 +102,7 @@ func TestDeleteDuplicateLayers(t *testing.T) {
 		t.Fatal(err)
 	}

-	w := createRequest(t, s.DeleteHandler, api.DeleteRequest{Name: "test"})
+	w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})
 	if w.Code != http.StatusOK {
 		t.Errorf("expected status code 200, actual %d", w.Code)
 	}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -84,7 +84,7 @@ func TestGenerateChat(t *testing.T) {

 	go s.sched.Run(context.TODO())

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
@@ -144,7 +144,7 @@ func TestGenerateChat(t *testing.T) {
 	})

 	t.Run("missing capabilities chat", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Model: "bert",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
@@ -270,7 +270,7 @@ func TestGenerateChat(t *testing.T) {
 		checkChatResponse(t, w.Body, "test", "Hi!")
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Model:     "test-system",
 		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
 	})
@@ -382,7 +382,7 @@ func TestGenerate(t *testing.T) {

 	go s.sched.Run(context.TODO())

-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
+	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
@@ -442,7 +442,7 @@ func TestGenerate(t *testing.T) {
 	})

 	t.Run("missing capabilities generate", func(t *testing.T) {
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Model: "bert",
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
 				"general.architecture": "bert",
@@ -583,7 +583,7 @@ func TestGenerate(t *testing.T) {
 		checkGenerateResponse(t, w.Body, "test", "Hi!")
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Model:     "test-system",
 		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
 	})
@@ -652,7 +652,7 @@ func TestGenerate(t *testing.T) {
 		checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
 	})

-	w = createRequest(t, s.CreateHandler, api.CreateRequest{
+	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
 		Model: "test-suffix",
 		Modelfile: `FROM test
 TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -31,13 +31,13 @@ func TestList(t *testing.T) {

 	var s Server
 	for _, n := range expectNames {
-		createRequest(t, s.CreateHandler, api.CreateRequest{
+		createRequest(t, s.CreateModelHandler, api.CreateRequest{
 			Name:      n,
 			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		})
 	}

-	w := createRequest(t, s.ListHandler, nil)
+	w := createRequest(t, s.ListModelsHandler, nil)
 	if w.Code != http.StatusOK {
 		t.Fatalf("expected status code 200, actual %d", w.Code)
 	}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Patrick Devine	cb576a6b23	fix ref	2024-08-26 19:59:33 -07:00
Patrick Devine	15b7ff3a89	more comments	2024-08-26 19:56:45 -07:00
Patrick Devine	3ad243466b	comments	2024-08-26 19:54:06 -07:00
Patrick Devine	a13e583c49	cleanup whitespace	2024-08-26 18:09:21 -07:00
Patrick Devine	3c1994d0ee	small change	2024-08-26 18:07:59 -07:00
Patrick Devine	1b2da3829d	update the import docs	2024-08-26 18:04:46 -07:00
				`@@ -1 +0,0 @@`
				`This is here to make sure the build/ directory exists for the go:embed command`