Compare commits

..

14 Commits

Author SHA1 Message Date
Bruce MacDonald
c2b11611a8 Update new_runner_benchmark_test.go 2025-01-29 10:34:09 -08:00
Bruce MacDonald
90698c7d15 benchmark: new Go runner 2025-01-28 14:55:03 -08:00
Jesse Gross
4b4a5a28bf new runner 2025-01-27 13:47:13 -08:00
Jesse Gross
3c95c21ddf tensor loading iface 2025-01-23 16:04:13 -08:00
Michael Yang
8ab13e4d3e next 2025-01-22 22:10:31 -08:00
Michael Yang
144f63e2fb next build 2025-01-22 22:08:29 -08:00
frob
294b6f5a22 docs: remove tfs_z option from documentation (#8515) 2025-01-21 09:28:59 -08:00
EndoTheDev
7bb356c680 docs: update suspend header in gpu.md (#8487) 2025-01-19 18:45:35 -08:00
Jannik Maierhöfer
021817e59a readme: add link to Langfuse (#8455) 2025-01-16 22:41:12 -08:00
Patrick Devine
a420a453b4 fix default modelfile for create (#8452) 2025-01-16 01:14:04 -08:00
Jeffrey Morgan
42cf4db601 parser: fix parsing Modelfiles with multiple FROM commands (#8449) 2025-01-16 00:14:04 -08:00
Josh
93a8daf285 convert: import support for command-r models from safetensors (#6063)
---------

Co-authored-by: Patrick Devine <patrick@infrahq.com>
2025-01-15 16:31:22 -08:00
Gloryjaw
a041b4df7c docs: fix path to examples (#8438) 2025-01-15 11:49:12 -08:00
Patrick Devine
2539f2dbf9 Fix absolute path names + gguf detection (#8428) 2025-01-14 19:01:24 -08:00
98 changed files with 3738 additions and 4160 deletions

View File

@@ -478,243 +478,77 @@ jobs:
dist/OllamaSetup.exe
dist/ollama-windows-*.zip
# Linux x86 assets built using the container based build
build-linux-amd64:
build-linux:
environment: release
runs-on: linux
env:
PLATFORM: linux/amd64
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- run: |
./scripts/build_linux.sh
- uses: actions/upload-artifact@v4
with:
name: dist-linux-amd64
path: |
dist/*linux*
!dist/*-cov
# Linux ARM assets built using the container based build
# (at present, docker isn't pre-installed on arm ubunutu images)
build-linux-arm64:
environment: release
runs-on: linux-arm64
env:
PLATFORM: linux/arm64
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: 'Install Docker'
run: |
# Add Docker's official GPG key:
env
uname -a
sudo apt-get update
sudo apt-get install -y ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
sudo usermod -aG docker $USER
sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- run: |
./scripts/build_linux.sh
- uses: actions/upload-artifact@v4
with:
name: dist-linux-arm64
path: |
dist/*linux*
!dist/*-cov
# Container image build
build-container-image:
environment: release
strategy:
matrix:
runner:
- linux
- linux-arm64
runs-on: ${{ matrix.runner }}
env:
FINAL_IMAGE_REPO: ollama/ollama
include:
- os: linux
arch: amd64
targets: [archive, rocm]
- os: linux
arch: arm64
targets: [archive]
steps:
- uses: actions/checkout@v4
- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3
- run: |
apt-get update && apt-get install pigz
for TARGET in ${{ matrix.targets }}; do docker buildx build --platform $PLATFORM --target $TARGET --output type=local,dest=dist/$PLATFORM .; done
tar c -C dist/$PLATFORM . | pigz -9cv >dist/ollama-${PLATFORM//\//-}.tar.gz
env:
PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
- uses: actions/upload-artifact@v4
with:
submodules: recursive
- name: 'Install Docker'
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
run: |
sudo apt-get update
sudo apt-get install -y ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
sudo usermod -aG docker $USER
sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
machine=$(uname -m)
case ${machine} in
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
esac >>$GITHUB_ENV
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6
with:
context: "."
platforms: linux/${{ env.ARCH }}
build-args: |
GOFLAGS
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
- name: Export digest
run: |
mkdir -p /tmp/digests
digest="${{ steps.build.outputs.digest }}"
touch "/tmp/digests/${digest#sha256:}"
- name: Upload digest
uses: actions/upload-artifact@v4
with:
name: digests-${{ env.PLATFORM_PAIR }}
path: /tmp/digests/*
if-no-files-found: error
retention-days: 1
merge:
name: dist-${{ matrix.os }}-${{ matrix.arch }}
path: |
dist/ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.gz
build-docker:
environment: release
runs-on: linux
needs:
- build-container-image
env:
FINAL_IMAGE_REPO: ollama/ollama
strategy:
matrix:
include:
- flavor: |
latest=auto
platforms: linux/amd64,linux/arm64
build-args: [GOFLAGS]
- flavor: |
suffix=-rocm,onlatest=false
platforms: linux/amd64
build-args: [GOFLAGS, FLAVOR=rocm]
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Download digests
uses: actions/download-artifact@v4
with:
path: /tmp/digests
pattern: digests-*
merge-multiple: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
machine=$(uname -m)
case ${machine} in
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
esac >>$GITHUB_ENV
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
- uses: docker/setup-qemu-action@v2
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Create manifest list and push
working-directory: /tmp/digests
run: |
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
build-container-image-rocm:
environment: release
runs-on: linux
env:
FINAL_IMAGE_REPO: ollama/ollama
ARCH: amd64
PLATFORM_PAIR: linux-amd64
steps:
- uses: actions/checkout@v4
- id: metadata
uses: docker/metadata-action@v4
with:
submodules: recursive
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
flavor: ${{ matrix.flavor }}
images: |
ollama/ollama
tags: |
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
- uses: docker/build-push-action@v6
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6
with:
context: "."
target: runtime-rocm
build-args: |
GOFLAGS
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
context: .
push: true
platforms: ${{ matrix.platforms }}
build-args: ${{ matrix.build-args }}
tags: ${{ steps.metadata.outputs.tags }}
labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=registry,ref=ollama/ollama:latest
cache-to: type=inline
provenance: false
env:
GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ steps.metadata.outputs.version }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
# Aggregate all the assets and ship a release
release:

View File

@@ -55,9 +55,14 @@ jobs:
- uses: actions/checkout@v4
- run: |
apt-get update
apt-get install -y cmake pkg-config ${{ matrix.extra-packages }}
apt-get install -y cmake pkg-config ccache ${{ matrix.extra-packages }}
ccache -o cache_dir=${{ github.workspace }}\.ccache
env:
DEBIAN_FRONTEND: noninteractive
- uses: actions/cache@v4
with:
path: ${{ github.workspace }}\.ccache
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
- run: |
cmake --preset ${{ matrix.preset }}
cmake --build --preset ${{ matrix.preset }} --parallel

View File

@@ -19,11 +19,14 @@ set(GGML_CCACHE ON)
set(GGML_BACKEND_DL ON)
set(GGML_BACKEND_SHARED ON)
set(GGML_SCHED_MAX_COPIES 4)
set(GGML_LLAMAFILE ON)
set(GGML_CPU_ALL_VARIANTS ON)
set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
set(GGML_LLAMAFILE ON)
set(GGML_CUDA_GRAPHS ON)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
@@ -36,10 +39,16 @@ set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "native")
endif()
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
endif()
check_language(HIP)
if(CMAKE_HIP_COMPILER)
set(HIP_PLATFORM "amd")
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
endif()

View File

@@ -8,6 +8,10 @@
"CMAKE_BUILD_TYPE": "Release"
}
},
{
"name": "CPU",
"inherits": [ "Default" ]
},
{
"name": "CUDA",
"inherits": [ "Default" ]
@@ -42,20 +46,29 @@
},
{
"name": "ROCm",
"inherits": [ "Default" ]
"inherits": [ "Default" ],
"cacheVariables": {
"CMAKE_HIP_PLATFORM": "amd"
}
},
{
"name": "ROCm 6",
"inherits": [ "ROCm" ],
"cacheVariables": {
"CMAKE_HIP_ARCHITECTURES": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
}
}
],
"buildPresets": [
{
"name": "Default",
"configurePreset": "Default"
"configurePreset": "Default",
"configuration": "Release"
},
{
"name": "CPU",
"configurePreset": "Default",
"targets": [ "ggml-cpu" ]
},
{
"name": "CUDA",

View File

@@ -1,201 +1,161 @@
ARG GOLANG_VERSION=1.22.8
ARG CUDA_VERSION_11=11.3.1
ARG CUDA_VERSION_12=12.4.0
ARG ROCM_VERSION=6.1.2
ARG JETPACK_6=r36.2.0
ARG JETPACK_5=r35.4.1
# vim: filetype=dockerfile
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
#
# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
#
### Then incremental builds will be much faster in this container
#
# make -j 10 dist
#
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
ARG GOLANG_VERSION
ARG CUDA_VERSION_11
ARG CUDA_VERSION_12
COPY ./scripts/rh_linux_deps.sh /
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
dnf clean all && \
dnf install -y \
zsh \
cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
# TODO intel oneapi goes here...
ENV GOARCH amd64
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/ollama/ollama/
ENTRYPOINT [ "zsh" ]
ARG FLAVOR=${TARGETARCH}
### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
# Note: this does not contain jetson variants
#
# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
#
FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
ARG GOLANG_VERSION
ARG CUDA_VERSION_11
ARG CUDA_VERSION_12
COPY ./scripts/rh_linux_deps.sh /
RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
dnf config-manager --set-enabled appstream && \
dnf clean all && \
dnf install -y \
zsh \
cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
ENV GOARCH arm64
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/ollama/ollama/
ENTRYPOINT [ "zsh" ]
ARG ROCMVERSION=6.1.2
ARG JETPACK5VERSION=r35.4.1
ARG JETPACK6VERSION=r36.2.0
ARG CMAKEVERSION=3.31.2
FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
COPY . .
ARG OLLAMA_SKIP_CUDA_GENERATE
ARG OLLAMA_SKIP_ROCM_GENERATE
ARG OLLAMA_FAST_BUILD
ARG VERSION
ARG CUSTOM_CPU_FLAGS
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCMVERSION}-complete AS base-amd64
RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
&& yum install -y yum-utils devtoolset-10-gcc devtoolset-10-gcc-c++ \
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo \
&& curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/devtoolset-11/root/usr/bin:$PATH
FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
# install epel-release for ccache
RUN yum install -y yum-utils epel-release \
&& yum install -y clang ccache \
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
ENV CC=clang CXX=clang++
FROM base-${TARGETARCH} AS base
ARG CMAKEVERSION
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
ENV LDFLAGS=-s
FROM base AS cpu
# amd64 uses gcc which requires devtoolset-11 for AVX extensions while arm64 uses clang
RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
make -j $(nproc) dist ; \
else \
make -j 5 dist ; \
fi
RUN cd dist/linux-$GOARCH && \
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
cd dist/linux-$GOARCH-rocm && \
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
fi
cmake --preset 'CPU' && cmake --build --parallel --preset 'CPU'
# Jetsons need to be built in discrete stages
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
ARG GOLANG_VERSION
RUN apt-get update && apt-get install -y git curl ccache && \
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /go/src/github.com/ollama/ollama/
COPY . .
ARG CGO_CFLAGS
ENV GOARCH arm64
ARG VERSION
FROM base AS cuda-11
ARG CUDA11VERSION=11.3
RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
ENV PATH=/usr/local/cuda-11/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
make -j 5 dist_cuda_v11 \
CUDA_ARCHITECTURES="72;87" \
GPU_RUNNER_VARIANT=_jetpack5 \
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
cmake --preset 'CUDA 11' && cmake --build --parallel --preset 'CUDA 11'
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
ARG GOLANG_VERSION
RUN apt-get update && apt-get install -y git curl ccache && \
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
ln -s /usr/local/go/bin/go /usr/local/bin/go && \
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /go/src/github.com/ollama/ollama/
COPY . .
ARG CGO_CFLAGS
ENV GOARCH arm64
ARG VERSION
FROM base AS cuda-12
ARG CUDA12VERSION=12.4
RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
ENV PATH=/usr/local/cuda-12/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
make -j 5 dist_cuda_v12 \
CUDA_ARCHITECTURES="87" \
GPU_RUNNER_VARIANT=_jetpack6 \
DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
cmake --preset 'CUDA 12' && cmake --build --parallel --preset 'CUDA 12'
FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
COPY . .
ARG OLLAMA_SKIP_CUDA_GENERATE
ARG OLLAMA_FAST_BUILD
ARG VERSION
FROM base AS rocm-6
RUN --mount=type=cache,target=/root/.ccache \
make -j 5 dist
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
RUN cd dist/linux-$GOARCH && \
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
RUN cd dist/linux-$GOARCH-jetpack5 && \
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
RUN cd dist/linux-$GOARCH-jetpack6 && \
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
cmake --preset 'ROCm 6' && cmake --build --parallel --preset 'ROCm 6'
FROM --platform=linux/amd64 scratch AS dist-amd64
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
FROM --platform=linux/arm64 scratch AS dist-arm64
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
FROM dist-$TARGETARCH AS dist
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
ARG CMAKEVERSION
RUN apt-get update && apt-get install -y curl ccache \
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'JetPack 5' && cmake --build --parallel --preset 'JetPack 5'
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
ARG CMAKEVERSION
RUN apt-get update && apt-get install -y curl ccache \
&& curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'JetPack 6' && cmake --build --parallel --preset 'JetPack 6'
# For amd64 container images, filter out cuda/rocm to minimize size
FROM build-amd64 AS runners-cuda-amd64
RUN rm -rf \
./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
./dist/linux-amd64/lib/ollama/runners/rocm*
FROM base AS build
ARG GOVERSION=1.23.4
RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
ENV PATH=/usr/local/go/bin:$PATH
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
ARG GOFLAGS="'-ldflags=-w -s'"
ENV CGO_ENABLED=1
RUN --mount=type=cache,target=/root/.cache/go-build \
go build -trimpath -buildmode=pie -o /bin/ollama .
FROM build-amd64 AS runners-rocm-amd64
RUN rm -rf \
./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
./dist/linux-amd64/lib/ollama/libcu*.so* \
./dist/linux-amd64/lib/ollama/runners/cuda*
FROM --platform=linux/amd64 scratch AS amd64
COPY --from=cuda-11 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.11 \
/usr/local/cuda/lib64/libcublasLt.so.11 \
/usr/local/cuda/lib64/libcudart.so.11.0 \
/lib/ollama/cuda_v11/
COPY --from=cuda-12 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.12 \
/usr/local/cuda/lib64/libcublasLt.so.12 \
/usr/local/cuda/lib64/libcudart.so.12 \
/lib/ollama/cuda_v12/
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
RUN apt-get update && \
apt-get install -y ca-certificates && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
FROM --platform=linux/arm64 scratch AS arm64
COPY --from=cuda-11 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.11 \
/usr/local/cuda/lib64/libcublasLt.so.11 \
/usr/local/cuda/lib64/libcudart.so.11.0 \
/lib/ollama/cuda_v11/
COPY --from=cuda-12 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.12 \
/usr/local/cuda/lib64/libcublasLt.so.12 \
/usr/local/cuda/lib64/libcudart.so.12 \
/lib/ollama/cuda_v12/
COPY --from=jetpack-5 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.11 \
/usr/local/cuda/lib64/libcublasLt.so.11 \
/usr/local/cuda/lib64/libcudart.so.11.0 \
/lib/ollama/cuda_jetpack5/
COPY --from=jetpack-6 --chmod=644 \
build/lib/libggml-cuda.so \
/usr/local/cuda/lib64/libcublas.so.12 \
/usr/local/cuda/lib64/libcublasLt.so.12 \
/usr/local/cuda/lib64/libcudart.so.12 \
/lib/ollama/cuda_jetpack6/
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
RUN apt-get update && \
apt-get install -y ca-certificates && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
FROM --platform=linux/arm64 scratch AS rocm
COPY --from=rocm-6 --chmod=644 \
build/lib/libggml-hip.so \
/opt/rocm/lib/libamdhip64.so.6 \
/opt/rocm/lib/libhipblas.so.2 \
/opt/rocm/lib/librocblas.so.4 \
/opt/rocm/lib/libamd_comgr.so.2 \
/opt/rocm/lib/libhsa-runtime64.so.1 \
/opt/rocm/lib/librocprofiler-register.so.0 \
/opt/amdgpu/lib64/libdrm_amdgpu.so.1 \
/opt/amdgpu/lib64/libdrm.so.2 \
/usr/lib64/libnuma.so.1 \
/lib/ollama/rocm/
COPY --from=rocm-6 /opt/rocm/lib/rocblas/ /lib/ollama/rocm/rocblas/
FROM ${FLAVOR} AS archive
COPY --from=cpu --chmod=644 \
build/lib/libggml-base.so \
build/lib/libggml-cpu-*.so \
/lib/ollama/
COPY --from=build /bin/ollama /bin/ollama
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
# across releases
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
RUN apt-get update && \
apt-get install -y ca-certificates && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]
FROM runtime-$TARGETARCH
EXPOSE 11434
ENV OLLAMA_HOST 0.0.0.0
FROM ubuntu:20.04
RUN apt-get update \
&& apt-get install -y ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
COPY --from=archive /bin/ /usr/bin/
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
COPY --from=archive /lib/ollama/ /usr/lib/ollama/
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/ollama
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENV NVIDIA_VISIBLE_DEVICES=all
ENV OLLAMA_HOST=0.0.0.0:11434
EXPOSE 11434
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

View File

@@ -1,66 +0,0 @@
ARG CUDA_11_VERSION=11.3
ARG CUDA_12_VERSION=12.4
ARG ROCM_VERSION=6.1.2
ARG JETPACK_5_VERSION=r35.4.1
ARG JETPACK_6_VERSION=r36.2.0
ARG CMAKE_VERSION=3.31.2
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
ARG CMAKE_VERSION
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
# FROM --platform=linux/arm64 rockylinux:8 AS base
# ARG CMAKE_VERSION
# RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
# RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
FROM base AS amd64
ARG CUDA_11_VERSION
ARG CUDA_12_VERSION
RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
&& yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
COPY CMakeLists.txt CMakeLists.txt
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
FROM --platform=linux/amd64 amd64 AS cuda_11
ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
RUN cmake --build build --target ggml-cuda -j
FROM --platform=linux/amd64 amd64 AS cuda_12
ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
RUN cmake --build build --target ggml-cuda -j
FROM --platform=linux/amd64 amd64 AS rocm
RUN cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
RUN cmake --build build --target ggml-hip -j
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
ARG CMAKE_VERSION
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
COPY CMakeLists.txt .
COPY ml/backend/ggml/ggml .
RUN cmake -S . -B build \
-DCMAKE_CUDA_ARCHITECTURES="72;87"
RUN cmake --build build --target ggml-cuda
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
ARG CMAKE_VERSION
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
COPY CMakeLists.txt .
COPY ml/backend/ggml/ggml .
RUN cmake -S . -B build \
-DCMAKE_CUDA_ARCHITECTURES="87"
RUN cmake --build build --target ggml-cuda
FROM --platform=linux/amd64 golang:1.23
COPY --from=cuda_11 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-11.so
COPY --from=cuda_12 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-12.so
COPY --from=rocm build/ml/backend/ggml/ggml/src/ggml-hip/libggml-hip.so libggml-hip.so
# FROM --platform=linux/arm64 golang:1.23
# COPY --from=jetpack_5 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-5.so
# COPY --from=jetpack_6 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-6.so

View File

@@ -539,4 +539,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Observability
- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
- [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.

25
benchmark/README.md Normal file
View File

@@ -0,0 +1,25 @@
# Benchmark
Performance benchmarking for Ollama.
## Prerequisites
- Ollama server running locally (`127.0.0.1:11434`)
- Desired models pre-downloaded (e.g., `llama3.2:1b`)
## Run Benchmark
```bash
# Run all tests
go test -bench=. -timeout 30m ./...
```
## New Runner Benchmark
```bash
go test -bench=Runner
```
or to test multiple models:
```bash
# run this from within the benchmark directory
# requires: llama3.2:1b, llama3.1:8b, llama3.3:70b
sh new_runner.sh
```

72
benchmark/new_runner.sh Normal file
View File

@@ -0,0 +1,72 @@
#!/bin/bash
kill_process_tree() {
local pid=$1
# Get all child processes using pgrep
local children=$(pgrep -P $pid)
# Kill children first
for child in $children; do
kill_process_tree $child
done
# Kill the parent process
kill -9 $pid 2>/dev/null || true
}
# Function to run the runner and benchmark for a given model
run_benchmark() {
local model=$1
echo "Starting runner with model: $model"
# Start the runner in background and save its PID
go run ../cmd/runner/main.go --new-runner -model "$model" &
runner_pid=$!
# Wait for the runner to initialize (adjust sleep time as needed)
sleep 5
echo "Running benchmark..."
# Run test and wait for it to complete
go test -bench=Runner
test_exit_code=$?
echo "Stopping runner process..."
# Kill the runner process and all its children
kill_process_tree $runner_pid
# Wait for the process to fully terminate
wait $runner_pid 2>/dev/null || true
# Make sure no processes are still listening on port 8080
lsof -t -i:8080 | xargs kill -9 2>/dev/null || true
# Additional sleep to ensure port is freed
sleep 2
# Check if test failed
if [ $test_exit_code -ne 0 ]; then
echo "Warning: Benchmark test failed with exit code $test_exit_code"
fi
echo "Benchmark complete for model: $model"
echo "----------------------------------------"
}
HOME_DIR="$HOME"
# llama3.2:1b: ~/.ollama/models/blobs/sha256-74701a8c35f6c8d9a4b91f3f3497643001d63e0c7a84e085bed452548fa88d45
# llama3.1:8b: ~/.ollama/models/blobs/sha256-667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29
# llama3.3:70b: ~/.ollama/models/blobs/sha256-4824460d29f2058aaf6e1118a63a7a197a09bed509f0e7d4e2efb1ee273b447d
models=(
"${HOME_DIR}/.ollama/models/blobs/sha256-74701a8c35f6c8d9a4b91f3f3497643001d63e0c7a84e085bed452548fa88d45"
"${HOME_DIR}/.ollama/models/blobs/sha256-667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"
# "${HOME_DIR}/.ollama/models/blobs/sha256-4824460d29f2058aaf6e1118a63a7a197a09bed509f0e7d4e2efb1ee273b447d"
)
# Run benchmarks for each model
for model in "${models[@]}"; do
run_benchmark "$model"
done
echo "All benchmarks completed!"

View File

@@ -0,0 +1,175 @@
package benchmark
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"testing"
"time"
)
const (
runnerURL = "http://localhost:8080"
warmupPrompts = 2 // Number of warm-up requests per test case
warmupTokens = 50 // Smaller token count for warm-up requests
)
var runnerMetrics []BenchmarkMetrics
// CompletionRequest represents the request body for the completion endpoint
type CompletionRequest struct {
Prompt string `json:"prompt"`
NumPredict int `json:"n_predict"`
Temperature float32 `json:"temperature"`
}
// CompletionResponse represents a single response chunk from the streaming API
type CompletionResponse struct {
Content string `json:"content"`
Stop bool `json:"stop"`
Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMs int `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMs int `json:"prompt_ms"`
} `json:"timings"`
}
// warmUp performs warm-up requests before the actual benchmark
func warmUp(b *testing.B, tt TestCase) {
b.Logf("Warming up for test case %s", tt.name)
warmupTest := TestCase{
name: tt.name + "_warmup",
prompt: tt.prompt,
maxTokens: warmupTokens,
}
for i := 0; i < warmupPrompts; i++ {
runCompletion(context.Background(), warmupTest, b)
time.Sleep(100 * time.Millisecond) // Brief pause between warm-up requests
}
b.Logf("Warm-up complete")
}
func BenchmarkRunnerInference(b *testing.B) {
b.Logf("Starting benchmark suite")
// Verify server availability
if _, err := http.Get(runnerURL + "/health"); err != nil {
b.Fatalf("Runner unavailable: %v", err)
}
b.Log("Runner available")
tests := []TestCase{
{
name: "short_prompt",
prompt: formatPrompt("Write a long story"),
maxTokens: 100,
},
{
name: "medium_prompt",
prompt: formatPrompt("Write a detailed economic analysis"),
maxTokens: 500,
},
{
name: "long_prompt",
prompt: formatPrompt("Write a comprehensive AI research paper"),
maxTokens: 1000,
},
}
// Register cleanup handler for results reporting
b.Cleanup(func() { reportMetrics(metrics) })
// Main benchmark loop
for _, tt := range tests {
b.Run(tt.name, func(b *testing.B) {
// Perform warm-up requests
warmUp(b, tt)
// Wait a bit after warm-up before starting the actual benchmark
time.Sleep(500 * time.Millisecond)
m := make([]BenchmarkMetrics, b.N)
for i := 0; i < b.N; i++ {
b.ResetTimer()
m[i] = runCompletion(context.Background(), tt, b)
}
metrics = append(metrics, m...)
})
}
}
func formatPrompt(text string) string {
return fmt.Sprintf("<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", text)
}
func runCompletion(ctx context.Context, tt TestCase, b *testing.B) BenchmarkMetrics {
start := time.Now()
var ttft time.Duration
var tokens int
lastToken := start
// Create request body
reqBody := CompletionRequest{
Prompt: tt.prompt,
NumPredict: tt.maxTokens,
Temperature: 0.1,
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
b.Fatalf("Failed to marshal request: %v", err)
}
// Create HTTP request
req, err := http.NewRequestWithContext(ctx, "POST", runnerURL+"/completion", bytes.NewBuffer(jsonData))
if err != nil {
b.Fatalf("Failed to create request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
// Execute request
resp, err := http.DefaultClient.Do(req)
if err != nil {
b.Fatalf("Request failed: %v", err)
}
defer resp.Body.Close()
// Process streaming response
decoder := json.NewDecoder(resp.Body)
for {
var chunk CompletionResponse
if err := decoder.Decode(&chunk); err != nil {
if err == io.EOF {
break
}
b.Fatalf("Failed to decode response: %v", err)
}
if ttft == 0 && chunk.Content != "" {
ttft = time.Since(start)
}
if chunk.Content != "" {
tokens++
lastToken = time.Now()
}
if chunk.Stop {
break
}
}
totalTime := lastToken.Sub(start)
return BenchmarkMetrics{
testName: tt.name,
ttft: ttft,
totalTime: totalTime,
totalTokens: tokens,
tokensPerSecond: float64(tokens) / totalTime.Seconds(),
}
}

View File

@@ -0,0 +1,293 @@
// Package benchmark provides tools for performance testing of Ollama inference server and supported models.
package benchmark
import (
"context"
"fmt"
"net/http"
"net/url"
"os"
"testing"
"text/tabwriter"
"time"
"github.com/ollama/ollama/api"
)
// ServerURL is the default Ollama server URL for benchmarking
const serverURL = "http://127.0.0.1:11434"
// metrics collects all benchmark results for final reporting
var metrics []BenchmarkMetrics
// models contains the list of model names to benchmark
var models = []string{
"llama3.2:1b",
// "qwen2.5:7b",
// "llama3.3:70b",
}
// TestCase defines a benchmark test scenario with prompt characteristics
type TestCase struct {
name string // Human-readable test name
prompt string // Input prompt text
maxTokens int // Maximum tokens to generate
}
// BenchmarkMetrics contains performance measurements for a single test run
type BenchmarkMetrics struct {
model string // Model being tested
scenario string // cold_start or warm_start
testName string // Name of the test case
ttft time.Duration // Time To First Token (TTFT)
totalTime time.Duration // Total time for complete response
totalTokens int // Total generated tokens
tokensPerSecond float64 // Calculated throughput
}
// ScenarioType defines the initialization state for benchmarking
type ScenarioType int
const (
ColdStart ScenarioType = iota // Model is loaded from cold state
WarmStart // Model is already loaded in memory
)
// String implements fmt.Stringer for ScenarioType
func (s ScenarioType) String() string {
return [...]string{"cold_start", "warm_start"}[s]
}
// BenchmarkServerInference is the main entry point for benchmarking Ollama inference performance.
// It tests all configured models with different prompt lengths and start scenarios.
func BenchmarkServerInference(b *testing.B) {
b.Logf("Starting benchmark suite with %d models", len(models))
// Verify server availability
if _, err := http.Get(serverURL + "/api/version"); err != nil {
b.Fatalf("Server unavailable: %v", err)
}
b.Log("Server available")
tests := []TestCase{
{"short_prompt", "Write a long story", 100},
{"medium_prompt", "Write a detailed economic analysis", 500},
{"long_prompt", "Write a comprehensive AI research paper", 1000},
}
// Register cleanup handler for results reporting
b.Cleanup(func() { reportMetrics(metrics) })
// Main benchmark loop
for _, model := range models {
client := api.NewClient(mustParse(serverURL), http.DefaultClient)
// Verify model availability
if _, err := client.Show(context.Background(), &api.ShowRequest{Model: model}); err != nil {
b.Fatalf("Model unavailable: %v", err)
}
for _, tt := range tests {
testName := fmt.Sprintf("%s/%s/%s", model, ColdStart, tt.name)
b.Run(testName, func(b *testing.B) {
m := runBenchmark(b, tt, model, ColdStart, client)
metrics = append(metrics, m...)
})
}
for _, tt := range tests {
testName := fmt.Sprintf("%s/%s/%s", model, WarmStart, tt.name)
b.Run(testName, func(b *testing.B) {
m := runBenchmark(b, tt, model, WarmStart, client)
metrics = append(metrics, m...)
})
}
}
}
// runBenchmark executes multiple iterations of a specific test case and scenario.
// Returns collected metrics for all iterations.
func runBenchmark(b *testing.B, tt TestCase, model string, scenario ScenarioType, client *api.Client) []BenchmarkMetrics {
results := make([]BenchmarkMetrics, b.N)
// Run benchmark iterations
for i := 0; i < b.N; i++ {
switch scenario {
case WarmStart:
// Pre-warm the model by generating some tokens
for i := 0; i < 2; i++ {
client.Generate(
context.Background(),
&api.GenerateRequest{
Model: model,
Prompt: tt.prompt,
Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
},
func(api.GenerateResponse) error { return nil },
)
}
case ColdStart:
unloadModel(client, model, b)
}
b.ResetTimer()
results[i] = runSingleIteration(context.Background(), client, tt, model, b)
results[i].scenario = scenario.String()
}
return results
}
// unloadModel forces model unloading using KeepAlive: -1 parameter.
// Includes short delay to ensure unloading completes before next test.
func unloadModel(client *api.Client, model string, b *testing.B) {
req := &api.GenerateRequest{
Model: model,
KeepAlive: &api.Duration{Duration: 0},
}
if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
b.Logf("Unload error: %v", err)
}
time.Sleep(100 * time.Millisecond)
}
// runSingleIteration measures performance metrics for a single inference request.
// Captures TTFT, total generation time, and calculates tokens/second.
func runSingleIteration(ctx context.Context, client *api.Client, tt TestCase, model string, b *testing.B) BenchmarkMetrics {
start := time.Now()
var ttft time.Duration
var tokens int
lastToken := start
req := &api.GenerateRequest{
Model: model,
Prompt: tt.prompt,
Options: map[string]interface{}{"num_predict": tt.maxTokens, "temperature": 0.1},
}
if b != nil {
b.Logf("Prompt length: %d chars", len(tt.prompt))
}
// Execute generation request with metrics collection
client.Generate(ctx, req, func(resp api.GenerateResponse) error {
if ttft == 0 {
ttft = time.Since(start)
}
if resp.Response != "" {
tokens++
lastToken = time.Now()
}
return nil
})
totalTime := lastToken.Sub(start)
return BenchmarkMetrics{
model: model,
testName: tt.name,
ttft: ttft,
totalTime: totalTime,
totalTokens: tokens,
tokensPerSecond: float64(tokens) / totalTime.Seconds(),
}
}
// reportMetrics processes collected metrics and prints formatted results.
// Generates both human-readable tables and CSV output with averaged statistics.
func reportMetrics(results []BenchmarkMetrics) {
if len(results) == 0 {
return
}
// Aggregate results by test case
type statsKey struct {
model string
scenario string
testName string
}
stats := make(map[statsKey]*struct {
ttftSum time.Duration
totalTimeSum time.Duration
tokensSum int
iterations int
})
for _, m := range results {
key := statsKey{m.model, m.scenario, m.testName}
if _, exists := stats[key]; !exists {
stats[key] = &struct {
ttftSum time.Duration
totalTimeSum time.Duration
tokensSum int
iterations int
}{}
}
stats[key].ttftSum += m.ttft
stats[key].totalTimeSum += m.totalTime
stats[key].tokensSum += m.totalTokens
stats[key].iterations++
}
// Calculate averages
var averaged []BenchmarkMetrics
for key, data := range stats {
count := data.iterations
averaged = append(averaged, BenchmarkMetrics{
model: key.model,
scenario: key.scenario,
testName: key.testName,
ttft: data.ttftSum / time.Duration(count),
totalTime: data.totalTimeSum / time.Duration(count),
totalTokens: data.tokensSum / count,
tokensPerSecond: float64(data.tokensSum) / data.totalTimeSum.Seconds(),
})
}
// Print formatted results
printTableResults(averaged)
printCSVResults(averaged)
}
// printTableResults displays averaged metrics in a formatted table
func printTableResults(averaged []BenchmarkMetrics) {
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(w, "\nAVERAGED BENCHMARK RESULTS")
fmt.Fprintln(w, "Model\tScenario\tTest Name\tTTFT (ms)\tTotal Time (ms)\tTokens\tTokens/sec")
for _, m := range averaged {
fmt.Fprintf(w, "%s\t%s\t%s\t%.2f\t%.2f\t%d\t%.2f\n",
m.model,
m.scenario,
m.testName,
float64(m.ttft.Milliseconds()),
float64(m.totalTime.Milliseconds()),
m.totalTokens,
m.tokensPerSecond,
)
}
w.Flush()
}
// printCSVResults outputs averaged metrics in CSV format
func printCSVResults(averaged []BenchmarkMetrics) {
fmt.Println("\nCSV OUTPUT")
fmt.Println("model,scenario,test_name,ttft_ms,total_ms,tokens,tokens_per_sec")
for _, m := range averaged {
fmt.Printf("%s,%s,%s,%.2f,%.2f,%d,%.2f\n",
m.model,
m.scenario,
m.testName,
float64(m.ttft.Milliseconds()),
float64(m.totalTime.Milliseconds()),
m.totalTokens,
m.tokensPerSecond,
)
}
}
// mustParse is a helper function to parse URLs with panic on error
func mustParse(rawURL string) *url.URL {
u, err := url.Parse(rawURL)
if err != nil {
panic(err)
}
return u
}

417
cache/cache.go vendored
View File

@@ -1,63 +1,420 @@
package cache
import (
"errors"
"fmt"
"log/slog"
"math"
"slices"
"github.com/ollama/ollama/ml"
)
type Options struct {
Position int
}
var ErrNotSupported = errors.New("model does not support operation")
type Cache interface {
// ** used by model implementations **
// Returns an instance of the cache for layer 'i'
Sub(i int) Cache
Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
// Returns the history of key and value tensors plus a mask
//
// The tensors are of shape embed dim, kv heads, batch size
// The mask is of shape history size, batch size
Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
// Stores a batch of key and value in the cache
//
// The tensors must be of shape embed dim, kv heads, batch size
Put(ctx ml.Context, key, value ml.Tensor)
// ** cache management **
// Closes the cache and frees resources associated with it
Close()
// Called before the start of the model's forward pass. For each
// token in the coming batch, there must be a corresponding entry
// in positions and seqs.
StartForward(ctx ml.Context, positions []int32, seqs []int) error
// Copies tokens in the range [0, len) from srcSeq to dstSeq
CopyPrefix(srcSeq, dstSeq int, len int32)
// Removes tokens in the range [beginIndex, endIndex) from seq. Set
// endIndex to math.MaxInt32 to remove everything starting at beginIndex
Remove(seq int, beginIndex, endIndex int32) error
}
type Simple struct {
type Causal struct {
DType ml.DType
Capacity int
Capacity int32
// current forward pass
curLayer int
curLoc int
curBatchSize int
curMask ml.Tensor
curCellRange cellRange
// metadata
cells []cacheCell
cellRanges map[int]cellRange
// cache data storage
backend ml.Backend
cacheCtx ml.Context
keys, values []ml.Tensor
}
func (c *Simple) Sub(i int) Cache {
type seqCell struct {
seq int
pos int32
}
type cacheCell struct {
sequences []seqCell
}
type cellRange struct {
min int
max int
}
func (cell cacheCell) findSeq(seq int) *seqCell {
for i := range cell.sequences {
if cell.sequences[i].seq == seq {
return &cell.sequences[i]
}
}
return nil
}
func NewCausalCache(backend ml.Backend, dtype ml.DType, capacity int32) Cache {
return &Causal{
Capacity: capacity,
DType: dtype,
cells: make([]cacheCell, capacity),
cellRanges: make(map[int]cellRange),
backend: backend,
cacheCtx: backend.NewContext(),
}
}
func (c *Causal) Close() {
c.cacheCtx.Close()
}
var ErrKvCacheFull = errors.New("could not find a kv cache slot")
func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
if len(positions) != len(seqs) {
return fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(positions), len(seqs))
}
c.curBatchSize = len(positions)
if c.curBatchSize < 1 {
return errors.New("batch size cannot be less than 1")
}
var err error
c.curLoc, err = c.findStartLoc()
if errors.Is(err, ErrKvCacheFull) {
c.defrag()
c.curLoc, err = c.findStartLoc()
}
if err != nil {
return err
}
c.curCellRange = newRange()
for i, pos := range positions {
seq := seqs[i]
c.cells[c.curLoc+i] = cacheCell{sequences: []seqCell{{seq: seq, pos: pos}}}
ranges, ok := c.cellRanges[seq]
if !ok {
ranges = newRange()
}
if c.curLoc+i > ranges.max {
ranges.max = c.curLoc + i
}
if ranges.max > c.curCellRange.max {
c.curCellRange.max = ranges.max
}
if c.curLoc+i < ranges.min {
ranges.min = c.curLoc + i
}
if ranges.min < c.curCellRange.min {
c.curCellRange.min = ranges.min
}
c.cellRanges[seq] = ranges
}
c.curMask, err = c.buildMask(ctx, positions, seqs)
return err
}
func newRange() cellRange {
return cellRange{
min: math.MaxInt,
max: 0,
}
}
func (c *Causal) findStartLoc() (int, error) {
var start, count int
for i := range c.cells {
if len(c.cells[i].sequences) == 0 {
count++
if count >= c.curBatchSize {
return start, nil
}
} else {
start = i + 1
count = 0
}
}
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
}
func (c *Causal) buildMask(ctx ml.Context, positions []int32, seqs []int) (ml.Tensor, error) {
// TODO(jessegross): This makes a number of simplifications such as no padding,
// which could be an issue for CUDA graphs and/or flash attention
len := c.curCellRange.max - c.curCellRange.min + 1
mask := make([]float32, c.curBatchSize*len)
for i := range c.curBatchSize {
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
cellSeq := c.cells[j].findSeq(seqs[i])
if cellSeq == nil || cellSeq.pos > positions[i] {
mask[i*len+(j-c.curCellRange.min)] = float32(math.Inf(-1))
}
}
}
return ctx.FromFloatSlice(mask, len, c.curBatchSize)
}
func moveCell(ctx ml.Context, objs []ml.Tensor, src, dst, len int) {
for _, obj := range objs {
srcView := obj.View(ctx, int(obj.Stride(2))*src, int(obj.Dim(0)*obj.Dim(1))*len)
dstView := obj.View(ctx, int(obj.Stride(2))*dst, int(obj.Dim(0)*obj.Dim(1))*len)
ctx.Forward(srcView.Copy(ctx, dstView))
}
}
func (c *Causal) defrag() {
slog.Debug("defragmenting kv cache")
// Defrag strategy:
// - Search for empty holes at the beginning of the cache,
// filling them with active data starting at the end
// - If there are contiguous elements that need to be moved,
// combine them into a single operation by holding new moves
// until we see the next one is non-contiguous
// - Fill up the context with the maximum number of operations it
// can hold then compute that and continue with a new context
//
// We could try to optimize placement by grouping blocks from
// the same sequences together but most likely the next forward
// pass will disrupt this anyways, so the real world benefit
// seems limited as this time.
ctx := c.backend.NewContext()
// For every move, 6 tensors are required per layer (2 views and a
// copy for each of k and v). For efficiency, we try to group
// multiple contiguous blocks into a single move. However, if we
// exceed the maximum number of tensors then we need to compute
// what we have and start a new batch.
maxMoves := ctx.MaxTensors() / (6 * len(c.keys))
moves := 0
var pendingSrc, pendingDst, pendingLen int
for dst := range c.cells {
if len(c.cells[dst].sequences) == 0 {
for src := len(c.cells) - 1; src > dst; src-- {
if len(c.cells[src].sequences) != 0 {
c.cells[dst] = c.cells[src]
c.cells[src] = cacheCell{}
if pendingLen > 0 {
if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
pendingSrc = src
pendingLen++
break
} else {
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
moves++
}
}
pendingSrc = src
pendingDst = dst
pendingLen = 1
break
}
}
}
if moves >= maxMoves {
ctx.Compute(nil)
ctx.Close()
ctx = c.backend.NewContext()
moves = 0
}
}
if pendingLen > 0 {
moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
moves++
}
if moves > 0 {
ctx.Compute(nil)
}
ctx.Close()
for seq := range c.cellRanges {
seqRange := newRange()
for i, cell := range c.cells {
if cell.findSeq(seq) != nil {
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
c.cellRanges[seq] = seqRange
}
}
func (c *Causal) Sub(i int) Cache {
if i >= len(c.keys) {
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
}
return &Simple{
keys: c.keys[i : i+1],
values: c.values[i : i+1],
Capacity: c.Capacity,
DType: c.DType,
}
c.curLayer = i
return c
}
func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
if c.keys[0] == nil || c.values[0] == nil {
c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
}
func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
key := c.keys[c.curLayer]
value := c.values[c.curLayer]
ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
n := min(c.Capacity, int(key.Dim(2))+opts.Position)
key = c.keys[0].View(ctx, 0,
key = key.View(ctx, int(key.Stride(2))*c.curCellRange.min,
int(key.Dim(0)), int(key.Stride(1)),
int(key.Dim(1)), int(key.Stride(2)),
n,
int(c.curMask.Dim(0)),
)
value = c.values[0].View(ctx, 0,
value = value.View(ctx, int(key.Stride(2))*c.curCellRange.min,
int(value.Dim(0)), int(value.Stride(1)),
int(value.Dim(1)), int(value.Stride(2)),
n,
int(c.curMask.Dim(0)),
)
// TODO shift context if necessary
return key, value
return key, value, c.curMask
}
func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
if c.curBatchSize != int(key.Dim(2)) {
panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, int(key.Dim(2))))
}
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
c.keys[c.curLayer] = c.cacheCtx.Zeros(c.DType, key.Dim(0), key.Dim(1), int64(c.Capacity))
c.values[c.curLayer] = c.cacheCtx.Zeros(c.DType, value.Dim(0), value.Dim(1), int64(c.Capacity))
}
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, int(key.Stride(2))*c.curLoc, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, int(value.Stride(2))*c.curLoc, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
}
func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
seqRange := newRange()
for i := range c.cells {
srcCellSeq := c.cells[i].findSeq(srcSeq)
dstCellSeq := c.cells[i].findSeq(dstSeq)
if dstCellSeq != nil {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == dstSeq })
}
if srcCellSeq != nil && srcCellSeq.pos < len {
c.cells[i].sequences = append(c.cells[i].sequences, seqCell{seq: dstSeq, pos: srcCellSeq.pos})
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
c.cellRanges[dstSeq] = seqRange
}
func (c *Causal) shift(seq int, beginIndex, offset int32) error {
panic("Shift not yet implemented")
}
func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
var offset int32
if endIndex != math.MaxInt32 {
offset = beginIndex - endIndex
}
seqRange := newRange()
for i := range c.cells {
cellSeq := c.cells[i].findSeq(seq)
if cellSeq != nil {
if cellSeq.pos >= beginIndex && cellSeq.pos < endIndex {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == seq })
} else {
if cellSeq.pos >= endIndex {
cellSeq.pos += offset
}
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
}
if endIndex != math.MaxInt32 {
err := c.shift(seq, endIndex, offset)
if err != nil {
return err
}
}
c.cellRanges[seq] = seqRange
return nil
}

47
cache/tensor.go vendored Normal file
View File

@@ -0,0 +1,47 @@
package cache
import (
"github.com/ollama/ollama/ml"
)
type TensorCache struct {
curLayer int
cacheCtx ml.Context
keys, values []ml.Tensor
}
func NewTensorCache(backend ml.Backend) *TensorCache {
return &TensorCache{
cacheCtx: backend.NewContext(),
}
}
func (c *TensorCache) Close() {
c.cacheCtx.Close()
}
func (c *TensorCache) Sub(i int) *TensorCache {
if i >= len(c.keys) {
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
}
c.curLayer = i
return c
}
func (c *TensorCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
return c.keys[c.curLayer], c.values[c.curLayer], nil
}
func (c *TensorCache) Put(ctx ml.Context, key, value ml.Tensor) {
if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
c.keys[c.curLayer] = c.cacheCtx.Zeros(key.DType(), key.Shape()...)
c.values[c.curLayer] = c.cacheCtx.Zeros(value.DType(), value.Shape()...)
}
ctx.Forward(key.Copy(ctx, c.keys[c.curLayer]))
ctx.Forward(value.Copy(ctx, c.values[c.curLayer]))
}

View File

@@ -35,9 +35,9 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/llama/runner"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/runner"
"github.com/ollama/ollama/server"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
@@ -59,7 +59,7 @@ func getModelfileName(cmd *cobra.Command) (string, error) {
_, err = os.Stat(absName)
if err != nil {
return filename, err
return "", err
}
return absName, nil
@@ -338,7 +338,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
return err
}
opts.MultiModal = len(info.ProjectorInfo) != 0
// TODO(jessegross): We should either find another way to know if this is
// a vision model or remove the logic. Also consider that other modalities will
// need different behavior anyways.
opts.MultiModal = true
opts.ParentModel = info.Details.ParentModel
if interactive {

View File

@@ -279,7 +279,7 @@ func TestGetModelfileName(t *testing.T) {
name: "no modelfile specified, no modelfile exists",
modelfileName: "",
fileExists: false,
expectedName: "Modelfile",
expectedName: "",
expectedErr: os.ErrNotExist,
},
{
@@ -293,7 +293,7 @@ func TestGetModelfileName(t *testing.T) {
name: "modelfile specified, no modelfile exists",
modelfileName: "crazyfile",
fileExists: false,
expectedName: "crazyfile",
expectedName: "",
expectedErr: os.ErrNotExist,
},
{

View File

@@ -4,7 +4,7 @@ import (
"fmt"
"os"
"github.com/ollama/ollama/llama/runner"
"github.com/ollama/ollama/runner"
)
func main() {

View File

@@ -191,6 +191,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
conv = &qwen2Model{}
case "BertModel":
conv = &bertModel{}
case "CohereForCausalLM":
conv = &commandrModel{}
default:
return errors.New("unsupported architecture")
}

View File

@@ -0,0 +1,76 @@
package convert
import (
"cmp"
"github.com/ollama/ollama/fs/ggml"
)
type commandrModel struct {
ModelParameters
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"`
HiddenLayers uint32 `json:"num_hidden_layers"`
IntermediateSize uint32 `json:"intermediate_size"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
LayerNormEPS float32 `json:"layer_norm_eps"`
RopeTheta float32 `json:"rope_theta"`
UseQKNorm bool `json:"use_qk_norm"`
MaxLength uint32 `json:"model_max_length"`
LogitScale float32 `json:"logit_scale"`
NCtx uint32 `json:"n_ctx"`
}
var _ ModelConverter = (*commandrModel)(nil)
func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "command-r"
kv["general.name"] = "command-r"
kv["command-r.context_length"] = cmp.Or(p.MaxLength, p.MaxPositionEmbeddings, p.NCtx)
kv["command-r.embedding_length"] = p.HiddenSize
kv["command-r.block_count"] = p.HiddenLayers
kv["command-r.feed_forward_length"] = p.IntermediateSize
kv["command-r.attention.head_count"] = p.NumAttentionHeads
kv["command-r.attention.head_count_kv"] = p.NumKeyValueHeads
kv["command-r.attention.layer_norm_epsilon"] = p.LayerNormEPS
kv["command-r.rope.freq_base"] = p.RopeTheta
kv["command-r.max_position_embeddings"] = cmp.Or(p.MaxLength, p.MaxPositionEmbeddings)
kv["command-r.logit_scale"] = p.LogitScale
kv["command-r.rope.scaling.type"] = "none"
return kv
}
func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor
for _, t := range ts {
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (p *commandrModel) Replacements() []string {
return []string{
"self_attn.q_norm", "attn_q_norm",
"self_attn.k_norm", "attn_k_norm",
"model.layers", "blk",
"input_layernorm", "attn_norm",
"mlp.down_proj", "ffn_down",
"mlp.gate_proj", "ffn_gate",
"mlp.up_proj", "ffn_up",
"self_attn.k_proj", "attn_k",
"self_attn.o_proj", "attn_output",
"self_attn.q_proj", "attn_q",
"self_attn.v_proj", "attn_v",
"model.norm", "output_norm",
"model.embed_tokens", "token_embd",
}
}

View File

@@ -109,6 +109,7 @@ func TestConvertModel(t *testing.T) {
"all-MiniLM-L6-v2",
"gemma-2-9b-it",
"Qwen2.5-0.5B-Instruct",
"c4ai-command-r-v01",
}
for i := range cases {

344
convert/testdata/c4ai-command-r-v01.json vendored Normal file
View File

@@ -0,0 +1,344 @@
{
"general.architecture": "command-r",
"general.name": "command-r",
"command-r.attention.head_count": "64",
"command-r.attention.head_count_kv": "64",
"command-r.attention.layer_norm_epsilon": "1e-05",
"command-r.block_count": "40",
"command-r.context_length": "131072",
"command-r.embedding_length": "8192",
"command-r.feed_forward_length": "22528",
"command-r.logit_scale": "0.0625",
"command-r.rope.freq_base": "8e+06",
"command-r.rope.scaling.type": "none",
"tokenizer.ggml.add_bos_token": "true",
"tokenizer.ggml.add_eos_token": "false",
"tokenizer.ggml.bos_token_id": "5",
"tokenizer.ggml.eos_token_id": "255001",
"tokenizer.ggml.merges": "902a060cac8884a5793d2a857dd2e53a259de46c8d08c4deb243c239671e1350",
"tokenizer.ggml.model": "gpt2",
"tokenizer.ggml.padding_token_id": "0",
"tokenizer.ggml.token_type": "b7a352ccd1c99d4413bcf452c2db707b0526d0e1216616b865560fab80296462",
"tokenizer.ggml.tokens": "815ac90ff23565081522d7258f46648c8a0619eb847a9c7c31b238a9b984e4ae",
"blk.0.attn_k.weight": "6fcfdb466f9ceb1229404ce4ec4e480751b8d00da12707a11783dad7256cb864",
"blk.0.attn_norm.weight": "6063317f731371864049c7704a70772f1eb632194201ebdc2ed0f8e483507c72",
"blk.0.attn_output.weight": "920f49716a1e2fc73b6794ec777947f1c122701e63ed302422ac89e90f06e9da",
"blk.0.attn_q.weight": "ddbcd7cde197e632564ac58e4f25d9e3a8ca52917329eeb6081eb41a797932ab",
"blk.0.attn_v.weight": "318fc02a189d87420f0cbf57f47f11e00c21ec1ed472ce0a2a895b44f7fa0fca",
"blk.0.ffn_down.weight": "aa71975b6eb1f4c77b03d2ac4a194cf8d95718efac741bb12f0f3ff79a27f9bc",
"blk.0.ffn_gate.weight": "42967702fa0bc738b88dc50007ace26dbe74a5a9e0978124dd093f818241a9e1",
"blk.0.ffn_up.weight": "5282c8788b086bd30f46525e7995a17464882a72703fd27165491afdd8bfd4af",
"blk.1.attn_k.weight": "cd248882e64fd2c3402c44790ebe12440133dc671b6893fdad0564c461973adc",
"blk.1.attn_norm.weight": "ba84e1c8fd30af6ec94208db4078befac8c921aad3acb887812887f3282ea2be",
"blk.1.attn_output.weight": "2efa3ef7c5666ccceb05e339b83ad680cc0d2c3ec78203f5da5959f23a80e14f",
"blk.1.attn_q.weight": "5106f2e255358a1303c22e8b5f0ec044852bb30a866c52cabefd30017a7a6b7d",
"blk.1.attn_v.weight": "a211a634a1a5df1d5f973645438be0461dd922210f9747c6b04e386c7f1ebe95",
"blk.1.ffn_down.weight": "37093afe48d32c578ec956c9ed85242cd000d6aa979e60526aafa10c822dbb10",
"blk.1.ffn_gate.weight": "469860819e9159caefb1aad0bc66db790f3393f05fd87b08e52256a7ed256543",
"blk.1.ffn_up.weight": "736742c97d35d1a011f9cafd3c0ce947ad559bb2fba6da73c816f6bfd0fa9aeb",
"blk.2.attn_k.weight": "92c219d92804d832ab404bd6dc7339c90877bb7cf405dd030c121f8b27757739",
"blk.2.attn_norm.weight": "61e4466069474b76b6d1e702566420eb669faf3556b00ff7b824784aca13a2d6",
"blk.2.attn_output.weight": "d2fb38a2b2171fd91caf037faa585a62225819aa232d86fd4f7f9d2c3c8a45e9",
"blk.2.attn_q.weight": "f6faf5cc6844e3daa4f9f68d90f5458c64879de68a7728860e38374e30c3429d",
"blk.2.attn_v.weight": "f340ef8f7341d987a6f37c0e9afe0aef5be67be00c0ce5f57612daf73319cce1",
"blk.2.ffn_down.weight": "c7be61a701d779860b621b143fb6365b607bf99ec7c0f153b07908ac8120885a",
"blk.2.ffn_gate.weight": "b64f0878187bd3392abfa4c3e8ad2f8b4c133903e54246747ff8f3b4639ad83e",
"blk.2.ffn_up.weight": "50b11c712652e90ee7428dbb45cffebb80662ac982bc72bd9eafff361b5eb5a8",
"blk.3.attn_k.weight": "2b7bcbe9ee5c9c630c8c8d7483887e78b73581016f4cbb6933db2a147a25f431",
"blk.3.attn_norm.weight": "0181dac7f4eee7252980323e8032cf339bef2046ce0a16c0fd72af7c98a8a37b",
"blk.3.attn_output.weight": "aef8843b636ce231da9e7c9acbee197883cc15df0e2887709324c6a50f16da7b",
"blk.3.attn_q.weight": "55404130fa10e81322d33eb378aa0de31a92990ce7730f1338c0ace0406bb1b1",
"blk.3.attn_v.weight": "76f7fb8040d82b957d689ce34fea2302a6640ad5bbaa0052ad2b7ebce270c33d",
"blk.3.ffn_down.weight": "648628933eff3b357c3729c33c5b1ae51c28e59b9c19acd1601a2ff7c5d5d9a5",
"blk.3.ffn_gate.weight": "6a588885d16e98d5f50ebed05af089154f680085ca9c97691e5b489088630a4a",
"blk.3.ffn_up.weight": "e12455a1d702f4986e1a663493e3d5102b367af74d45557522002a35d63ecac2",
"blk.4.attn_k.weight": "40d943380a8a85e4eab147934bf6e16f23cc8ab753f6636526382c074d182288",
"blk.4.attn_norm.weight": "4ab2c098983d4599fe540eef624c4df954adb7473faebda7471ef0ba4134814c",
"blk.4.attn_output.weight": "d14b91e40f58bf4a3c8c2eca0b12bb541de406574af39027d56f6c588a147082",
"blk.4.attn_q.weight": "e1224960a3562107488589f883fa32414bae41712fa8dbd47c5f3e3a7801452f",
"blk.4.attn_v.weight": "063f297bc4aa6e709fc32c4c32e35af7d07d80e83cb939b76adbba858006c03d",
"blk.4.ffn_down.weight": "f88a18020c5e1caaa29596895eb348e76ee5bfad27ed57651a86cd8cd1f9b5aa",
"blk.4.ffn_gate.weight": "48e7e1eed3fb52e92e61d3557dd0ec002418327090e034ce4322fd68542266f8",
"blk.4.ffn_up.weight": "1ca8a7aa17355b6ce0d9ad5539fdad3899fa47fd359c285fbfb31f19f47bf073",
"blk.5.attn_k.weight": "2bdf15f8e73d068d972380f25d207004cf0bf3b5bfa46946803ba6fba07d9175",
"blk.5.attn_norm.weight": "60448d7cde6e1b6467aa31bdea012e39cdb08c88081cee7d102dca4f93f766ef",
"blk.5.attn_output.weight": "f9f687d7c457537f9fca8a4087a59f1c3bebfaf5537b94e42c831a13224f7799",
"blk.5.attn_q.weight": "987db7a2ad68657a92625e1980effbb1f79697c2183f2b9f3b3a0570c51b0ab9",
"blk.5.attn_v.weight": "cf696891148f3e4783ad1d20f93462ae091eb8651c656bba9b662253b6263e02",
"blk.5.ffn_down.weight": "c0662b0bd0929136005fb9d691fdd9b2c33867d9ce9622339a6a456b720b059a",
"blk.5.ffn_gate.weight": "200bbdfab615d7a3a84719b6ced7751e3ce52757ef212d96f87798bc1de5e987",
"blk.5.ffn_up.weight": "df5d23e7e035fb1b9d163da7ddfdfe38da6a37e86e96534dc02ad20f011b55b3",
"blk.6.attn_k.weight": "c0dae2d272a7c5a2fa004bbb8475dbab362fc1f6d008e73d5a4434a9382ac6ba",
"blk.6.attn_norm.weight": "51c57ac8b55e04354d5dca6bb9c0cf4177639d3b038e80209e33036209688f64",
"blk.6.attn_output.weight": "229d97892c62f85bcdf431675250e01c976ad69ffa450b01fb543bf88f14a2fb",
"blk.6.attn_q.weight": "c20e49621821bd46ed156e6823864a5bda4f317750e71ab8dc54e44eb48cf7c2",
"blk.6.attn_v.weight": "53ceb1a2ee43fce3c7b5b33c58a9fc5ee7f44dc1c6f29bc9dbefc37582102dc9",
"blk.6.ffn_down.weight": "7923c943b7629d560a032d1efa210d1d75c6692140f1be94464ee7ed24f44ed0",
"blk.6.ffn_gate.weight": "57593d350361af753a6a39f53b066282634c0fb44f396f6f2966a574b01d8f8c",
"blk.6.ffn_up.weight": "327b6a7a387098b8899d3ded04a4d4e7c658ca61b80d4e7b17594be232721602",
"blk.7.attn_k.weight": "9ca48b87a10116fd8868e62b76f211d4bb91f166096be9061439ee2e1c3a5c20",
"blk.7.attn_norm.weight": "cd56cfcc4e2ad6b96e23ea7b0d32b4caf236107d99a0b22c56760b62e63c8cfd",
"blk.7.attn_output.weight": "7352b509a03cae2491ffc060e577d189341a0f861233f18c96f9d275dc4234bf",
"blk.7.attn_q.weight": "2b3791c8c008c33ddbe12bedba8191322ceea2dcce5cf0eb7a93d40ad254e672",
"blk.7.attn_v.weight": "3ae721d52466487a3d48150581e57f6d64ea1e83ab929f23b28c3d777422eeb6",
"blk.7.ffn_down.weight": "3b6fa8ececdb3c34af3a5363863d6f94289c1c95bf47fce3a3ddcf184c5f0848",
"blk.7.ffn_gate.weight": "dbd7df6c5ae5eb4adb859f0d36453813a4e289a359a1ba8f72d67fcbf21c3e22",
"blk.7.ffn_up.weight": "de68380a334b4c5cfd4c318b0e9854aec59bd79aa0f0c30af3f56414f83482b0",
"blk.8.attn_k.weight": "7303c4e4480abc72a7ee271811311199245fb5c2ea27a2bd3b8cad3a53a03c27",
"blk.8.attn_norm.weight": "2e3d1921898d1b943ce1a1b6818546c8b471d6d542da24f51a8b514b8c3dd4ef",
"blk.8.attn_output.weight": "30421520887b66bf97a18dbcdc283bc8d0b60590b612fd638a319a6eae923227",
"blk.8.attn_q.weight": "73e064d5433c9b500068a1c31744dbd53f4ade298fb450a0e8c97f62cf1f8a8d",
"blk.8.attn_v.weight": "27e21f8b9a9a8533e8178ca34a72aa1d786393d57302b7806dcdf3e51de511a8",
"blk.8.ffn_down.weight": "bf694bd8e00047982108000e7b3dee7b225db8b19abc595e5697b6bbefd92e7c",
"blk.8.ffn_gate.weight": "d55fdbf8606d9141b774b0500c58944fd1253b9e69d1f765eaa9a680b9f2ca40",
"blk.8.ffn_up.weight": "1ae3f580655e7c8e8dd6c34fa4ac574fdfc5e3f1a8536da0c5442d3a2976f0e7",
"blk.9.attn_k.weight": "b18080626012d8aabcf78542d6c7bf31c712bf55a70172fbfe173fcf34481036",
"blk.9.attn_norm.weight": "2e3620620dc09998c6d3063a7d5de5433fbbae8c11e5b00d13f145d39140e162",
"blk.9.attn_output.weight": "69c3c0e27ef1c0fc933eeb7b612b70909f18cde238873c0d576a2ba9714ef174",
"blk.9.attn_q.weight": "68330e5aa28a28873c9a6e67f032186ef651df2df5844e0f27094ba349fbe4ab",
"blk.9.attn_v.weight": "3df8d45a102be082d0793a51cb82aa62a43cd0e9d047ba4115ca0f2414b39325",
"blk.9.ffn_down.weight": "1d6cc162b73745b135b4f040a0aac3c06d5135a3dc5b2421e7ee2af48662fd7f",
"blk.9.ffn_gate.weight": "034a9d40fb1e32b534b45f4bccd65cbe43c4a6a3f5d01132bd245ca0005de5fc",
"blk.9.ffn_up.weight": "c838c38d0e1a0ac0da17eb2a66023ed31929f07d8fcfe1cc546df26096c91f0c",
"blk.10.attn_k.weight": "a78507cb72f744b86ceaa032596e74e5571c822d0226d334881169addb32cbd5",
"blk.10.attn_norm.weight": "35f48d0b28ee0e6b4cad4e983925737562d64824be5b168b3e26df3d6b260cf1",
"blk.10.attn_output.weight": "53712db06796de39b131323e7abf9a58551b6d52da6db66a471580386d396252",
"blk.10.attn_q.weight": "efe08429ba196026b81cd1c471e1c7418afd9e966659feb3936b674aa0803b58",
"blk.10.attn_v.weight": "7ec6055e134f89da0cbe79ec9f13ef2e442ac584b1f03c3e13e7d0cdad0078bd",
"blk.10.ffn_down.weight": "37e66af4bcd1f3079e841e892255b8255070655901864ea3a8c602a7f681a640",
"blk.10.ffn_gate.weight": "1825282bc34830d371c6edcc3c1e73e6ecc1e10f4aea0122dbb7acc1d6f7b1bc",
"blk.10.ffn_up.weight": "819b3b276a4d4c14a35ed6682d5ef18a5e8ed468e5ce3f12e8c75ec18ac20ec4",
"blk.11.attn_k.weight": "5327e6a2af82dfff0619a14971f5864a15553c36fead84e1af42c7630f2729c6",
"blk.11.attn_norm.weight": "fec363b3c4a43036d2c635fb8aa9e122dd87ee79811839f2f6cd955be3373e7b",
"blk.11.attn_output.weight": "ccf7b38f18ee8798b8a6a35018e2df3eb3e007de62876befb68025dd66c79763",
"blk.11.attn_q.weight": "da8c4a1c824ffe174e39f126cd72f7ef83c56aff1259d452a1212de80f98f5e9",
"blk.11.attn_v.weight": "d17ae6bb77f03982b55d341eb67acb5969e9ad3da5994b96eafc09793dcfe3a0",
"blk.11.ffn_down.weight": "a6bac521e2791345f22c57205fa1c2f2f687794dfd24d0e98d50ae0d0eb6088a",
"blk.11.ffn_gate.weight": "5ed902c488cb51ba5635f3df08258c5f84f31a679a00211ea5f9d8b824ef6d9d",
"blk.11.ffn_up.weight": "ee9f1437eb890d2cf9df2574afa1cecf20aafdd847cd75b152d7eb74419afd34",
"blk.12.attn_k.weight": "5a069c06e1019b0f889088e67458f7a11ec77fa190ada6069e46211f62219947",
"blk.12.attn_norm.weight": "194d7e5fcc8c49aea62daf1940532419cf3c505afdce6be377286b677db5db8f",
"blk.12.attn_output.weight": "6534995fd4d6fecb55e317add4b1723aba4d825e1e9471d0b08813dfdc247176",
"blk.12.attn_q.weight": "4ab51ca519b5995581fa34f846276feca3b907ef2b51f192f6cc0b3263c3f5a2",
"blk.12.attn_v.weight": "5652ca3fa81ef9a1ac1543d71fc6813f8517f8ec54b25c701f6f98061614830f",
"blk.12.ffn_down.weight": "4b2c263f54c88516b8eb273bb8d9615b01c5c8b484dc70358adb91b50b300edd",
"blk.12.ffn_gate.weight": "8f50c3c3e3e8568991d6c1b0e74b500cf4f208e7700bbb8e87c3f6a6d359b6b5",
"blk.12.ffn_up.weight": "1c1a581fec1fbe959e1427fa513f400100b5e1ee9d83932630be9905fb49c231",
"blk.13.attn_k.weight": "efd7a38c46f08d8376d82974f33c644e3a02220e142d63b1704718699a8a884c",
"blk.13.attn_norm.weight": "d28fa4f1bd75abbd063b0e622e08f579c89cd0c0c5ce63c1952ec9f944f8ee13",
"blk.13.attn_output.weight": "71e0068a639288718bdb70a6cfdefd50bc8b3ec3993347a65129e70001ca5827",
"blk.13.attn_q.weight": "b97077adc92cff07a2e07d80ee38f214ad8713571c69cd5c70ebd43dc501ac87",
"blk.13.attn_v.weight": "79b3e2749ab4b459c81e96e322b215f1e8af645eb346e176c326bd00cf6ed2fd",
"blk.13.ffn_down.weight": "9f8687d11effa1db7cfecf7bec5631734bcf2962aad74a9f519144491e08ec85",
"blk.13.ffn_gate.weight": "7d14dfa0543852e7777fe8fff29ca533744cbcf1ebcf10067e5adfc4eb345e65",
"blk.13.ffn_up.weight": "852b9527b97fdab211ff3f832a660ee1d93ccb56906144c50f01319a6e8ee615",
"blk.14.attn_k.weight": "79e926b20f36f66d58226cb358881f2f68ae7b468787d33cafae5110287a14a0",
"blk.14.attn_norm.weight": "97d481b63deb0df6142c2c6cd23043720c62eb609e390f47a7113751c79974ec",
"blk.14.attn_output.weight": "aa6e94d7176d5c79fbb89b96e5f13ce75702ce3dd23ee52986446da436a6c3d6",
"blk.14.attn_q.weight": "214becb6d1bb460da9fb8ace0f99b9a5afa9edf7aa7acc19606c7401b11d6305",
"blk.14.attn_v.weight": "488b0e6d7f1a7a2ed0972aaa6d10ef9c775ee5373460324efcf5b3e3da9311df",
"blk.14.ffn_down.weight": "29c7ad16cf9542e30996a1a01ab95b844533b28051f04cc7949c371afb796471",
"blk.14.ffn_gate.weight": "b7ef208f2b054803665b377f5a5980c122c026841809cf855c6ba06d1c3a885a",
"blk.14.ffn_up.weight": "76a5cc28100748d79c4398ce7b9176aab4d661548b6293a82f99144812e5b70e",
"blk.15.attn_k.weight": "a6b8f9e98ab878fa7ebc5d080978ebf2d050acc2ab2fa8ea9188eb10e27702c8",
"blk.15.attn_norm.weight": "a26d07a9752d6dccb68e3a8a2a49fd0752cdd0a415e05547819bc37d9ba63d5e",
"blk.15.attn_output.weight": "c63616c69048ccbee801e05be4f56d21fda21aa0cc470f41d57c31b4d9283a4d",
"blk.15.attn_q.weight": "fd595a67bf96c6ba16eb148a9d02fa52fa3c1d33ed10be28a08f851409fd6e64",
"blk.15.attn_v.weight": "1c5c9d33fa07c05d5f4ed0032c6c4aa83d863f0d31c94a66109d239dcd03cea3",
"blk.15.ffn_down.weight": "585ea62ab8aff7d7d212ea5c1a03226fda6b68370c890b776834af70c948dcbc",
"blk.15.ffn_gate.weight": "a13c63f86f879b03a573d5dd2a25cfd1f4dc73e8132e6454ecc23e538b4cdf6f",
"blk.15.ffn_up.weight": "f7112450f57c12fcd511f049e0dc0b541625a107a7901c3261ed9e984299f65c",
"blk.16.attn_k.weight": "2d2c8b11dd71fba6d1c106aa1673c113a5448653cca7eab897c8739212ed5003",
"blk.16.attn_norm.weight": "95c2ec7be9469690e18a9a1779684acb3e9da44b13e263a0da840305646fbf8a",
"blk.16.attn_output.weight": "31a65046e677f54dae654ded4e733479fcc0f7283d83076b7dc7cbcae8528230",
"blk.16.attn_q.weight": "bfc6292b9c6d49b7118d08060242a138182eb182d136ba5dfaf469437c16081d",
"blk.16.attn_v.weight": "68f81d037340217d87c7853ff4d6edfbc46d9e827ee6d5bff7c3f6238e3a95ad",
"blk.16.ffn_down.weight": "bbd6629691950cef4d5113e1c6670e91b216a9b872cb92cee02dfda4d6c4f7b8",
"blk.16.ffn_gate.weight": "63cb56f282b7401ed6c76e5bb6fdf1bf68a64f9af0c82c014209b55bcb5191d0",
"blk.16.ffn_up.weight": "b54f39a2541063cbfb6f713aa81c3b69a04100e999aa2ebbeec195dc382eceec",
"blk.17.attn_k.weight": "3d9ba49799cc56664ec30a002bcad61eb651294212a68c3ddb573eb042aef5a4",
"blk.17.attn_norm.weight": "42ee0db4b9d63257bca0012a30b12737ead1caafeb5ed3d93c8f48ffec4b46de",
"blk.17.attn_output.weight": "a38fd100f05c9041c592bc739e287de0b10d08ef2bda41a879225bdca9002f71",
"blk.17.attn_q.weight": "8a3bee285b0180a9eb35662e449ee4cbe16d992bdd48fb3a94bc4a347728cfa2",
"blk.17.attn_v.weight": "d7f8f1b8b863494ed4392a1656775912e9b264ad36016547b12e832a1d6757d6",
"blk.17.ffn_down.weight": "bb7ee58f61da8630972e25b621996fbe8ec06f4dc9ab1e268ab5b120c526ca28",
"blk.17.ffn_gate.weight": "6b652dbf167fee09a45ebfd78d500ff6548fb2756dbe5343ffec3f7e6207179f",
"blk.17.ffn_up.weight": "3b67f727e55e742715de978fab80457781e7a3762bc48f79d13b45dcb8de664c",
"blk.18.attn_k.weight": "ff7fe57c57b90c6fcc0aefc39ec24593c3a7d1ea1c23770480075a015450e0f5",
"blk.18.attn_norm.weight": "1d40faca082d2633ef0ccf19e121870dd6c7c3e2154607c7f3543fa96e99cb2d",
"blk.18.attn_output.weight": "9adfecaaa397a92db4687efd5fcabfa0daef9e6b0493763b7ff5ebc185c43a6c",
"blk.18.attn_q.weight": "ad1803eb9b291948639277afe981e666b07167eb3fcae903ba5b73bf86d8f50b",
"blk.18.attn_v.weight": "308cf23399adccf27401a4ab60d74dac6fb9d4cd4b9c5940d9145118d1881b34",
"blk.18.ffn_down.weight": "7de4ac9a561fb580619b745687dfd7ca8a69ef70471dee978741b80e9ff7bead",
"blk.18.ffn_gate.weight": "0c66970f696b33bd5ee8f1f2fbcb41fd78fa5ccabdc927e11a4d5a4089f19c69",
"blk.18.ffn_up.weight": "66a42e988e8a1f468fabf976c48e9e4bb045eaac6916ef16555ac101cd674abc",
"blk.19.attn_k.weight": "a928ab50390bacbcebe2e4b66922498134ce22d7b93beaa87d6cf4ab52eb7174",
"blk.19.attn_norm.weight": "b4a02c55b46c2a96aec9c64a254087cf48e6c1d4b6f31782c77a46fc4daebad1",
"blk.19.attn_output.weight": "b768319c641dff1eac5d1f8ceb960c9899c795bf2b24c1d6bf70aa24fda45f77",
"blk.19.attn_q.weight": "79ef3f57d187d3954a26362096e1b6c222d76f537dff73e034d6e9999935b8bc",
"blk.19.attn_v.weight": "ce13d6b13e24fcb2d5bc6a2662e5bd295b31b12db10a6d0307f86cf29b8d5001",
"blk.19.ffn_down.weight": "cf90d7e2137482cfd50934a8223ad774621d08554969da80a9712df5e6227eb0",
"blk.19.ffn_gate.weight": "71ce30150f003b6eeb3bf7464e05b6ae615f135110d8e47f0a47fd973e537c0f",
"blk.19.ffn_up.weight": "7f92aca0cc29866633feec701ec01a85a8ee2fd4e2b9630173a6cffb1d9d50ee",
"blk.20.attn_k.weight": "a2df23159d6fb74ef28e14b61028fe8b00a693a2fc9234a980be74f20b958682",
"blk.20.attn_norm.weight": "c6cd5f1b096fc5efa4eb59ca1c8c4bd28730f3dcedd59a63601663eccc6724ed",
"blk.20.attn_output.weight": "896a8a166d0f006d4b09867ae4345426303cbc3fb13a18d3d4e1bde00f16dbdf",
"blk.20.attn_q.weight": "01eb79588fe61baea0da43e99f4dc5939590e1bafd01e12dadb8326f102bfea2",
"blk.20.attn_v.weight": "bd39630fdd5a7c859ac1addaf53e63faf524c3f32f5f4896d86b6e746b1d5c06",
"blk.20.ffn_down.weight": "0304a5d39957a0e3f031c4bcc4549a135d396c8d97c8d276fd1c823ce86560c2",
"blk.20.ffn_gate.weight": "117b79d595b1dca0c8b37586beaecc4d84411507276212dc286cde7fc36c9bef",
"blk.20.ffn_up.weight": "6e799346db145c125f01783539749d3828fcc451cd4f10c5352f047a47e28714",
"blk.21.attn_k.weight": "1c37e4c0664147e775bb006b226b9553e3421140cd96288ea755f81731ab80ba",
"blk.21.attn_norm.weight": "00ae783a29000ccda5e4bdbff03df0752fb82805dc3f9b987500ebd80714476e",
"blk.21.attn_output.weight": "7588b84f9fb19f15095b5265c60b4a4e7ae74bcc47d4607dfa5d0bfab6f136cb",
"blk.21.attn_q.weight": "a65f1c0dd06d45bb97532d3e932689c1eecfe7359089b39174a96a149335cbc1",
"blk.21.attn_v.weight": "4220b77e7d5e8709b4eef33a679b5dad11f297085ef44c9977f9e54ef08f7a2d",
"blk.21.ffn_down.weight": "b8c082a0530d4b5328e67db0df84c5498f2af956de23c639fa0198ffea853950",
"blk.21.ffn_gate.weight": "cd1b656ee72d00e9835ef667c19ef89a88de261eb8eb7c0e936e0f9ddf83ef9f",
"blk.21.ffn_up.weight": "dc445f73e36ec7a3bd86884186b728f8e0187f32848c3b8b69d4d41f8571bf31",
"blk.22.attn_k.weight": "e37cf0b893ec8b9ee8c78dd139b8d9c45cb997a3bc0c3d93a70ca1c3f6af8859",
"blk.22.attn_norm.weight": "248a27838d3c46cc03a5c312facc84e2e0e2c990ef8401e93da25918497f88d1",
"blk.22.attn_output.weight": "fc191a18f6d18332c66761f7ab28008bfe295dd1f5c8741a2488442f9e00d0f5",
"blk.22.attn_q.weight": "4b193a2ab8bc2b085db18f2bf3eeba26e02b537b2cdd738160c8f14b165d0f5a",
"blk.22.attn_v.weight": "7a60ce5ccac7e045e55ba1e1e85bd2a0f93f8c781daee96c5223665e22f0c666",
"blk.22.ffn_down.weight": "e0a34fb4244e2c7168f3dbaa1904c15d339ec39999cdf27128bbaf619ee0a237",
"blk.22.ffn_gate.weight": "8bac872d4b8549c8812f927efa309f1792b524f33601095fff61b826de5a5615",
"blk.22.ffn_up.weight": "b67fa2b94dd901b6ec64c0853ce8ca2d86fe9cb1cc6d2f15fbbbe0e691c0c648",
"blk.23.attn_k.weight": "2c32e66ad01942b819ac09a197c71579fe66f02226a264fdd72ad1e02c67a27e",
"blk.23.attn_norm.weight": "825fdc94deb439cb93c713eeb077c1052b90ed658d6d464fc4ad3d611e911d48",
"blk.23.attn_output.weight": "95ca6707a95b8750b0c7c5d379d368f0f2e7ebef631954e7d4d8ec0f41f13a3a",
"blk.23.attn_q.weight": "6eccc84faca5fac015d1b26e2854501edcfd292a302228fe14cf99f5eb59a34b",
"blk.23.attn_v.weight": "b343ac3d226040f1033ee049668aa1d89b1774bc18431965682e5dbdce78ccdc",
"blk.23.ffn_down.weight": "9fc599befea8d3b1e342d564a110074f66d2542df406c4b90b6bdc5828fbb2b2",
"blk.23.ffn_gate.weight": "488556c1b0c9f0b20b0c99b4bac2e0f4046b81edb601d7b91e7e5b3bab47d667",
"blk.23.ffn_up.weight": "1088e291d7008dd9c7c2dd6830af686a8a84b724d123a016209bd5156d6898f1",
"blk.24.attn_k.weight": "a923fbe35e61e009a53927d7828818e0592bb737d6a1106c4b0b5a1efc367e07",
"blk.24.attn_norm.weight": "9b51aaaa939cefafdd9b13a7e5b74ac7fa2d603427e55a16a909d6f3f353750a",
"blk.24.attn_output.weight": "1beb2baba56f8409466434b037771248c2f620ec5f53e15f44c271d5a2d9ecf4",
"blk.24.attn_q.weight": "4b0194fe5bfae0c6bf6131dcf8cb6e2b994f6ea10b27cb03574f0f4f8cc0c950",
"blk.24.attn_v.weight": "6ac34b1ab0f66226d85bca1194a7c212cd93d384ecbc8b8395de48aec0970a61",
"blk.24.ffn_down.weight": "5508f74cb732a662c2936b32ac5e90742d172b9f961a747b0e5cba0e5906a89d",
"blk.24.ffn_gate.weight": "095e39b8584403835f9bb1ac33e0e81f54175575e4800273d281b845bff381e7",
"blk.24.ffn_up.weight": "2d43ec21637dda12973de367b0113ee9840b0d815bf6fce042f7c3f270b0b530",
"blk.25.attn_k.weight": "9e2aee029f3d2c7f67dfc7926e72c8228fb978382c8e5a4701bbf82c93801419",
"blk.25.attn_norm.weight": "220cd7164fb4cdbe22d26058e4153b26c27c7b5ce2bec8e95bf2c0ea08d23103",
"blk.25.attn_output.weight": "a17f4a5dc6aa51f03dbd75602d98e9491767c205cdc2c3a5f8667fc54bbf7c64",
"blk.25.attn_q.weight": "f60827496835c440c794bf57ce9780704d10a59d8229886bf75ebb18900ba4ef",
"blk.25.attn_v.weight": "9cac217e9e9f4f4c85f14ee51165a77c580165bd4a34b202389169bbe61a1ced",
"blk.25.ffn_down.weight": "a0f36949b663e80849581dfb71e7babcc73580793bbcb0c80ab26d5a6e000359",
"blk.25.ffn_gate.weight": "df4d1be4d50d6afe5ad3ef0d0e0fac76a33e85c963dea769641d612dd53e7d13",
"blk.25.ffn_up.weight": "992da76be762632e25ebc5ef4d03728eece1b43f7c4e31827df19ca724aea694",
"blk.26.attn_k.weight": "34199ff856ac32a500c754539d070258574192a34ecba87a182897cb59fdff52",
"blk.26.attn_norm.weight": "a8e9dfb2dae5d22b5c0aec5f3675991c0e3c3e6a44153db2579136b73f456e00",
"blk.26.attn_output.weight": "1c4f257ffb0d7db0f11cfb275e38b4af736917b43ad82de1badce3f1d227da4d",
"blk.26.attn_q.weight": "33d55786274c2e718cf61e8fbecf3dfa5ee0c208f0b716d42b061f55459acb3c",
"blk.26.attn_v.weight": "684b636939cd4ffcfec5a6238a0790ffa43d853c95783af9b9e8275e74071a7a",
"blk.26.ffn_down.weight": "89d0bf066db154e6d312b5433aed1714f6a28b40f4c52e3e1530ee07703303c8",
"blk.26.ffn_gate.weight": "393d649bebe5e2940e1b043649f6c860b4b8b9f380f30e9da1744a830f358156",
"blk.26.ffn_up.weight": "179edc85ababd9d8440cc6093eecd1004290aa1cb96434b26ecf7585b6cca17b",
"blk.27.attn_k.weight": "334841445a7f1e14731b08f56eb0b1f0938c63823d28bc6d078c4c5f05b36f19",
"blk.27.attn_norm.weight": "57344471bbda2e9deffdfdb2dd05a07aa47f8761e24de53525588639145bf551",
"blk.27.attn_output.weight": "506126af9ee54b535d49f97e36f630e74834f480329f098d6d62e96246d8d65a",
"blk.27.attn_q.weight": "dd984df1acb4783849e25ba7ae378bfd385cd9efc540fb798cd5bdd873f0118f",
"blk.27.attn_v.weight": "b4b3fe9a4455d34c297ff20a2f537b647cef424741d840a747b265f23d320ac0",
"blk.27.ffn_down.weight": "621fdb185ba0d35ba5476dae73d2c81ec1482a0e878d5bfd5c3b29fe837af013",
"blk.27.ffn_gate.weight": "e4fbab45f2ec506fa374103251a0bdb7baa6f576080bdd796f3e9db92098e08f",
"blk.27.ffn_up.weight": "a0c57e463e988002bbd6a6c6792baa21a65e6f89ae303a2c301951b0ae6e4bbe",
"blk.28.attn_k.weight": "bac36cbd52ec5056841663865e1291ddab4b47ef9a2544dd285d4503bfb0e4a0",
"blk.28.attn_norm.weight": "5774a9df2bbb2e86d1f70179c7b92d81e1f401160148b3328fb64db6646a5425",
"blk.28.attn_output.weight": "e8712622d1569557000c75f26c3f55fad267fd300463c2c2cfe3afbfa1c8f908",
"blk.28.attn_q.weight": "11677751fddee52cc739699c02836f7be54d96038be4240be5d4f53d00161608",
"blk.28.attn_v.weight": "e5ee459b8958d65e1445997b9aa1e90e2f5d17761ebcf5357313119a45322507",
"blk.28.ffn_down.weight": "3934518f9f85292da8475fe38a8edcbfc4e24ac56c351b472d6351f98750871e",
"blk.28.ffn_gate.weight": "6ba735d57e98d0847e487f25ffaa25256deaa8abec76f428cb70bd9774279d83",
"blk.28.ffn_up.weight": "977fae6e1e5353114fc645dd98429464749758765cbc6e6457593d596e57850c",
"blk.29.attn_k.weight": "8122a457307d580ad6f1e0acea09a2f593d97f595ba0d6737f5fea16d2433642",
"blk.29.attn_norm.weight": "d626f721e05aa1202439b01027031d4caf1adace61ed37870a277cb6297c77cc",
"blk.29.attn_output.weight": "7fb7122ab1b6b1e6615ca746897da27bc52c92cb70d3147183cdde61795b72b3",
"blk.29.attn_q.weight": "be43e94ff6b6e391024dc824101efa0ddf4005d5b002ac26cb03765c0c73c2fa",
"blk.29.attn_v.weight": "af93c85ebff908f74f9935b81bde0516ca487c84139868a1ce079c3ae20036b1",
"blk.29.ffn_down.weight": "39dae12340ed3120bd19c495fe0872b559613641e41fde69d02d8631900b84c0",
"blk.29.ffn_gate.weight": "36fd482439840ef197c9f3b8905d86acfcea49bcf018544106ca465d4bf8d5c7",
"blk.29.ffn_up.weight": "5243fbdfdc1e2a1dd84b6210a9869d18a014db9088897e345240cdc99990bd5d",
"blk.30.attn_k.weight": "948f263616bd3788b2b968baafd69b9c5bd1b77578665f096c4b7e247b4cea42",
"blk.30.attn_norm.weight": "e168df981e744874ff303faf2eb470e5f6868c2040ba5f383f6c5148669975e7",
"blk.30.attn_output.weight": "4cf0ccca04b792573b756655a24fc89cfb1f272da8305633f0bc66ef14990b93",
"blk.30.attn_q.weight": "21e07d6cba6c50d65350289258209717174a13c42be57e8141d69712cbaf32c1",
"blk.30.attn_v.weight": "65a8ca29c7237b3182ccf03e2fc94e84f9a53d0e160fb679ab401c853170dd9c",
"blk.30.ffn_down.weight": "8b00500a6d00d84058f6658ee1d6f06fb4fcae2f90d4341792259362923b3c13",
"blk.30.ffn_gate.weight": "5bc0e19ab7a31b50ac2118ad1b36e31055271a322cd8ff661d47c3ac0210703c",
"blk.30.ffn_up.weight": "f37a0561955725bd59ee2d064fa9f4e00a12a1b620b624db3bc3add5330bc321",
"blk.31.attn_k.weight": "9a5663edda227f5d87533897146764f8e8a7481b9e71fae197c39204f8463221",
"blk.31.attn_norm.weight": "060a4f438a1ee5e220b5b5278ad2f5c085a428bf38c515766781815597c87529",
"blk.31.attn_output.weight": "6ada5d3cad9dea4780ffbb43302bb6ccc2f24eddd0fc4f5f84c9ce0fc0c6e5dd",
"blk.31.attn_q.weight": "bb5d08c08603907981ad388d5d8b70fcc9b98034ba264b8474c8890cc0297af0",
"blk.31.attn_v.weight": "e01b4252ea9c6a889c32b21144b441a347464d04536ef4f6572425be55759796",
"blk.31.ffn_down.weight": "8ba4d679c36e93ba65ba03180385ef35ea86b3b7cdf2fded9df59369f1c09630",
"blk.31.ffn_gate.weight": "e5b41dc93645f8b5e8eebae3ada3ea43a18f97ce2654228655170b07b463ccb0",
"blk.31.ffn_up.weight": "25b88cdddc8b547af294ed107d3d1312e90b983cae87936fa6062ecd8ea02539",
"blk.32.attn_k.weight": "4bcf86dc0858c8ca2fbdf6aa76674d43eb698f78979fdc1a38f556a7af1facc4",
"blk.32.attn_norm.weight": "cdcc12f3b8b9773c6722736bfb748a2729230b21478cbcc4104859d3148df815",
"blk.32.attn_output.weight": "d43f1196822995ed89a9365c97054753a8b30ce20b6e273c8edcc42673a1e141",
"blk.32.attn_q.weight": "ebf2972bb3865cbc5be4840113a322089752038344beab2a0122c7cb4fb399b6",
"blk.32.attn_v.weight": "714db81704ff34fa137512903c1013acee7877467473e46600728b9240582eb7",
"blk.32.ffn_down.weight": "2cde3da1258bb170a79d5d3cdfe10c86a71eb34b77da46b74c5ed71e7f4fe274",
"blk.32.ffn_gate.weight": "c7e1ed792532613ff9d4e5834b6536e2e0f47df2303bc0fdaa90aac0c1f4e8db",
"blk.32.ffn_up.weight": "d8d6f13fe66a716e28f79101a29817f0c0d6f99969a6f017d51bafd1a16c600c",
"blk.33.attn_k.weight": "a0a28f6cbca88da00cab2ca37094d9b0503bf9defdae77b91895b911c408cbb6",
"blk.33.attn_norm.weight": "0251200c24cc8445607ace6dc8c5aa0566567997262b7cca53a11ac23cc564b2",
"blk.33.attn_output.weight": "b2423205bdf6a1096d43c44d8d12f1a84fcd4e1bb70fcf6dc8542b8b8a71a13c",
"blk.33.attn_q.weight": "00b425c3ef71065ce5e0234e702bf38143b4952da78a85f52ab2c2e3073d97ab",
"blk.33.attn_v.weight": "035edd2335df816c42c765a5e66b9d9b9e15a822a8dc1863508145499c942c14",
"blk.33.ffn_down.weight": "4894a923a3db75bae4496ba3ce5f28796ad31fe33996a066271fb8654964310e",
"blk.33.ffn_gate.weight": "8f6c819b8bbfbe3357fae89e1ac5a3d58be85b3b04be3bacf7b62775869046ff",
"blk.33.ffn_up.weight": "257c3544b5b544fd5d839665bf5caf107a329b59dbc3751efcaa24ae63c56179",
"blk.34.attn_k.weight": "b6cd8bba892e38dac4a2ebc3ba1bce49e71b967fc436fde30c6d76f54a18935f",
"blk.34.attn_norm.weight": "2b3c8e60a064cba9955752bbbbdd92c71ba5c2f1bd721097bdbe88b5abc68787",
"blk.34.attn_output.weight": "8cc272551c9aaca9db5a660c6927bab94a0243d74a30b2bc165f06bd577714ea",
"blk.34.attn_q.weight": "74b561eb4792484e6a94b58fe2583848c3ae28ff2f1bf3d02939a0cfdfa49990",
"blk.34.attn_v.weight": "dba19e24ff05154dc5a1f55c023729303a583d13d68732ce22ea74d4410dc8f0",
"blk.34.ffn_down.weight": "76eca5dfeb274c35774e0bf9f22ee420ed9085c8e99aa2cd5a236e4918b44c61",
"blk.34.ffn_gate.weight": "9af0862d5fcbc24732846488e653db8242a467765c0cdbc00332b3a40256b4a6",
"blk.34.ffn_up.weight": "2a03126bf73587eaba99ece2066103d12e47bcd4ce30ff6c17b2f383b81d40df",
"blk.35.attn_k.weight": "52513fc0cd4e997a842729af7d21dd09399bce0a339558374738be266d0fa2f0",
"blk.35.attn_norm.weight": "e5281fa911964263ccf1630b14762edbd41d0b9472d6ec695fc600fed4892c35",
"blk.35.attn_output.weight": "b391d6705d5dc6f48326b5fd16573f679edf64109d86fb729a498819676590ca",
"blk.35.attn_q.weight": "d16446921966db9b0e0539626ad22a2511ace780e59379d6a4162d8c5441440b",
"blk.35.attn_v.weight": "9d8cdf23ffdb0c5c74106843390b94b24c9f33ef0eb9998d39f78c73390101ea",
"blk.35.ffn_down.weight": "938eb6301f7bbf162d7dd965682a5ed11d0a4a530c6fedd7e5469ce80012fc17",
"blk.35.ffn_gate.weight": "5ad84f5a0c8edcfea1ecf1a3e3d21d85ceda0c4ad9e3c6ca68885eeff8ed3c2f",
"blk.35.ffn_up.weight": "1c4330d9dc71bf4c98812c34356c51f520f47610a534152aa6d29284b758090d",
"blk.36.attn_k.weight": "ef720655e5ca2465f13db2dfc4732fb4ef2c9d53acde52f514fd4f301e974081",
"blk.36.attn_norm.weight": "88f4b9310b3c8c2644e3029160cd35678c79dfa59280430e03f5c29a6fe84a58",
"blk.36.attn_output.weight": "aec6f915fffd7bb72cd783273e871b4f09605950089d45e72059d1316b6c4b01",
"blk.36.attn_q.weight": "72f9408a2405d42f8db6ce5fcf1d26a3660b6f225fc60e77d0277109cfcb82ed",
"blk.36.attn_v.weight": "0f3b3d851dc44b3893ef53f6cca5b4acc9658bacfe1cc2d13c3d704ddd409b67",
"blk.36.ffn_down.weight": "470aec48ce8c5129a6654d9fd26fcae72776f9fc1429a8bb05818072a876475d",
"blk.36.ffn_gate.weight": "7f5f296d09cf55679767b5d15de3eff489c456782119f25204be4b1647f18dcf",
"blk.36.ffn_up.weight": "b7ef74a1f7ffb4982711d93f1787be3a70edc3d2358d5203c41d8900508037d4",
"blk.37.attn_k.weight": "c4ffa5412e4ff2dcfe1aed991c1f54169fd171a4c7638e4b9f21a1ca64c5e1d6",
"blk.37.attn_norm.weight": "4eb6c888d841cccfacf5b963f8611120f6ff24b84af0b5714fd9ab36dcda422f",
"blk.37.attn_output.weight": "db2a7bbf9682f9f6eea672dae8e150738f1bf74dbc80edc7022017a3f040c8ac",
"blk.37.attn_q.weight": "e38c0462aff139afcbab289189823527e453abc9e541154adde5e7af88cacf0b",
"blk.37.attn_v.weight": "952eb2492ed452a72f96bcc12d4b2affad9dfdf46ee39ce4a5d7b57a5dc301e5",
"blk.37.ffn_down.weight": "25f23a8fbc44febf6dc4848fd7fe03a580e2822bd3b3b5a51f4990826bfe3e4e",
"blk.37.ffn_gate.weight": "707da5eb40118b035305d3262444382351f170a20a537386a70e90c5a83a7817",
"blk.37.ffn_up.weight": "d2d2ba5cfc4ef47338dd7384219e22bf030a5a2209e0354d88f5bbaaafd20e87",
"blk.38.attn_k.weight": "abc4bb189dedf7ce661e79028427623a4f91ac091c2cd60e31b58bc62b1cda71",
"blk.38.attn_norm.weight": "9f4803a7d03fd40fcb83d85f84eb1d5682ea4e5bb084f210c02850675d804c3d",
"blk.38.attn_output.weight": "77cb66007f1a41df7135d0e7f900ceb499c2f667dfc3f1a6ac01a3203bbd3ccf",
"blk.38.attn_q.weight": "d94a8b26cd375bf2bcaa76597e314aa8268ee50a479d00931e5e0e021feadb5d",
"blk.38.attn_v.weight": "660c907888bc5016dc69b7d35fe6f55c7ded697c93be0e2d332a2f17aff88758",
"blk.38.ffn_down.weight": "6f06173bae5b00ffaf88ef383619a8b9c6a8d0d5c6494695d17f6c1de1a68a13",
"blk.38.ffn_gate.weight": "89f99be149d03f116527bfcabe073c50001c874de40fb6e817f6619027f3cd05",
"blk.38.ffn_up.weight": "8d57557c8d5e2d2688b73f01dddf1ce8d5194990cda6358153320aea88aac7f8",
"blk.39.attn_k.weight": "21be09c988b46c8393e6c2ec9230f3b5136eb7607dd1953ba92d0811c2f0dd75",
"blk.39.attn_norm.weight": "ba7c1912dd1c4e2d16917201f62396fd0600e4a451137eaddff255548c209abd",
"blk.39.attn_output.weight": "acfaf4abb3fd27fd899b5563c3877f176b597d8f6cdb2f2fd3f3a0bd4da15ed6",
"blk.39.attn_q.weight": "e8adbc140d4c8f0db2a27ca584c5531d5b1e080555fe627e34d80d0814a92bed",
"blk.39.attn_v.weight": "92f96b0e1f724e73a0f90a76c145654418844c04a6d4b14c05eb5af8a62bf8dc",
"blk.39.ffn_down.weight": "4d9ee7c65fc16fe95d10c47b79ac6a525741947600a64b5fcea5d300a82c50de",
"blk.39.ffn_gate.weight": "7e18507989f39b32191133d2657c2ee3b74f42f070579204d727eb72215793d1",
"blk.39.ffn_up.weight": "22cda752269c9757ba918abede1df95bb0f83a5c772dea13c8deea3d5f2723d9",
"output_norm.weight": "2858cf0e39d32caf52b7861378ace076000241e147f10b9eb21d8a5cd149e3cb"
}

View File

@@ -2,7 +2,7 @@
### Getting Started
* [Quickstart](../README.md#quickstart)
* [Examples](../examples)
* [Examples](./examples.md)
* [Importing models](./import.md)
* [Linux Documentation](./linux.md)
* [Windows Documentation](./windows.md)

View File

@@ -38,7 +38,7 @@ Numeric IDs may be used, however ordering may vary, so UUIDs are more reliable.
You can discover the UUID of your GPUs by running `nvidia-smi -L` If you want to
ignore the GPUs and force CPU usage, use an invalid GPU ID (e.g., "-1")
### Laptop Suspend Resume
### Linux Suspend Resume
On linux, after a suspend/resume cycle, sometimes Ollama will fail to discover
your NVIDIA GPU, and fallback to running on the CPU. You can workaround this

View File

@@ -155,7 +155,6 @@ PARAMETER <parameter> <parametervalue>
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
| num_predict | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation) | int | num_predict 42 |
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |

View File

@@ -165,6 +165,8 @@ var (
IntelGPU = Bool("OLLAMA_INTEL_GPU")
// MultiUserCache optimizes prompt caching for multi-user scenarios
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
// Enable the new Ollama engine
NewRunners = Bool("OLLAMA_NEW_RUNNERS")
)
func String(s string) func() string {
@@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners(), "Enable the new Ollama engine"},
// Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},

3
go.mod
View File

@@ -24,7 +24,6 @@ require (
github.com/nlpodyssey/gopickle v0.3.0
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
golang.org/x/image v0.22.0
golang.org/x/tools v0.28.0
gonum.org/v1/gonum v0.15.0
)
@@ -72,7 +71,7 @@ require (
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.31.0
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
golang.org/x/net v0.32.0 // indirect
golang.org/x/net v0.25.0 // indirect
golang.org/x/sys v0.28.0
golang.org/x/term v0.27.0
golang.org/x/text v0.21.0

6
go.sum
View File

@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -309,8 +309,6 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View File

@@ -1,22 +0,0 @@
//go:build go1.24
package grammar
import "testing"
func BenchmarkFromSchema(b *testing.B) {
for tt := range testCases(b) {
b.Run("", func(b *testing.B) {
s := []byte(tt.schema)
b.ReportAllocs()
for b.Loop() {
_, err := FromSchema(nil, s)
if err != nil {
b.Fatalf("GrammarFromSchema: %v", err)
}
}
})
return
}
}

View File

@@ -1,227 +0,0 @@
package grammar
import (
"bytes"
"encoding/json"
"fmt"
"iter"
"strconv"
"github.com/ollama/ollama/grammar/jsonschema"
)
const jsonTerms = `
# Unicode
#
# Unicode characters can be specified directly in the grammar, for example
# hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit
# (\UXXXXXXXX).
unicode ::= \x{hex}{2} | \u{hex}{4} | \U{hex}{8}
# JSON grammar from RFC 7159
null ::= "null"
object ::= "{" (kv ("," kv)*)? "}"
array ::= "[" (value ("," value)*)? "]"
kv ::= string ":" value
integer ::= "0" | [1-9] [0-9]*
number ::= "-"? integer frac? exp?
frac ::= "." [0-9]+
exp ::= ("e" | "E") ("+" | "-") [0-9]+
string ::= "\"" char* "\""
escape ::= ["/" | "b" | "f" | "n" | "r" | "t" | unicode]
char ::= [^"\\] | escape
space ::= (" " | "\t" | "\n" | "\r")*
hex ::= [0-9] | [a-f] | [A-F]
boolean ::= "true" | "false"
value ::= object | array | string | number | boolean | "null"
# User-defined
`
// FromSchema generates a grammar from a JSON schema.
func FromSchema(buf []byte, jsonSchema []byte) ([]byte, error) {
var s *jsonschema.Schema
if err := json.Unmarshal(jsonSchema, &s); err != nil {
return nil, err
}
var g builder
// "root" is the only rule that is guaranteed to exist, so we start
// with its length for padding, and then adjust it as we go.
g.pad = len("root")
for id := range dependencies("root", s) {
g.pad = max(g.pad, len(id))
}
g.b.WriteString(jsonTerms)
ids := make(map[*jsonschema.Schema]string)
for id, s := range dependencies("root", s) {
ids[s] = id
g.define(id)
if err := fromSchema(&g, ids, s); err != nil {
return nil, err
}
}
g.define("root")
if err := fromSchema(&g, ids, s); err != nil {
return nil, err
}
g.define("") // finalize the last rule
return g.b.Bytes(), nil
}
func fromSchema(g *builder, ids map[*jsonschema.Schema]string, s *jsonschema.Schema) error {
switch typ := s.EffectiveType(); typ {
case "array":
if len(s.PrefixItems) == 0 && s.Items == nil {
g.u("array")
} else {
g.q("[")
for i, s := range s.PrefixItems {
if i > 0 {
g.q(",")
}
g.u(ids[s])
}
if s.Items != nil {
g.u("(")
if len(s.PrefixItems) > 0 {
g.q(",")
}
g.u(ids[s.Items])
g.u(")*")
}
g.q("]")
}
case "object":
if len(s.Properties) == 0 {
g.u("object")
} else {
g.q("{")
for i, p := range s.Properties {
name := ids[p]
if i > 0 {
g.q(",")
}
g.q(p.Name)
g.q(":")
g.u(name)
}
g.q("}")
}
case "number":
buildConstrainedNumber(g, s)
case "string":
if len(s.Enum) == 0 {
g.u("string")
} else {
g.u("(")
for i, e := range s.Enum {
if i > 0 {
g.q("|")
}
g.q(string(e))
}
g.u(")")
}
case "boolean", "value", "null", "integer":
g.u(typ)
default:
return fmt.Errorf("%s: unsupported type %q", s.Name, typ)
}
return nil
}
// dependencies returns a sequence of all child dependencies of the schema in
// post-order.
//
// The first value is the id/pointer to the dependency, and the second value
// is the schema.
func dependencies(id string, s *jsonschema.Schema) iter.Seq2[string, *jsonschema.Schema] {
return func(yield func(string, *jsonschema.Schema) bool) {
for i, p := range s.Properties {
id := fmt.Sprintf("%s_%d", id, i)
for did, d := range dependencies(id, p) {
if !yield(did, d) {
return
}
}
if !yield(id, p) {
return
}
}
for i, p := range s.PrefixItems {
id := fmt.Sprintf("tuple_%d", i)
for did, d := range dependencies(id, p) {
id := fmt.Sprintf("%s_%s", id, did)
if !yield(id, d) {
return
}
}
if !yield(id, p) {
return
}
}
if s.Items != nil {
id := fmt.Sprintf("%s_tuple_%d", id, len(s.PrefixItems))
for did, d := range dependencies(id, s.Items) {
if !yield(did, d) {
return
}
}
if !yield(id, s.Items) {
return
}
}
}
}
type builder struct {
b bytes.Buffer
pad int
rules int
items int
}
// define terminates the current rule, if any, and then either starts a new
// rule or does nothing else if the name is empty.
func (b *builder) define(name string) {
if b.rules > 0 {
b.b.WriteString(";\n")
}
if name == "" {
return
}
fmt.Fprintf(&b.b, "% -*s", b.pad, name)
b.b.WriteString(" ::=")
b.rules++
b.items = 0
}
// quote appends a terminal to the current rule.
func (b *builder) q(s string) {
if b.items > 0 {
b.b.WriteString(" ")
}
b.b.WriteString(" ")
b.b.WriteString(strconv.Quote(s))
}
// u appends a non-terminal to the current rule.
func (b *builder) u(s string) {
if b.items > 0 {
b.b.WriteString(" ")
}
b.b.WriteString(" ")
b.b.WriteString(s)
}
func buildConstrainedNumber(b *builder, s *jsonschema.Schema) {
if s.Minimum == 0 && s.Maximum == 0 {
b.u("TODO")
} else {
b.u("number")
}
}

View File

@@ -1,75 +0,0 @@
package grammar
import (
"bufio"
"cmp"
"iter"
"strings"
"testing"
_ "embed"
"github.com/ollama/ollama/grammar/internal/diff"
)
func TestFromSchema(t *testing.T) {
for tt := range testCases(t) {
t.Run(tt.name, func(t *testing.T) {
g, err := FromSchema(nil, []byte(tt.schema))
if err != nil {
t.Fatalf("FromSchema: %v", err)
}
got := string(g)
got = strings.TrimPrefix(got, jsonTerms)
if got != tt.want {
t.Logf("schema:\n%s", tt.schema)
t.Fatal(string(diff.Diff("got", []byte(got), "want", []byte(tt.want))))
}
})
}
}
type testCase struct {
name string
schema string
want string
}
//go:embed testdata/schemas.txt
var tests string
func testCases(t testing.TB) iter.Seq[testCase] {
t.Helper()
return func(yield func(testCase) bool) {
t.Helper()
sc := bufio.NewScanner(strings.NewReader(tests))
name := ""
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if line == "" {
name = ""
continue
}
if line[0] == '#' {
name = cmp.Or(name, strings.TrimSpace(line[1:]))
continue
}
s := sc.Text()
g := ""
for sc.Scan() {
line = strings.TrimSpace(sc.Text())
if line == "" || line[0] == '#' {
break
}
g += sc.Text() + "\n"
}
if !yield(testCase{name, s, g}) {
return
}
name = strings.TrimSpace(strings.TrimPrefix(line, "#"))
}
if err := sc.Err(); err != nil {
t.Fatalf("error reading tests: %v", err)
}
}
}

View File

@@ -1,261 +0,0 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package diff
import (
"bytes"
"fmt"
"sort"
"strings"
)
// A pair is a pair of values tracked for both the x and y side of a diff.
// It is typically a pair of line indexes.
type pair struct{ x, y int }
// Diff returns an anchored diff of the two texts old and new
// in the “unified diff” format. If old and new are identical,
// Diff returns a nil slice (no output).
//
// Unix diff implementations typically look for a diff with
// the smallest number of lines inserted and removed,
// which can in the worst case take time quadratic in the
// number of lines in the texts. As a result, many implementations
// either can be made to run for a long time or cut off the search
// after a predetermined amount of work.
//
// In contrast, this implementation looks for a diff with the
// smallest number of “unique” lines inserted and removed,
// where unique means a line that appears just once in both old and new.
// We call this an “anchored diff” because the unique lines anchor
// the chosen matching regions. An anchored diff is usually clearer
// than a standard diff, because the algorithm does not try to
// reuse unrelated blank lines or closing braces.
// The algorithm also guarantees to run in O(n log n) time
// instead of the standard O(n²) time.
//
// Some systems call this approach a “patience diff,” named for
// the “patience sorting” algorithm, itself named for a solitaire card game.
// We avoid that name for two reasons. First, the name has been used
// for a few different variants of the algorithm, so it is imprecise.
// Second, the name is frequently interpreted as meaning that you have
// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
// when in fact the algorithm is faster than the standard one.
func Diff(oldName string, old []byte, newName string, new []byte) []byte {
if bytes.Equal(old, new) {
return nil
}
x := lines(old)
y := lines(new)
// Print diff header.
var out bytes.Buffer
fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
fmt.Fprintf(&out, "--- %s\n", oldName)
fmt.Fprintf(&out, "+++ %s\n", newName)
// Loop over matches to consider,
// expanding each match to include surrounding lines,
// and then printing diff chunks.
// To avoid setup/teardown cases outside the loop,
// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
// in the sequence of matches.
var (
done pair // printed up to x[:done.x] and y[:done.y]
chunk pair // start lines of current chunk
count pair // number of lines from each side in current chunk
ctext []string // lines for current chunk
)
for _, m := range tgs(x, y) {
if m.x < done.x {
// Already handled scanning forward from earlier match.
continue
}
// Expand matching lines as far as possible,
// establishing that x[start.x:end.x] == y[start.y:end.y].
// Note that on the first (or last) iteration we may (or definitely do)
// have an empty match: start.x==end.x and start.y==end.y.
start := m
for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
start.x--
start.y--
}
end := m
for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
end.x++
end.y++
}
// Emit the mismatched lines before start into this chunk.
// (No effect on first sentinel iteration, when start = {0,0}.)
for _, s := range x[done.x:start.x] {
ctext = append(ctext, "-"+s)
count.x++
}
for _, s := range y[done.y:start.y] {
ctext = append(ctext, "+"+s)
count.y++
}
// If we're not at EOF and have too few common lines,
// the chunk includes all the common lines and continues.
const C = 3 // number of context lines
if (end.x < len(x) || end.y < len(y)) &&
(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
for _, s := range x[start.x:end.x] {
ctext = append(ctext, " "+s)
count.x++
count.y++
}
done = end
continue
}
// End chunk with common lines for context.
if len(ctext) > 0 {
n := end.x - start.x
if n > C {
n = C
}
for _, s := range x[start.x : start.x+n] {
ctext = append(ctext, " "+s)
count.x++
count.y++
}
done = pair{start.x + n, start.y + n}
// Format and emit chunk.
// Convert line numbers to 1-indexed.
// Special case: empty file shows up as 0,0 not 1,0.
if count.x > 0 {
chunk.x++
}
if count.y > 0 {
chunk.y++
}
fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
for _, s := range ctext {
out.WriteString(s)
}
count.x = 0
count.y = 0
ctext = ctext[:0]
}
// If we reached EOF, we're done.
if end.x >= len(x) && end.y >= len(y) {
break
}
// Otherwise start a new chunk.
chunk = pair{end.x - C, end.y - C}
for _, s := range x[chunk.x:end.x] {
ctext = append(ctext, " "+s)
count.x++
count.y++
}
done = end
}
return out.Bytes()
}
// lines returns the lines in the file x, including newlines.
// If the file does not end in a newline, one is supplied
// along with a warning about the missing newline.
func lines(x []byte) []string {
l := strings.SplitAfter(string(x), "\n")
if l[len(l)-1] == "" {
l = l[:len(l)-1]
} else {
// Treat last line as having a message about the missing newline attached,
// using the same text as BSD/GNU diff (including the leading backslash).
l[len(l)-1] += "\n\\ No newline at end of file\n"
}
return l
}
// tgs returns the pairs of indexes of the longest common subsequence
// of unique lines in x and y, where a unique line is one that appears
// once in x and once in y.
//
// The longest common subsequence algorithm is as described in
// Thomas G. Szymanski, “A Special Case of the Maximal Common
// Subsequence Problem,” Princeton TR #170 (January 1975),
// available at https://research.swtch.com/tgs170.pdf.
func tgs(x, y []string) []pair {
// Count the number of times each string appears in a and b.
// We only care about 0, 1, many, counted as 0, -1, -2
// for the x side and 0, -4, -8 for the y side.
// Using negative numbers now lets us distinguish positive line numbers later.
m := make(map[string]int)
for _, s := range x {
if c := m[s]; c > -2 {
m[s] = c - 1
}
}
for _, s := range y {
if c := m[s]; c > -8 {
m[s] = c - 4
}
}
// Now unique strings can be identified by m[s] = -1+-4.
//
// Gather the indexes of those strings in x and y, building:
// xi[i] = increasing indexes of unique strings in x.
// yi[i] = increasing indexes of unique strings in y.
// inv[i] = index j such that x[xi[i]] = y[yi[j]].
var xi, yi, inv []int
for i, s := range y {
if m[s] == -1+-4 {
m[s] = len(yi)
yi = append(yi, i)
}
}
for i, s := range x {
if j, ok := m[s]; ok && j >= 0 {
xi = append(xi, i)
inv = append(inv, j)
}
}
// Apply Algorithm A from Szymanski's paper.
// In those terms, A = J = inv and B = [0, n).
// We add sentinel pairs {0,0}, and {len(x),len(y)}
// to the returned sequence, to help the processing loop.
J := inv
n := len(xi)
T := make([]int, n)
L := make([]int, n)
for i := range T {
T[i] = n + 1
}
for i := range n {
k := sort.Search(n, func(k int) bool {
return T[k] >= J[i]
})
T[k] = J[i]
L[i] = k + 1
}
k := 0
for _, v := range L {
if k < v {
k = v
}
}
seq := make([]pair, 2+k)
seq[1+k] = pair{len(x), len(y)} // sentinel at end
lastj := n
for i := n - 1; i >= 0; i-- {
if L[i] == k && J[i] < lastj {
seq[k] = pair{xi[i], yi[J[i]]}
k--
}
}
seq[0] = pair{0, 0} // sentinel at start
return seq
}

View File

@@ -1,44 +0,0 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package diff
import (
"bytes"
"path/filepath"
"testing"
"golang.org/x/tools/txtar"
)
func clean(text []byte) []byte {
text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n"))
text = bytes.TrimSuffix(text, []byte("^D\n"))
return text
}
func Test(t *testing.T) {
files, _ := filepath.Glob("testdata/*.txt")
if len(files) == 0 {
t.Fatalf("no testdata")
}
for _, file := range files {
t.Run(filepath.Base(file), func(t *testing.T) {
a, err := txtar.ParseFile(file)
if err != nil {
t.Fatal(err)
}
if len(a.Files) != 3 || a.Files[2].Name != "diff" {
t.Fatalf("%s: want three files, third named \"diff\"", file)
}
diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data))
want := clean(a.Files[2].Data)
if !bytes.Equal(diffs, want) {
t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file,
diffs, want, Diff("have", diffs, "want", want))
}
})
}
}

View File

@@ -1,13 +0,0 @@
-- old --
-- new --
a
b
c
-- diff --
diff old new
--- old
+++ new
@@ -0,0 +1,3 @@
+a
+b
+c

View File

@@ -1,13 +0,0 @@
-- old --
a
b
c
-- new --
-- diff --
diff old new
--- old
+++ new
@@ -1,3 +0,0 @@
-a
-b
-c

View File

@@ -1,35 +0,0 @@
Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.”
https://www.cs.dartmouth.edu/~doug/diff.pdf
-- old --
a
b
c
d
e
f
g
-- new --
w
a
b
x
y
z
e
-- diff --
diff old new
--- old
+++ new
@@ -1,7 +1,7 @@
+w
a
b
-c
-d
+x
+y
+z
e
-f
-g

View File

@@ -1,40 +0,0 @@
-- old --
a
b
c
d
e
f
-- new --
a
B
C
d
e
f
-- diff --
diff old new
--- old
+++ new
@@ -1,8 +1,8 @@
a
$
-b
-
-c
+B
+
+C
$
d
$

View File

@@ -1,38 +0,0 @@
-- old --
1
2
3
4
5
6
7
eight
nine
ten
eleven
-- new --
1
2
3
4
5
6
7
8
9
10
-- diff --
diff old new
--- old
+++ new
@@ -5,7 +5,6 @@
5
6
7
-eight
-nine
-ten
-eleven
+8
+9
+10

View File

@@ -1,9 +0,0 @@
-- old --
a
b
c^D
-- new --
a
b
c^D
-- diff --

View File

@@ -1,18 +0,0 @@
-- old --
a
b
c
-- new --
a
b
c^D
-- diff --
diff old new
--- old
+++ new
@@ -1,3 +1,3 @@
a
b
-c
+c
\ No newline at end of file

View File

@@ -1,18 +0,0 @@
-- old --
a
b
c^D
-- new --
a
b
c
-- diff --
diff old new
--- old
+++ new
@@ -1,3 +1,3 @@
a
b
-c
\ No newline at end of file
+c

View File

@@ -1,62 +0,0 @@
-- old --
1
2
3
4
5
6
7
8
9
10
11
12
13
14
14½
15
16
17
18
19
20
-- new --
1
2
3
4
5
6
8
9
10
11
12
13
14
17
18
19
20
-- diff --
diff old new
--- old
+++ new
@@ -4,7 +4,6 @@
4
5
6
-7
8
9
10
@@ -12,9 +11,6 @@
12
13
14
-14½
-15
-16
17
18
19

View File

@@ -1,5 +0,0 @@
-- old --
hello world
-- new --
hello world
-- diff --

View File

@@ -1,34 +0,0 @@
-- old --
e
pi
4
5
6
7
8
9
10
-- new --
1
2
3
4
5
6
7
8
9
10
-- diff --
diff old new
--- old
+++ new
@@ -1,5 +1,6 @@
-e
-pi
+1
+2
+3
4
5
6

View File

@@ -1,40 +0,0 @@
Another example from Hunt and McIlroy,
“An Algorithm for Differential File Comparison.”
https://www.cs.dartmouth.edu/~doug/diff.pdf
Anchored diff gives up on finding anything,
since there are no unique lines.
-- old --
a
b
c
a
b
b
a
-- new --
c
a
b
a
b
c
-- diff --
diff old new
--- old
+++ new
@@ -1,7 +1,6 @@
-a
-b
-c
-a
-b
-b
-a
+c
+a
+b
+a
+b
+c

View File

@@ -1,171 +0,0 @@
package jsonschema
import (
"bytes"
"encoding/json"
"errors"
)
// Schema holds a JSON schema.
type Schema struct {
// Name is the name of the property. For the parent/root property, this
// is "root". For child properties, this is the name of the property.
Name string `json:"-"`
// Type is the type of the property.
//
// TODO: Union types (e.g. make this a []string).
Type string
// PrefixItems is a list of schemas for each item in a tuple. By
// default, the tuple is "closed." unless Items is set to true or a
// valid Schema.
PrefixItems []*Schema
// Items is the schema for each item in a list.
//
// If it is missing, or its JSON value is "null" or "false", it is nil.
// If the JSON value is "true", it is set to the empty Schema. If the
// JSON value is an object, it will be decoded as a Schema.
Items *Schema
// MinItems specifies the minimum number of items allowed in a list.
MinItems int
// MaxItems specifies the maximum number of items allowed in a list.
MaxItems int
// Properties is the schema for each property of an object.
Properties []*Schema
// Format is the format of the property. This is used to validate the
// property against a specific format.
//
// It is the callers responsibility to validate the property against
// the format.
Format string
// Minimum specifies the minimum value for numeric properties.
Minimum float64
// Maximum specifies the maximum value for numeric properties.
Maximum float64
// Enum is a list of valid values for the property.
Enum []json.RawMessage
}
func (s *Schema) UnmarshalJSON(data []byte) error {
type S Schema
w := struct {
Properties props
Items items
*S
}{
S: (*S)(s),
}
if err := json.Unmarshal(data, &w); err != nil {
return err
}
if w.Items.set {
s.Items = &w.Items.Schema
}
s.Properties = w.Properties
return nil
}
type items struct {
Schema
set bool
}
func (s *items) UnmarshalJSON(data []byte) error {
switch b := data[0]; b {
case 't':
*s = items{set: true}
case '{':
type I items
if err := json.Unmarshal(data, (*I)(s)); err != nil {
return err
}
s.set = true
case 'n', 'f':
default:
return errors.New("invalid Items")
}
return nil
}
// EffectiveType returns the effective type of the schema. If the Type field is
// not empty, it is returned; otherwise:
//
// - If the schema has both Properties and Items, it returns an empty string.
// - If the schema has Properties, it returns "object".
// - If the schema has Items, it returns "array".
// - If the schema has neither Properties nor Items, it returns "value".
//
// The returned string is never empty.
func (d *Schema) EffectiveType() string {
if d.Type == "" {
if len(d.Properties) > 0 {
return "object"
}
if len(d.PrefixItems) > 0 || d.Items != nil {
return "array"
}
return "value"
}
return d.Type
}
// props is an ordered list of properties. The order of the properties
// is the order in which they were defined in the schema.
type props []*Schema
var _ json.Unmarshaler = (*props)(nil)
func (v *props) UnmarshalJSON(data []byte) error {
if len(data) == 0 {
return nil
}
if data[0] != '{' {
return errors.New("expected object")
}
d := json.NewDecoder(bytes.NewReader(data))
// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
// llama.cpp, ignore unknown fields, which could be lead to unexpected
// behavior for clients of this package, since they may not be aware
// that "additionalFields", "itemsPrefix", etc, are being ignored.
//
// For now, just do what llama.cpp does.
t, err := d.Token()
if err != nil {
return err
}
if t != json.Delim('{') {
return errors.New("expected object")
}
for d.More() {
// Use the first token (map key) as the property name, then
// decode the rest of the object fields into a Schema and
// append.
t, err := d.Token()
if err != nil {
return err
}
if t == json.Delim('}') {
return nil
}
s := &Schema{
Name: t.(string),
}
if err := d.Decode(s); err != nil {
return err
}
*v = append(*v, s)
}
return nil
}

View File

@@ -1,104 +0,0 @@
package jsonschema
import (
"encoding/json"
"reflect"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
const testSchemaBasic = `
{
"properties": {
"tupleClosedEmpty": { "prefixItems": [] },
"tupleClosedMissing": { "prefixItems": [{}] },
"tupleClosedNull": { "prefixItems": [{}], "items": null },
"tupleClosedFalse": { "prefixItems": [{}], "items": false },
"tupleOpenTrue": { "prefixItems": [{}], "items": true },
"tupleOpenEmpty": { "prefixItems": [{}], "items": {} },
"tupleOpenTyped": { "prefixItems": [{}], "items": {"type": "boolean"} },
"tupleOpenMax": { "prefixItems": [{}], "items": true, "maxItems": 3},
"array": { "items": {"type": "number"} },
"null": { "type": "null" },
"string": { "type": "string" },
"boolean": { "type": "boolean" }
}
}
`
func TestSchemaUnmarshal(t *testing.T) {
var got *Schema
if err := json.Unmarshal([]byte(testSchemaBasic), &got); err != nil {
t.Fatalf("Unmarshal: %v", err)
}
want := &Schema{
Properties: []*Schema{
{Name: "tupleClosedEmpty", PrefixItems: []*Schema{}, Items: nil},
{Name: "tupleClosedMissing", PrefixItems: []*Schema{{}}, Items: nil},
{Name: "tupleClosedNull", PrefixItems: []*Schema{{}}, Items: nil},
{Name: "tupleClosedFalse", PrefixItems: []*Schema{{}}, Items: nil},
{Name: "tupleOpenTrue", PrefixItems: []*Schema{{}}, Items: &Schema{}},
{Name: "tupleOpenEmpty", PrefixItems: []*Schema{{}}, Items: &Schema{}},
{Name: "tupleOpenTyped", PrefixItems: []*Schema{{}}, Items: &Schema{Type: "boolean"}},
{Name: "tupleOpenMax", PrefixItems: []*Schema{{}}, Items: &Schema{}, MaxItems: 3},
{Name: "array", Items: &Schema{Type: "number"}},
{Name: "null", Type: "null"},
{Name: "string", Type: "string"},
{Name: "boolean", Type: "boolean"},
},
}
if diff := cmp.Diff(want, got); diff != "" {
t.Errorf("(-want, +got)\n%s", diff)
}
}
func TestEffectiveType(t *testing.T) {
const schema = `
{"properties": {
"o": {"type": "object"},
"a": {"type": "array"},
"n": {"type": "number"},
"s": {"type": "string"},
"z": {"type": "null"},
"b": {"type": "boolean"},
"t0": {"prefixItems": [{}], "items": {"type": "number"}},
"t1": {"items": {"type": "number"}, "maxItems": 3},
"v": {"maxItems": 3}
}}
`
var s *Schema
if err := json.Unmarshal([]byte(schema), &s); err != nil {
t.Fatalf("json.Unmarshal: %v", err)
}
var got []string
for _, p := range s.Properties {
got = append(got, p.EffectiveType())
}
want := strings.Fields(`
object
array
number
string
null
boolean
array
array
value
`)
if !reflect.DeepEqual(want, got) {
t.Errorf("\ngot:\n\t%v\nwant:\n\t%v", got, want)
}
}

View File

@@ -1,76 +0,0 @@
# This file holds tests for JSON schema to EBNF grammar conversions.
#
# The format is a JSON schema, followed by the expected EBNF grammar. Each test
# MAY be preceded by a comment that describes the test (e.g. the test name), followed by
# the JSON schema and the expected EBNF grammar. If no comment is present, the test
# name the tests number in the file (e.g. "#0", "#1", etc.)
#
# Blank lines signify the end or start of a new test. Comments can be added
# anywhere in the file, but they must be preceded by a '#' character and start at
# the beginning of the line.
# default
{}
root ::= value;
{"properties": {}}
root ::= value;
# array
{"properties": {"a": {"type": "array", "items": {"type": "string"}}}}
root_0_tuple_0 ::= string;
root_0 ::= "[" ( root_0_tuple_0 )* "]";
root ::= "{" "a" ":" root_0 "}";
# array with nested array
{"type": "array", "items": {"type": "array", "items": {"type": "string"}}}
root_tuple_0_tuple_0 ::= string;
root_tuple_0 ::= "[" ( root_tuple_0_tuple_0 )* "]";
root ::= "[" ( root_tuple_0 )* "]";
# object
{"properties": {"e": {}}}
root_0 ::= value;
root ::= "{" "e" ":" root_0 "}";
# object with nested object
{"properties": {"o": {"type": "object", "properties": {"e": {}}}}}
root_0_0 ::= value;
root_0 ::= "{" "e" ":" root_0_0 "}";
root ::= "{" "o" ":" root_0 "}";
# boolean
{"type": "boolean"}
root ::= boolean;
# number
{"properties": {"n": {"type": "number", "minimum": 123, "maximum": 4567}}}
root_0 ::= number;
root ::= "{" "n" ":" root_0 "}";
# string
{"type": "string"}
root ::= string;
# string with enum
{"type": "string", "enum": ["a", "b", "c"]}
root ::= ( "\"a\"" "|" "\"b\"" "|" "\"c\"" );
# spaces in key
{"properties": {"a b": {}}}
root_0 ::= value;
root ::= "{" "a b" ":" root_0 "}";
# issue7978
{ "type": "object", "properties": { "steps": { "type": "array", "items": { "type": "object", "properties": { "explanation": { "type": "string" }, "output": { "type": "string" } }, "required": [ "explanation", "output" ], "additionalProperties": false } }, "final_answer": { "type": "string" } }, "required": [ "steps", "final_answer" ], "additionalProperties": false }
root_0_tuple_0_0 ::= string;
root_0_tuple_0_1 ::= string;
root_0_tuple_0 ::= "{" "explanation" ":" root_0_tuple_0_0 "," "output" ":" root_0_tuple_0_1 "}";
root_0 ::= "[" ( root_0_tuple_0 )* "]";
root_1 ::= string;
root ::= "{" "steps" ":" root_0 "," "final_answer" ":" root_1 "}";
# !! # special characters in key
# !! {"properties": {"a!b": {}}}
# !! !invalid character '!' in key
# !!

View File

@@ -3,5 +3,6 @@ package llama
// #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -I${SRCDIR}/../include
// #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
// #cgo windows CPPFLAGS: -D_WIN32_WINNT=0x0602
import "C"
import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"

View File

@@ -0,0 +1,29 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 14 Jan 2025 15:59:04 -0800
Subject: [PATCH] add phony target ggml-cpu for all cpu variants
---
ggml/src/CMakeLists.txt | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 84101c32..72b488dd 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
+ add_dependencies(ggml-cpu ggml-cpu-${tag_name})
endfunction()
ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
+ add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)

View File

@@ -29,7 +29,6 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/grammar"
"github.com/ollama/ollama/llama"
)
@@ -253,6 +252,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
finalParams := []string{"runner"}
if envconfig.NewRunners() {
finalParams = append(finalParams, "--new-runner")
}
finalParams = append(finalParams, params...)
finalParams = append(finalParams, "--port", strconv.Itoa(port))
@@ -661,9 +663,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
// User provided a JSON schema
g, err := grammar.FromSchema(nil, req.Format)
if err != nil {
return fmt.Errorf("invalid JSON schema in format: %w", err)
g := llama.SchemaToGrammar(req.Format)
if g == nil {
return fmt.Errorf("invalid JSON schema in format")
}
request["grammar"] = string(g)
}
@@ -683,6 +685,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
req.Options.NumPredict = 10 * s.options.NumCtx
}
// Make sure the server is ready
status, err := s.getServerStatusRetry(ctx)
if err != nil {

View File

@@ -43,12 +43,13 @@ func NewBackend(f *os.File) (Backend, error) {
}
type Context interface {
Zeros(dtype DType, shape ...int) Tensor
Zeros(dtype DType, shape ...int64) Tensor
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
FromIntSlice(s []int32, shape ...int) (Tensor, error)
Forward(Tensor)
Compute(Tensor) Tensor
MaxTensors() int
Close() error
}

View File

@@ -23,7 +23,7 @@ import (
"github.com/ollama/ollama/ml"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
)
type device struct {
@@ -198,10 +198,9 @@ func (b *Backend) Get(name string) ml.Tensor {
func (b *Backend) NewContext() ml.Context {
nodes := max(8192, len(b.meta.Tensors().Items())*5)
bts := make([]byte, C.size_t(nodes)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(nodes), false))
c := C.ggml_init(C.struct_ggml_init_params{
mem_buffer: unsafe.Pointer(&bts[0]),
mem_size: C.size_t(len(bts)),
mem_buffer: nil,
mem_size: C.size_t(nodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(nodes), false),
no_alloc: true,
})
@@ -244,17 +243,23 @@ func (c *Context) Forward(t ml.Tensor) {
}
func (c *Context) Compute(t ml.Tensor) ml.Tensor {
c.Forward(t)
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
if t != nil && C.ggml_nbytes(t.(*Tensor).t) != 0 {
backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
return t
}
func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
func (c *Context) MaxTensors() int {
return c.nodes
}
func (c Context) Zeros(dtype ml.DType, shape ...int64) ml.Tensor {
if len(shape) < 1 || len(shape) > 4 {
panic("unsupported number of dimensions")
}
@@ -283,6 +288,13 @@ func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
n := len(s)
if n == 0 {
shape := 0
t := C.ggml_new_tensor(ctx.ctx, dtype, 1, (*C.int64_t)(unsafe.Pointer(&shape)))
return &Tensor{t: t}, nil
}
for _, v := range shape {
n /= v
}

View File

@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
add_dependencies(ggml-cpu ggml-cpu-${tag_name})
endfunction()
ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)

View File

@@ -3,6 +3,7 @@ package ggml
// #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
// #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu
// #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
// #include <stdlib.h>
// #include "ggml-backend.h"
// extern void sink(int level, char *text, void *user_data);

View File

@@ -1,212 +0,0 @@
package main
import (
"errors"
"flag"
"fmt"
"image"
"io"
"log/slog"
"os"
"path/filepath"
"strings"
"time"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
_ "github.com/ollama/ollama/model/llama"
_ "github.com/ollama/ollama/model/mllama"
"github.com/ollama/ollama/sample"
)
var args struct {
n int
debug bool
image string
cache bool
}
func temp() error {
// start := time.Now()
flag.IntVar(&args.n, "n", 10, "number of samples")
flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
flag.StringVar(&args.image, "image", "", "path to image file")
flag.BoolVar(&args.cache, "cache", false, "enable KV cache")
flag.Parse()
var prompt string
if n := len(flag.Args()); n == 1 {
bts, err := io.ReadAll(os.Stdin)
if err != nil {
return err
}
prompt = string(bts)
} else if n > 1 {
prompt = strings.Join(flag.Args()[1:], " ")
} else {
return fmt.Errorf("usage: %s path/to/file <prompt\n", filepath.Base(os.Args[0]))
}
level := slog.LevelInfo
if args.debug {
level = slog.LevelDebug
}
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})))
m, err := model.New(flag.Arg(0))
if err != nil {
return err
}
inputIDs, err := m.(model.TextProcessor).Encode(prompt)
if err != nil {
return err
}
var opts []model.OptionsFunc
if args.cache {
opts = append(opts, model.WithCache(&cache.Simple{
Capacity: 2048,
DType: ml.DTypeF32,
}))
}
if args.image != "" {
if err := func() error {
f, err := os.Open(args.image)
if err != nil {
return err
}
defer f.Close()
img, _, err := image.Decode(f)
if err != nil {
return err
}
opts = append(opts, model.WithImage(img))
return nil
}(); err != nil {
return err
}
}
// Schema for a list of friends with their info
// Maps to JSON like:
// {
// "name": "string",
// "age": integer,
// "is_available": boolean
// }
schema := &sample.Schema{
Name: "root",
Type: "object",
Properties: []*sample.Schema{
{Name: "name", Type: "string"},
{Name: "age", Type: "integer"},
{Name: "is_available", Type: "boolean"},
},
}
// fmt.Println("schema", schema)
// schema = nil
jsonTransform, err := sample.NewJSONSampler(m.(model.TextProcessor), schema)
if err != nil {
return err
}
transforms := []sample.Transform{
jsonTransform,
}
var offset int
var stringBuffer string
// var ttft time.Duration
var totalSamplingTime time.Duration
count := 0
for range args.n {
logits, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
if err != nil {
return err
}
samplingStart := time.Now()
sampler := sample.Greedy()
sampledIdx, err := sampler.Sample(logits.Floats(), transforms...)
if err != nil {
return err
}
samplingTime := time.Since(samplingStart)
totalSamplingTime += samplingTime
// fmt.Println("sampling time", samplingTime)
// fmt.Printf("Sample time: %vms\n", finishTime.Sub(sampleTime).Milliseconds())
var outputIDs []int32
if !m.(model.TextProcessor).Is(uint32(sampledIdx), model.SpecialEOS) {
outputIDs = append(outputIDs, int32(sampledIdx))
}
if len(outputIDs) == 0 {
break
}
s, err := m.(model.TextProcessor).Decode(outputIDs)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return err
}
// if ttft == 0 {
// ttft = time.Since(start)
// fmt.Printf("Time to first token: %vms\n", ttft.Milliseconds())
// }
// fmt.Printf("--- token: %q\n", s)
// fmt.Printf("--- outputIDs: %v\n", outputIDs)
stringBuffer += s
count++
fmt.Println("--- stringBuffer", stringBuffer)
outputIDs, err = jsonTransform.UpdateState(outputIDs)
if err != nil {
return err
}
// can do fun shifting stuff here if needed
inputIDs = append(inputIDs, outputIDs...)
if args.cache {
offset = len(inputIDs) - 1
}
}
fmt.Println("\n------ Output: ------")
fmt.Println(stringBuffer)
fmt.Println("--------------------")
fmt.Println("sample average time", totalSamplingTime/time.Duration(count))
return nil
}
func main() {
if err := temp(); err != nil {
fmt.Println("err", err)
os.Exit(1)
}
}

View File

@@ -1 +0,0 @@
package main

View File

@@ -3,6 +3,7 @@ package llama
import (
"math"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
@@ -59,7 +60,7 @@ type SelfAttention struct {
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache cache.Cache, opts *Options) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
@@ -74,7 +75,8 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
k, v = cache.Put(ctx, k, v, cache.Options)
cache.Put(ctx, k, v)
k, v, mask := cache.Get(ctx)
q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -82,6 +84,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
kq := k.Mulmat(ctx, q)
kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
kq = kq.Add(ctx, mask)
kq = kq.Softmax(ctx)
kqv := v.Mulmat(ctx, kq)
@@ -109,7 +112,7 @@ type Layer struct {
MLP *MLP
}
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache cache.Cache, opts *Options) ml.Tensor {
residual := hiddenState
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -142,7 +145,7 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
hiddenState = m.Output.Forward(ctx, hiddenState)
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
outputs, err := ctx.FromIntSlice(opts.Outputs(), len(opts.Outputs()))
if err != nil {
return nil, err
}

View File

@@ -1,6 +1,9 @@
package mllama
import (
"sync"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
@@ -16,6 +19,9 @@ type Model struct {
ImageProcessor
TextProcessor
start sync.Once
tCache *cache.TensorCache
}
func New(c ml.Config) (model.Model, error) {
@@ -28,6 +34,10 @@ func New(c ml.Config) (model.Model, error) {
}
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
m.start.Do(func() {
m.tCache = cache.NewTensorCache(m.Backend())
})
var crossAttentionStates ml.Tensor
if opts.Images != nil {
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
@@ -75,9 +85,9 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
}
// TODO: attention mask, cross attention mask
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache, m.tCache)
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
outputs, err := ctx.FromIntSlice(opts.Outputs(), len(opts.Outputs()))
if err != nil {
return nil, err
}

View File

@@ -4,9 +4,9 @@ import (
"math"
"slices"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
)
type TextSelfAttention struct {
@@ -16,7 +16,7 @@ type TextSelfAttention struct {
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache cache.Cache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
@@ -31,7 +31,8 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mas
value := sa.Value.Forward(ctx, hiddenState)
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
key, value = cache.Put(ctx, key, value, cache.Options)
cache.Put(ctx, key, value)
key, value, mask := cache.Get(ctx)
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -39,11 +40,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mas
scores := key.Mulmat(ctx, query)
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
if mask != nil {
scores = scores.Add(ctx, mask)
}
scores = scores.Add(ctx, mask)
scores = scores.Softmax(ctx)
attention := value.Mulmat(ctx, scores)
@@ -72,7 +69,7 @@ type TextSelfAttentionDecoderLayer struct {
MLP *TextMLP
}
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache cache.Cache, _ *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -85,6 +82,10 @@ func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, pos
return hiddenState.Add(ctx, residual)
}
func (d *TextSelfAttentionDecoderLayer) Run() bool {
return true
}
type TextCrossAttention struct {
QueryNorm *nn.RMSNorm `gguf:"cross_attn_q_norm"`
Query *nn.Linear `gguf:"cross_attn_q_proj"`
@@ -94,23 +95,29 @@ type TextCrossAttention struct {
Output *nn.Linear `gguf:"cross_attn_o_proj"`
}
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, _ cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
query := ca.Query.Forward(ctx, hiddenState)
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
query = ca.QueryNorm.Forward(ctx, query, opts.eps)
key := ca.Key.Forward(ctx, crossAttentionStates)
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
var key, value ml.Tensor
if crossAttentionStates != nil {
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
value := ca.Value.Forward(ctx, crossAttentionStates)
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
key = ca.Key.Forward(ctx, crossAttentionStates)
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
// TODO cache key, value
value = ca.Value.Forward(ctx, crossAttentionStates)
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
tCache.Put(ctx, key, value)
} else {
key, value, _ = tCache.Get(ctx)
}
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -135,13 +142,17 @@ type TextCrossAttentionDecoderLayer struct {
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
MLP *TextMLP
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
run bool
}
func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
d.run = true
residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, tCache, opts)
hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
hiddenState = hiddenState.Add(ctx, residual)
residual = hiddenState
@@ -152,18 +163,23 @@ func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
return hiddenState.Add(ctx, residual)
}
func (d *TextCrossAttentionDecoderLayer) Run() bool {
return d.run
}
type TextDecoderLayer interface {
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor
Run() bool
}
type TextDecoder struct {
Layers []TextDecoderLayer
}
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache, opts *TextModelOptions) ml.Tensor {
for i, layer := range d.Layers {
if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
if layer.Run() || crossAttentionStates != nil {
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), tCache.Sub(i), opts)
}
}
@@ -189,9 +205,9 @@ type TextModel struct {
*TextModelOptions
}
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache cache.Cache, tCache *cache.TensorCache) ml.Tensor {
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, tCache, m.TextModelOptions)
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
return m.Output.Forward(ctx, hiddenState)
}

View File

@@ -20,51 +20,28 @@ import (
_ "github.com/ollama/ollama/ml/backend"
)
type Cache struct {
cache.Cache
cache.Options
}
func (c Cache) Sub(i int) Cache {
if c.Cache != nil {
return Cache{
Cache: c.Cache.Sub(i),
Options: c.Options,
}
}
return c
}
func (c Cache) Put(ctx ml.Context, key, value ml.Tensor, opts cache.Options) (ml.Tensor, ml.Tensor) {
if c.Cache != nil {
return c.Cache.Put(ctx, key, value, opts)
}
return key, value
}
type Options struct {
inputs []int32
inputs []int32
positions []int32
outputs []int32
Offset int
sequences []int
Images []image.Image
Cache
cache.Cache
}
func (opts Options) Inputs() []int32 {
return opts.inputs[opts.Offset:]
return opts.inputs
}
func (opts Options) Positions() []int32 {
positions := make([]int32, len(opts.inputs)-opts.Offset)
for i := range positions {
positions[i] = int32(opts.Offset + i)
}
return opts.positions
}
return positions
func (opts Options) Outputs() []int32 {
return opts.outputs
}
type OptionsFunc func(Model, *Options)
@@ -75,10 +52,21 @@ func WithInputIDs(ids []int32) OptionsFunc {
}
}
func WithOffset(offset int) OptionsFunc {
func WithPositions(pos []int32) OptionsFunc {
return func(m Model, opts *Options) {
opts.Offset = offset
opts.Cache.Position = offset
opts.positions = pos
}
}
func WithOutputs(outputs []int32) OptionsFunc {
return func(m Model, opts *Options) {
opts.outputs = outputs
}
}
func WithSequences(seqs []int) OptionsFunc {
return func(m Model, opts *Options) {
opts.sequences = seqs
}
}
@@ -90,12 +78,7 @@ func WithImage(img image.Image) OptionsFunc {
func WithCache(c cache.Cache) OptionsFunc {
return func(m Model, opts *Options) {
opts.Cache = Cache{
Cache: c,
Options: cache.Options{
Position: opts.Offset,
},
}
opts.Cache = c
}
}
@@ -152,6 +135,12 @@ func New(s string) (Model, error) {
}
func populateFields(b ml.Backend, v reflect.Value, tags ...Tag) reflect.Value {
var iface bool
if v.Kind() == reflect.Interface {
iface = true
v = v.Elem()
}
t := v.Type()
if t.Kind() == reflect.Pointer {
t, v = t.Elem(), v.Elem()
@@ -230,6 +219,10 @@ func populateFields(b ml.Backend, v reflect.Value, tags ...Tag) reflect.Value {
}
}
if iface {
return v.Addr()
}
return v
}
@@ -262,18 +255,22 @@ func canNil(t reflect.Type) bool {
t.Kind() == reflect.Slice
}
func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
func Forward(ctx ml.Context, m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
var opts Options
for _, optsFunc := range optsFuncs {
optsFunc(m, &opts)
}
ctx := m.Backend().NewContext()
err := opts.Cache.StartForward(ctx, opts.positions, opts.sequences)
if err != nil {
return nil, err
}
t, err := m.Forward(ctx, opts)
if err != nil {
return nil, err
}
defer ctx.Close()
ctx.Forward(t)
return ctx.Compute(t), nil
}

View File

@@ -21,8 +21,6 @@ type TextProcessor interface {
Encode(string) ([]int32, error)
Decode([]int32) (string, error)
Is(uint32, Special) bool
GetVocabulary() *Vocabulary
}
type Vocabulary struct {
@@ -100,10 +98,6 @@ func (v *Vocabulary) Merge(left, right string) int {
return -1
}
func (v *Vocabulary) GetVocabulary() *Vocabulary {
return v
}
type BytePairEncoding struct {
Pretokenizer string

View File

@@ -4,6 +4,7 @@ import (
"os"
"os/user"
"path/filepath"
"runtime"
"testing"
)
@@ -11,14 +12,29 @@ func TestExpandPath(t *testing.T) {
mockCurrentUser := func() (*user.User, error) {
return &user.User{
Username: "testuser",
HomeDir: "/home/testuser",
HomeDir: func() string {
if os.PathSeparator == '\\' {
return filepath.FromSlash("D:/home/testuser")
}
return "/home/testuser"
}(),
}, nil
}
mockLookupUser := func(username string) (*user.User, error) {
fakeUsers := map[string]string{
"testuser": "/home/testuser",
"anotheruser": "/home/anotheruser",
"testuser": func() string {
if os.PathSeparator == '\\' {
return filepath.FromSlash("D:/home/testuser")
}
return "/home/testuser"
}(),
"anotheruser": func() string {
if os.PathSeparator == '\\' {
return filepath.FromSlash("D:/home/anotheruser")
}
return "/home/anotheruser"
}(),
}
if homeDir, ok := fakeUsers[username]; ok {
@@ -30,30 +46,78 @@ func TestExpandPath(t *testing.T) {
return nil, os.ErrNotExist
}
tests := []struct {
path string
relativeDir string
expected string
windowsExpected string
shouldErr bool
}{
{"~", "", "/home/testuser", "D:\\home\\testuser", false},
{"~/myfolder/myfile.txt", "", "/home/testuser/myfolder/myfile.txt", "D:\\home\\testuser\\myfolder\\myfile.txt", false},
{"~anotheruser/docs/file.txt", "", "/home/anotheruser/docs/file.txt", "D:\\home\\anotheruser\\docs\\file.txt", false},
{"~nonexistentuser/file.txt", "", "", "", true},
{"relative/path/to/file", "", filepath.Join(os.Getenv("PWD"), "relative/path/to/file"), "relative\\path\\to\\file", false},
{"/absolute/path/to/file", "", "/absolute/path/to/file", "D:\\absolute\\path\\to\\file", false},
{".", os.Getenv("PWD"), "", os.Getenv("PWD"), false},
{"somefile", "somedir", filepath.Join(os.Getenv("PWD"), "somedir", "somefile"), "somedir\\somefile", false},
pwd, err := os.Getwd()
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
if (err != nil) != test.shouldErr {
t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
t.Run("unix tests", func(t *testing.T) {
if runtime.GOOS == "windows" {
return
}
if result != test.expected && result != test.windowsExpected && !test.shouldErr {
t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
tests := []struct {
path string
relativeDir string
expected string
shouldErr bool
}{
{"~", "", "/home/testuser", false},
{"~/myfolder/myfile.txt", "", "/home/testuser/myfolder/myfile.txt", false},
{"~anotheruser/docs/file.txt", "", "/home/anotheruser/docs/file.txt", false},
{"~nonexistentuser/file.txt", "", "", true},
{"relative/path/to/file", "", filepath.Join(pwd, "relative/path/to/file"), false},
{"/absolute/path/to/file", "", "/absolute/path/to/file", false},
{"/absolute/path/to/file", "someotherdir/", "/absolute/path/to/file", false},
{".", pwd, pwd, false},
{".", "", pwd, false},
{"somefile", "somedir", filepath.Join(pwd, "somedir", "somefile"), false},
}
}
for _, test := range tests {
result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
if (err != nil) != test.shouldErr {
t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
}
if result != test.expected && !test.shouldErr {
t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
}
}
})
t.Run("windows tests", func(t *testing.T) {
if runtime.GOOS != "windows" {
return
}
tests := []struct {
path string
relativeDir string
expected string
shouldErr bool
}{
{"~", "", "D:\\home\\testuser", false},
{"~/myfolder/myfile.txt", "", "D:\\home\\testuser\\myfolder\\myfile.txt", false},
{"~anotheruser/docs/file.txt", "", "D:\\home\\anotheruser\\docs\\file.txt", false},
{"~nonexistentuser/file.txt", "", "", true},
{"relative\\path\\to\\file", "", filepath.Join(pwd, "relative\\path\\to\\file"), false},
{"D:\\absolute\\path\\to\\file", "", "D:\\absolute\\path\\to\\file", false},
{"D:\\absolute\\path\\to\\file", "someotherdir/", "D:\\absolute\\path\\to\\file", false},
{".", pwd, pwd, false},
{".", "", pwd, false},
{"somefile", "somedir", filepath.Join(pwd, "somedir", "somefile"), false},
}
for _, test := range tests {
result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
if (err != nil) != test.shouldErr {
t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
}
if result != test.expected && !test.shouldErr {
t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
}
}
})
}

View File

@@ -62,7 +62,13 @@ func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error)
return nil, err
}
req.Files = digestMap
if req.Files == nil {
req.Files = digestMap
} else {
for k, v := range digestMap {
req.Files[k] = v
}
}
case "adapter":
path, err := expandPath(c.Args, relativeDir)
if err != nil {
@@ -564,7 +570,9 @@ func isValidCommand(cmd string) bool {
}
func expandPathImpl(path, relativeDir string, currentUserFunc func() (*user.User, error), lookupUserFunc func(string) (*user.User, error)) (string, error) {
if strings.HasPrefix(path, "~") {
if filepath.IsAbs(path) || strings.HasPrefix(path, "\\") || strings.HasPrefix(path, "/") {
return filepath.Abs(path)
} else if strings.HasPrefix(path, "~") {
var homeDir string
if path == "~" || strings.HasPrefix(path, "~/") {

View File

@@ -490,7 +490,6 @@ func TestParseFileParameters(t *testing.T) {
"top_k 1": {"top_k", "1"},
"top_p 1.0": {"top_p", "1.0"},
"min_p 0.05": {"min_p", "0.05"},
"tfs_z 1.0": {"tfs_z", "1.0"},
"typical_p 1.0": {"typical_p", "1.0"},
"repeat_last_n 1": {"repeat_last_n", "1"},
"temperature 1.0": {"temperature", "1.0"},
@@ -793,15 +792,20 @@ func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, s
}
func TestCreateRequestFiles(t *testing.T) {
name, digest := createBinFile(t, nil, nil)
n1, d1 := createBinFile(t, nil, nil)
n2, d2 := createBinFile(t, map[string]any{"foo": "bar"}, nil)
cases := []struct {
input string
expected *api.CreateRequest
}{
{
fmt.Sprintf("FROM %s", name),
&api.CreateRequest{Files: map[string]string{name: digest}},
fmt.Sprintf("FROM %s", n1),
&api.CreateRequest{Files: map[string]string{n1: d1}},
},
{
fmt.Sprintf("FROM %s\nFROM %s", n1, n2),
&api.CreateRequest{Files: map[string]string{n1: d1, n2: d2}},
},
}

View File

View File

@@ -1,10 +1,10 @@
package runner
package common
import (
"strings"
)
func findStop(sequence string, stops []string) (bool, string) {
func FindStop(sequence string, stops []string) (bool, string) {
for _, stop := range stops {
if strings.Contains(sequence, stop) {
return true, stop
@@ -14,7 +14,7 @@ func findStop(sequence string, stops []string) (bool, string) {
return false, ""
}
func containsStopSuffix(sequence string, stops []string) bool {
func ContainsStopSuffix(sequence string, stops []string) bool {
for _, stop := range stops {
for i := 1; i <= len(stop); i++ {
if strings.HasSuffix(sequence, stop[:i]) {
@@ -29,7 +29,7 @@ func containsStopSuffix(sequence string, stops []string) bool {
// truncateStop removes the provided stop string from pieces,
// returning the partial pieces with stop removed, including truncating
// the last piece if required (and signalling if this was the case)
func truncateStop(pieces []string, stop string) ([]string, bool) {
func TruncateStop(pieces []string, stop string) ([]string, bool) {
joined := strings.Join(pieces, "")
index := strings.Index(joined, stop)
@@ -65,7 +65,7 @@ func truncateStop(pieces []string, stop string) ([]string, bool) {
return result, tokenTruncated
}
func incompleteUnicode(token string) bool {
func IncompleteUnicode(token string) bool {
incomplete := false
// check if there is incomplete UTF-8 character at the end

View File

@@ -1,4 +1,4 @@
package runner
package common
import (
"reflect"
@@ -52,7 +52,7 @@ func TestTruncateStop(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, resultTrunc := truncateStop(tt.pieces, tt.stop)
result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
}
@@ -120,7 +120,7 @@ func TestIncompleteUnicode(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := incompleteUnicode(tt.input)
result := IncompleteUnicode(tt.input)
if result != tt.expected {
t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
}

261
runner/newrunner/cache.go Normal file
View File

@@ -0,0 +1,261 @@
package newrunner
import (
"errors"
"fmt"
"log/slog"
"math"
"reflect"
"time"
"github.com/ollama/ollama/cache"
"github.com/ollama/ollama/ml"
)
type InputCache struct {
// context window size (per slot)
numCtx int32
// individual KV caches
slots []InputCacheSlot
// optimize cache eviction for multiple users
multiUserCache bool
cache cache.Cache
}
func NewInputCache(backend ml.Backend, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
if kvSize/int32(numSlots) < 1 {
return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
}
slots := make([]InputCacheSlot, numSlots)
for i := range slots {
slots[i] = InputCacheSlot{
Id: i,
Inputs: make([]input, 0),
}
}
return &InputCache{
numCtx: kvSize / int32(numSlots),
slots: slots,
multiUserCache: multiUserCache,
cache: cache.NewCausalCache(backend, kvCacheTypeFromStr(kvCacheType), kvSize),
}, nil
}
func kvCacheTypeFromStr(s string) ml.DType {
switch s {
case "q8_0":
panic("kv cache quantization not yet implemented")
case "q4_0":
panic("kv cache quantization not yet implemented")
default:
return ml.DTypeF32
}
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be be held that serializes
// these operations with each other and processBatch
type InputCacheSlot struct {
// Index in the KV cache
Id int
// Inputs that are stored in the KV cache
Inputs []input
// is this cache actively being processed as part of a sequence?
InUse bool
// last time this cache was used (as of start of processing)
lastUsed time.Time
}
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
var slot *InputCacheSlot
var numPast int32
var err error
// In single-user scenarios, the longest cache slot works fine for getting good input
// cache hit rates and it keeps the footprint of the cache small, which improves throughput.
// For multiple users, the "best" cache slot produces better input cache hit rates
// at the cost of worse performance when we miss the input cache.
if !c.multiUserCache {
slot, numPast, err = c.findLongestCacheSlot(prompt)
} else {
slot, numPast, err = c.findBestCacheSlot(prompt)
}
if err != nil {
return nil, nil, err
}
if !cachePrompt {
numPast = 0
}
slot.InUse = true
slot.lastUsed = time.Now()
if numPast == int32(len(prompt)) {
// Leave one input to sample so we can get a response
numPast--
}
err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
if err != nil {
// Some models don't support partial erasure
err = c.cache.Remove(slot.Id, 0, math.MaxInt32)
if err != nil {
return nil, nil, err
}
numPast = 0
}
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
"used", numPast, "remaining", int32(len(prompt))-numPast)
prompt = prompt[numPast:]
slot.Inputs = slot.Inputs[:numPast]
return slot, prompt, nil
}
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
longest := int32(-1)
var longestSlot *InputCacheSlot
for i, s := range c.slots {
if s.InUse {
continue
}
count := countCommonPrefix(s.Inputs, prompt)
if count > longest {
longest = count
longestSlot = &c.slots[i]
}
}
if longestSlot == nil {
return nil, 0, errors.New("no available cache slots")
}
return longestSlot, longest, nil
}
func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
oldest := time.Now()
var oldestSlot *InputCacheSlot
longest := int32(-1)
var longestSlot *InputCacheSlot
for i, s := range c.slots {
count := countCommonPrefix(s.Inputs, prompt)
if count > longest {
longest = count
longestSlot = &c.slots[i]
}
if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
oldest = s.lastUsed
oldestSlot = &c.slots[i]
}
}
if longest == int32(len(longestSlot.Inputs)) && !longestSlot.InUse {
return longestSlot, longest, nil
}
if oldestSlot.InUse {
return nil, 0, errors.New("no available cache slots")
}
if len(oldestSlot.Inputs) != 0 {
slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
"used", oldestSlot.lastUsed)
}
if longest > 0 && longestSlot != oldestSlot {
slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
len(longestSlot.Inputs))
oldestSlot.Inputs = make([]input, longest)
copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
// This is only nil for unit tests
if c.cache != nil {
c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
}
}
return oldestSlot, longest, nil
}
func countCommonPrefix(a []input, b []input) int32 {
var count int32
for i := range a {
if i >= len(b) {
break
}
if !reflect.DeepEqual(a[i], b[i]) {
break
}
count++
}
return count
}
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
targetFree := (c.numCtx - numKeep) / 2
targetFree = max(targetFree, 1)
currentFree := c.numCtx - inputLen
discard := targetFree - currentFree
if discard < 0 {
discard = 0
}
return discard
}
// Frees up space in the KV cache by deleting the oldest half of history and shifting
// the newest half into that space (saving numKeep inputs at the beginning).
//
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
if numKeep >= c.numCtx {
return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
}
inputLen := int32(len(slot.Inputs))
discard := c.ShiftDiscard(inputLen, numKeep)
if discard <= 0 {
return nil
}
slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
"keep", numKeep, "discard", discard)
// TODO (jessegross): KV cache removal can fail for certain types of models
err := c.cache.Remove(slot.Id, numKeep, numKeep+discard)
if err != nil {
return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v): %w", slot.Id, numKeep, discard, err)
}
for i := numKeep + discard; i < inputLen; i++ {
slot.Inputs[i-discard] = slot.Inputs[i]
}
slot.Inputs = slot.Inputs[:inputLen-discard]
return nil
}

View File

@@ -0,0 +1,291 @@
package newrunner
import (
"image"
"testing"
"time"
)
func TestCountCommon(t *testing.T) {
imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
tests := []struct {
name string
t1 []input
t2 []input
expected int32
}{
{
name: "Equal",
t1: []input{{token: 1}, {token: 2}, {token: 3}},
t2: []input{{token: 1}, {token: 2}, {token: 3}},
expected: 3,
},
{
name: "Prefix",
t1: []input{{token: 1}},
t2: []input{{token: 1}, {token: 2}, {token: 3}},
expected: 1,
},
{
name: "Image Prefix",
t1: []input{{image: imgA}},
t2: []input{{image: imgA}, {image: imgB}, {image: imgC}},
expected: 1,
},
{
name: "Mixed",
t1: []input{{token: 1}, {image: imgA}},
t2: []input{{token: 1}, {image: imgA}, {token: 5}},
expected: 2,
},
{
name: "Empty",
t1: []input{},
t2: []input{{token: 1}, {token: 2}, {token: 3}},
expected: 0,
},
{
name: "Both Empty",
t1: []input{},
t2: []input{},
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := countCommonPrefix(tt.t1, tt.t2)
if result != tt.expected {
t.Errorf("countCommonPrefix(%v, %v): have %v; want %v", tt.t1, tt.t2, result, tt.expected)
}
})
}
}
func TestFindCacheSlot(t *testing.T) {
type expected struct {
result int
len int32
}
tests := []struct {
name string
cache InputCache
prompt []input
longest expected
best expected
}{
{
name: "Empty",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
{
Id: 1,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
}},
prompt: []input{{token: 1}},
longest: expected{result: 0, len: 0},
best: expected{result: 0, len: 0},
},
{
name: "Extend",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-2 * time.Second),
},
}},
prompt: []input{{token: 1}, {token: 2}},
longest: expected{result: 1, len: 2},
best: expected{result: 1, len: 2},
},
{
name: "New",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
}},
prompt: []input{{token: 2}},
longest: expected{result: 0, len: 0},
best: expected{result: 1, len: 0},
},
{
name: "Fork",
cache: InputCache{
slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{},
InUse: false,
lastUsed: time.Time{},
},
},
},
prompt: []input{{token: 1}},
longest: expected{result: 0, len: 1},
best: expected{result: 1, len: 1},
},
{
name: "Evict",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}},
InUse: false,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{{token: 1}, {token: 2}},
InUse: false,
lastUsed: time.Now().Add(-2 * time.Second),
},
}},
prompt: []input{{token: 2}, {token: 3}},
longest: expected{result: 0, len: 0},
best: expected{result: 1, len: 0},
},
{
name: "In use",
cache: InputCache{slots: []InputCacheSlot{
{
Id: 0,
Inputs: []input{{token: 1}, {token: 2}},
InUse: true,
lastUsed: time.Now().Add(-time.Second),
},
{
Id: 1,
Inputs: []input{{token: 1}},
InUse: false,
lastUsed: time.Now().Add(-2 * time.Second),
},
}},
prompt: []input{{token: 1}, {token: 2}},
longest: expected{result: 1, len: 1},
best: expected{result: 1, len: 2},
},
}
for _, tt := range tests {
t.Run("Longest-"+tt.name, func(t *testing.T) {
result, resultLen, err := tt.cache.findLongestCacheSlot(tt.prompt)
if err != nil {
t.Errorf("findLongestCacheSlot: err %v", err)
} else if result.Id != tt.longest.result || resultLen != tt.longest.len {
t.Errorf("findLongestCacheSlot: slot have %v, want %v len have %v, want %v",
result.Id, tt.longest.result, resultLen, tt.longest.len)
}
})
}
for _, tt := range tests {
t.Run("Best-"+tt.name, func(t *testing.T) {
result, resultLen, err := tt.cache.findBestCacheSlot(tt.prompt)
if err != nil {
t.Errorf("findBestCacheSlot: err %v", err)
} else if result.Id != tt.best.result || resultLen != tt.best.len {
t.Errorf("findBestCacheSlot: slot have %v, want %v len have %v, want %v",
result.Id, tt.best.result, resultLen, tt.best.len)
}
})
}
}
func TestShiftDiscard(t *testing.T) {
tests := []struct {
name string
numCtx int32
numKeep int32
inputLen int32
expected int32
}{
{
name: "Shift",
numCtx: 2048,
numKeep: 5,
inputLen: 2048,
expected: 1021,
},
{
name: "Max Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 2048,
expected: 1,
},
{
name: "No Keep",
numCtx: 2048,
numKeep: 0,
inputLen: 2048,
expected: 1024,
},
{
name: "Truncate",
numCtx: 2048,
numKeep: 5,
inputLen: 5000,
expected: 3973,
},
{
name: "Truncate Keep",
numCtx: 2048,
numKeep: 2047,
inputLen: 5000,
expected: 2953,
},
{
name: "No Op",
numCtx: 2048,
numKeep: 5,
inputLen: 512,
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := InputCache{numCtx: tt.numCtx}
result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
if result != tt.expected {
t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
}
})
}
}

941
runner/newrunner/runner.go Normal file
View File

@@ -0,0 +1,941 @@
package newrunner
import (
"bytes"
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"image"
"io"
"log"
"log/slog"
"net"
"net/http"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/runner/common"
"github.com/ollama/ollama/sample"
_ "github.com/ollama/ollama/model/llama"
_ "github.com/ollama/ollama/model/mllama"
)
// input is an element of the prompt to process, either a token or an image
type input struct {
token int32
image image.Image
}
type Sequence struct {
// batch index
iBatch int
// prompt inputs left to evaluate
inputs []input
// inputs that have been added to a batch but not yet submitted to Forward
pendingInputs []input
// tokens that have been generated but not returned yet (e.g. for stop sequences)
pendingResponses []string
// input cache being used by this sequence
cache *InputCacheSlot
// channel to send responses over
responses chan string
// channel to stop decoding (such as if the remote connection is closed)
quit chan bool
// number of tokens to predict
numPredict int
// set of samplers to run on generated logits
samplers []sample.Sampler
// channel to send back the embedding if embedding only
embedding chan []float32
// stop sequences
stop []string
// number of inputs to keep at the beginning when shifting context window
numKeep int32
// true if an embedding are to be returned instead of text generation
embeddingOnly bool
doneReason string
// Metrics
startProcessingTime time.Time
startGenerationTime time.Time
numPredicted int
numPromptInputs int
}
type NewSequenceParams struct {
numPredict int
stop []string
numKeep int32
samplers []sample.Sampler
embedding bool
}
func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
s.ready.Wait()
startTime := time.Now()
inputs, err := s.inputs(prompt, images)
if err != nil {
return nil, fmt.Errorf("failed to process inputs: %w", err)
} else if len(inputs) == 0 {
return nil, errors.New("no input provided")
}
if params.numKeep < 0 {
params.numKeep = int32(len(inputs))
}
// Ensure that at least 1 input can be discarded during shift
params.numKeep = min(params.numKeep, s.cache.numCtx-1)
if int32(len(inputs)) > s.cache.numCtx {
discard := int32(len(inputs)) - s.cache.numCtx
newInputs := inputs[:params.numKeep]
newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
inputs = newInputs
}
// TODO(jessegross): Ingest cached history for grammar
return &Sequence{
inputs: inputs,
numPromptInputs: len(inputs),
startProcessingTime: startTime,
numPredict: params.numPredict,
pendingResponses: make([]string, 0),
responses: make(chan string, 100),
quit: make(chan bool, 1),
embedding: make(chan []float32, 1),
samplers: params.samplers,
embeddingOnly: params.embedding,
stop: params.stop,
numKeep: params.numKeep,
}, nil
}
// inputs processes the prompt and images into a list of inputs
// by splitting the prompt on [img-<n>] tags, tokenizing text and
// decoding images
func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
var inputs []input
var parts []string
var matches [][]string
// TODO(jessegross): This can sometimes trigger for matching text in the
// user's prompt. We previously tried to avoid it by only looking for images
// on image models. We don't have a clear indication now but it would be better
// to properly escape it in any case.
re := regexp.MustCompile(`\[img-(\d+)\]`)
parts = re.Split(prompt, -1)
matches = re.FindAllStringSubmatch(prompt, -1)
for i, part := range parts {
// text - tokenize
tokens, err := s.model.(model.TextProcessor).Encode(part)
if err != nil {
return nil, err
}
for _, t := range tokens {
inputs = append(inputs, input{token: t})
}
// image - decode and store
if i < len(matches) {
n, _ := strconv.Atoi(matches[i][1])
imageIndex := -1
for j := range images {
if images[j].ID == n {
imageIndex = j
break
}
}
if imageIndex < 0 {
return nil, fmt.Errorf("invalid image index: %d", n)
}
image, _, err := image.Decode(bytes.NewReader(images[imageIndex].Data))
if err != nil {
return nil, err
}
inputs = append(inputs, input{image: image})
}
}
return inputs, nil
}
type Server struct {
// is the server ready to process requests?
// protects access to model and image
ready sync.WaitGroup
// loaded model
model model.Model
// status for external health reporting - loading, ready to serve, etc.
status ServerStatus
// current progress on loading the model
progress float32
// number of simultaneous requests to handle
parallel int
// maximum number of elements in a batch (per sequence)
// TODO (jmorganca): make this n_batch
batchSize int
// protects access to everything below this line
// this is context state needed for decoding
mu sync.Mutex
// indicates that data is ready for processing
cond *sync.Cond
// the list of simultaneous sequences being evaluated
seqs []*Sequence
// seqs can have a maximum of parallel entries, which
// is enfoced by seqSem
seqsSem *semaphore.Weighted
// KV cache
cache *InputCache
}
func (s *Server) allNil() bool {
for _, item := range s.seqs {
if item != nil {
return false
}
}
return true
}
func flushPending(seq *Sequence) bool {
joined := strings.Join(seq.pendingResponses, "")
seq.pendingResponses = []string{}
// Check if there are any partial UTF-8 characters remaining.
// We already check and queue as we are generating but some may
// still make it here:
// - Sequence is ending, e.g. generation limit has been hit
// - Invalid characters in the middle of a string
// This is a stricter check to ensure we never output invalid Unicode.
for !utf8.ValidString(joined) {
joined = joined[:len(joined)-1]
}
if len(joined) == 0 {
return true
}
select {
case seq.responses <- joined:
return true
case <-seq.quit:
return false
}
}
func (s *Server) removeSequence(seqIndex int, reason string) {
seq := s.seqs[seqIndex]
flushPending(seq)
seq.doneReason = reason
close(seq.responses)
close(seq.embedding)
seq.cache.InUse = false
s.seqs[seqIndex] = nil
s.seqsSem.Release(1)
}
func (s *Server) run(ctx context.Context) {
s.ready.Wait()
for {
select {
case <-ctx.Done():
return
default:
err := s.processBatch()
if err != nil {
panic(err)
}
}
}
}
func (s *Server) processBatch() error {
s.mu.Lock()
for s.allNil() {
s.cond.Wait() // Wait until an item is added
}
defer s.mu.Unlock()
var inputIDs []int32
var pos []int32
var outputs []int32
var seqs []int
var image image.Image
for i, seq := range s.seqs {
if seq == nil {
continue
}
// if past the num predict limit
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
s.removeSequence(i, "limit")
continue
}
for j, input := range seq.inputs {
if int32(len(seq.cache.Inputs)+len(seq.pendingInputs)+1) > s.cache.numCtx {
if len(seq.pendingInputs) == 0 {
err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
if err != nil {
return err
}
} else {
break
}
}
if j >= s.batchSize {
break
}
if input.image != nil {
if image != nil {
break
}
image = input.image
seq.pendingInputs = append(seq.pendingInputs, input)
continue
}
inputIDs = append(inputIDs, input.token)
pos = append(pos, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
seqs = append(seqs, seq.cache.Id)
seq.iBatch = len(outputs)
if j+1 == len(seq.inputs) {
outputs = append(outputs, int32(len(inputIDs)-1))
}
seq.pendingInputs = append(seq.pendingInputs, input)
}
seq.inputs = seq.inputs[len(seq.pendingInputs):]
}
if len(inputIDs) == 0 {
return nil
}
var options []model.OptionsFunc
if image != nil {
options = append(options, model.WithImage(image))
}
ctx := s.model.Backend().NewContext()
defer ctx.Close()
logit, err := model.Forward(ctx, s.model, append(options, model.WithCache(s.cache.cache), model.WithInputIDs(inputIDs), model.WithPositions(pos), model.WithOutputs(outputs), model.WithSequences(seqs))...)
if err != nil {
return err
}
f32s := logit.Floats()
for i, seq := range s.seqs {
if seq == nil {
continue
}
// After calling Forward, pending inputs are now in the cache
if len(seq.pendingInputs) > 0 {
seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
seq.pendingInputs = []input{}
}
// don't sample prompt processing
if len(seq.inputs) != 0 {
continue
}
seq.numPredicted++
if seq.numPredicted == 1 {
seq.startGenerationTime = time.Now()
}
// if done processing the prompt, generate an embedding and return
if seq.embeddingOnly {
/*embed := s.lc.GetEmbeddingsSeq(seq.cache.Id)
if embed == nil {
embed = s.lc.GetEmbeddingsIth(seq.iBatch)
}
seq.embedding <- embed*/
s.removeSequence(i, "")
continue
}
vocabSize := len(f32s) / len(outputs)
seqLogits := f32s[seq.iBatch*vocabSize : (seq.iBatch+1)*vocabSize]
// TODO(jessegross): The data type and number of outputs for the samplers seem inconsistent
f64s := make([]float64, vocabSize)
for j, f32 := range seqLogits {
f64s[j] = float64(f32)
}
// do sampling
f64s, err = sample.Sample(f64s, seq.samplers...)
if err != nil {
return err
}
var outputIDs []int32
for _, f64 := range f64s {
if !s.model.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
outputIDs = append(outputIDs, int32(f64))
} else {
s.removeSequence(i, "stop")
continue
}
}
if len(outputIDs) == 0 {
continue
}
piece, err := s.model.(model.TextProcessor).Decode(outputIDs)
if errors.Is(err, io.EOF) {
continue
} else if err != nil {
return err
}
for _, id := range outputIDs {
seq.inputs = append(seq.inputs, input{token: id})
}
seq.pendingResponses = append(seq.pendingResponses, piece)
sequence := strings.Join(seq.pendingResponses, "")
if ok, stop := common.FindStop(sequence, seq.stop); ok {
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
var tokenTruncated bool
origLen := len(seq.pendingResponses)
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
newLen := len(seq.pendingResponses)
// Update the cache based on the tokens that will be returned:
// - We have more tokens than are currently in the cache because
// the last ones generated weren't submitted to Forward
// - Remove any stop sequences that we stripped out
// - If truncateStop removed a portion of a token, drop that
// - As defense-in-depth, if truncatedToken didn't find a stop token
// remove the extra ones that we added to the cache len
tokenLen := len(seq.cache.Inputs) + len(outputIDs)
tokenLen -= origLen - newLen
if tokenTruncated {
tokenLen--
}
if origLen == newLen {
tokenLen = len(seq.cache.Inputs)
}
seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
s.removeSequence(i, "stop")
continue
}
if common.ContainsStopSuffix(sequence, seq.stop) {
continue
}
if common.IncompleteUnicode(sequence) {
continue
}
if !flushPending(seq) {
s.removeSequence(i, "connection")
}
}
return nil
}
// TODO (jmorganca): use structs from the api package to avoid duplication
// this way the api acts as a proxy instead of using a different api for the
// runner
type Options struct {
api.Runner
NumKeep int `json:"n_keep"`
Seed int `json:"seed"`
NumPredict int `json:"n_predict"`
TopK int `json:"top_k"`
TopP float32 `json:"top_p"`
MinP float32 `json:"min_p"`
TypicalP float32 `json:"typical_p"`
RepeatLastN int `json:"repeat_last_n"`
Temperature float32 `json:"temperature"`
RepeatPenalty float32 `json:"repeat_penalty"`
PresencePenalty float32 `json:"presence_penalty"`
FrequencyPenalty float32 `json:"frequency_penalty"`
Mirostat int `json:"mirostat"`
MirostatTau float32 `json:"mirostat_tau"`
MirostatEta float32 `json:"mirostat_eta"`
Stop []string `json:"stop"`
}
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
AspectRatioID int `json:"aspect_ratio_id"`
}
type CompletionRequest struct {
Prompt string `json:"prompt"`
Images []ImageData `json:"image_data"`
Grammar string `json:"grammar"`
CachePrompt bool `json:"cache_prompt"`
Options
}
type Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMS float64 `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMS float64 `json:"prompt_ms"`
}
type CompletionResponse struct {
Content string `json:"content"`
Stop bool `json:"stop"`
Model string `json:"model,omitempty"`
Prompt string `json:"prompt,omitempty"`
StoppedLimit bool `json:"stopped_limit,omitempty"`
PredictedN int `json:"predicted_n,omitempty"`
PredictedMS float64 `json:"predicted_ms,omitempty"`
PromptN int `json:"prompt_n,omitempty"`
PromptMS float64 `json:"prompt_ms,omitempty"`
Timings Timings `json:"timings"`
}
func getSamplers(_ CompletionRequest) []sample.Sampler {
// TODO(jessegross): Waiting for sampling code
/*var samplingParams llama.SamplingParams
samplingParams.TopK = req.TopK
samplingParams.TopP = req.TopP
samplingParams.MinP = req.MinP
samplingParams.TypicalP = req.TypicalP
samplingParams.Temp = req.Temperature
samplingParams.RepeatLastN = req.RepeatLastN
samplingParams.PenaltyRepeat = req.RepeatPenalty
samplingParams.PenaltyFreq = req.FrequencyPenalty
samplingParams.PenaltyPresent = req.PresencePenalty
samplingParams.Mirostat = req.Mirostat
samplingParams.MirostatTau = req.MirostatTau
samplingParams.MirostatEta = req.MirostatEta
samplingParams.Seed = uint32(req.Seed)
samplingParams.Grammar = req.Grammar*/
return []sample.Sampler{sample.Greedy()}
}
func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
var req CompletionRequest
req.Options = Options(api.DefaultOptions())
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
// Set the headers to indicate streaming
w.Header().Set("Content-Type", "application/json")
w.Header().Set("Transfer-Encoding", "chunked")
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "Streaming not supported", http.StatusInternalServerError)
return
}
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
numPredict: req.NumPredict,
stop: req.Stop,
numKeep: int32(req.NumKeep),
samplers: getSamplers(req),
embedding: false,
})
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
return
}
// Ensure there is a place to put the sequence, released when removed from s.seqs
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting completion request due to client closing the connection")
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return
}
s.mu.Lock()
found := false
for i, sq := range s.seqs {
if sq == nil {
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
if err != nil {
s.mu.Unlock()
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
return
}
s.seqs[i] = seq
s.cond.Signal()
found = true
break
}
}
s.mu.Unlock()
if !found {
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
return
}
for {
select {
case <-r.Context().Done():
close(seq.quit)
return
case content, ok := <-seq.responses:
if ok {
if err := json.NewEncoder(w).Encode(&CompletionResponse{
Content: content,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
close(seq.quit)
return
}
flusher.Flush()
} else {
// Send the final response
if err := json.NewEncoder(w).Encode(&CompletionResponse{
Stop: true,
StoppedLimit: seq.doneReason == "limit",
Timings: Timings{
PromptN: seq.numPromptInputs,
PromptMS: float64(seq.startGenerationTime.Sub(seq.startProcessingTime).Milliseconds()),
PredictedN: seq.numPredicted,
PredictedMS: float64(time.Since(seq.startGenerationTime).Milliseconds()),
},
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
}
return
}
}
}
}
type EmbeddingRequest struct {
Content string `json:"content"`
CachePrompt bool `json:"cache_prompt"`
}
type EmbeddingResponse struct {
Embedding []float32 `json:"embedding"`
}
func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
var req EmbeddingRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
return
}
w.Header().Set("Content-Type", "application/json")
slog.Debug("embedding request", "content", req.Content)
seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
return
}
// Ensure there is a place to put the sequence, released when removed from s.seqs
if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
if errors.Is(err, context.Canceled) {
slog.Info("aborting embeddings request due to client closing the connection")
} else {
slog.Error("Failed to acquire semaphore", "error", err)
}
return
}
s.mu.Lock()
found := false
for i, sq := range s.seqs {
if sq == nil {
seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
if err != nil {
s.mu.Unlock()
http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
return
}
s.seqs[i] = seq
s.cond.Signal()
found = true
break
}
}
s.mu.Unlock()
if !found {
http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
return
}
embedding := <-seq.embedding
if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
Embedding: embedding,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
}
type HealthResponse struct {
Status string `json:"status"`
Progress float32 `json:"progress"`
}
type ServerStatus int
const (
ServerStatusReady ServerStatus = iota
ServerStatusLoadingModel
ServerStatusError
)
func (s ServerStatus) ToString() string {
switch s {
case ServerStatusReady:
return "ok"
case ServerStatusLoadingModel:
return "loading model"
default:
return "server error"
}
}
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(w).Encode(&HealthResponse{
Status: s.status.ToString(),
Progress: s.progress,
}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
}
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) loadModel(
mpath string,
lpath multiLPath,
kvCacheType string,
kvSize int,
multiUserCache bool,
) {
var err error
s.model, err = model.New(mpath)
if err != nil {
panic(err)
}
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
panic("loras are not yet implemented")
}
s.cache, err = NewInputCache(s.model.Backend(), kvCacheType, int32(kvSize), s.parallel, multiUserCache)
if err != nil {
panic(err)
}
s.status = ServerStatusReady
s.ready.Done()
}
func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
_ = fs.Int("main-gpu", 0, "Main GPU")
_ = fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on")
_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
return err
}
level := slog.LevelInfo
if *verbose {
level = slog.LevelDebug
}
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
AddSource: true,
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
if attr.Key == slog.SourceKey {
source := attr.Value.Any().(*slog.Source)
source.File = filepath.Base(source.File)
}
return attr
},
})
slog.SetDefault(slog.New(handler))
slog.Info("starting ollama engine")
// TODO(jessegross): Some system info would be useful
server := &Server{
batchSize: *batchSize,
parallel: *parallel,
seqs: make([]*Sequence, *parallel),
seqsSem: semaphore.NewWeighted(int64(*parallel)),
status: ServerStatusLoadingModel,
}
// TODO(jessegross): Parameters that need to be implemented:
// n-gpu-layers
// main-gpu
// flash-attn
// threads
// no-mmap
// mlock
// tensor-split
/*var tensorSplitFloats []float32
if *tensorSplit != "" {
stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
tensorSplitFloats = make([]float32, 0, len(stringFloats))
for _, s := range stringFloats {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats = append(tensorSplitFloats, float32(f))
}
}*/
server.ready.Add(1)
go server.loadModel(*mpath, lpaths, *kvCacheType, *kvSize, *multiUserCache)
server.cond = sync.NewCond(&server.mu)
ctx, cancel := context.WithCancel(context.Background())
go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port)
listener, err := net.Listen("tcp", addr)
if err != nil {
fmt.Println("Listen error:", err)
cancel()
return err
}
defer listener.Close()
mux := http.NewServeMux()
mux.HandleFunc("/embedding", server.embeddings)
mux.HandleFunc("/completion", server.completion)
mux.HandleFunc("/health", server.health)
httpServer := http.Server{
Handler: mux,
}
log.Println("Server listening on", addr)
if err := httpServer.Serve(listener); err != nil {
log.Fatal("server error:", err)
return err
}
cancel()
return nil
}

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"errors"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"testing"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"errors"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"reflect"

View File

@@ -1,4 +1,4 @@
package runner
package oldrunner
import (
"context"
@@ -24,6 +24,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runner/common"
)
// input is an element of the prompt to process, either
@@ -443,7 +444,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
s.lc.Synchronize()
}
var totalSamplingTime time.Duration
for i, seq := range s.seqs {
if seq == nil {
continue
@@ -478,12 +478,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
}
// sample a token
samplingStart := time.Now()
token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
seq.samplingCtx.Accept(token, true)
samplingTime := time.Since(samplingStart)
totalSamplingTime += samplingTime
slog.Info("sampling time", "time", samplingTime)
piece := s.model.TokenToPiece(token)
seq.numPredicted++
@@ -503,12 +499,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
seq.pendingResponses = append(seq.pendingResponses, piece)
sequence := strings.Join(seq.pendingResponses, "")
if ok, stop := findStop(sequence, seq.stop); ok {
if ok, stop := common.FindStop(sequence, seq.stop); ok {
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
var tokenTruncated bool
origLen := len(seq.pendingResponses)
seq.pendingResponses, tokenTruncated = truncateStop(seq.pendingResponses, stop)
seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
newLen := len(seq.pendingResponses)
// Update the cache based on the tokens that will be returned:
@@ -529,11 +525,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
continue
}
if containsStopSuffix(sequence, seq.stop) {
if common.ContainsStopSuffix(sequence, seq.stop) {
continue
}
if incompleteUnicode(sequence) {
if common.IncompleteUnicode(sequence) {
continue
}
@@ -640,7 +636,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
samplingParams.Seed = uint32(req.Seed)
samplingParams.Grammar = req.Grammar
start := time.Now()
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
numPredict: req.NumPredict,
stop: req.Stop,
@@ -648,7 +643,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
samplingParams: &samplingParams,
embedding: false,
})
slog.Info("new sequence created", "duration", time.Since(start))
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
return
@@ -892,9 +886,6 @@ func (s *Server) loadModel(
}
func Execute(args []string) error {
if args[0] == "runner" {
args = args[1:]
}
fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file")
ppath := fs.String("mmproj", "", "Path to projector binary file")

24
runner/runner.go Normal file
View File

@@ -0,0 +1,24 @@
package runner
import (
"github.com/ollama/ollama/runner/newrunner"
"github.com/ollama/ollama/runner/oldrunner"
)
func Execute(args []string) error {
if args[0] == "runner" {
args = args[1:]
}
var newRunner bool
if args[0] == "--new-runner" {
args = args[1:]
newRunner = true
}
if newRunner {
return newrunner.Execute(args)
} else {
return oldrunner.Execute(args)
}
}

View File

@@ -1,171 +0,0 @@
package sample
import (
"bytes"
"encoding/json"
"errors"
)
// Schema holds a JSON schema.
type Schema struct {
// Name is the name of the property. For the parent/root property, this
// is "root". For child properties, this is the name of the property.
Name string `json:"-"`
// Type is the type of the property.
//
// TODO: Union types (e.g. make this a []string).
Type string
// PrefixItems is a list of schemas for each item in a tuple. By
// default, the tuple is "closed." unless Items is set to true or a
// valid Schema.
PrefixItems []*Schema
// Items is the schema for each item in a list.
//
// If it is missing, or its JSON value is "null" or "false", it is nil.
// If the JSON value is "true", it is set to the empty Schema. If the
// JSON value is an object, it will be decoded as a Schema.
Items *Schema
// MinItems specifies the minimum number of items allowed in a list.
MinItems int
// MaxItems specifies the maximum number of items allowed in a list.
MaxItems int
// Properties is the schema for each property of an object.
Properties []*Schema
// Format is the format of the property. This is used to validate the
// property against a specific format.
//
// It is the callers responsibility to validate the property against
// the format.
Format string
// Minimum specifies the minimum value for numeric properties.
Minimum float64
// Maximum specifies the maximum value for numeric properties.
Maximum float64
// Enum is a list of valid values for the property.
Enum []json.RawMessage
}
func (s *Schema) UnmarshalJSON(data []byte) error {
type S Schema
w := struct {
Properties props
Items items
*S
}{
S: (*S)(s),
}
if err := json.Unmarshal(data, &w); err != nil {
return err
}
if w.Items.set {
s.Items = &w.Items.Schema
}
s.Properties = w.Properties
return nil
}
type items struct {
Schema
set bool
}
func (s *items) UnmarshalJSON(data []byte) error {
switch b := data[0]; b {
case 't':
*s = items{set: true}
case '{':
type I items
if err := json.Unmarshal(data, (*I)(s)); err != nil {
return err
}
s.set = true
case 'n', 'f':
default:
return errors.New("invalid Items")
}
return nil
}
// EffectiveType returns the effective type of the schema. If the Type field is
// not empty, it is returned; otherwise:
//
// - If the schema has both Properties and Items, it returns an empty string.
// - If the schema has Properties, it returns "object".
// - If the schema has Items, it returns "array".
// - If the schema has neither Properties nor Items, it returns "value".
//
// The returned string is never empty.
func (d *Schema) EffectiveType() string {
if d.Type == "" {
if len(d.Properties) > 0 {
return "object"
}
if len(d.PrefixItems) > 0 || d.Items != nil {
return "array"
}
return "value"
}
return d.Type
}
// props is an ordered list of properties. The order of the properties
// is the order in which they were defined in the schema.
type props []*Schema
var _ json.Unmarshaler = (*props)(nil)
func (v *props) UnmarshalJSON(data []byte) error {
if len(data) == 0 {
return nil
}
if data[0] != '{' {
return errors.New("expected object")
}
d := json.NewDecoder(bytes.NewReader(data))
// TODO(bmizerany): Consider DisallowUnknownFields. Currently, we, like
// llama.cpp, ignore unknown fields, which could be lead to unexpected
// behavior for clients of this package, since they may not be aware
// that "additionalFields", "itemsPrefix", etc, are being ignored.
//
// For now, just do what llama.cpp does.
t, err := d.Token()
if err != nil {
return err
}
if t != json.Delim('{') {
return errors.New("expected object")
}
for d.More() {
// Use the first token (map key) as the property name, then
// decode the rest of the object fields into a Schema and
// append.
t, err := d.Token()
if err != nil {
return err
}
if t == json.Delim('}') {
return nil
}
s := &Schema{
Name: t.(string),
}
if err := d.Decode(s); err != nil {
return err
}
*v = append(*v, s)
}
return nil
}

View File

@@ -1,32 +0,0 @@
// Feedback from code review:
// pushdown_automata.go:
// 1. The BuildGraph function is quite long and could be split into smaller, more focused functions
// 2. Consider using constants instead of magic runes like rune(-1) for sentinel values
// 3. The state machine transitions could be defined more declaratively, perhaps in a config
// 4. The stringInvalidRunes list needs to handle escape sequences properly
// 5. The graph building could be optimized to avoid duplicate nodes/transitions
// 6. Consider adding validation for max nesting depth of braces/brackets
// 7. The CreateMask function is doing a lot - could be split into smaller pieces
// 8. isRuneValid has a "garbage interface" per TODO - needs cleaner design
// pushdown_runner.go:
// 1. The Apply method has a lot of duplicated logic around EOS handling
// 2. The UpdateState method could use more granular error messages
// 3. The braceStack validation could be moved to a separate validator
// 4. Consider adding max length limits for strings/numbers
// 5. The stateCounter isn't being used effectively yet
// 6. Need to add penalties for staying in same state too long
// 7. The maskLogits function could be optimized to avoid allocations
// 8. Missing proper cleanup/reset functionality
// 9. Error handling could be more consistent throughout
// 10. Consider adding debug logging levels instead of raw fmt.Println
// General improvements needed:
// - More comprehensive testing, especially edge cases
// - Better documentation of state machine transitions
// - Performance optimization for large inputs
// - Memory usage optimization for the graph structure
// - Cleaner interfaces between components
// - More robust error handling and recovery

View File

@@ -1,11 +0,0 @@
package sample
// type fusedMaskSampler struct{}
// func FusedMaskSampler() Sampler {
// return fusedMaskSampler{}
// }
// func (f fusedMaskSampler) Sample(logits []float64) (int, error) {
// return int(logits[0]), nil
// }

View File

@@ -8,19 +8,6 @@ func Greedy() Sampler {
return greedy{}
}
func (s greedy) Sample(logits []float32, transforms ...Transform) (int, error) {
logits64 := make([]float64, len(logits))
for i, v := range logits {
logits64[i] = float64(v)
}
var err error
for _, t := range transforms {
logits64, err = t.Apply(logits64)
if err != nil {
return -1, err
}
}
return floats.MaxIdx(logits64), nil
func (s greedy) Sample(t []float64) ([]float64, error) {
return []float64{float64(floats.MaxIdx(t))}, nil
}

View File

@@ -1,296 +0,0 @@
package sample
import (
"slices"
"github.com/ollama/ollama/model"
)
var stringInvalidRunes = []rune{'\\', '\n', '\t', '{', '}', ':', ','}
var intInvalidRunes = []rune{'e', 'E', ' ', '\n', '\t', '{', '}', ':', ',', '"'}
var validIntRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'}
var validNumberRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E'}
var validBoolRunes = []rune{'t', 'r', 'u', 'e', 'f', 'a', 'l', 's', 'e'}
var validNullRunes = []rune{'n', 'u', 'l', 'l'}
type PDANode struct {
State JSONState
TransitionEdges map[rune]*PDANode
MaskTokenIDToNode map[int32]JSONState
}
func NewPDANode(state JSONState) *PDANode {
return &PDANode{
State: state,
TransitionEdges: make(map[rune]*PDANode),
MaskTokenIDToNode: make(map[int32]JSONState),
}
}
func BuildGraph(proc model.TextProcessor) (*PDANode, map[JSONState]*PDANode, error) {
stateToNodeMap := make(map[JSONState]*PDANode)
startNode := NewPDANode(StateStart)
stateToNodeMap[StateStart] = startNode
objNode := NewPDANode(StateInObject)
stateToNodeMap[StateInObject] = objNode
objEndNode := NewPDANode(StateInObjectEnd)
stateToNodeMap[StateInObjectEnd] = objEndNode
objKeyNode := NewPDANode(StateInObjectKey)
stateToNodeMap[StateInObjectKey] = objKeyNode
objKeyEndNode := NewPDANode(StateInObjectKeyEnd)
stateToNodeMap[StateInObjectKeyEnd] = objKeyEndNode
colonNode := NewPDANode(StateInColon)
stateToNodeMap[StateInColon] = colonNode
commaNode := NewPDANode(StateInComma)
stateToNodeMap[StateInComma] = commaNode
newlineNode := NewPDANode(StateInNewline)
stateToNodeMap[StateInNewline] = newlineNode
spaceNode := NewPDANode(StateInSpace)
stateToNodeMap[StateInSpace] = spaceNode
spaceObjNode := NewPDANode(StateInObjSpace)
stateToNodeMap[StateInObjSpace] = spaceObjNode
tabNode := NewPDANode(StateInTab)
stateToNodeMap[StateInTab] = tabNode
stringNode := NewPDANode(StateInString)
stateToNodeMap[StateInString] = stringNode
stringEndNode := NewPDANode(StateInStringEnd)
stateToNodeMap[StateInStringEnd] = stringEndNode
listNode := NewPDANode(StateInList)
stateToNodeMap[StateInList] = listNode
listCommaNode := NewPDANode(StateInListComma)
stateToNodeMap[StateInListComma] = listCommaNode
listEndNode := NewPDANode(StateListEnd)
stateToNodeMap[StateListEnd] = listEndNode
numberNode := NewPDANode(StateInNumber)
stateToNodeMap[StateInNumber] = numberNode
boolNode := NewPDANode(StateInBool)
stateToNodeMap[StateInBool] = boolNode
nullNode := NewPDANode(StateInNull)
stateToNodeMap[StateInNull] = nullNode
// Defined with structured outputs only
intNode := NewPDANode(StateInInt)
stateToNodeMap[StateInInt] = intNode
// TODO:
// consider adding a node to just point to values, could be good to compute that
// mask rather than many different nodes
// Connect nodes
// TODO: if all are single tokens then this can just be connected instead of defining the token
startNode.TransitionEdges['{'] = objNode
objNode.TransitionEdges['"'] = objKeyNode
objNode.TransitionEdges['\n'] = newlineNode
// objNode.TransitionEdges['\t'] = tabNode
newlineNode.TransitionEdges['"'] = objKeyNode
newlineNode.TransitionEdges['\t'] = tabNode
tabNode.TransitionEdges['"'] = objKeyNode
// tabNode.TransitionEdges['\t'] = tabNode
objKeyNode.TransitionEdges[rune(-1)] = objKeyNode
objKeyNode.TransitionEdges['"'] = objKeyEndNode
objKeyEndNode.TransitionEdges[':'] = colonNode
objEndNode.TransitionEdges[' '] = spaceNode
// where values should be
// this could be combined but the probs might change, we're alr doing a skip ahead
colonNode.TransitionEdges[' '] = spaceNode
// Leads to a value
spaceNode.TransitionEdges['"'] = stringNode
spaceNode.TransitionEdges['['] = listNode
spaceNode.TransitionEdges['{'] = objNode
for _, r := range validNumberRunes {
spaceNode.TransitionEdges[r] = numberNode
}
for _, r := range validBoolRunes {
spaceNode.TransitionEdges[r] = boolNode
}
for _, r := range validNullRunes {
spaceNode.TransitionEdges[r] = nullNode
}
// Values
// string node
stringNode.TransitionEdges[rune(-1)] = stringNode
stringNode.TransitionEdges['"'] = stringEndNode
stringEndNode.TransitionEdges[','] = commaNode
stringEndNode.TransitionEdges['}'] = objEndNode
stringEndNode.TransitionEdges[']'] = listEndNode
// TODO: add counters for allowable number of decimals, e, E, etc
// number node
for _, r := range validNumberRunes {
numberNode.TransitionEdges[r] = numberNode
}
numberNode.TransitionEdges[','] = commaNode
numberNode.TransitionEdges['}'] = objEndNode
numberNode.TransitionEdges[']'] = listEndNode
for _, r := range validBoolRunes {
boolNode.TransitionEdges[r] = boolNode
}
// list node
listNode.TransitionEdges[','] = commaNode
listNode.TransitionEdges['"'] = stringNode
// squash states to a value
for _, r := range validNumberRunes {
listNode.TransitionEdges[r] = numberNode
}
for _, r := range validBoolRunes {
listNode.TransitionEdges[r] = boolNode
}
for _, r := range validNullRunes {
listNode.TransitionEdges[r] = nullNode
}
// null node
for _, r := range validNullRunes {
nullNode.TransitionEdges[r] = nullNode
}
nullNode.TransitionEdges[','] = commaNode
nullNode.TransitionEdges['}'] = objEndNode
nullNode.TransitionEdges[']'] = listEndNode
// list comma
// should point to values
listCommaNode.TransitionEdges['"'] = stringNode
listCommaNode.TransitionEdges[' '] = listCommaNode
listCommaNode.TransitionEdges['{'] = objNode
listCommaNode.TransitionEdges['\n'] = newlineNode
for _, r := range validNumberRunes {
listCommaNode.TransitionEdges[r] = numberNode
}
for _, r := range validBoolRunes {
listCommaNode.TransitionEdges[r] = boolNode
}
for _, r := range validNullRunes {
listCommaNode.TransitionEdges[r] = nullNode
}
// bool node
for _, r := range validBoolRunes {
boolNode.TransitionEdges[r] = boolNode
}
boolNode.TransitionEdges['}'] = objEndNode
boolNode.TransitionEdges[']'] = listEndNode
boolNode.TransitionEdges[','] = commaNode
listEndNode.TransitionEdges['}'] = objEndNode
listEndNode.TransitionEdges[','] = commaNode
commaNode.TransitionEdges['{'] = objNode
commaNode.TransitionEdges['\n'] = newlineNode
commaNode.TransitionEdges['\t'] = tabNode
commaNode.TransitionEdges['"'] = objKeyNode
commaNode.TransitionEdges[' '] = spaceObjNode
spaceObjNode.TransitionEdges['"'] = objKeyNode
return startNode, stateToNodeMap, nil
}
func PreComputeValidStates(stateToNodeMap map[JSONState]*PDANode, proc model.TextProcessor) error {
vocab := proc.GetVocabulary()
decodedToks := make([]string, len(vocab.Values))
for i := range vocab.Values {
token, err := proc.Decode([]int32{int32(i)})
if err != nil {
return err
}
decodedToks[i] = token
}
var err error
for _, node := range stateToNodeMap {
for i := range vocab.Values {
token := decodedToks[i]
// Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
if proc.Is(uint32(i), model.SpecialEOS) || proc.Is(uint32(i), model.SpecialBOS) || token == "" {
continue
}
valid := true
curNode := node
consumedSpecialRunes := make(map[rune]bool)
for _, r := range token {
valid, curNode, err = isRuneValid(r, curNode, consumedSpecialRunes)
if err != nil {
return err
}
if !valid {
break
}
}
if valid {
node.MaskTokenIDToNode[int32(i)] = curNode.State
}
}
}
return nil
}
func isRuneValid(r rune, curNode *PDANode, consumedSpecialRunes map[rune]bool) (bool, *PDANode, error) {
if consumedSpecialRunes[r] {
return false, nil, nil
}
specialRune := slices.Contains(stringInvalidRunes, r)
if specialRune {
if curNode.State == StateInString || curNode.State == StateInObjectKey {
return false, nil, nil
}
}
// Check for specific rune transition
if nextNode, ok := curNode.TransitionEdges[r]; ok {
if specialRune {
if curNode.State == nextNode.State {
return false, nil, nil
}
// fmt.Println("special rune", r, "consumed")
consumedSpecialRunes[r] = true
}
return true, nextNode, nil
}
// Check for sentinel value - if present, any rune is valid
if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
return true, nextNode, nil
}
return false, nil, nil
}

View File

@@ -1,160 +0,0 @@
package sample
import (
"fmt"
)
type JSONState int
const (
StateStart JSONState = iota
StateInObject
StateInObjectKey
StateInStructuredKey
StateInStructuredValue
StateNewline
StateTab
StateSpace
StateInString
StateInInt
StateInFloat
StateInBool
StateInNull
StateInColon
StateInComma
StateInTab
StateInSpaceToValue
StateInSpaceEndValue
StateInNewlineEndValue
StateInObjSpace
StateInList
StateInListComma
StateInValue
StateInValueEnd
StateInListEnd
StateInListObjectEnd
StateInNewline
StateInNumber
StateInNumberEnd
StateInStringEnd
StateInObjectKeyEnd
StateTerminate
StateInObjectEnd
StateTransitioningToTerminate
StateInListStartJSON
)
var JSONStates = []JSONState{
StateStart,
StateInObject,
StateInObjectKey,
StateInStructuredKey,
StateInStructuredValue,
StateNewline,
StateTab,
StateSpace,
StateInString,
StateInInt,
StateInFloat,
StateInBool,
StateInNull,
StateInColon,
StateInComma,
StateInTab,
StateInSpaceToValue,
StateInSpaceEndValue,
StateInNewlineEndValue,
StateInObjSpace,
StateInListStartJSON,
StateInList,
StateInListComma,
StateInValue,
StateInValueEnd,
StateInListEnd,
StateInListObjectEnd,
StateInNewline,
StateInNumber,
StateInNumberEnd,
StateInStringEnd,
StateInObjectKeyEnd,
StateTerminate,
StateInObjectEnd,
StateTransitioningToTerminate,
}
func (s JSONState) String() string {
switch s {
case StateStart:
return "StateStart"
case StateInObject:
return "StateInObject"
case StateInObjectKey:
return "StateInObjectKey"
case StateInStructuredKey:
return "StateInStructuredKey"
case StateInStructuredValue:
return "StateInStructuredValue"
case StateNewline:
return "StateNewline"
case StateTab:
return "StateTab"
case StateSpace:
return "StateSpace"
case StateInString:
return "StateInString"
case StateInInt:
return "StateInInt"
case StateInFloat:
return "StateInFloat"
case StateInBool:
return "StateInBool"
case StateInNull:
return "StateInNull"
case StateInColon:
return "StateInColon"
case StateInComma:
return "StateInComma"
case StateInTab:
return "StateInTab"
case StateInSpaceToValue:
return "StateInSpaceToValue"
case StateInSpaceEndValue:
return "StateInSpaceEndValue"
case StateInNewlineEndValue:
return "StateInNewlineEndValue"
case StateInObjSpace:
return "StateInObjSpace"
case StateInList:
return "StateInList"
case StateInListComma:
return "StateInListComma"
case StateInValue:
return "StateInValue"
case StateInValueEnd:
return "StateInValueEnd"
case StateInListEnd:
return "StateInListEnd"
case StateInListObjectEnd:
return "StateInListObjectEnd"
case StateInNewline:
return "StateInNewline"
case StateInNumber:
return "StateInNumber"
case StateInNumberEnd:
return "StateInNumberEnd"
case StateInStringEnd:
return "StateInStringEnd"
case StateInObjectKeyEnd:
return "StateInObjectKeyEnd"
case StateTerminate:
return "StateTerminate"
case StateInObjectEnd:
return "StateInObjectEnd"
case StateTransitioningToTerminate:
return "StateTransitioningToTerminate"
case StateInListStartJSON:
return "StateInListStartJSON"
default:
return fmt.Sprintf("Unknown state: %d", s)
}
}

View File

@@ -1,326 +0,0 @@
package sample
import (
"fmt"
"slices"
"github.com/ollama/ollama/model"
)
/*
Key JSON rules to consider:
1. Whitespace handling:
- Need to handle all valid JSON whitespace characters (\r, spaces between tokens)
- Current code only handles some whitespace cases
2. Number validation:
- Need proper validation for special number cases like -0
- Should handle .5 style decimals
- Need limits on scientific notation (e, E)
3. String escaping:
- Currently marks \ as invalid but should allow escaped sequences:
- \"
- \n
- \u1234 unicode escapes
4. Empty object/array transitions:
- Direct {} and [] cases could be more explicit
- Need clear transitions for these edge cases
5. Nested depth limits:
- No protection against excessive nesting
- Could cause stack overflow with deeply nested structures
*/
// TODO: / should be valid but an escape character
var stringInvalidRunes = []rune{'\\', '\n', '\t', '{', '}', ':', ',', '/'}
var (
intInvalidRunes = []rune{'e', 'E', ' ', '\n', '\t', '{', '}', ':', ',', '"'}
validIntRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'}
)
var validNumberRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E'}
var validBoolRunes = []rune{'t', 'r', 'u', 'e', 'f', 'a', 'l', 's', 'e'}
var validNullRunes = []rune{'n', 'u', 'l', 'l'}
type PDA struct {
State JSONState
TransitionEdges map[rune]*PDA
MaskTokenIDToNode map[int32]*PDA
}
func NewPDANode(state JSONState) *PDA {
return &PDA{
State: state,
TransitionEdges: make(map[rune]*PDA),
MaskTokenIDToNode: make(map[int32]*PDA),
}
}
type PDAGraphBuilder struct {
proc model.TextProcessor
decodedToks []string
stateToNodeMap map[JSONState]*PDA
tokenToStatesMap map[int32][]JSONState
}
func (b *PDAGraphBuilder) BuildGraph() error {
stateToNodeMap := make(map[JSONState]*PDA)
for _, state := range JSONStates {
stateToNodeMap[state] = NewPDANode(state)
}
stateToNodeMap[StateStart].TransitionEdges['{'] = stateToNodeMap[StateInObject]
stateToNodeMap[StateStart].TransitionEdges['['] = stateToNodeMap[StateInListStartJSON]
// TODO: update naming here - and revisit values
stateToNodeMap[StateInListStartJSON].TransitionEdges['{'] = stateToNodeMap[StateInObject]
stateToNodeMap[StateInListStartJSON].TransitionEdges['['] = stateToNodeMap[StateInListStartJSON]
stateToNodeMap[StateInObject].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
stateToNodeMap[StateInObject].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
stateToNodeMap[StateInObject].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
stateToNodeMap[StateInObject].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
// new line
stateToNodeMap[StateInNewline].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
stateToNodeMap[StateInNewline].TransitionEdges['\t'] = stateToNodeMap[StateInTab]
stateToNodeMap[StateInNewline].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
stateToNodeMap[StateInNewline].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
// stateToNodeMap[StateInNewline].TransitionEdges['{'] = stateToNodeMap[StateInObject]
// new line end value
// stateToNodeMap[StateInNewlineEndValue].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
stateToNodeMap[StateInNewlineEndValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
stateToNodeMap[StateInNewlineEndValue].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
stateToNodeMap[StateInObjSpace].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
stateToNodeMap[StateInObjSpace].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
// TODO: see if this is needed for formatting
stateToNodeMap[StateInObjSpace].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
stateToNodeMap[StateInTab].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
stateToNodeMap[StateInObjectKey].TransitionEdges[rune(-1)] = stateToNodeMap[StateInObjectKey]
stateToNodeMap[StateInObjectKey].TransitionEdges['"'] = stateToNodeMap[StateInObjectKeyEnd]
stateToNodeMap[StateInObjectKeyEnd].TransitionEdges[':'] = stateToNodeMap[StateInColon]
stateToNodeMap[StateInObjectEnd].TransitionEdges[','] = stateToNodeMap[StateInComma]
stateToNodeMap[StateInObjectEnd].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
// where values should be
// this could be combined but the probl might change, we're alr doing a skip ahead
stateToNodeMap[StateInColon].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
stateToNodeMap[StateInColon].TransitionEdges['\n'] = stateToNodeMap[StateInSpaceToValue]
stateToNodeMap[StateInColon].TransitionEdges['['] = stateToNodeMap[StateInList]
stateToNodeMap[StateInColon].TransitionEdges['{'] = stateToNodeMap[StateInObject]
addValueConnections(stateToNodeMap[StateInColon], stateToNodeMap)
// Leads to a value
stateToNodeMap[StateInSpaceToValue].TransitionEdges['['] = stateToNodeMap[StateInList]
stateToNodeMap[StateInSpaceToValue].TransitionEdges['{'] = stateToNodeMap[StateInObject]
addValueConnections(stateToNodeMap[StateInSpaceToValue], stateToNodeMap)
stateToNodeMap[StateInSpaceToValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
stateToNodeMap[StateInSpaceToValue].TransitionEdges['\n'] = stateToNodeMap[StateInSpaceToValue]
// Values
// string node
stateToNodeMap[StateInString].TransitionEdges[rune(-1)] = stateToNodeMap[StateInString]
stateToNodeMap[StateInString].TransitionEdges['"'] = stateToNodeMap[StateInStringEnd]
// String end node
addEnds(stateToNodeMap[StateInStringEnd], stateToNodeMap)
// stateToNodeMap[StateInStringEnd].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
stateToNodeMap[StateInStringEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
// TODO: add counters for allowable number of decimals, e, E, etc
// number node
for _, r := range validNumberRunes {
stateToNodeMap[StateInNumber].TransitionEdges[r] = stateToNodeMap[StateInNumber]
}
addEnds(stateToNodeMap[StateInNumber], stateToNodeMap)
// stateToNodeMap[StateInNumber].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
stateToNodeMap[StateInNumber].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
// list node
stateToNodeMap[StateInList].TransitionEdges[','] = stateToNodeMap[StateInComma]
stateToNodeMap[StateInList].TransitionEdges['{'] = stateToNodeMap[StateInObject]
stateToNodeMap[StateInList].TransitionEdges[' '] = stateToNodeMap[StateInList]
stateToNodeMap[StateInList].TransitionEdges['\n'] = stateToNodeMap[StateInList]
// early end
stateToNodeMap[StateInList].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
// list end node
stateToNodeMap[StateInListEnd].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
// stateToNodeMap[StateInListEnd].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
stateToNodeMap[StateInListEnd].TransitionEdges[','] = stateToNodeMap[StateInComma]
stateToNodeMap[StateInListEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
// empty list
stateToNodeMap[StateInList].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
addValueConnections(stateToNodeMap[StateInList], stateToNodeMap)
// null node
for _, r := range validNullRunes {
stateToNodeMap[StateInNull].TransitionEdges[r] = stateToNodeMap[StateInNull]
}
addEnds(stateToNodeMap[StateInNull], stateToNodeMap)
stateToNodeMap[StateInNull].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
stateToNodeMap[StateInNull].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
// list comma
// should point to values
stateToNodeMap[StateInListComma].TransitionEdges[' '] = stateToNodeMap[StateInListComma]
stateToNodeMap[StateInListComma].TransitionEdges['{'] = stateToNodeMap[StateInObject]
stateToNodeMap[StateInListComma].TransitionEdges['\n'] = stateToNodeMap[StateInList]
stateToNodeMap[StateInListComma].TransitionEdges[' '] = stateToNodeMap[StateInList]
stateToNodeMap[StateInListComma].TransitionEdges['\t'] = stateToNodeMap[StateInList]
addValueConnections(stateToNodeMap[StateInListComma], stateToNodeMap)
// list object end
stateToNodeMap[StateInListObjectEnd].TransitionEdges[','] = stateToNodeMap[StateInListComma]
stateToNodeMap[StateInListObjectEnd].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
// TODO: not sure if this is needed
stateToNodeMap[StateInListObjectEnd].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
// bool node
for _, r := range validBoolRunes {
stateToNodeMap[StateInBool].TransitionEdges[r] = stateToNodeMap[StateInBool]
}
stateToNodeMap[StateInBool].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
addEnds(stateToNodeMap[StateInBool], stateToNodeMap)
// stateToNodeMap[StateInBool].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
stateToNodeMap[StateInBool].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
// comma node
stateToNodeMap[StateInComma].TransitionEdges['{'] = stateToNodeMap[StateInObject]
stateToNodeMap[StateInComma].TransitionEdges['\n'] = stateToNodeMap[StateInNewline]
stateToNodeMap[StateInComma].TransitionEdges['"'] = stateToNodeMap[StateInObjectKey]
stateToNodeMap[StateInComma].TransitionEdges[' '] = stateToNodeMap[StateInObjSpace]
// todo: review this space transition
// stateToNodeMap[StateInComma].TransitionEdges[' '] = stateToNodeMap[StateInSpaceToValue]
// space end value
// stateToNodeMap[StateInSpaceEndValue].TransitionEdges[' '] = stateToNodeMap[StateInSpaceEndValue]
stateToNodeMap[StateInSpaceEndValue].TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
stateToNodeMap[StateInSpaceEndValue].TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
stateToNodeMap[StateInSpaceEndValue].TransitionEdges['\n'] = stateToNodeMap[StateInNewlineEndValue]
b.stateToNodeMap = stateToNodeMap
if err := b.preComputeValidStates(); err != nil {
return err
}
return nil
}
func addEnds(node *PDA, stateToNodeMap map[JSONState]*PDA) {
node.TransitionEdges[','] = stateToNodeMap[StateInComma]
node.TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
node.TransitionEdges[']'] = stateToNodeMap[StateInListEnd]
}
func addValueConnections(node *PDA, stateToNodeMap map[JSONState]*PDA) {
node.TransitionEdges['"'] = stateToNodeMap[StateInString]
for _, r := range validNumberRunes {
node.TransitionEdges[r] = stateToNodeMap[StateInNumber]
}
// TODO(parthsareen): force the output and shift similar to structured outputs
node.TransitionEdges['t'] = stateToNodeMap[StateInBool]
node.TransitionEdges['f'] = stateToNodeMap[StateInBool]
node.TransitionEdges['n'] = stateToNodeMap[StateInNull]
}
func (b *PDAGraphBuilder) preComputeValidStates() error {
for _, node := range b.stateToNodeMap {
// if node.State == StateInObjectKey {
// if len(b.stateToNodeMap[StateInString].MaskTokenIDToNode) > 0 {
// b.stateToNodeMap[StateInObjectKey].MaskTokenIDToNode = b.stateToNodeMap[StateInString].MaskTokenIDToNode
// fmt.Println("copying string mask to object key mask")
// }
// }
if err := b.CreateMask(node); err != nil {
return err
}
}
return nil
}
func (b *PDAGraphBuilder) preComputeTokenToStatesMap() error {
// TODO: make can be somewhere else too
b.tokenToStatesMap = make(map[int32][]JSONState)
for i, t := range b.decodedToks {
for _, r := range t {
if r == '"' {
b.tokenToStatesMap[int32(i)] = append(b.tokenToStatesMap[int32(i)], StateInString)
}
}
}
return nil
}
// TODO: the mask for obj key and string should be the same?
func (b *PDAGraphBuilder) CreateMask(node *PDA) error {
if node == nil {
return fmt.Errorf("node cannot be nil")
}
for i := range b.decodedToks {
token := b.decodedToks[i]
// Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
if b.proc.Is(uint32(i), model.SpecialEOS) || b.proc.Is(uint32(i), model.SpecialBOS) || token == "" || token == "\"\"" {
continue
}
curNode := node
valid := true
consumedSpecialRunes := make(map[rune]bool)
for _, r := range token {
curNode, valid = isRuneValid(r, curNode, consumedSpecialRunes)
if curNode == nil || !valid {
break
}
}
if valid {
node.MaskTokenIDToNode[int32(i)] = curNode
}
}
return nil
}
func isRuneValid(r rune, curNode *PDA, consumedSpecialRunes map[rune]bool) (*PDA, bool) {
if consumedSpecialRunes[r] {
return nil, false
}
specialRune := slices.Contains(stringInvalidRunes, r)
if specialRune {
if curNode.State == StateInString || curNode.State == StateInObjectKey {
return nil, false
}
}
// Check for specific rune transition
if nextNode, ok := curNode.TransitionEdges[r]; ok {
// fmt.Println("next node", nextNode)
if specialRune {
if curNode.State == nextNode.State {
return nil, false
}
consumedSpecialRunes[r] = true
}
return nextNode, true
}
// Check for sentinel value - if present, any rune is valid
if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
return nextNode, true
}
return nil, false
}

View File

@@ -1,255 +0,0 @@
package sample
import (
"fmt"
"math"
"runtime"
"time"
"github.com/ollama/ollama/model"
)
// TODO: safety in case of invalid json
// TODO: partial JSON matching?
// TODO: interfaces to cleanup with return values
// TODO this interface shouldn't be the sampler - should just use Sampler
// TODO: add penalties for string \n stuff
// TODO: minimize number of fwd passes if there is only one match
// TODO: greedy sample initially and then backtrack if no match
type PushdownSampler struct {
PDAGraphBuilder
curNode *PDA
braceStack []rune
stateCounter uint32
}
// graph should be built once and reused per tokenizer
func NewPushdownSampler(proc model.TextProcessor) (*PushdownSampler, error) {
start := time.Now()
fmt.Println("--------------------------------")
fmt.Println("PDA sampler")
fmt.Println("--------------------------------")
var m runtime.MemStats
runtime.ReadMemStats(&m)
before := m.Alloc
fmt.Printf("Alloc = %.2f MB\n", float64(before)/(1024*1024))
vocab := proc.GetVocabulary()
decodedToks := make([]string, len(vocab.Values))
for i := range vocab.Values {
token, err := proc.Decode([]int32{int32(i)})
if err != nil {
return nil, err
}
decodedToks[i] = token
}
gb := &PDAGraphBuilder{
proc: proc,
decodedToks: decodedToks,
}
if err := gb.BuildGraph(); err != nil {
return nil, err
}
runtime.ReadMemStats(&m)
after := m.Alloc
fmt.Printf("Alloc = %.2f MB\n", float64(after)/(1024*1024))
fmt.Printf("Graph memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
fmt.Printf("Graph build time = %v\n", time.Since(start))
// TODO: this can be simplified
return &PushdownSampler{
curNode: gb.stateToNodeMap[StateStart],
PDAGraphBuilder: *gb,
braceStack: []rune{},
stateCounter: 0,
}, nil
}
// TODO: need to add resampling logic if the first sample was not good
// greedy sample + backtrack?
func (s *PushdownSampler) Apply(logits []float64) ([]float64, error) {
switch s.curNode.State {
case StateInString:
return s.maskLogits(logits, s.curNode)
case StateInListEnd:
// force finish if no braces left
if len(s.braceStack) == 0 {
s.curNode = NewPDANode(StateTerminate)
return forceFinish(s, logits)
}
logits, err := s.maskLogits(logits, s.curNode)
if err != nil {
return nil, err
}
return logits, nil
case StateTerminate:
return forceFinish(s, logits)
case StateInObjectEnd:
// force finish if no braces left
if len(s.braceStack) == 0 {
s.curNode = NewPDANode(StateTerminate)
return forceFinish(s, logits)
}
peek := s.braceStack[len(s.braceStack)-1]
if peek == rune('[') {
s.curNode = s.stateToNodeMap[StateInListObjectEnd]
}
logits, err := s.maskLogits(logits, s.curNode)
if err != nil {
return nil, err
}
return logits, nil
case StateInComma:
peek := s.braceStack[len(s.braceStack)-1]
if peek == rune('[') {
s.curNode = s.stateToNodeMap[StateInListComma]
}
logits, err := s.maskLogits(logits, s.curNode)
if err != nil {
return nil, err
}
return logits, nil
default:
fmt.Println("masking logits current state", s.curNode.State)
logits, err := s.maskLogits(logits, s.curNode)
if err != nil {
return nil, err
}
return logits, nil
}
}
func forceFinish(s *PushdownSampler, logits []float64) ([]float64, error) {
for i := range logits {
if s.proc.Is(uint32(i), model.SpecialEOS) {
logits[i] = 1.0
} else {
logits[i] = math.Inf(-1)
}
}
return logits, nil
}
func (s *PushdownSampler) UpdateState(tokenSlice []int32) ([]int32, error) {
fmt.Println("current state - updating", s.curNode.State)
mappedString, err := s.proc.Decode(tokenSlice)
if err != nil {
return nil, err
}
fmt.Printf(">>> mappedString: %q\n", mappedString)
// flag := -1
// endBraceRunes := []rune{'}', ']'}
for _, r := range mappedString {
// TODO: if this is enabled again, make sure to appropriately handle the state transitions
// if slices.Contains(endBraceRunes, r) && len(s.braceStack) == 0 {
// fmt.Printf("stack is empty, extra closing brace %c\n", r)
// // flag = i
// break
// }
if r == rune('{') {
s.braceStack = append(s.braceStack, r)
}
if r == rune('[') {
s.braceStack = append(s.braceStack, r)
}
if r == rune('}') {
if len(s.braceStack) == 0 {
return nil, fmt.Errorf("stack is empty, extra closing brace %c", r)
}
top := s.braceStack[len(s.braceStack)-1]
if top != rune('{') {
return nil, fmt.Errorf("unmatched closing brace, got%c, want%c", top, '{')
}
s.braceStack = s.braceStack[:len(s.braceStack)-1]
}
if r == rune(']') {
if len(s.braceStack) == 0 {
return nil, fmt.Errorf("stack is empty, extra closing brace %c", r)
}
top := s.braceStack[len(s.braceStack)-1]
if top != rune('[') {
return nil, fmt.Errorf("unmatched closing brace, got%c, want%c", top, '[')
}
s.braceStack = s.braceStack[:len(s.braceStack)-1]
}
}
// if flag != -1 {
// tokenSlice = tokenSlice[:flag]
// }
// fmt.Println("flag!", flag)
for _, tokenID := range tokenSlice {
// transition to the next node
nextNode, ok := s.curNode.MaskTokenIDToNode[tokenID]
if !ok {
return nil, fmt.Errorf("invalid token: %q", mappedString)
}
fmt.Println("transitioning to", nextNode.State)
// TODO: add a penalty for staying in the same state too long
if nextNode.State == s.curNode.State {
s.stateCounter++
} else {
s.stateCounter = 0
}
s.curNode = nextNode
fmt.Println("updated curNode state", s.curNode.State)
}
return tokenSlice, nil
}
// greedy sample + backtrack?
func (s *PushdownSampler) maskLogits(logits []float64, node *PDA) ([]float64, error) {
// Create a new slice with same length as logits, initialized to -Inf
maskedLogits := make([]float64, len(logits))
for i := range maskedLogits {
maskedLogits[i] = math.Inf(-1)
}
// Only update values for valid token IDs from the mask map
for tokenID := range node.MaskTokenIDToNode {
if int(tokenID) < len(logits) {
maskedLogits[tokenID] = logits[tokenID]
}
}
return maskedLogits, nil
}
func (s *PushdownSampler) fastMaskLogits(logits []float64, node *PDA) ([]float64, error) {
maxLogit := math.Inf(-1)
maxIndex := -1
// Find the maximum logit value among valid tokens
for tokenID := range node.MaskTokenIDToNode {
if int(tokenID) < len(logits) && logits[tokenID] > maxLogit {
maxLogit = logits[tokenID]
maxIndex = int(tokenID)
}
}
if maxIndex == -1 {
return nil, fmt.Errorf("no valid tokens found in mask")
}
logits[0] = float64(maxIndex)
return logits, nil
// return maxIndex, nil
}

View File

@@ -1,190 +1,74 @@
package sample
import (
"cmp"
"errors"
"math"
"slices"
pq "github.com/emirpasic/gods/v2/queues/priorityqueue"
"golang.org/x/exp/rand"
"gonum.org/v1/gonum/floats"
"gonum.org/v1/gonum/stat/sampleuv"
)
type Transform interface {
Apply([]float64) ([]float64, error)
}
type Sampler interface {
Sample([]float32, ...Transform) (int, error)
}
// TODO(parthsareen): potentially cache softmax values
func softmax(logits []float64) []float64 {
var sum float64
tt := make([]float64, len(logits))
for i, v := range logits {
tt[i] = math.Exp(v)
sum += tt[i]
}
floats.Scale(1/sum, tt)
return tt
Sample([]float64) ([]float64, error)
}
type Temperature float64
func (t Temperature) Apply(logits []float64) ([]float64, error) {
if t == 0 {
return nil, errors.New("use Greedy sampler instead of Temperature(0)")
}
if t < 0 || t > 2 {
return nil, errors.New("temperature must be between 0 and 2")
}
temp := math.Max(float64(t), 1e-7)
// subtracting max logit to avoid under/overflow
maxLogit := slices.Max(logits)
for i := range logits {
logits[i] = (logits[i] - maxLogit) / temp
}
return logits, nil
func (s Temperature) Sample(t []float64) ([]float64, error) {
floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
return t, nil
}
type logitMap struct {
index int
logit float64
type softmax struct{}
func Softmax() Sampler {
return softmax{}
}
func logitMapComparator(a, b logitMap) int {
return -cmp.Compare(a.logit, b.logit)
func (softmax) Sample(t []float64) ([]float64, error) {
return t, nil
}
type TopK int
// TODO(parthsareen): avoid having to check all logits after this transform
func (k TopK) Apply(logits []float64) ([]float64, error) {
if k <= 0 {
return nil, errors.New("k must be greater than 0")
}
if int(k) >= len(logits) {
return logits, nil
}
q := pq.NewWith(logitMapComparator)
for i, logit := range logits {
q.Enqueue(logitMap{index: i, logit: logit})
}
validLogits := make(map[int]float64)
for range k {
logitMap, _ := q.Dequeue()
validLogits[logitMap.index] = logitMap.logit
}
for i := range logits {
if _, ok := validLogits[i]; !ok {
logits[i] = math.Inf(-1)
}
}
return logits, nil
func (s TopK) Sample(t []float64) ([]float64, error) {
return t, nil
}
type TopP float64
type TopP float32
func (p TopP) Apply(logits []float64) ([]float64, error) {
if p <= 0 || p >= 1 {
return nil, errors.New("p must be between 0 and 1")
}
probs := softmax(logits)
indices := make([]int, len(probs))
for i := range indices {
indices[i] = i
}
// sort in descending order
slices.SortFunc(indices, func(i, j int) int {
return cmp.Compare(probs[j], probs[i])
})
var cumSum float64
for i, idx := range indices {
cumSum += probs[idx]
if cumSum > float64(p) {
for _, idx := range indices[i+1:] {
logits[idx] = math.Inf(-1)
}
break
}
}
return logits, nil
func (s TopP) Sample(t []float64) ([]float64, error) {
return t, nil
}
type MinP float64
type MinP float32
func (p MinP) Apply(logits []float64) ([]float64, error) {
if p <= 0 || p >= 1 {
return nil, errors.New("p must be between 0 and 1")
}
probs := softmax(logits)
threshold := slices.Max(probs) * float64(p)
for i, prob := range probs {
if prob < threshold {
logits[i] = math.Inf(-1)
}
}
return logits, nil
func (s MinP) Sample(t []float64) ([]float64, error) {
return t, nil
}
type weighted struct {
src rand.Source
type weighed struct{}
func Weighed() Sampler {
return weighed{}
}
func Weighted(seed *int64) Sampler {
var src rand.Source
if seed != nil {
src = rand.NewSource(uint64(*seed))
func (s weighed) Sample(t []float64) ([]float64, error) {
w := sampleuv.NewWeighted(t, nil)
if v, ok := w.Take(); ok {
return []float64{float64(v)}, nil
}
return weighted{src: src}
return t, nil
}
func (s weighted) Sample(logits []float32, transforms ...Transform) (int, error) {
logits64 := make([]float64, len(logits))
for i, v := range logits {
logits64[i] = float64(v)
}
func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
var err error
for _, t := range transforms {
logits64, err = t.Apply(logits64)
for _, sampler := range samplers {
floats, err = sampler.Sample(floats)
if err != nil {
return -1, err
return nil, err
}
}
logitsCopy := make([]float64, 0, len(logits))
indices := make([]int, 0, len(logits))
for i, logit := range logits64 {
if !math.IsInf(logit, -1) {
logitsCopy = append(logitsCopy, logit)
indices = append(indices, i)
}
}
if len(logitsCopy) == 0 {
return -1, errors.New("no valid logits found for weighed sampling")
}
probs := softmax(logitsCopy)
w := sampleuv.NewWeighted(probs, s.src)
if idx, ok := w.Take(); ok {
return indices[idx], nil
}
return -1, errors.New("weighed sampler failed, no valid token found")
return floats, nil
}

View File

@@ -1,242 +0,0 @@
package sample
import (
"fmt"
"math"
"math/rand/v2"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestTemperature(t *testing.T) {
logits, err := Temperature(0.5).Apply([]float64{2, -1, 4, -3, 1, -2, 0})
if err != nil {
t.Error(err)
return
}
want := []float64{-4, -10, 0, -14, -6, -12, -8}
if diff := cmp.Diff(want, logits); diff != "" {
t.Errorf("logits mismatch (-want +got):\n%s", diff)
}
logits, err = Temperature(-1).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err == nil {
t.Errorf("expected error for temperature=-1, got %v", logits)
}
logits, err = Temperature(0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err == nil {
t.Errorf("expected error for temperature=0, got %v", logits)
}
logits, err = Temperature(2.1).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err == nil {
t.Errorf("expected error for temperature=2.1, got %v", logits)
}
}
func TestSoftmax(t *testing.T) {
probs := softmax([]float64{-3, -2, -1, 0, 1, 2, 4})
expectedProbs := []float64{0.000751406628089903, 0.0020425349829204676, 0.005552185728064613, 0.015092405572827691, 0.04102541181635154, 0.11151863144543739, 0.8240174238263085}
if diff := cmp.Diff(expectedProbs, probs); diff != "" {
t.Errorf("probs mismatch (-want +got):\n%s", diff)
}
}
func TestTopK(t *testing.T) {
logits, err := TopK(3).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err != nil {
t.Error(err)
return
}
expectedlogits := []float64{math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), 1, 2, 4}
if diff := cmp.Diff(expectedlogits, logits); diff != "" {
t.Errorf("logits mismatch (-want +got):\n%s", diff)
}
_, err = TopK(0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err == nil {
t.Errorf("expected error for k=0, got %v", err)
}
logits, err = TopK(10).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err != nil {
t.Error(err)
return
}
expectedlogits = []float64{-3, -2, -1, 0, 1, 2, 4}
if diff := cmp.Diff(expectedlogits, logits); diff != "" {
t.Errorf("logits mismatch (-want +got):\n%s", diff)
}
}
func TestTopP(t *testing.T) {
logits, err := TopP(0.9).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err != nil {
t.Error(err)
return
}
want := []float64{math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), 2, 4}
if diff := cmp.Diff(want, logits); diff != "" {
t.Errorf("logits mismatch (-want +got):\n%s", diff)
}
_, err = TopP(1.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err == nil {
t.Error("expected error for p=1.0")
}
_, err = TopP(0.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 4})
if err == nil {
t.Error("expected error for p=0.0")
}
}
func TestMinP(t *testing.T) {
logits, err := MinP(0.2).Apply([]float64{-3, -2, -1, 0, 1, 2, 4, 3})
if err != nil {
t.Error(err)
return
}
want := []float64{math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), math.Inf(-1), 4, 3}
if diff := cmp.Diff(want, logits); diff != "" {
t.Errorf("logits mismatch (-want +got):\n%s", diff)
}
_, err = MinP(1.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 3, 4})
if err == nil {
t.Error("expected error for p=1.0")
}
_, err = MinP(0.0).Apply([]float64{-3, -2, -1, 0, 1, 2, 3, 4})
if err == nil {
t.Error("expected error for p=0.0")
}
}
func TestWeighed(t *testing.T) {
idx, err := Weighted(nil).Sample([]float32{float32(math.Inf(-1)), 2, float32(math.Inf(-1)), float32(math.Inf(-1))})
if err != nil {
t.Error(err)
return
}
want := 1
if diff := cmp.Diff(want, idx); diff != "" {
t.Errorf("index mismatch (-want +got):\n%s", diff)
}
idx, err = Weighted(nil).Sample([]float32{float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1))})
if err == nil {
t.Error("expected error for no valid tokens, got index", idx)
}
}
func TestSample(t *testing.T) {
input := []float32{1, 2, 3, 4}
var callOrder []int
mock1 := &testTransform{
id: 1,
callOrder: &callOrder,
}
mock2 := &testTransform{
id: 2,
callOrder: &callOrder,
}
mock3 := &testTransform{
id: 3,
callOrder: &callOrder,
}
got, err := Greedy().Sample(input, mock1, mock2, mock3)
if err != nil {
t.Error(err)
return
}
want := 3 // Greedy sampler should pick highest logit
if diff := cmp.Diff(want, got); diff != "" {
t.Errorf("sampled index mismatch (-want +got):\n%s", diff)
}
_, err = Weighted(nil).Sample(input, mock1, mock2, mock3)
if err != nil {
t.Error(err)
return
}
wantOrder := []int{1, 2, 3}
if diff := cmp.Diff(wantOrder, callOrder); diff != "" {
t.Errorf("call order mismatch (-want +got):\n%s", diff)
}
errMock := &testTransform{
returnErr: fmt.Errorf("mock error"),
}
_, err = Weighted(nil).Sample(input, mock1, errMock, mock2)
if err == nil {
t.Error("Expected error from sampler")
}
}
type testTransform struct {
id int
callOrder *[]int
returnErr error
}
func (ts *testTransform) Apply(logits []float64) ([]float64, error) {
if ts.callOrder != nil {
*ts.callOrder = append(*ts.callOrder, ts.id)
}
if ts.returnErr != nil {
return nil, ts.returnErr
}
return logits, nil
}
func BenchmarkTransform(b *testing.B) {
transforms := map[string]Transform{
"Temperature": Temperature(0.5),
"TopK": TopK(10),
"TopP": TopP(0.9),
"MinP": MinP(0.2),
}
logits := make([]float64, 1<<16)
for i := range logits {
logits[i] = rand.Float64()
}
for name, transform := range transforms {
b.Run(name, func(b *testing.B) {
b.ResetTimer()
for range b.N {
_, err := transform.Apply(logits)
if err != nil {
b.Error(err)
}
}
})
}
}
func BenchmarkSample(b *testing.B) {
samplers := map[string]Sampler{
"Greedy": Greedy(),
"Weighted": Weighted(nil),
}
logits := make([]float32, 1<<16)
for i := range logits {
logits[i] = rand.Float32()
}
for name, s := range samplers {
b.Run(name, func(b *testing.B) {
b.ResetTimer()
for range b.N {
if _, err := s.Sample(logits); err != nil {
b.Error(err)
}
}
})
}
}

View File

@@ -1,296 +0,0 @@
package sample
import (
"fmt"
"runtime"
"time"
"github.com/ollama/ollama/model"
)
type JSONSampler struct {
schema *Schema
propIdx int
propToNodeMap map[string]*PDA
pdaSampler *PushdownSampler
decodedToks []string
}
func NewJSONSampler(proc model.TextProcessor, schema *Schema) (*JSONSampler, error) {
if proc == nil {
return nil, fmt.Errorf("TextProcessor cannot be nil")
}
pdaSampler, err := NewPushdownSampler(proc)
if err != nil {
return nil, fmt.Errorf("failed to create PushdownSampler: %w", err)
}
if schema == nil {
return &JSONSampler{
schema: nil,
propIdx: -1,
propToNodeMap: nil,
pdaSampler: pdaSampler,
}, nil
}
// fmt.Println("schema not nil")
so := &JSONSampler{
schema: schema,
propIdx: -1,
propToNodeMap: make(map[string]*PDA),
pdaSampler: pdaSampler,
}
so.schemaToGraph()
// Benchmark token decoding
start := time.Now()
var m runtime.MemStats
runtime.ReadMemStats(&m)
before := m.Alloc
vocab := proc.GetVocabulary()
decodedToks := make([]string, len(vocab.Values))
for i := range vocab.Values {
token, err := proc.Decode([]int32{int32(i)})
if err != nil {
return nil, err
}
decodedToks[i] = token
}
so.decodedToks = decodedToks
runtime.ReadMemStats(&m)
after := m.Alloc
fmt.Printf("Token decode memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
fmt.Printf("Token decode time = %v\n", time.Since(start))
fmt.Println("--------------------------------")
fmt.Println("SOSampler")
fmt.Println("--------------------------------")
// Benchmark this section
start = time.Now()
runtime.ReadMemStats(&m)
before = m.Alloc
// TODO: still messed up
// TODO: recursion use case
// key masks
for _, prop := range so.schema.Properties {
node := so.propToNodeMap[prop.Name]
// propName -> node
curState := node.State
fromNode := node
so.pdaSampler.CreateMask(fromNode)
for curState == StateInStructuredKey {
// there is only one edge
for r, toNode := range fromNode.TransitionEdges {
fmt.Println("rune", r, "edge", toNode.State)
so.pdaSampler.CreateMask(toNode)
fmt.Printf("created mask for %c\n", r)
curState = toNode.State
fmt.Println("next state", curState)
// TODO: theres an extra gen for " right now
fromNode = toNode
}
}
if curState != StateInColon {
return nil, fmt.Errorf("expected state to be StateInColon, got %v", curState)
}
// so.pdaSampler.CreateMask(fromNode)
fromNode = fromNode.TransitionEdges[' ']
so.pdaSampler.CreateMask(fromNode)
curState = fromNode.State
for _, toNode := range fromNode.TransitionEdges {
fmt.Println("toNode", toNode.State)
}
}
// runtime.ReadMemStats(&m)
// after = m.Alloc
// fmt.Printf("Mask creation memory usage = %.2f MB\n", float64(after-before)/(1024*1024))
// fmt.Printf("Mask creation time = %v\n", time.Since(start))
// fmt.Println("--------------------------------")
return so, nil
}
func (s *JSONSampler) schemaToGraph() {
schemaType := s.schema.EffectiveType()
switch schemaType {
case "object":
// TODO: see if we need to connect these to the JSON graph
// each prop is a key
for _, prop := range s.schema.Properties {
// name of key
name := prop.Name
keyNode := &PDA{
State: StateInStructuredKey, // this is unchanging, will impact sampling
TransitionEdges: make(map[rune]*PDA),
MaskTokenIDToNode: make(map[int32]*PDA),
}
prevNode := keyNode
for _, r := range name {
runeNode := &PDA{
State: StateInStructuredKey, // this is unchanging, will impact sampling
TransitionEdges: make(map[rune]*PDA),
MaskTokenIDToNode: make(map[int32]*PDA),
}
// fmt.Println("runeNode created", runeNode.State)
// fmt.Printf("runeNode created %c\n", r)
// since alloc on heap connections wil still map
prevNode.TransitionEdges[r] = runeNode
prevNode = runeNode
}
// point to end of object key node after all chars are done
// prevNode.TransitionEdges['"'] = s.pdaSampler.stateToNodeMap[StateInObjectKeyEnd]
// link to value node
// Create a node for the end of the key (after the closing quote)
stringEndNode := &PDA{
State: StateInStructuredKey,
TransitionEdges: make(map[rune]*PDA),
MaskTokenIDToNode: make(map[int32]*PDA),
}
prevNode.TransitionEdges['"'] = stringEndNode
prevNode = stringEndNode
// Add transition for colon after key
colonNode := &PDA{
State: StateInColon,
TransitionEdges: make(map[rune]*PDA),
MaskTokenIDToNode: make(map[int32]*PDA),
}
prevNode.TransitionEdges[':'] = colonNode
prevNode = colonNode
// Add transition for space after colon
spaceNode := &PDA{
State: StateInSpaceToValue,
TransitionEdges: make(map[rune]*PDA),
MaskTokenIDToNode: make(map[int32]*PDA),
}
prevNode.TransitionEdges[' '] = spaceNode
prevNode = spaceNode
value := prop.Type
switch value {
case "object":
fmt.Println("object under key: ", name)
case "array":
fmt.Println("array under key: ", name)
case "string":
fmt.Println("string under key: ", name)
prevNode.TransitionEdges['"'] = s.pdaSampler.stateToNodeMap[StateInString]
case "number":
fmt.Println("number under key: ", name)
for _, r := range validNumberRunes {
prevNode.TransitionEdges[r] = s.pdaSampler.stateToNodeMap[StateInNumber]
}
case "boolean":
fmt.Println("boolean under key: ", name)
prevNode.TransitionEdges['t'] = s.pdaSampler.stateToNodeMap[StateInBool]
prevNode.TransitionEdges['f'] = s.pdaSampler.stateToNodeMap[StateInBool]
prevNode.TransitionEdges['n'] = s.pdaSampler.stateToNodeMap[StateInNull]
}
// points to start of the key
s.propToNodeMap[name] = keyNode
fmt.Println("name", name, "keyNode", keyNode.State)
}
}
// TODO: do values + recursion
}
func (s *JSONSampler) Apply(logits []float64) ([]float64, error) {
if s.schema == nil {
return s.pdaSampler.Apply(logits)
}
switch s.pdaSampler.curNode.State {
// TODO: doesnt account for multi rune case
case StateInObjectKey:
if s.propIdx > len(s.schema.Properties)-1 {
return nil, fmt.Errorf("propIdx out of bounds")
}
// fmt.Println("in object key - structured outputs")
// TODO: this tracking should probably be coming from a stack to track nested objects
// simple case
s.propIdx++
fmt.Println("propIdx", s.propIdx)
prop := s.schema.Properties[s.propIdx]
fmt.Println("prop", prop.Name)
s.pdaSampler.curNode = s.propToNodeMap[prop.Name]
fmt.Println("changed curNode state to", s.pdaSampler.curNode.State)
logits, err := s.pdaSampler.maskLogits(logits, s.pdaSampler.curNode)
if err != nil {
return nil, err
}
return logits, nil
default:
// Will only happen for the last prop - can also be precomputed.
if s.propIdx == len(s.schema.Properties)-1 {
// todo: if i incremenet propidx then i know im in last value as well
switch s.pdaSampler.curNode.State {
case StateInObjectEnd:
fmt.Println("<<<<< in obj end - generating mask for", s.pdaSampler.curNode.State)
s.pdaSampler.curNode.TransitionEdges = make(map[rune]*PDA)
s.pdaSampler.curNode = NewPDANode(StateTerminate)
s.propIdx++
// TODO: this needs to be optimized in some way, computing mask on the fly is expensive
case StateInNumber, StateInString, StateInBool, StateInNull, StateInListEnd:
fmt.Println("<<<<< last prop - generating mask for", s.pdaSampler.curNode.State)
delete(s.pdaSampler.curNode.TransitionEdges, ',')
s.pdaSampler.curNode.MaskTokenIDToNode = make(map[int32]*PDA)
s.pdaSampler.CreateMask(s.pdaSampler.curNode)
s.propIdx++
}
}
return s.pdaSampler.Apply(logits)
}
}
func (s *JSONSampler) UpdateState(tokenSlice []int32) ([]int32, error) {
tokenSlice, err := s.pdaSampler.UpdateState(tokenSlice)
if err != nil {
return nil, err
}
if s.schema == nil {
// Don't need to update state for unconstrained JSON sampling
return tokenSlice, nil
}
switch s.pdaSampler.curNode.State {
case StateInObjectKey:
s.propIdx++
fmt.Println("propIdx", s.propIdx)
prop := s.schema.Properties[s.propIdx]
fmt.Println("prop", prop.Name)
s.pdaSampler.curNode = s.propToNodeMap[prop.Name]
// TODO: this does not work - mike
// str, err := s.pdaSampler.proc.Decode(tokenSlice)
// if err != nil {
// return nil, err
// }
// fmt.Println("str", str)
return tokenSlice, nil
default:
return tokenSlice, nil
}
}

View File

Binary file not shown.

View File

@@ -18,7 +18,7 @@ docker buildx build \
--output type=local,dest=./dist/ \
--platform=${PLATFORM} \
${OLLAMA_COMMON_BUILD_ARGS} \
--target dist \
--target archive \
-f Dockerfile \
.
@@ -26,4 +26,4 @@ docker buildx build \
if echo $PLATFORM | grep "," > /dev/null ; then
mv -f ./dist/linux_*64/ollama* ./dist/
rmdir ./dist/linux_*64
fi
fi

View File

@@ -178,12 +178,37 @@ func convertModelFromFiles(files map[string]string, baseLayers []*layerGGML, isA
}
func detectModelTypeFromFiles(files map[string]string) string {
// todo make this more robust by actually introspecting the files
for fn := range files {
if strings.HasSuffix(fn, ".safetensors") {
return "safetensors"
} else if strings.HasSuffix(fn, ".bin") || strings.HasSuffix(fn, ".gguf") {
} else if strings.HasSuffix(fn, ".gguf") {
return "gguf"
} else {
// try to see if we can find a gguf file even without the file extension
blobPath, err := GetBlobsPath(files[fn])
if err != nil {
slog.Error("error getting blobs path", "file", fn)
return ""
}
f, err := os.Open(blobPath)
if err != nil {
slog.Error("error reading file", "error", err)
return ""
}
defer f.Close()
buf := make([]byte, 4)
_, err = f.Read(buf)
if err != nil {
slog.Error("error reading file", "error", err)
return ""
}
ct := ggml.DetectContentType(buf)
if ct == "gguf" {
return "gguf"
}
}
}

View File

@@ -10,6 +10,7 @@ import (
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/mllama"
"github.com/ollama/ollama/template"
@@ -92,26 +93,33 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
var imgData llm.ImageData
if isMllama {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
if envconfig.NewRunners() {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} else {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
}
}
imgPrompt = "<|image|>"
} else {

View File

@@ -203,7 +203,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
images := make([]llm.ImageData, len(req.Images))
for i := range req.Images {
if isMllama {
if isMllama && !envconfig.NewRunners() {
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})

View File

@@ -3,6 +3,7 @@ package server
import (
"bytes"
"cmp"
"crypto/sha256"
"encoding/json"
"fmt"
"io"
@@ -710,3 +711,100 @@ func TestCreateDetectTemplate(t *testing.T) {
})
})
}
func TestDetectModelTypeFromFiles(t *testing.T) {
t.Run("gguf file", func(t *testing.T) {
_, digest := createBinFile(t, nil, nil)
files := map[string]string{
"model.gguf": digest,
}
modelType := detectModelTypeFromFiles(files)
if modelType != "gguf" {
t.Fatalf("expected model type 'gguf', got %q", modelType)
}
})
t.Run("gguf file w/o extension", func(t *testing.T) {
_, digest := createBinFile(t, nil, nil)
files := map[string]string{
fmt.Sprintf("%x", digest): digest,
}
modelType := detectModelTypeFromFiles(files)
if modelType != "gguf" {
t.Fatalf("expected model type 'gguf', got %q", modelType)
}
})
t.Run("safetensors file", func(t *testing.T) {
files := map[string]string{
"model.safetensors": "sha256:abc123",
}
modelType := detectModelTypeFromFiles(files)
if modelType != "safetensors" {
t.Fatalf("expected model type 'safetensors', got %q", modelType)
}
})
t.Run("unsupported file type", func(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
data := []byte("12345678")
digest := fmt.Sprintf("sha256:%x", sha256.Sum256(data))
if err := os.MkdirAll(filepath.Join(p, "blobs"), 0o755); err != nil {
t.Fatal(err)
}
f, err := os.Create(filepath.Join(p, "blobs", fmt.Sprintf("sha256-%s", strings.TrimPrefix(digest, "sha256:"))))
if err != nil {
t.Fatal(err)
}
defer f.Close()
if _, err := f.Write(data); err != nil {
t.Fatal(err)
}
files := map[string]string{
"model.bin": digest,
}
modelType := detectModelTypeFromFiles(files)
if modelType != "" {
t.Fatalf("expected empty model type for unsupported file, got %q", modelType)
}
})
t.Run("file with less than 4 bytes", func(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
data := []byte("123")
digest := fmt.Sprintf("sha256:%x", sha256.Sum256(data))
if err := os.MkdirAll(filepath.Join(p, "blobs"), 0o755); err != nil {
t.Fatal(err)
}
f, err := os.Create(filepath.Join(p, "blobs", fmt.Sprintf("sha256-%s", strings.TrimPrefix(digest, "sha256:"))))
if err != nil {
t.Fatal(err)
}
defer f.Close()
if _, err := f.Write(data); err != nil {
t.Fatal(err)
}
files := map[string]string{
"noext": digest,
}
modelType := detectModelTypeFromFiles(files)
if modelType != "" {
t.Fatalf("expected empty model type for small file, got %q", modelType)
}
})
}

67
template/command-r.gotmpl Normal file
View File

@@ -0,0 +1,67 @@
{{- if or .Tools .System }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
{{- if .Tools }}# Safety Preamble
The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
# System Preamble
## Basic Rules
You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
{{ if .System }}# User Preamble
{{ .System }}
{{- end }}
## Available Tools
Here is a list of tools that you have available to you:
{{- range .Tools }}
```python
def {{ .Function.Name }}(
{{- range $name, $property := .Function.Parameters.Properties }}{{ $name }}: {{ $property.Type }}, {{ end }}) -> List[Dict]:
'''{{ .Function.Description }}
{{- if .Function.Parameters.Properties }}
Args:
{{- range $name, $property := .Function.Parameters.Properties }}
{{ $name }} ({{ $property.Type }}): {{ $property.Description }}
{{- end }}
{{- end }}
'''
pass
```
{{- end }}
{{- else if .System }}{{ .System }}
{{- end }}<|END_OF_TURN_TOKEN|>
{{- end }}
{{- range .Messages }}
{{- if eq .Role "system" }}
{{- continue }}
{{- end }}<|START_OF_TURN_TOKEN|>
{{- if eq .Role "user" }}<|USER_TOKEN|>{{ .Content }}
{{- if $.Tools }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
```json
[
{
"tool_name": title of the tool in the specification,
"parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
}
]```
{{- end }}
{{- else if eq .Role "assistant" }}<|CHATBOT_TOKEN|>
{{- if .Content }}{{ .Content }}
{{- else if .ToolCalls }}
Action: ```json
[
{{- range .ToolCalls }}
{
"tool_name": "{{ .Function.Name }}",
"parameters": {{ .Function.Arguments }}
}
{{- end }}
]```
{{- end }}
{{- else if eq .Role "tool" }}<|SYSTEM_TOKEN|><results>
console_output: {{ .Content }}
</results>
{{- end }}<|END_OF_TURN_TOKEN|>
{{- end }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>

6
template/command-r.json Normal file
View File

@@ -0,0 +1,6 @@
{
"stop": [
"<|START_OF_TURN_TOKEN|>",
"<|END_OF_TURN_TOKEN|>"
]
}

View File

@@ -138,5 +138,9 @@
{
"template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}",
"name": "solar-instruct"
},
{
"template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
"name": "command-r"
}
]

View File

@@ -0,0 +1 @@
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I'm doing great. How can I help you today?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>I'd like to show off how chat templating works!<|END_OF_TURN_TOKEN|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>

View File

@@ -0,0 +1 @@
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>

View File

@@ -0,0 +1 @@
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I'm doing great. How can I help you today?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>I'd like to show off how chat templating works!<|END_OF_TURN_TOKEN|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>