Compare commits

..

2 Commits

Author SHA1 Message Date
Daniel Hiltgen
dc011d16b9 Backport MacOS SDK fix from main 2024-04-04 11:17:48 -07:00
Daniel Hiltgen
8d08676a99 Apply 01-cache.diff 2024-04-04 10:11:23 -07:00
102 changed files with 3263 additions and 5869 deletions

View File

@@ -19,7 +19,7 @@ body:
label: What did you expect to see?
description: What did you expect to see/happen instead?
validations:
required: false
required: true
- type: textarea
id: steps
attributes:

View File

@@ -1,24 +0,0 @@
name: latest
on:
release:
types: [released]
jobs:
update-latest:
environment: release
runs-on: linux
steps:
- uses: actions/checkout@v4
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Tag images as latest
env:
PUSH: "1"
shell: bash
run: |
export "VERSION=${GITHUB_REF_NAME#v}"
./scripts/tag_latest.sh

View File

@@ -30,7 +30,7 @@ jobs:
security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- name: Build Darwin
env:
@@ -42,7 +42,7 @@ jobs:
DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
run: |
./scripts/build_darwin.sh
- uses: actions/upload-artifact@v4
with:
name: dist-darwin
@@ -50,6 +50,7 @@ jobs:
dist/*arwin*
!dist/*-cov
# Windows builds take a long time to both install the dependencies and build, so parallelize
# CPU generation step
generate-windows-cpu:
@@ -86,7 +87,7 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
@@ -100,9 +101,7 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: generate-windows-cpu
path: |
llm/build/**/bin/*
llm/build/**/*.a
path: llm/llama.cpp/build/**/lib/*
# ROCm generation step
generate-windows-rocm:
@@ -139,9 +138,9 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- name: 'Install ROCm'
- name: "Install ROCm"
run: |
$ErrorActionPreference = "Stop"
write-host "downloading AMD HIP Installer"
@@ -149,7 +148,7 @@ jobs:
write-host "Installing AMD HIP"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP"
- name: 'Verify ROCm'
- name: "Verify ROCm"
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- run: go get ./...
@@ -163,7 +162,7 @@ jobs:
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
go generate -x ./...
name: go generate
- name: 'gather rocm dependencies'
- name: "gather rocm dependencies"
run: |
$HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
md "dist\deps\bin\rocblas\library"
@@ -173,7 +172,7 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: generate-windows-rocm
path: llm/build/**/bin/*
path: llm/llama.cpp/build/**/lib/*
- uses: actions/upload-artifact@v4
with:
name: windows-rocm-deps
@@ -214,36 +213,29 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- name: 'Install CUDA'
run: |
$ErrorActionPreference = "Stop"
write-host "downloading CUDA Installer"
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
write-host "Installing CUDA"
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
write-host "Completed CUDA"
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
echo "$cudaPath\bin" >> $env:GITHUB_PATH
echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
- name: 'Verify CUDA'
# TODO - consider replacing this action with a ps1 snippet to install
# This actions seems to fail sometimes with "no tools in cache" but a re-run of the failed job clears it
# https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
- name: "Install CUDA"
uses: Jimver/cuda-toolkit@v0.2.14
id: cuda-toolkit
with:
cuda: '11.3.1'
- name: "Verify CUDA"
run: nvcc -V
- run: go get ./...
- name: go generate
run: |
$gopath=(get-command go).source | split-path -parent
$cudabin=(get-command nvcc).source | split-path
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$cudabin;$env:PATH"
$env:PATH="$gopath;$env:PATH"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
go generate -x ./...
- name: 'gather cuda dependencies'
- name: "gather cuda dependencies"
run: |
$NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
md "dist\deps"
@@ -253,7 +245,7 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: generate-windows-cuda
path: llm/build/**/bin/*
path: llm/llama.cpp/build/**/lib/*
- uses: actions/upload-artifact@v4
with:
name: windows-cuda-deps
@@ -300,17 +292,17 @@ jobs:
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- run: go get
- uses: actions/download-artifact@v4
with:
name: generate-windows-cpu
path: llm/build
path: llm/llama.cpp/build
- uses: actions/download-artifact@v4
with:
name: generate-windows-cuda
path: llm/build
path: llm/llama.cpp/build
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps
@@ -322,8 +314,8 @@ jobs:
- uses: actions/download-artifact@v4
with:
name: generate-windows-rocm
path: llm/build
- run: dir llm/build
path: llm/llama.cpp/build
- run: dir llm/llama.cpp/build
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@@ -339,14 +331,14 @@ jobs:
name: dist-windows
path: dist/*.exe
# Linux x86 assets built using the container based build
# Linux x86 assets built using the container based build
build-linux-amd64:
environment: release
runs-on: linux
env:
OLLAMA_SKIP_MANIFEST_CREATE: '1'
OLLAMA_SKIP_MANIFEST_CREATE: "1"
BUILD_ARCH: amd64
PUSH: '1'
PUSH: "1"
steps:
- uses: actions/checkout@v4
with:
@@ -376,9 +368,9 @@ jobs:
environment: release
runs-on: linux-arm64
env:
OLLAMA_SKIP_MANIFEST_CREATE: '1'
OLLAMA_SKIP_MANIFEST_CREATE: "1"
BUILD_ARCH: arm64
PUSH: '1'
PUSH: "1"
steps:
- uses: actions/checkout@v4
with:
@@ -386,7 +378,7 @@ jobs:
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: 'Install Docker'
- name: "Install Docker"
run: |
# Add Docker's official GPG key:
env
@@ -423,7 +415,7 @@ jobs:
!dist/*-cov
# Aggregate all the assets and ship a release
release:
release:
needs:
- build-darwin
- build-windows
@@ -434,8 +426,8 @@ jobs:
permissions:
contents: write
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
OLLAMA_SKIP_IMAGE_BUILD: "1"
PUSH: "1"
steps:
- uses: actions/checkout@v4
- name: Set Version
@@ -463,11 +455,11 @@ jobs:
with:
name: ${{ env.RELEASE_VERSION }}
allowUpdates: true
artifacts: 'dist/*'
artifacts: "dist/*"
draft: true
prerelease: true
omitBodyDuringUpdate: true
generateReleaseNotes: true
omitDraftDuringUpdate: true
omitPrereleaseDuringUpdate: true
replacesArtifacts: true
replacesArtifacts: true

View File

@@ -5,13 +5,13 @@ on:
paths:
- '**/*'
- '!docs/**'
- '!examples/**'
- '!README.md'
jobs:
changes:
runs-on: ubuntu-latest
outputs:
GENERATE: ${{ steps.changes.outputs.GENERATE }}
GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
steps:
@@ -50,29 +50,25 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
$gopath=(get-command go).source | split-path -parent
$gccpath=(get-command gcc).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$gccpath;$env:PATH"
echo $env:PATH
$env:PATH="$gopath;$env:PATH"
go generate -x ./...
if: ${{ startsWith(matrix.os, 'windows-') }}
name: 'Windows Go Generate'
name: "Windows Go Generate"
- run: go generate -x ./...
if: ${{ ! startsWith(matrix.os, 'windows-') }}
name: 'Unix Go Generate'
name: "Unix Go Generate"
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: |
llm/build/**/bin/*
llm/build/**/*.a
path: llm/llama.cpp/build/**/lib/*
generate-cuda:
needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@@ -92,7 +88,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
@@ -103,14 +99,14 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: cuda-${{ matrix.cuda-version }}-libraries
path: llm/build/**/bin/*
path: llm/llama.cpp/build/**/lib/*
generate-rocm:
needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
strategy:
matrix:
rocm-version:
- '6.0.2'
- '6.0'
runs-on: linux
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps:
@@ -123,7 +119,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- run: go get ./...
- run: |
@@ -134,87 +130,7 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: rocm-${{ matrix.rocm-version }}-libraries
path: llm/build/**/bin/*
# ROCm generation step
generate-windows-rocm:
needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
runs-on: windows
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
- name: 'Install ROCm'
run: |
$ErrorActionPreference = "Stop"
write-host "downloading AMD HIP Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP"
- name: 'Verify ROCm'
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- run: go get ./...
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
go generate -x ./...
name: go generate
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
# CUDA generation step
generate-windows-cuda:
needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
runs-on: windows
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
- name: 'Install CUDA'
run: |
$ErrorActionPreference = "Stop"
write-host "downloading CUDA Installer"
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
write-host "Installing CUDA"
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
write-host "Completed CUDA"
$cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
$cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2'
echo "$cudaPath\bin" >> $env:GITHUB_PATH
echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
- name: 'Verify CUDA'
run: nvcc -V
- run: go get ./...
- name: go generate
run: |
$gopath=(get-command go).source | split-path -parent
$cudabin=(get-command nvcc).source | split-path
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$cudabin;$env:PATH"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
path: llm/llama.cpp/build/**/lib/*
lint:
strategy:
matrix:
@@ -237,7 +153,7 @@ jobs:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: false
- run: |
case ${{ matrix.arch }} in
@@ -246,21 +162,19 @@ jobs:
esac >>$GITHUB_ENV
shell: bash
- run: |
mkdir -p llm/build/linux/$ARCH/stub/bin
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
touch llm/llama.cpp/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }}
- run: |
mkdir -p llm/build/windows/$ARCH/stub/bin
touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }}
shell: bash
- uses: golangci/golangci-lint-action@v4
with:
args: --timeout 8m0s
- uses: golangci/golangci-lint-action@v3
test:
strategy:
matrix:
@@ -275,14 +189,13 @@ jobs:
env:
GOARCH: ${{ matrix.arch }}
CGO_ENABLED: '1'
OLLAMA_CPU_TARGET: 'static'
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
go-version: '1.22'
cache: true
- run: go get
- run: |
@@ -292,19 +205,18 @@ jobs:
esac >>$GITHUB_ENV
shell: bash
- run: |
mkdir -p llm/build/linux/$ARCH/stub/bin
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
touch llm/llama.cpp/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }}
- run: |
mkdir -p llm/build/windows/$ARCH/stub/bin
touch llm/build/windows/$ARCH/stub/bin/ollama_llama_server
mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }}
shell: bash
- run: go generate ./...
- run: go build
- run: go test -v ./...
- uses: actions/upload-artifact@v4

3
.gitignore vendored
View File

@@ -10,5 +10,4 @@ ggml-metal.metal
*.exe
.idea
test_data
*.crt
llm/build
*.crt

View File

@@ -15,3 +15,13 @@ linters:
- misspell
- nilerr
- unused
linters-settings:
errcheck:
# exclude the following functions since we don't generally
# need to be concerned with the returned errors
exclude-functions:
- encoding/binary.Read
- (*os.File).Seek
- (*bufio.Writer).WriteString
- (*github.com/spf13/pflag.FlagSet).Set
- (*github.com/ollama/ollama/llm.readSeekOffset).Seek

View File

@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
ARG CMAKE_VERSION=3.22.1
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
ARG CUDA_VERSION=11.3.1
ARG ROCM_VERSION=6.0.2
ARG ROCM_VERSION=6.0
# Copy the minimal context we need to run the generate scripts
FROM scratch AS llm-code
@@ -42,7 +42,7 @@ ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN mkdir /tmp/scratch && \
for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
for dep in $(cat /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
cp ${dep} /tmp/scratch/ || exit 1 ; \
done && \
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
@@ -61,8 +61,6 @@ ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
@@ -70,33 +68,28 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
ARG GOFLAGS
ARG CGO_CFLAGS
@@ -108,8 +101,8 @@ ENV CGO_ENABLED 1
ARG GOLANG_VERSION
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build -trimpath .

View File

@@ -64,7 +64,6 @@ Here are some example models that can be downloaded:
| LLaVA | 7B | 4.5GB | `ollama run llava` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
| Solar | 10.7B | 6.1GB | `ollama run solar` |
> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -260,7 +259,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Web & Desktop
- [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
- [LibreChat](https://github.com/danny-avila/LibreChat)
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
@@ -291,9 +289,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
### Terminal
@@ -317,8 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Database
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
- [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)
### Package managers
@@ -377,4 +371,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)

View File

@@ -1,9 +1,3 @@
// Package api implements the client-side API for code wishing to interact
// with the ollama service. The methods of the [Client] type correspond to
// the ollama REST API as described in https://github.com/ollama/ollama/blob/main/docs/api.md
//
// The ollama command-line client itself uses this package to interact with
// the backend service.
package api
import (
@@ -11,6 +5,7 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
@@ -24,8 +19,6 @@ import (
"github.com/ollama/ollama/version"
)
// Client encapsulates client state for interacting with the ollama
// service. Use [ClientFromEnvironment] to create new Clients.
type Client struct {
base *url.URL
http *http.Client
@@ -47,15 +40,6 @@ func checkError(resp *http.Response, body []byte) error {
return apiError
}
// ClientFromEnvironment creates a new [Client] using configuration from the
// environment variable OLLAMA_HOST, which points to the network host and
// port on which the ollama service is listenting. The format of this variable
// is:
//
// <scheme>://<host>:<port>
//
// If the variable is not specified, a default ollama host and port will be
// used.
func ClientFromEnvironment() (*Client, error) {
defaultPort := "11434"
@@ -207,14 +191,8 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
return nil
}
// GenerateResponseFunc is a function that [Client.Generate] invokes every time
// a response is received from the service. If this function returns an error,
// [Client.Generate] will stop generating and return this error.
type GenerateResponseFunc func(GenerateResponse) error
// Generate generates a response for a given prompt. The req parameter should
// be populated with prompt details. fn is called for each response (there may
// be multiple responses, e.g. in case streaming is enabled).
func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn GenerateResponseFunc) error {
return c.stream(ctx, http.MethodPost, "/api/generate", req, func(bts []byte) error {
var resp GenerateResponse
@@ -226,15 +204,8 @@ func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn Generate
})
}
// ChatResponseFunc is a function that [Client.Chat] invokes every time
// a response is received from the service. If this function returns an error,
// [Client.Chat] will stop generating and return this error.
type ChatResponseFunc func(ChatResponse) error
// Chat generates the next message in a chat. [ChatRequest] may contain a
// sequence of messages which can be used to maintain chat history with a model.
// fn is called for each response (there may be multiple responses, e.g. if case
// streaming is enabled).
func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
var resp ChatResponse
@@ -246,14 +217,8 @@ func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc
})
}
// PullProgressFunc is a function that [Client.Pull] invokes every time there
// is progress with a "pull" request sent to the service. If this function
// returns an error, [Client.Pull] will stop the process and return this error.
type PullProgressFunc func(ProgressResponse) error
// Pull downloads a model from the ollama library. fn is called each time
// progress is made on the request and can be used to display a progress bar,
// etc.
func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
return c.stream(ctx, http.MethodPost, "/api/pull", req, func(bts []byte) error {
var resp ProgressResponse
@@ -336,7 +301,18 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
}
func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
var statusError StatusError
if !errors.As(err, &statusError) || statusError.StatusCode != http.StatusNotFound {
return err
}
if err := c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil); err != nil {
return err
}
}
return nil
}
func (c *Client) Version(ctx context.Context) (string, error) {

View File

@@ -33,46 +33,18 @@ func (e StatusError) Error() string {
type ImageData []byte
// GenerateRequest describes a request sent by [Client.Generate]. While you
// have to specify the Model and Prompt fields, all the other fields have
// reasonable defaults for basic uses.
type GenerateRequest struct {
// Model is the model name; it should be a name familiar to Ollama from
// the library at https://ollama.com/library
Model string `json:"model"`
Model string `json:"model"`
Prompt string `json:"prompt"`
System string `json:"system"`
Template string `json:"template"`
Context []int `json:"context,omitempty"`
Stream *bool `json:"stream,omitempty"`
Raw bool `json:"raw,omitempty"`
Format string `json:"format"`
KeepAlive *Duration `json:"keep_alive,omitempty"`
Images []ImageData `json:"images,omitempty"`
// Prompt is the textual prompt to send to the model.
Prompt string `json:"prompt"`
// System overrides the model's default system message/prompt.
System string `json:"system"`
// Template overrides the model's default prompt template.
Template string `json:"template"`
// Context is the context parameter returned from a previous call to
// Generate call. It can be used to keep a short conversational memory.
Context []int `json:"context,omitempty"`
// Stream specifies whether the response is streaming; it is true by default.
Stream *bool `json:"stream,omitempty"`
// Raw set to true means that no formatting will be applied to the prompt.
Raw bool `json:"raw,omitempty"`
// Format specifies the format to return a response in.
Format string `json:"format"`
// KeepAlive controls how long the model will stay loaded in memory following
// this request.
KeepAlive *Duration `json:"keep_alive,omitempty"`
// Images is an optional list of base64-encoded images accompanying this
// request, for multimodal models.
Images []ImageData `json:"images,omitempty"`
// Options lists model-specific options. For example, temperature can be
// set through this field, if the model supports it.
Options map[string]interface{} `json:"options"`
}
@@ -137,24 +109,21 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type Runner struct {
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGQA int `json:"num_gqa,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
UseNUMA bool `json:"numa,omitempty"`
NumCtx int `json:"num_ctx,omitempty"`
NumBatch int `json:"num_batch,omitempty"`
NumGQA int `json:"num_gqa,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
type EmbeddingRequest struct {
@@ -170,11 +139,10 @@ type EmbeddingResponse struct {
}
type CreateRequest struct {
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
Quantization string `json:"quantization,omitempty"`
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
// Name is deprecated, see Model
Name string `json:"name"`
@@ -414,16 +382,18 @@ func DefaultOptions() Options {
Runner: Runner{
// options set when the model is loaded
NumCtx: 2048,
NumBatch: 512,
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1,
NumThread: 0, // let the runtime decide
LowVRAM: false,
F16KV: true,
UseMLock: false,
UseMMap: true,
UseNUMA: false,
NumCtx: 2048,
RopeFrequencyBase: 10000.0,
RopeFrequencyScale: 1.0,
NumBatch: 512,
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1,
NumThread: 0, // let the runtime decide
LowVRAM: false,
F16KV: true,
UseMLock: false,
UseMMap: true,
UseNUMA: false,
},
}
}

View File

@@ -83,38 +83,6 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
io.Copy(logFile, stderr) //nolint:errcheck
}()
// Re-wire context done behavior to attempt a graceful shutdown of the server
cmd.Cancel = func() error {
if cmd.Process != nil {
err := terminate(cmd)
if err != nil {
slog.Warn("error trying to gracefully terminate server", "err", err)
return cmd.Process.Kill()
}
tick := time.NewTicker(10 * time.Millisecond)
defer tick.Stop()
for {
select {
case <-tick.C:
exited, err := isProcessExited(cmd.Process.Pid)
if err != nil {
return err
}
if exited {
return nil
}
case <-time.After(5 * time.Second):
slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
return cmd.Process.Kill()
}
}
}
return nil
}
// run the command and wait for it to finish
if err := cmd.Start(); err != nil {
return done, fmt.Errorf("failed to start server %w", err)
@@ -137,7 +105,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
select {
case <-ctx.Done():
slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code))
done <- code
return
default:

View File

@@ -4,35 +4,9 @@ package lifecycle
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
"syscall"
)
func getCmd(ctx context.Context, cmd string) *exec.Cmd {
return exec.CommandContext(ctx, cmd, "serve")
}
func terminate(cmd *exec.Cmd) error {
return cmd.Process.Signal(os.Interrupt)
}
func isProcessExited(pid int) (bool, error) {
proc, err := os.FindProcess(pid)
if err != nil {
return false, fmt.Errorf("failed to find process: %v", err)
}
err = proc.Signal(syscall.Signal(0))
if err != nil {
if errors.Is(err, os.ErrProcessDone) || errors.Is(err, syscall.ESRCH) {
return true, nil
}
return false, fmt.Errorf("error signaling process: %v", err)
}
return false, nil
}

View File

@@ -2,88 +2,12 @@ package lifecycle
import (
"context"
"fmt"
"os/exec"
"syscall"
"golang.org/x/sys/windows"
)
func getCmd(ctx context.Context, exePath string) *exec.Cmd {
cmd := exec.CommandContext(ctx, exePath, "serve")
cmd.SysProcAttr = &syscall.SysProcAttr{
HideWindow: true,
CreationFlags: windows.CREATE_NEW_PROCESS_GROUP,
}
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true, CreationFlags: 0x08000000}
return cmd
}
func terminate(cmd *exec.Cmd) error {
dll, err := windows.LoadDLL("kernel32.dll")
if err != nil {
return err
}
defer dll.Release() // nolint: errcheck
pid := cmd.Process.Pid
f, err := dll.FindProc("AttachConsole")
if err != nil {
return err
}
r1, _, err := f.Call(uintptr(pid))
if r1 == 0 && err != syscall.ERROR_ACCESS_DENIED {
return err
}
f, err = dll.FindProc("SetConsoleCtrlHandler")
if err != nil {
return err
}
r1, _, err = f.Call(0, 1)
if r1 == 0 {
return err
}
f, err = dll.FindProc("GenerateConsoleCtrlEvent")
if err != nil {
return err
}
r1, _, err = f.Call(windows.CTRL_BREAK_EVENT, uintptr(pid))
if r1 == 0 {
return err
}
r1, _, err = f.Call(windows.CTRL_C_EVENT, uintptr(pid))
if r1 == 0 {
return err
}
return nil
}
const STILL_ACTIVE = 259
func isProcessExited(pid int) (bool, error) {
hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION, false, uint32(pid))
if err != nil {
return false, fmt.Errorf("failed to open process: %v", err)
}
defer windows.CloseHandle(hProcess) // nolint: errcheck
var exitCode uint32
err = windows.GetExitCodeProcess(hProcess, &exitCode)
if err != nil {
return false, fmt.Errorf("failed to get exit code: %v", err)
}
if exitCode == STILL_ACTIVE {
return false, nil
}
return true, nil
}

View File

@@ -24,5 +24,10 @@ func NewTray() (commontray.OllamaTray, error) {
return nil, fmt.Errorf("failed to load icon %s: %w", iconName, err)
}
return InitPlatformTray(icon, updateIcon)
tray, err := InitPlatformTray(icon, updateIcon)
if err != nil {
return nil, err
}
return tray, nil
}

View File

@@ -105,48 +105,24 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
zf := zip.NewWriter(tf)
files := []string{}
tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin"))
files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors"))
if err != nil {
return err
} else if len(tfiles) == 0 {
tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors"))
if err != nil {
return err
}
}
files = append(files, tfiles...)
if len(files) == 0 {
return fmt.Errorf("no models were found in '%s'", path)
return fmt.Errorf("no safetensors files were found in '%s'", path)
}
// add the safetensor/torch config file + tokenizer
// add the safetensor config file + tokenizer
files = append(files, filepath.Join(path, "config.json"))
files = append(files, filepath.Join(path, "params.json"))
files = append(files, filepath.Join(path, "added_tokens.json"))
files = append(files, filepath.Join(path, "tokenizer.model"))
for _, fn := range files {
f, err := os.Open(fn)
// just skip whatever files aren't there
if os.IsNotExist(err) {
if strings.HasSuffix(fn, "tokenizer.model") {
// try the parent dir before giving up
parentDir := filepath.Dir(path)
newFn := filepath.Join(parentDir, "tokenizer.model")
f, err = os.Open(newFn)
if os.IsNotExist(err) {
continue
} else if err != nil {
return err
}
} else {
continue
}
if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") {
continue
} else if err != nil {
return err
}
@@ -218,9 +194,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return nil
}
quantization, _ := cmd.Flags().GetString("quantization")
request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile)}
if err := client.Create(cmd.Context(), &request, fn); err != nil {
return err
}
@@ -239,10 +213,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
if _, err := io.Copy(hash, bin); err != nil {
return "", err
}
if _, err := bin.Seek(0, io.SeekStart); err != nil {
return "", err
}
bin.Seek(0, io.SeekStart)
digest := fmt.Sprintf("sha256:%x", hash.Sum(nil))
if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil {
@@ -961,7 +932,6 @@ func NewCLI() *cobra.Command {
}
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")
showCmd := &cobra.Command{
Use: "show MODEL",
@@ -1003,7 +973,6 @@ Environment Variables:
OLLAMA_ORIGINS A comma separated list of allowed origins.
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models")
OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default is "5m")
OLLAMA_DEBUG Set to 1 to enable additional debug logging
`)
pullCmd := &cobra.Command{

View File

@@ -295,14 +295,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.WordWrap = false
fmt.Println("Set 'nowordwrap' mode.")
case "verbose":
if err := cmd.Flags().Set("verbose", "true"); err != nil {
return err
}
cmd.Flags().Set("verbose", "true")
fmt.Println("Set 'verbose' mode.")
case "quiet":
if err := cmd.Flags().Set("verbose", "false"); err != nil {
return err
}
cmd.Flags().Set("verbose", "false")
fmt.Println("Set 'quiet' mode.")
case "format":
if len(args) < 3 || args[2] != "json" {

View File

@@ -1,16 +1,19 @@
package convert
import (
"bytes"
"cmp"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"regexp"
"slices"
"strings"
"github.com/mitchellh/mapstructure"
"google.golang.org/protobuf/proto"
"github.com/ollama/ollama/convert/sentencepiece"
@@ -27,58 +30,137 @@ type Params struct {
AttentionHeads int `json:"num_attention_heads"` // n_head
KeyValHeads int `json:"num_key_value_heads"`
NormEPS float64 `json:"rms_norm_eps"`
RopeFreqBase float64 `json:"rope_theta"`
BoSTokenID int `json:"bos_token_id"`
EoSTokenID int `json:"eos_token_id"`
HeadDimension int `json:"head_dim"`
PaddingTokenID int `json:"pad_token_id"`
ByteOrder
}
type ByteOrder interface {
binary.ByteOrder
binary.AppendByteOrder
type MetaData struct {
Type string `mapstructure:"dtype"`
Shape []int `mapstructure:"shape"`
Offsets []int `mapstructure:"data_offsets"`
}
type ModelArch interface {
GetTensors() error
LoadVocab() error
WriteGGUF() (string, error)
func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
f, err := os.Open(fn)
if err != nil {
return []llm.Tensor{}, 0, err
}
defer f.Close()
var jsonSize uint64
binary.Read(f, binary.LittleEndian, &jsonSize)
buf := make([]byte, jsonSize)
_, err = io.ReadFull(f, buf)
if err != nil {
return []llm.Tensor{}, 0, err
}
d := json.NewDecoder(bytes.NewBuffer(buf))
d.UseNumber()
var parsed map[string]interface{}
if err = d.Decode(&parsed); err != nil {
return []llm.Tensor{}, 0, err
}
var keys []string
for k := range parsed {
keys = append(keys, k)
}
slices.Sort(keys)
slog.Info("converting layers")
var tensors []llm.Tensor
for _, k := range keys {
vals := parsed[k].(map[string]interface{})
var data MetaData
if err = mapstructure.Decode(vals, &data); err != nil {
return []llm.Tensor{}, 0, err
}
var size uint64
var kind uint32
switch len(data.Shape) {
case 0:
// metadata
continue
case 1:
// convert to float32
kind = 0
size = uint64(data.Shape[0] * 4)
case 2:
// convert to float16
kind = 1
size = uint64(data.Shape[0] * data.Shape[1] * 2)
}
ggufName, err := GetTensorName(k)
if err != nil {
slog.Error("%v", err)
return []llm.Tensor{}, 0, err
}
shape := []uint64{0, 0, 0, 0}
for i := range data.Shape {
shape[i] = uint64(data.Shape[i])
}
t := llm.Tensor{
Name: ggufName,
Kind: kind,
Offset: offset,
Shape: shape[:],
FileName: fn,
OffsetPadding: 8 + jsonSize,
FileOffsets: []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
}
slog.Debug(fmt.Sprintf("%v", t))
tensors = append(tensors, t)
offset += size
}
return tensors, offset, nil
}
type ModelFormat interface {
GetLayerName(string) (string, error)
GetTensors(string, *Params) ([]llm.Tensor, error)
GetParams(string) (*Params, error)
GetModelArch(string, string, *Params) (ModelArch, error)
func GetSafeTensors(dirpath string) ([]llm.Tensor, error) {
var tensors []llm.Tensor
files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
if err != nil {
return []llm.Tensor{}, err
}
var offset uint64
for _, f := range files {
var t []llm.Tensor
var err error
t, offset, err = ReadSafeTensors(f, offset)
if err != nil {
slog.Error("%v", err)
return []llm.Tensor{}, err
}
tensors = append(tensors, t...)
}
return tensors, nil
}
type ModelData struct {
Path string
Name string
Params *Params
Vocab *Vocab
Tensors []llm.Tensor
Format ModelFormat
}
func GetParams(dirpath string) (*Params, error) {
f, err := os.Open(filepath.Join(dirpath, "config.json"))
if err != nil {
return nil, err
}
defer f.Close()
func GetModelFormat(dirname string) (ModelFormat, error) {
files, err := filepath.Glob(filepath.Join(dirname, "*"))
var params Params
d := json.NewDecoder(f)
err = d.Decode(&params)
if err != nil {
return nil, err
}
for _, fn := range files {
slog.Debug(fmt.Sprintf("file = %s", fn))
if strings.HasSuffix(fn, ".safetensors") {
return &SafetensorFormat{}, nil
} else if strings.HasSuffix(fn, ".bin") {
slog.Debug("model is torch")
return &TorchFormat{}, nil
}
}
return nil, fmt.Errorf("couldn't determine model format")
return &params, nil
}
// Details on gguf's tokenizer can be found at:
@@ -89,7 +171,7 @@ type Vocab struct {
Types []int32
}
func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
func LoadTokens(dirpath string) (*Vocab, error) {
slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model")))
in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model"))
if err != nil {
@@ -114,14 +196,6 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
v.Tokens = append(v.Tokens, p.GetPiece())
v.Scores = append(v.Scores, p.GetScore())
t := p.GetType()
switch t {
case sentencepiece.ModelProto_SentencePiece_UNKNOWN:
case sentencepiece.ModelProto_SentencePiece_CONTROL:
case sentencepiece.ModelProto_SentencePiece_UNUSED:
case sentencepiece.ModelProto_SentencePiece_BYTE:
default:
t = sentencepiece.ModelProto_SentencePiece_NORMAL
}
v.Types = append(v.Types, int32(t))
}
@@ -169,15 +243,89 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
}
slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
if params.VocabSize > len(v.Tokens) {
missingTokens := params.VocabSize - len(v.Tokens)
slog.Warn(fmt.Sprintf("vocab is missing %d tokens", missingTokens))
for cnt := 0; cnt < missingTokens; cnt++ {
v.Tokens = append(v.Tokens, fmt.Sprintf("<dummy%05d>", cnt+1))
v.Scores = append(v.Scores, -1)
v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
return v, nil
}
func GetTensorName(n string) (string, error) {
tMap := map[string]string{
"model.embed_tokens.weight": "token_embd.weight",
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
"lm_head.weight": "output.weight",
"model.norm.weight": "output_norm.weight",
}
v, ok := tMap[n]
if ok {
return v, nil
}
// quick hack to rename the layers to gguf format
for k, v := range tMap {
re := regexp.MustCompile(k)
newName := re.ReplaceAllString(n, v)
if newName != n {
return newName, nil
}
}
return v, nil
return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
}
func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) {
c := llm.ContainerGGUF{
ByteOrder: binary.LittleEndian,
}
m := llm.NewGGUFModel(&c)
m.Tensors = tensors
m.KV["general.architecture"] = "llama"
m.KV["general.name"] = name
m.KV["llama.context_length"] = uint32(params.ContextSize)
m.KV["llama.embedding_length"] = uint32(params.HiddenSize)
m.KV["llama.block_count"] = uint32(params.HiddenLayers)
m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize)
m.KV["llama.rope.dimension_count"] = uint32(128)
m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads)
m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads)
m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS)
m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase)
m.KV["general.file_type"] = uint32(1)
m.KV["tokenizer.ggml.model"] = "llama"
m.KV["tokenizer.ggml.tokens"] = vocab.Tokens
m.KV["tokenizer.ggml.scores"] = vocab.Scores
m.KV["tokenizer.ggml.token_type"] = vocab.Types
m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID)
m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID)
m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0)
m.KV["tokenizer.ggml.add_bos_token"] = true
m.KV["tokenizer.ggml.add_eos_token"] = false
// llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer
// m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme
c.V3.NumTensor = uint64(len(tensors))
c.V3.NumKV = uint64(len(m.KV))
f, err := os.CreateTemp("", "ollama-gguf")
if err != nil {
return "", err
}
defer f.Close()
err = m.Encode(f)
if err != nil {
return "", err
}
return f.Name(), nil
}

View File

@@ -1,137 +0,0 @@
package convert
import (
"encoding/binary"
"fmt"
"io"
"log/slog"
"os"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type GemmaModel struct {
ModelData
}
func gemmaLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
slog.Debug(fmt.Sprintf("converting '%s'", r.t.Name))
data := make([]byte, r.end-r.start)
if err := binary.Read(f, r.bo, data); err != nil {
return err
}
tDataF32 := bfloat16.DecodeFloat32(data)
var err error
tDataF32, err = addOnes(tDataF32, int(r.t.Shape[0]))
if err != nil {
return err
}
if err := binary.Write(w, r.bo, tDataF32); err != nil {
return err
}
return nil
}
func addOnes(data []float32, vectorSize int) ([]float32, error) {
n := tensor.New(tensor.WithShape(vectorSize), tensor.WithBacking(data))
ones := tensor.Ones(tensor.Float32, vectorSize)
var err error
n, err = n.Add(ones)
if err != nil {
return []float32{}, err
}
newN, err := native.SelectF32(n, 0)
if err != nil {
return []float32{}, err
}
var fullTensor []float32
for _, v := range newN {
fullTensor = append(fullTensor, v...)
}
return fullTensor, nil
}
func (m *GemmaModel) GetTensors() error {
t, err := m.Format.GetTensors(m.Path, m.Params)
if err != nil {
return err
}
slog.Debug(fmt.Sprintf("Total tensors: %d", len(t)))
m.Tensors = []llm.Tensor{}
for _, l := range t {
if strings.HasSuffix(l.Name, "norm.weight") {
wt := l.WriterTo.(safetensorWriterTo)
wt.handler = gemmaLayerHandler
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
}
return nil
}
func (m *GemmaModel) LoadVocab() error {
v, err := LoadSentencePieceTokens(m.Path, m.Params)
if err != nil {
return err
}
m.Vocab = v
return nil
}
func (m *GemmaModel) WriteGGUF() (string, error) {
kv := llm.KV{
"general.architecture": "gemma",
"general.name": m.Name,
"gemma.context_length": uint32(m.Params.ContextSize),
"gemma.embedding_length": uint32(m.Params.HiddenSize),
"gemma.block_count": uint32(m.Params.HiddenLayers),
"gemma.feed_forward_length": uint32(m.Params.IntermediateSize),
"gemma.attention.head_count": uint32(m.Params.AttentionHeads),
"gemma.attention.head_count_kv": uint32(m.Params.KeyValHeads),
"gemma.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
"gemma.attention.key_length": uint32(m.Params.HeadDimension),
"gemma.attention.value_length": uint32(m.Params.HeadDimension),
"general.file_type": uint32(1),
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.tokens": m.Vocab.Tokens,
"tokenizer.ggml.scores": m.Vocab.Scores,
"tokenizer.ggml.token_type": m.Vocab.Types,
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
"tokenizer.ggml.padding_token_id": uint32(m.Params.PaddingTokenID),
"tokenizer.ggml.unknown_token_id": uint32(3),
"tokenizer.ggml.add_bos_token": true,
"tokenizer.ggml.add_eos_token": false,
}
f, err := os.CreateTemp("", "ollama-gguf")
if err != nil {
return "", err
}
defer f.Close()
mod := llm.NewGGUFV3(m.Params.ByteOrder)
if err := mod.Encode(f, kv, m.Tensors); err != nil {
return "", err
}
return f.Name(), nil
}

View File

@@ -1,176 +0,0 @@
package convert
import (
"encoding/binary"
"fmt"
"io"
"log/slog"
"os"
"regexp"
"strings"
"github.com/nlpodyssey/gopickle/pytorch"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/x448/float16"
"github.com/ollama/ollama/llm"
)
type LlamaModel struct {
ModelData
}
func llamaLayerHandler(w io.Writer, r torchWriterTo) error {
slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name))
data := r.storage.(*pytorch.HalfStorage).Data
tData := make([]uint16, len(data))
for cnt, v := range data {
tData[cnt] = uint16(float16.Fromfloat32(v))
}
var err error
var heads uint32
if strings.Contains(r.t.Name, "attn_q") {
heads = uint32(r.params.AttentionHeads)
} else if strings.Contains(r.t.Name, "attn_k") {
heads = uint32(r.params.KeyValHeads)
if heads == 0 {
heads = uint32(r.params.AttentionHeads)
}
} else {
return fmt.Errorf("unknown layer type")
}
slog.Debug(fmt.Sprintf("heads = %d", heads))
tData, err = llamaRepack(tData, int(heads), r.t.Shape)
if err != nil {
return err
}
if err = binary.Write(w, r.bo, tData); err != nil {
return err
}
return nil
}
func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
origShape := n.Shape().Clone()
// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(origShape...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
newN, err := native.SelectU16(n, 1)
if err != nil {
return nil, err
}
var fullTensor []uint16
for _, v := range newN {
fullTensor = append(fullTensor, v...)
}
return fullTensor, nil
}
func (m *LlamaModel) GetTensors() error {
t, err := m.Format.GetTensors(m.Path, m.Params)
if err != nil {
return err
}
m.Tensors = []llm.Tensor{}
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
re, err := regexp.Compile(pattern)
if err != nil {
return err
}
for _, l := range t {
matches := re.FindAllStringSubmatch(l.Name, -1)
if len(matches) > 0 {
slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name))
wt := l.WriterTo.(torchWriterTo)
wt.handler = llamaLayerHandler
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
}
return nil
}
func (m *LlamaModel) LoadVocab() error {
var v *Vocab
var err error
slog.Debug("loading vocab")
v, err = LoadSentencePieceTokens(m.Path, m.Params)
if err != nil {
return err
}
slog.Debug("vocab loaded")
m.Vocab = v
return nil
}
func (m *LlamaModel) WriteGGUF() (string, error) {
kv := llm.KV{
"general.architecture": "llama",
"general.name": m.Name,
"llama.vocab_size": uint32(len(m.Vocab.Tokens)),
"llama.context_length": uint32(m.Params.ContextSize),
"llama.embedding_length": uint32(m.Params.HiddenSize),
"llama.block_count": uint32(m.Params.HiddenLayers),
"llama.feed_forward_length": uint32(m.Params.IntermediateSize),
"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
"llama.attention.head_count": uint32(m.Params.AttentionHeads),
"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
"general.file_type": uint32(1),
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.tokens": m.Vocab.Tokens,
"tokenizer.ggml.scores": m.Vocab.Scores,
"tokenizer.ggml.token_type": m.Vocab.Types,
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
"tokenizer.ggml.unknown_token_id": uint32(0),
"tokenizer.ggml.add_bos_token": true,
"tokenizer.ggml.add_eos_token": false,
}
f, err := os.CreateTemp("", "ollama-gguf")
if err != nil {
return "", err
}
defer f.Close()
mod := llm.NewGGUFV3(m.Params.ByteOrder)
if err := mod.Encode(f, kv, m.Tensors); err != nil {
return "", err
}
slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
return f.Name(), nil
}

View File

@@ -1,173 +0,0 @@
package convert
import (
"encoding/binary"
"fmt"
"io"
"os"
"regexp"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/x448/float16"
"github.com/ollama/ollama/llm"
)
type MistralModel struct {
ModelData
}
func mistralLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
layerSize := r.end - r.start
var err error
tData := make([]uint16, layerSize/2)
if err = binary.Read(f, r.bo, tData); err != nil {
return err
}
var heads uint32
if strings.Contains(r.t.Name, "attn_q") {
heads = uint32(r.params.AttentionHeads)
} else if strings.Contains(r.t.Name, "attn_k") {
heads = uint32(r.params.KeyValHeads)
if heads == 0 {
heads = uint32(r.params.AttentionHeads)
}
} else {
return fmt.Errorf("unknown layer type")
}
tData, err = repack(tData, int(heads), r.t.Shape)
if err != nil {
return err
}
var buf []byte
for _, n := range tData {
buf = r.bo.AppendUint16(buf, n)
}
tempBuf := make([]uint16, len(tData))
tDataF32 := bfloat16.DecodeFloat32(buf)
for cnt, v := range tDataF32 {
tDataF16 := float16.Fromfloat32(v)
tempBuf[cnt] = uint16(tDataF16)
}
if err = binary.Write(w, r.bo, tempBuf); err != nil {
return err
}
return nil
}
func repack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
origShape := n.Shape().Clone()
// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(origShape...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
newN, err := native.SelectU16(n, 1)
if err != nil {
return nil, err
}
var fullTensor []uint16
for _, v := range newN {
fullTensor = append(fullTensor, v...)
}
return fullTensor, nil
}
func (m *MistralModel) GetTensors() error {
t, err := m.Format.GetTensors(m.Path, m.Params)
if err != nil {
return err
}
m.Tensors = []llm.Tensor{}
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
re, err := regexp.Compile(pattern)
if err != nil {
return err
}
for _, l := range t {
matches := re.FindAllStringSubmatch(l.Name, -1)
if len(matches) > 0 {
wt := l.WriterTo.(safetensorWriterTo)
wt.handler = mistralLayerHandler
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
}
return nil
}
func (m *MistralModel) LoadVocab() error {
v, err := LoadSentencePieceTokens(m.Path, m.Params)
if err != nil {
return err
}
m.Vocab = v
return nil
}
func (m *MistralModel) WriteGGUF() (string, error) {
kv := llm.KV{
"general.architecture": "llama",
"general.name": m.Name,
"llama.context_length": uint32(m.Params.ContextSize),
"llama.embedding_length": uint32(m.Params.HiddenSize),
"llama.block_count": uint32(m.Params.HiddenLayers),
"llama.feed_forward_length": uint32(m.Params.IntermediateSize),
"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
"llama.attention.head_count": uint32(m.Params.AttentionHeads),
"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
"general.file_type": uint32(1),
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.tokens": m.Vocab.Tokens,
"tokenizer.ggml.scores": m.Vocab.Scores,
"tokenizer.ggml.token_type": m.Vocab.Types,
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
"tokenizer.ggml.add_bos_token": true,
"tokenizer.ggml.add_eos_token": false,
"tokenizer.ggml.unknown_token_id": uint32(0),
}
f, err := os.CreateTemp("", "ollama-gguf")
if err != nil {
return "", err
}
defer f.Close()
mod := llm.NewGGUFV3(m.Params.ByteOrder)
if err := mod.Encode(f, kv, m.Tensors); err != nil {
return "", err
}
return f.Name(), nil
}

View File

@@ -1,304 +0,0 @@
package convert
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"regexp"
"slices"
"github.com/d4l3k/go-bfloat16"
"github.com/mitchellh/mapstructure"
"github.com/x448/float16"
"github.com/ollama/ollama/llm"
)
type safetensorWriterTo struct {
t *llm.Tensor
params *Params
bo ByteOrder
filename string
start, end, padding uint64
handler func(w io.Writer, r safetensorWriterTo, f *os.File) error
}
type tensorMetaData struct {
Type string `mapstructure:"dtype"`
Shape []int `mapstructure:"shape"`
Offsets []int `mapstructure:"data_offsets"`
}
type SafetensorFormat struct{}
func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
slog.Debug("getting tensor data")
var tensors []llm.Tensor
files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors"))
if err != nil {
return nil, err
}
var offset uint64
for _, f := range files {
var t []llm.Tensor
var err error
t, offset, err = m.readTensors(f, offset, params)
if err != nil {
slog.Error("%v", err)
return nil, err
}
tensors = append(tensors, t...)
}
slog.Debug(fmt.Sprintf("all tensors = %d", len(tensors)))
return tensors, nil
}
func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ([]llm.Tensor, uint64, error) {
f, err := os.Open(fn)
if err != nil {
return nil, 0, err
}
defer f.Close()
var jsonSize uint64
if err := binary.Read(f, binary.LittleEndian, &jsonSize); err != nil {
return nil, 0, err
}
buf := make([]byte, jsonSize)
_, err = io.ReadFull(f, buf)
if err != nil {
return nil, 0, err
}
d := json.NewDecoder(bytes.NewBuffer(buf))
d.UseNumber()
var parsed map[string]interface{}
if err = d.Decode(&parsed); err != nil {
return nil, 0, err
}
var keys []string
for k := range parsed {
keys = append(keys, k)
}
slices.Sort(keys)
slog.Info("converting layers")
var tensors []llm.Tensor
for _, k := range keys {
vals := parsed[k].(map[string]interface{})
var data tensorMetaData
if err = mapstructure.Decode(vals, &data); err != nil {
slog.Error("couldn't decode properly")
return nil, 0, err
}
slog.Debug(fmt.Sprintf("metadata = %#v", data))
var size uint64
var kind uint32
switch len(data.Shape) {
case 0:
// metadata
continue
case 1:
// convert to float32
kind = 0
size = uint64(data.Shape[0] * 4)
case 2:
// convert to float16
kind = 1
size = uint64(data.Shape[0] * data.Shape[1] * 2)
}
ggufName, err := m.GetLayerName(k)
if err != nil {
slog.Error("%v", err)
return nil, 0, err
}
shape := []uint64{0, 0, 0, 0}
for i := range data.Shape {
shape[i] = uint64(data.Shape[i])
}
t := llm.Tensor{
Name: ggufName,
Kind: kind,
Offset: offset,
Shape: shape[:],
}
t.WriterTo = safetensorWriterTo{
t: &t,
params: params,
bo: params.ByteOrder,
filename: fn,
start: uint64(data.Offsets[0]),
end: uint64(data.Offsets[1]),
padding: 8 + jsonSize,
}
tensors = append(tensors, t)
offset += size
}
slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
slog.Debug(fmt.Sprintf("offset = %d", offset))
return tensors, offset, nil
}
func (m *SafetensorFormat) GetParams(dirpath string) (*Params, error) {
f, err := os.Open(filepath.Join(dirpath, "config.json"))
if err != nil {
return nil, err
}
defer f.Close()
var params Params
d := json.NewDecoder(f)
err = d.Decode(&params)
if err != nil {
return nil, err
}
params.ByteOrder = binary.LittleEndian
return &params, nil
}
func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
directMap := map[string]string{
"model.embed_tokens.weight": "token_embd.weight",
"lm_head.weight": "output.weight",
"model.norm.weight": "output_norm.weight",
}
tMap := map[string]string{
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
}
v, ok := directMap[n]
if ok {
return v, nil
}
// quick hack to rename the layers to gguf format
for k, v := range tMap {
re := regexp.MustCompile(k)
newName := re.ReplaceAllString(n, v)
if newName != n {
return newName, nil
}
}
return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
}
func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) {
f, err := os.Open(r.filename)
if err != nil {
return 0, err
}
defer f.Close()
if _, err = f.Seek(int64(r.padding+r.start), 0); err != nil {
return 0, err
}
// use the handler if one is present
if r.handler != nil {
return 0, r.handler(w, r, f)
}
remaining := r.end - r.start
bufSize := uint64(10240)
var finished bool
for {
data := make([]byte, min(bufSize, remaining))
b, err := io.ReadFull(f, data)
remaining -= uint64(b)
if err == io.EOF || remaining <= 0 {
finished = true
} else if err != nil {
return 0, err
}
// convert bfloat16 -> ieee float32
tDataF32 := bfloat16.DecodeFloat32(data)
switch r.t.Kind {
case 0:
if err := binary.Write(w, r.bo, tDataF32); err != nil {
return 0, err
}
case 1:
// convert float32 -> float16
tempBuf := make([]uint16, len(data)/2)
for cnt, v := range tDataF32 {
tDataF16 := float16.Fromfloat32(v)
tempBuf[cnt] = uint16(tDataF16)
}
if err := binary.Write(w, r.bo, tempBuf); err != nil {
return 0, err
}
}
if finished {
break
}
}
return 0, nil
}
func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
switch len(params.Architectures) {
case 0:
return nil, fmt.Errorf("No architecture specified to convert")
case 1:
switch params.Architectures[0] {
case "MistralForCausalLM":
return &MistralModel{
ModelData{
Name: name,
Path: dirPath,
Params: params,
Format: m,
},
}, nil
case "GemmaForCausalLM":
return &GemmaModel{
ModelData{
Name: name,
Path: dirPath,
Params: params,
Format: m,
},
}, nil
default:
return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
}
}
return nil, fmt.Errorf("Unknown error")
}

View File

@@ -1,286 +0,0 @@
package convert
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/nlpodyssey/gopickle/pytorch"
"github.com/nlpodyssey/gopickle/types"
"github.com/x448/float16"
"github.com/ollama/ollama/llm"
)
type torchWriterTo struct {
t *llm.Tensor
params *Params
bo ByteOrder
storage pytorch.StorageInterface
handler func(w io.Writer, r torchWriterTo) error
}
type TorchFormat struct{}
func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
slog.Debug("getting torch tensors")
files, err := filepath.Glob(filepath.Join(dirpath, "pytorch_model-*.bin"))
if err != nil {
slog.Error("didn't find any torch files")
return nil, err
}
var offset uint64
var tensors []llm.Tensor
for _, fn := range files {
m, err := pytorch.Load(fn)
if err != nil {
slog.Error(fmt.Sprintf("error unpickling: %q", err))
return []llm.Tensor{}, err
}
for _, k := range m.(*types.Dict).Keys() {
if strings.HasSuffix(k.(string), "self_attn.rotary_emb.inv_freq") {
continue
}
t, _ := m.(*types.Dict).Get(k)
tshape := t.(*pytorch.Tensor).Size
var size uint64
var kind uint32
switch len(tshape) {
case 0:
continue
case 1:
// convert to float32
kind = 0
size = uint64(tshape[0] * 4)
case 2:
// convert to float16
kind = 1
size = uint64(tshape[0] * tshape[1] * 2)
}
ggufName, err := tf.GetLayerName(k.(string))
if err != nil {
slog.Error("%v", err)
return nil, err
}
slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
shape := []uint64{0, 0, 0, 0}
for i := range tshape {
shape[i] = uint64(tshape[i])
}
tensor := llm.Tensor{
Name: ggufName,
Kind: kind,
Offset: offset, // calculate the offset
Shape: shape[:],
}
tensor.WriterTo = torchWriterTo{
t: &tensor,
params: params,
bo: params.ByteOrder,
storage: t.(*pytorch.Tensor).Source,
}
tensors = append(tensors, tensor)
offset += size
}
}
return tensors, nil
}
func getAltParams(dirpath string) (*Params, error) {
f, err := os.Open(filepath.Join(dirpath, "params.json"))
if err != nil {
slog.Error("no params.json")
return nil, err
}
defer f.Close()
type TorchParams struct {
HiddenSize int `json:"dim"`
AttentionHeads int `json:"n_heads"`
KeyValHeads int `json:"n_kv_heads"`
HiddenLayers int `json:"n_layers"`
RopeTheta int `json:"rope_theta"`
NormEPS float64 `json:"norm_eps"`
}
var tparams TorchParams
d := json.NewDecoder(f)
err = d.Decode(&tparams)
if err != nil {
return nil, err
}
params := &Params{
HiddenSize: tparams.HiddenSize,
AttentionHeads: tparams.AttentionHeads,
KeyValHeads: tparams.KeyValHeads,
HiddenLayers: tparams.HiddenLayers,
NormEPS: tparams.NormEPS,
}
switch {
case tparams.RopeTheta == 1000000:
// Codellama
params.ContextSize = 16384
case tparams.NormEPS == 1e-06:
// llama2
slog.Debug("Found llama2 - setting context size to 4096")
params.ContextSize = 4096
default:
params.ContextSize = 2048
}
params.ByteOrder = binary.LittleEndian
return params, nil
}
func (m *TorchFormat) GetParams(dirpath string) (*Params, error) {
f, err := os.Open(filepath.Join(dirpath, "config.json"))
if err != nil {
if os.IsNotExist(err) {
// try params.json instead
return getAltParams(dirpath)
} else {
return nil, err
}
}
var params Params
d := json.NewDecoder(f)
err = d.Decode(&params)
if err != nil {
return nil, err
}
params.ByteOrder = binary.LittleEndian
return &params, nil
}
func (m *TorchFormat) GetLayerName(n string) (string, error) {
directMap := map[string]string{
"tok_embeddings.weight": "token_embd.weight",
"output.weight": "output.weight",
"norm.weight": "output_norm.weight",
"rope.freqs": "rope_freqs.weight",
"model.embed_tokens.weight": "token_embd.weight",
"lm_head.weight": "output.weight",
"model.norm.weight": "output_norm.weight",
}
lMap := map[string]string{
"layers.(\\d+).attention_norm.weight": "blk.$1.attn_norm.weight",
"layers.(\\d+).attention_output_norm.weight": "blk.$1.attn_norm.weight",
"layers.(\\d+).feed_forward.w2.weight": "blk.$1.ffn_down.weight",
"layers.(\\d+).feed_forward.w1.weight": "blk.$1.ffn_gate.weight",
"layers.(\\d+).feed_forward.w3.weight": "blk.$1.ffn_up.weight",
"layers.(\\d+).ffn_norm.weight": "blk.$1.ffn_norm.weight",
"layers.(\\d+).attention.wk.weight": "blk.$1.attn_k.weight",
"layers.(\\d+).attention.wo.weight": "blk.$1.attn_output.weight",
"layers.(\\d+).attention.wq.weight": "blk.$1.attn_q.weight",
"layers.(\\d+).attention.wv.weight": "blk.$1.attn_v.weight",
"model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight",
"model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight",
"model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight",
"model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight",
"model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight",
"model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight",
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
}
v, ok := directMap[n]
if ok {
return v, nil
}
// quick hack to rename the layers to gguf format
for k, v := range lMap {
re := regexp.MustCompile(k)
newName := re.ReplaceAllString(n, v)
if newName != n {
return newName, nil
}
}
return "", fmt.Errorf("couldn't find a layer name for '%s'", n)
}
func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) {
// use the handler if one is present
if r.handler != nil {
return 0, r.handler(w, r)
}
switch r.storage.(type) {
case *pytorch.FloatStorage:
slog.Warn(fmt.Sprintf("unexpected storage found for layer '%s'; skipping", r.t.Name))
return 0, nil
case *pytorch.HalfStorage:
switch r.t.Kind {
case 0:
data := r.storage.(*pytorch.HalfStorage).Data
slog.Debug(fmt.Sprintf("%35s F32 (%d)", r.t.Name, len(data)))
if err := binary.Write(w, r.bo, data); err != nil {
return 0, err
}
case 1:
data := r.storage.(*pytorch.HalfStorage).Data
tData := make([]uint16, len(data))
for cnt, v := range data {
tData[cnt] = uint16(float16.Fromfloat32(v))
}
slog.Debug(fmt.Sprintf("%35s F16 (%d)", r.t.Name, len(tData)))
if err := binary.Write(w, r.bo, tData); err != nil {
return 0, err
}
}
}
return 0, nil
}
func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
switch len(params.Architectures) {
case 0:
return nil, fmt.Errorf("No architecture specified to convert")
case 1:
switch params.Architectures[0] {
case "LlamaForCausalLM":
return &LlamaModel{
ModelData{
Name: name,
Path: dirPath,
Params: params,
Format: m,
},
}, nil
default:
return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0])
}
}
return nil, fmt.Errorf("Unknown error")
}

View File

@@ -394,6 +394,7 @@ Advanced parameters (optional):
- `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)

View File

@@ -139,6 +139,9 @@ PARAMETER <parameter> <parametervalue>
| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
| num_gqa | The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b | int | num_gqa 1 |
| num_gpu | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable. | int | num_gpu 50 |
| num_thread | Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). | int | num_thread 8 |
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |

View File

@@ -76,10 +76,3 @@ install script which version to install.
```sh
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
```
## Linux tmp noexec
If your system is configured with the "noexec" flag where Ollama stores its
temporary executable files, you can specify an alternate location by setting
OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example
OLLAMA_TMPDIR=/usr/share/ollama/

View File

@@ -18,7 +18,7 @@ const ollama = new Ollama({
model: "llama2",
});
const answer = await ollama.invoke(`why is the sky blue?`);
const answer = await ollama.call(`why is the sky blue?`);
console.log(answer);
```

View File

@@ -1,51 +0,0 @@
package main
import (
"context"
"fmt"
"log"
"github.com/ollama/ollama/api"
)
func main() {
client, err := api.ClientFromEnvironment()
if err != nil {
log.Fatal(err)
}
messages := []api.Message{
api.Message{
Role: "system",
Content: "Provide very brief, concise responses",
},
api.Message{
Role: "user",
Content: "Name some unusual animals",
},
api.Message{
Role: "assistant",
Content: "Monotreme, platypus, echidna",
},
api.Message{
Role: "user",
Content: "which of these is the most dangerous?",
},
}
ctx := context.Background()
req := &api.ChatRequest{
Model: "llama2",
Messages: messages,
}
respFunc := func(resp api.ChatResponse) error {
fmt.Print(resp.Message.Content)
return nil
}
err = client.Chat(ctx, req, respFunc)
if err != nil {
log.Fatal(err)
}
}

View File

@@ -1,40 +0,0 @@
package main
import (
"context"
"fmt"
"log"
"github.com/ollama/ollama/api"
)
func main() {
client, err := api.ClientFromEnvironment()
if err != nil {
log.Fatal(err)
}
// By default, GenerateRequest is streaming.
req := &api.GenerateRequest{
Model: "gemma",
Prompt: "how many planets are there?",
}
ctx := context.Background()
respFunc := func(resp api.GenerateResponse) error {
// Only print the response here; GenerateResponse has a number of other
// interesting fields you want to examine.
// In streaming mode, responses are partial so we call fmt.Print (and not
// Println) in order to avoid spurious newlines being introduced. The
// model will insert its own newlines if it wants.
fmt.Print(resp.Response)
return nil
}
err = client.Generate(ctx, req, respFunc)
if err != nil {
log.Fatal(err)
}
fmt.Println()
}

View File

@@ -1,37 +0,0 @@
package main
import (
"context"
"fmt"
"log"
"github.com/ollama/ollama/api"
)
func main() {
client, err := api.ClientFromEnvironment()
if err != nil {
log.Fatal(err)
}
req := &api.GenerateRequest{
Model: "gemma",
Prompt: "how many planets are there?",
// set streaming to false
Stream: new(bool),
}
ctx := context.Background()
respFunc := func(resp api.GenerateResponse) error {
// Only print the response here; GenerateResponse has a number of other
// interesting fields you want to examine.
fmt.Println(resp.Response)
return nil
}
err = client.Generate(ctx, req, respFunc)
if err != nil {
log.Fatal(err)
}
}

View File

@@ -1,47 +0,0 @@
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/ollama/ollama/api"
)
func main() {
if len(os.Args) <= 1 {
log.Fatal("usage: <image name>")
}
imgData, err := os.ReadFile(os.Args[1])
if err != nil {
log.Fatal(err)
}
client, err := api.ClientFromEnvironment()
if err != nil {
log.Fatal(err)
}
req := &api.GenerateRequest{
Model: "llava",
Prompt: "describe this image",
Images: []api.ImageData{imgData},
}
ctx := context.Background()
respFunc := func(resp api.GenerateResponse) error {
// In streaming mode, responses are partial so we call fmt.Print (and not
// Println) in order to avoid spurious newlines being introduced. The
// model will insert its own newlines if it wants.
fmt.Print(resp.Response)
return nil
}
err = client.Generate(ctx, req, respFunc)
if err != nil {
log.Fatal(err)
}
fmt.Println()
}

View File

@@ -1,31 +0,0 @@
package main
import (
"context"
"fmt"
"log"
"github.com/ollama/ollama/api"
)
func main() {
client, err := api.ClientFromEnvironment()
if err != nil {
log.Fatal(err)
}
ctx := context.Background()
req := &api.PullRequest{
Model: "mistral",
}
progressFunc := func(resp api.ProgressResponse) error {
fmt.Printf("Progress: status=%v, total=%v, completed=%v\n", resp.Status, resp.Total, resp.Completed)
return nil
}
err = client.Pull(ctx, req, progressFunc)
if err != nil {
log.Fatal(err)
}
}

View File

@@ -6,15 +6,11 @@ import (
)
const (
Byte = 1
Byte = 1
KiloByte = Byte * 1000
MegaByte = KiloByte * 1000
GigaByte = MegaByte * 1000
TeraByte = GigaByte * 1000
KibiByte = Byte * 1024
MebiByte = KibiByte * 1024
)
func HumanBytes(b int64) string {
@@ -49,14 +45,3 @@ func HumanBytes(b int64) string {
return fmt.Sprintf("%d %s", int(value), unit)
}
}
func HumanBytes2(b uint64) string {
switch {
case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
case b >= KibiByte:
return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte)
default:
return fmt.Sprintf("%d B", b)
}
}

9
go.mod
View File

@@ -9,7 +9,7 @@ require (
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/emirpasic/gods v1.18.1
github.com/gin-gonic/gin v1.9.1
github.com/golang/protobuf v1.5.0 // indirect
github.com/golang/protobuf v1.5.0
github.com/google/uuid v1.0.0
github.com/mitchellh/mapstructure v1.5.0
github.com/olekukonko/tablewriter v0.0.5
@@ -19,10 +19,7 @@ require (
golang.org/x/sync v0.3.0
)
require (
github.com/nlpodyssey/gopickle v0.3.0
github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
)
require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
require (
github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
@@ -71,7 +68,7 @@ require (
golang.org/x/net v0.17.0 // indirect
golang.org/x/sys v0.13.0
golang.org/x/term v0.13.0
golang.org/x/text v0.14.0 // indirect
golang.org/x/text v0.13.0 // indirect
google.golang.org/protobuf v1.30.0
gopkg.in/yaml.v3 v3.0.1 // indirect
)

6
go.sum
View File

@@ -122,8 +122,6 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/nlpodyssey/gopickle v0.3.0 h1:BLUE5gxFLyyNOPzlXxt6GoHEMMxD0qhsE4p0CIQyoLw=
github.com/nlpodyssey/gopickle v0.3.0/go.mod h1:f070HJ/yR+eLi5WmM1OXJEGaTpuJEUiib19olXgYha0=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
@@ -238,8 +236,8 @@ golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@@ -100,8 +100,6 @@ func AMDGetGPUInfo(resp *GpuInfo) {
return
}
updateLibPath(libDir)
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
if gfxOverride == "" {
supported, err := GetSupportedGFX(libDir)
@@ -115,7 +113,7 @@ func AMDGetGPUInfo(resp *GpuInfo) {
if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
// TODO - consider discrete markdown just for ROCM troubleshooting?
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
skip[i] = struct{}{}
} else {
slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
@@ -145,21 +143,6 @@ func AMDGetGPUInfo(resp *GpuInfo) {
}
}
func updateLibPath(libDir string) {
ldPaths := []string{}
if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
ldPaths = strings.Split(val, ":")
}
for _, d := range ldPaths {
if d == libDir {
return
}
}
val := strings.Join(append(ldPaths, libDir), ":")
slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
os.Setenv("LD_LIBRARY_PATH", val)
}
// Walk the sysfs nodes for the available GPUs and gather information from them
// skipping over any devices in the skip map
func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {

View File

@@ -11,7 +11,6 @@ import (
"strings"
"sync"
"syscall"
"time"
)
var (
@@ -22,20 +21,11 @@ var (
func PayloadsDir() (string, error) {
lock.Lock()
defer lock.Unlock()
var err error
if payloadsDir == "" {
cleanupTmpDirs()
tmpDir := os.Getenv("OLLAMA_TMPDIR")
if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
} else {
err = os.MkdirAll(tmpDir, 0755)
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
}
tmpDir, err := os.MkdirTemp("", "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
// Track our pid so we can clean up orphaned tmpdirs
@@ -94,12 +84,7 @@ func Cleanup() {
slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir)
if err != nil {
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
time.Sleep(1000 * time.Millisecond)
err = os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
}
}

View File

@@ -20,8 +20,6 @@ import (
"strings"
"sync"
"unsafe"
"github.com/ollama/ollama/format"
)
type handles struct {
@@ -29,12 +27,8 @@ type handles struct {
cudart *C.cudart_handle_t
}
const (
cudaMinimumMemory = 457 * format.MebiByte
rocmMinimumMemory = 457 * format.MebiByte
)
var gpuMutex sync.Mutex
var gpuHandles *handles = nil
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
@@ -84,11 +78,11 @@ var CudartWindowsGlobs = []string{
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
func initGPUHandles() *handles {
func initGPUHandles() {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles := &handles{nil, nil}
gpuHandles = &handles{nil, nil}
var nvmlMgmtName string
var nvmlMgmtPatterns []string
var cudartMgmtName string
@@ -115,7 +109,7 @@ func initGPUHandles() *handles {
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
default:
return gpuHandles
return
}
slog.Info("Detecting GPU type")
@@ -125,7 +119,7 @@ func initGPUHandles() *handles {
if cudart != nil {
slog.Info("Nvidia GPU detected via cudart")
gpuHandles.cudart = cudart
return gpuHandles
return
}
}
@@ -136,10 +130,10 @@ func initGPUHandles() *handles {
if nvml != nil {
slog.Info("Nvidia GPU detected via nvidia-ml")
gpuHandles.nvml = nvml
return gpuHandles
return
}
}
return gpuHandles
}
func GetGPUInfo() GpuInfo {
@@ -147,16 +141,9 @@ func GetGPUInfo() GpuInfo {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
gpuHandles := initGPUHandles()
defer func() {
if gpuHandles.nvml != nil {
C.nvml_release(*gpuHandles.nvml)
}
if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart)
}
}()
if gpuHandles == nil {
initGPUHandles()
}
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
@@ -181,7 +168,6 @@ func GetGPUInfo() GpuInfo {
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
resp.MinimumMemory = cudaMinimumMemory
} else {
slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
@@ -201,7 +187,6 @@ func GetGPUInfo() GpuInfo {
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
resp.Library = "cuda"
resp.MinimumMemory = cudaMinimumMemory
} else {
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
}
@@ -209,7 +194,6 @@ func GetGPUInfo() GpuInfo {
} else {
AMDGetGPUInfo(&resp)
if resp.Library != "" {
resp.MinimumMemory = rocmMinimumMemory
return resp
}
}
@@ -243,7 +227,7 @@ func getCPUMem() (memInfo, error) {
return ret, nil
}
func CheckVRAM() (uint64, error) {
func CheckVRAM() (int64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64)
@@ -251,11 +235,24 @@ func CheckVRAM() (uint64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
}
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return uint64(avail), nil
return avail, nil
}
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
return gpuInfo.FreeMemory, nil
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
overhead := gpuInfo.FreeMemory / 10
gpus := uint64(gpuInfo.DeviceCount)
if overhead < gpus*1024*1024*1024 {
overhead = gpus * 1024 * 1024 * 1024
}
// Assigning full reported free memory for Tegras due to OS controlled caching.
if CudaTegra != "" {
// Setting overhead for non-Tegra devices
overhead = 0
}
avail := int64(gpuInfo.FreeMemory - overhead)
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
return avail, nil
}
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation

View File

@@ -17,7 +17,7 @@ import (
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (uint64, error) {
func CheckVRAM() (int64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64)
@@ -25,15 +25,15 @@ func CheckVRAM() (uint64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
}
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return uint64(avail), nil
return avail, nil
}
if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal
return 0, nil
}
return uint64(C.getRecommendedMaxVRAM()), nil
recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM())
return recommendedMaxVRAM, nil
}
func GetGPUInfo() GpuInfo {
@@ -53,7 +53,7 @@ func GetGPUInfo() GpuInfo {
func getCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()),
TotalMemory: 0,
FreeMemory: 0,
DeviceCount: 0,
}, nil

View File

@@ -62,10 +62,6 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "cudart init failure: %d", ret);
resp->err = strdup(buf);
return;
@@ -191,10 +187,4 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r
}
}
void cudart_release(cudart_handle_t h) {
LOG(h.verbose, "releasing cudart library\n");
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
#endif // __APPLE__

View File

@@ -7,7 +7,6 @@
typedef enum cudartReturn_enum {
CUDART_SUCCESS = 0,
CUDART_UNSUPPORTED = 1,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
// Other values omitted for now...
} cudartReturn_t;
@@ -55,7 +54,6 @@ typedef struct cudart_compute_capability {
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__
#endif // __APPLE__

View File

@@ -1,4 +1,3 @@
#import <Metal/Metal.h>
#include <stdint.h>
uint64_t getRecommendedMaxVRAM();
uint64_t getPhysicalMemory();

View File

@@ -1,13 +1,11 @@
// go:build darwin
//go:build darwin
#include "gpu_info_darwin.h"
uint64_t getRecommendedMaxVRAM() {
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
uint64_t result = device.recommendedMaxWorkingSetSize;
CFRelease(device);
return result;
uint64_t getRecommendedMaxVRAM()
{
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
uint64_t result = device.recommendedMaxWorkingSetSize;
CFRelease(device);
return result;
}
uint64_t getPhysicalMemory() {
return [[NSProcessInfo processInfo] physicalMemory];
}

View File

@@ -211,11 +211,4 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
}
}
}
void nvml_release(nvml_handle_t h) {
LOG(h.verbose, "releasing nvml library\n");
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
#endif // __APPLE__

View File

@@ -51,7 +51,6 @@ typedef struct nvml_compute_capability {
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__
#endif // __APPLE__

View File

@@ -14,9 +14,6 @@ type GpuInfo struct {
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
// TODO add other useful attributes about the card here for discovery information
}

View File

@@ -24,5 +24,5 @@ func TestOrcaMiniBlueSky(t *testing.T) {
"seed": 123,
},
}
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"})
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh"})
}

View File

@@ -1,29 +0,0 @@
//go:build integration
package integration
import (
"context"
"net/http"
"testing"
"time"
"github.com/ollama/ollama/api"
)
func TestContextExhaustion(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
defer cancel()
// Set up the test data
req := api.GenerateRequest{
Model: "llama2",
Prompt: "Write me a story with a ton of emojis?",
Stream: &stream,
Options: map[string]interface{}{
"temperature": 0,
"seed": 123,
"num_ctx": 128,
},
}
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"once", "upon", "lived"})
}

View File

@@ -15,6 +15,10 @@ import (
// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
// package to avoid circular dependencies
// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
//
// TODO - Fix this ^^
var (
stream = false
req = [2]api.GenerateRequest{

View File

@@ -126,7 +126,7 @@ func StartServer(ctx context.Context, ollamaHost string) error {
}
func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
slog.Info("checking status of model", "model", modelName)
slog.Debug("checking status of model", "model", modelName)
showReq := &api.ShowRequest{Name: modelName}
requestJSON, err := json.Marshal(showReq)
if err != nil {
@@ -174,51 +174,36 @@ func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoin
return nil
}
var serverProcMutex sync.Mutex
func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
// TODO maybe stuff in an init routine?
lifecycle.InitLogging()
requestJSON, err := json.Marshal(genReq)
if err != nil {
t.Fatalf("Error serializing request: %v", err)
}
defer func() {
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
defer serverProcMutex.Unlock()
if t.Failed() {
fp, err := os.Open(lifecycle.ServerLogFile)
if err != nil {
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
data, err := io.ReadAll(fp)
if err != nil {
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
slog.Warn("SERVER LOG FOLLOWS")
os.Stderr.Write(data)
slog.Warn("END OF SERVER")
if t.Failed() && os.Getenv("OLLAMA_TEST_EXISTING") == "" {
// TODO
fp, err := os.Open(lifecycle.ServerLogFile)
if err != nil {
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
err = os.Remove(lifecycle.ServerLogFile)
if err != nil && !os.IsNotExist(err) {
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
data, err := io.ReadAll(fp)
if err != nil {
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
slog.Warn("SERVER LOG FOLLOWS")
os.Stderr.Write(data)
slog.Warn("END OF SERVER")
}
err = os.Remove(lifecycle.ServerLogFile)
if err != nil && !os.IsNotExist(err) {
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
}
}()
scheme, testEndpoint := GetTestEndpoint()
if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
serverProcMutex.Lock()
fp, err := os.CreateTemp("", "ollama-server-*.log")
if err != nil {
t.Fatalf("failed to generate log file: %s", err)
}
lifecycle.ServerLogFile = fp.Name()
fp.Close()
assert.NoError(t, StartServer(ctx, testEndpoint))
}

142
llm/dyn_ext_server.c Normal file
View File

@@ -0,0 +1,142 @@
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#elif _WIN32
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
#define LOAD_ERR() ({\
LPSTR messageBuffer = NULL; \
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
char *resp = strdup(messageBuffer); \
LocalFree(messageBuffer); \
resp; \
})
#else
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
if (!s->handle) {
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len,
"Unable to load dynamic server library: %s", msg);
free(msg);
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, msg);
free(msg);
return;
}
}
}
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}

388
llm/dyn_ext_server.go Normal file
View File

@@ -0,0 +1,388 @@
package llm
/*
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
#include <stdlib.h>
#include "dyn_ext_server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"
"sync"
"time"
"unsafe"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
type dynExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
var resp C.ext_server_resp_t
resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
}
C.free(unsafe.Pointer(resp.msg))
}
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
gpu.UpdatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(512)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dyn_init(libPath, &srv, &resp)
if resp.id < 0 {
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm := dynExtServer{
s: srv,
options: opts,
}
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model))
sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx)
sparams.n_batch = C.uint(opts.NumBatch)
sparams.n_gpu_layers = C.int(opts.NumGPU)
sparams.main_gpu = C.int(opts.MainGPU)
sparams.n_parallel = 1 // TODO - wire up concurrency
// Always use the value encoded in the model
sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter))
la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
la.next = nil
if i == 0 {
sparams.lora_adapters = la
} else {
tmp := sparams.lora_adapters
for ; tmp.next != nil; tmp = tmp.next {
}
tmp.next = la
}
}
if len(projectors) > 0 {
// TODO: applying multiple projectors is not supported by the llama.cpp server yet
sparams.mmproj = C.CString(projectors[0])
defer C.free(unsafe.Pointer(sparams.mmproj))
} else {
sparams.mmproj = nil
}
sparams.n_threads = C.uint(opts.NumThread)
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
sparams.verbose_logging = C.bool(true)
} else {
sparams.verbose_logging = C.bool(false)
}
slog.Info("Initializing llama server")
slog.Debug(fmt.Sprintf("server params: %+v", sparams))
initResp := newExtServerResp(512)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
mutex.Unlock()
err := extServerResponseToErr(initResp)
slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
return nil, err
}
slog.Info("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return &llm, nil
}
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
if len(predict.Images) > 0 {
slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
}
request := map[string]any{
"prompt": predict.Prompt,
"stream": true,
"n_predict": predict.Options.NumPredict,
"n_keep": predict.Options.NumKeep,
"temperature": predict.Options.Temperature,
"top_k": predict.Options.TopK,
"top_p": predict.Options.TopP,
"tfs_z": predict.Options.TFSZ,
"typical_p": predict.Options.TypicalP,
"repeat_last_n": predict.Options.RepeatLastN,
"repeat_penalty": predict.Options.RepeatPenalty,
"presence_penalty": predict.Options.PresencePenalty,
"frequency_penalty": predict.Options.FrequencyPenalty,
"mirostat": predict.Options.Mirostat,
"mirostat_tau": predict.Options.MirostatTau,
"mirostat_eta": predict.Options.MirostatEta,
"penalize_nl": predict.Options.PenalizeNewline,
"seed": predict.Options.Seed,
"stop": predict.Options.Stop,
"image_data": predict.Images,
"cache_prompt": true,
}
if predict.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
}
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %w", err)
}
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
C.dyn_llama_server_completion(llm.s, req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
out:
for {
select {
case <-ctx.Done():
return cancelCompletion(llm, resp)
default:
var result C.ext_server_task_result_t
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
json_resp := C.GoString(result.json_resp)
C.dyn_llama_server_release_task_result(llm.s, &result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
}
}
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
retryNeeded = true
// task will already be canceled
break out
}
switch {
case strings.TrimSpace(p.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(p.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return cancelCompletion(llm, resp)
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop || bool(result.stop) {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
}
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
}
return encoded.Tokens, err
}
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2)
}
return decoded.Content, err
}
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
}
func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}

74
llm/dyn_ext_server.h Normal file
View File

@@ -0,0 +1,74 @@
#include <stdlib.h>
#include "ext_server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct dynamic_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif

View File

@@ -1,14 +1,21 @@
set(TARGET ollama_llama_server)
set(TARGET ext_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp)
else()
add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
target_link_libraries(${TARGET} PRIVATE ggml llava common )
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
install(TARGETS ext_server LIBRARY)
if (CUDAToolkit_FOUND)
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (WIN32)
target_link_libraries(${TARGET} PRIVATE nvml)
endif()
endif()

18
llm/ext_server/README.md vendored Normal file
View File

@@ -0,0 +1,18 @@
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```

377
llm/ext_server/ext_server.cpp vendored Normal file
View File

@@ -0,0 +1,377 @@
#include "ext_server.h"
#include <atomic>
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
// Low level API access to verify GPU access
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define cudaGetDevice hipGetDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif // defined(GGML_USE_HIPBLAS)
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::thread ext_server_thread;
bool shutting_down = false;
std::atomic_int recv_counter;
// RAII wrapper for tracking in-flight recv calls
class atomicRecv {
public:
atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
++this->atomic;
}
~atomicRecv() {
--this->atomic;
}
private:
std::atomic<int> &atomic;
};
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
recv_counter = 0;
assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
server_verbose = true;
log_disable();
}
LOG_TEE("system info: %s\n", llama_print_system_info());
err->id = 0;
err->msg[0] = '\0';
try {
llama = new llama_server_context;
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
if (sparams->n_threads > 0) {
params.n_threads = sparams->n_threads;
}
params.n_parallel = sparams->n_parallel;
params.rope_freq_base = sparams->rope_freq_base;
params.rope_freq_scale = sparams->rope_freq_scale;
if (sparams->memory_f16) {
params.cache_type_k = "f16";
params.cache_type_v = "f16";
} else {
params.cache_type_k = "f32";
params.cache_type_v = "f32";
}
params.n_gpu_layers = sparams->n_gpu_layers;
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = (ggml_numa_strategy)sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
}
if (sparams->lora_adapters != NULL) {
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
}
params.use_mmap = false;
}
if (sparams->mmproj != NULL) {
params.mmproj = std::string(sparams->mmproj);
}
#if defined(GGML_USE_CUBLAS)
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
LOG_TEE("Performing pre-initialization of GPU\n");
int id;
cudaError_t cudaErr = cudaGetDevice(&id);
if (cudaErr != cudaSuccess) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
return;
}
#endif
llama_backend_init();
llama_numa_init(params.numa);
if (!llama->load_model(params)) {
// an error occurred that was not thrown
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
return;
}
llama->initialize();
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception initializing llama server");
}
}
void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_run_slots(std::bind(
&llama_server_context::update_slots, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
llama->queue_tasks.start_loop();
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
LOG_TEE("caught unknown exception in llama server main loop\n");
}
LOG_TEE("\nllama server shutting down\n");
llama_backend_free();
});
}
void llama_server_stop() {
assert(llama != NULL);
// Shutdown any in-flight requests and block incoming requests.
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
shutting_down = true;
while (recv_counter.load() > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama->queue_tasks.terminate();
ext_server_thread.join();
delete llama;
llama = NULL;
LOG_TEE("llama server shutdown complete\n");
shutting_down = false;
}
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
assert(llama != NULL && json_req != NULL && resp != NULL);
resp->id = -1;
resp->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
json data = json::parse(json_req);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
}
}
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *resp) {
assert(llama != NULL && resp != NULL);
resp->id = -1;
resp->stop = false;
resp->error = false;
resp->json_resp = NULL;
std::string result_json;
try {
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
LOG_TEE("next result cancel on error\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (result.stop) {
LOG_TEE("next result cancel on stop\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (shutting_down) {
LOG_TEE("aborting completion due to shutdown %d\n", task_id);
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
resp->stop = true;
}
} catch (std::exception &e) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
LOG_TEE("llama server completion exception %s\n", e.what());
} catch (...) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"Unknown exception during completion\"}";
LOG_TEE("llama server completion unknown exception\n");
}
const std::string::size_type size = result_json.size() + 1;
resp->json_resp = new char[size];
snprintf(resp->json_resp, size, "%s", result_json.c_str());
}
void llama_server_release_task_result(ext_server_task_result_t *result) {
if (result == NULL || result->json_resp == NULL) {
return;
}
delete[] result->json_resp;
}
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
assert(llama != NULL && err != NULL);
err->id = 0;
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception completion cancel in llama server");
}
}
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = llama->tokenize(body["content"], false);
}
const json data = format_tokenizer_response(tokens);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
}
}
void llama_server_release_json_resp(char **json_resp) {
if (json_resp == NULL || *json_resp == NULL) {
return;
}
delete[] *json_resp;
}
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
}
const json data = format_detokenized_response(content);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
}
}
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
json prompt;
if (body.count("content") != 0) {
prompt = body["content"];
} else {
prompt = "";
}
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
}
}

95
llm/ext_server/ext_server.h vendored Normal file
View File

@@ -0,0 +1,95 @@
#if defined(LLAMA_SERVER_LIBRARY)
#ifndef LLAMA_SERVER_H
#define LLAMA_SERVER_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
int __main(int argc, char **argv);
// This exposes extern C entrypoints into the llama_server
// To enable the server compile with LLAMA_SERVER_LIBRARY
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ext_server_resp {
int id; // < 0 on error
size_t msg_len; // caller must allocate msg and set msg_len
char *msg;
} ext_server_resp_t;
// Allocated and freed by caller
typedef struct ext_server_lora_adapter {
char *adapter;
float scale;
struct ext_server_lora_adapter *next;
} ext_server_lora_adapter_t;
// Allocated and freed by caller
typedef struct ext_server_params {
char *model;
uint32_t n_ctx; // token context window, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
int32_t n_parallel; // number of parallel sequences to decodewra
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
bool memory_f16; // use f16 instead of f32 for memory kv
int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
int numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
bool verbose_logging; // Enable verbose logging of the server
} ext_server_params_t;
typedef struct ext_server_task_result {
int id;
bool stop;
bool error;
char *json_resp; // null terminated, memory managed by ext_server
} ext_server_task_result_t;
// Initialize the server once per process
// err->id = 0 for success and err->msg[0] = NULL
// err->id != 0 for failure, and err->msg contains error message
void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
// Run the main loop, called once per init
void llama_server_start();
// Stop the main loop and free up resources allocated in init and start. Init
// must be called again to reuse
void llama_server_stop();
// json_req null terminated string, memory managed by caller
// resp->id >= 0 on success (task ID)
// resp->id < 0 on error, and resp->msg contains error message
void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
// Caller must call llama_server_release_task_result to free resp->json_resp
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *result);
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
void llama_server_release_task_result(ext_server_task_result_t *result);
// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
// 0
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_release_json_resp(char **json_resp);
#ifdef __cplusplus
}
#endif
#endif
#endif // LLAMA_SERVER_LIBRARY

View File

@@ -2770,7 +2770,7 @@ inline void signal_handler(int signal) {
shutdown_handler(signal);
}
int main(int argc, char **argv)
int _main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
log_disable();

View File

@@ -14,7 +14,7 @@ init_vars() {
LLAMACPP_DIR=../llama.cpp
CMAKE_DEFS=""
CMAKE_TARGETS="--target ollama_llama_server"
CMAKE_TARGETS="--target ext_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
else
@@ -81,24 +81,27 @@ apply_patches() {
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
mkdir -p ${BUILD_DIR}/lib/
ls ${BUILD_DIR}
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
${GCC_ARCH} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,-rpath,\$ORIGIN \
-lpthread -ldl -lm \
${EXTRA_LIBS}
}
compress() {
compress_libs() {
echo "Compressing payloads to reduce overall binary size..."
pids=""
rm -rf ${BUILD_DIR}/bin/*.gz
for f in ${BUILD_DIR}/bin/* ; do
gzip -n --best -f ${f} &
rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
gzip -n --best -f ${lib} &
pids+=" $!"
done
# check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do
gzip -n --best -f ${f} &
pids+=" $!"
done
fi
echo
echo
for pid in ${pids}; do
wait $pid
done

View File

@@ -18,31 +18,21 @@ sign() {
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case "${GOARCH}" in
"amd64")
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
compress_libs
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
@@ -50,11 +40,11 @@ case "${GOARCH}" in
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
compress_libs
#
# ~2013 CPU Dynamic library
@@ -62,30 +52,20 @@ case "${GOARCH}" in
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
compress_libs
;;
"arm64")
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/metal"
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
compress
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
compress_libs
;;
*)
echo "GOARCH must be set"
@@ -95,4 +75,3 @@ case "${GOARCH}" in
esac
cleanup
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -57,31 +57,16 @@ init_vars
git_module_setup
apply_patches
init_vars
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
fi
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
compress
compress_libs
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
@@ -98,12 +83,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
compress
compress_libs
fi
if [ "${ARCH}" == "x86_64" ]; then
@@ -117,10 +101,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress
compress_libs
fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
@@ -130,10 +114,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress
compress_libs
fi
fi
fi
@@ -172,8 +156,8 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
# Disabling has minimal performance effect while maintaining compatibility.
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
fi
CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build
@@ -181,20 +165,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
#
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
fi
done
compress
compress_libs
fi
@@ -217,24 +201,23 @@ if [ -d "${ROCM_PATH}" ]; then
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
# Record the ROCM dependencies
rm -f "${BUILD_DIR}/bin/deps.txt"
touch "${BUILD_DIR}/bin/deps.txt"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
rm -f "${BUILD_DIR}/lib/deps.txt"
touch "${BUILD_DIR}/lib/deps.txt"
for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
done
# bomb out if for some reason we didn't get a few deps
if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/bin/deps.txt"
if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/lib/deps.txt"
echo "ERROR: deps file short"
exit 1
fi
compress
compress_libs
fi
cleanup
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -33,7 +33,7 @@ function init_vars {
"-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off"
)
$script:cmakeTargets = @("ollama_llama_server")
$script:cmakeTargets = @("ext_server")
$script:ARCH = "amd64" # arm not yet supported.
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -97,14 +97,16 @@ function apply_patches {
}
# Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) {
git -C "${script:llamacppDir}" checkout $file
git checkout $file
}
}
# Apply each patch
foreach ($patch in $patches) {
git -C "${script:llamacppDir}" apply $patch.FullName
Set-Location -Path ${script:llamacppDir}
git apply $patch.FullName
}
}
@@ -113,41 +115,41 @@ function build {
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
remove-item "${script:buildDir}/bin/${script:config}"
}
function install {
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
# Display the dll dependencies in the build log
if ($script:DUMPBIN -ne $null) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
}
}
function sign {
if ("${env:KEY_CONTAINER}") {
write-host "Signing ${script:buildDir}/bin/*.exe ${script:buildDir}/bin/*.dll"
foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
write-host "Signing ${script:buildDir}/lib/*.dll"
foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
& "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
}
function compress {
function compress_libs {
if ($script:GZIP -eq $null) {
write-host "gzip not installed, not compressing files"
return
}
write-host "Compressing binaries..."
$binaries = dir "${script:buildDir}/bin/*.exe"
foreach ($file in $binaries) {
& "$script:GZIP" --best -f $file
}
write-host "Compressing dlls..."
$dlls = dir "${script:buildDir}/bin/*.dll"
foreach ($file in $dlls) {
$libs = dir "${script:buildDir}/lib/*.dll"
foreach ($file in $libs) {
& "$script:GZIP" --best -f $file
}
}
@@ -162,11 +164,14 @@ function cleanup {
}
# Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) {
git -C "${script:llamacppDir}" checkout $file
git checkout $file
}
git -C "${script:llamacppDir}" checkout CMakeLists.txt
}
Set-Location "${script:llamacppDir}/"
git checkout CMakeLists.txt
}
init_vars
@@ -174,6 +179,7 @@ git_module_setup
apply_patches
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
@@ -181,54 +187,32 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
# GCC build for direct linking into the Go binary
init_vars
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DCMAKE_C_COMPILER=gcc.exe",
"-DCMAKE_CXX_COMPILER=g++.exe",
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
install
sign
compress
compress_libs
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
install
sign
compress
compress_libs
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
sign
compress
compress_libs
} else {
write-host "Skipping CPU generation step as requested"
}
@@ -241,11 +225,13 @@ if ($null -ne $script:CUDA_LIB_DIR) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
write-host "Building CUDA"
build
install
sign
compress
compress_libs
}
if ($null -ne $env:HIP_PATH) {
@@ -255,13 +241,12 @@ if ($null -ne $env:HIP_PATH) {
}
init_vars
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DLLAMA_HIPBLAS=on",
"-DHIP_PLATFORM=amd",
"-DLLAMA_AVX=on",
"-DLLAMA_AVX2=off",
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
@@ -279,13 +264,13 @@ if ($null -ne $env:HIP_PATH) {
build
# Ninja doesn't prefix with config name
${script:config}=""
install
if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
}
sign
compress
compress_libs
}
cleanup
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})"

View File

@@ -1,3 +1,3 @@
package generate
//go:generate bash ./gen_darwin.sh
//go:generate sh ./gen_darwin.sh

View File

@@ -7,18 +7,16 @@ import (
"slices"
)
type containerGGLA struct {
type ContainerGGLA struct {
version uint32
}
func (c *containerGGLA) Name() string {
func (c *ContainerGGLA) Name() string {
return "ggla"
}
func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
return nil, err
}
func (c *ContainerGGLA) Decode(rs io.ReadSeeker) (model, error) {
binary.Read(rs, binary.LittleEndian, &c.version)
switch c.version {
case 1:
@@ -26,45 +24,37 @@ func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
return nil, errors.New("invalid version")
}
model := newGGLA(c)
model := newModelGGLA(c)
err := model.decode(rs)
return model, err
}
type ggla struct {
*containerGGLA
type ModelGGLA struct {
*ContainerGGLA
kv KV
tensors []*Tensor
tensors []Tensor
}
func newGGLA(container *containerGGLA) *ggla {
return &ggla{
containerGGLA: container,
func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
return &ModelGGLA{
ContainerGGLA: container,
kv: make(KV),
}
}
func (llm *ggla) KV() KV {
return llm.kv
}
func (llm *ggla) Tensors() Tensors {
return llm.tensors
}
func (llm *ggla) decode(rs io.ReadSeeker) error {
func (m *ModelGGLA) decode(rs io.ReadSeeker) error {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
}
llm.kv["r"] = r
m.kv["r"] = r
var alpha uint32
if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
return err
}
llm.kv["alpha"] = alpha
m.kv["alpha"] = alpha
for {
var dims uint32
@@ -119,10 +109,54 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
t.Offset = uint64(offset)
if _, err := rs.Seek(int64(t.size()), io.SeekCurrent); err != nil {
if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
return err
}
llm.tensors = append(llm.tensors, &t)
m.tensors = append(m.tensors, t)
}
}
func (m *ModelGGLA) KV() KV {
return m.kv
}
func (m *ModelGGLA) Tensor() []Tensor {
return m.tensors
}
func (*ModelGGLA) ModelFamily() string {
return "ggla"
}
func (*ModelGGLA) ModelType() string {
panic("not implemented")
}
func (*ModelGGLA) FileType() string {
panic("not implemented")
}
func (*ModelGGLA) NumLayers() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumGQA() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumEmbed() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumHead() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumHeadKv() uint32 {
panic("not implemented")
}
func (*ModelGGLA) NumCtx() uint32 {
panic("not implemented")
}

View File

@@ -3,14 +3,14 @@ package llm
import (
"encoding/binary"
"errors"
"fmt"
"io"
"strings"
)
type GGML struct {
container
model
Size int64
}
const (
@@ -90,178 +90,15 @@ func fileType(fileType uint32) string {
}
type model interface {
KV() KV
Tensors() Tensors
}
type KV map[string]any
func (kv KV) u64(key string) uint64 {
switch v := kv[key].(type) {
case uint64:
return v
case uint32:
return uint64(v)
case float64:
return uint64(v)
default:
return 0
}
}
func (kv KV) Architecture() string {
if s, ok := kv["general.architecture"].(string); ok {
return s
}
return "unknown"
}
func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count")
}
func (kv KV) FileType() string {
if u64 := kv.u64("general.file_type"); u64 > 0 {
return fileType(uint32(u64))
}
return "unknown"
}
func (kv KV) BlockCount() uint64 {
return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
}
func (kv KV) HeadCount() uint64 {
return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
}
func (kv KV) HeadCountKV() uint64 {
if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
return headCountKV
}
return 1
}
func (kv KV) GQA() uint64 {
return kv.HeadCount() / kv.HeadCountKV()
}
func (kv KV) EmbeddingLength() uint64 {
return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
}
func (kv KV) ContextLength() uint64 {
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
}
type Tensors []*Tensor
func (ts Tensors) Layers() map[string]Layer {
layers := make(map[string]Layer)
for _, t := range ts {
parts := strings.Split(t.Name, ".")
if parts[0] == "blk" {
parts = parts[1:]
}
if _, ok := layers[parts[0]]; !ok {
layers[parts[0]] = make(Layer)
}
layers[parts[0]][strings.Join(parts[1:], ".")] = t
}
return layers
}
type Layer map[string]*Tensor
func (l Layer) size() (size uint64) {
for _, t := range l {
size += t.size()
}
return size
}
type Tensor struct {
Name string `json:"name"`
Kind uint32 `json:"kind"`
Offset uint64 `json:"-"`
// Shape is the number of elements in each dimension
Shape []uint64 `json:"shape"`
io.WriterTo `json:"-"`
}
func (t Tensor) blockSize() uint64 {
switch {
case t.Kind < 2:
return 1
case t.Kind < 10:
return 32
default:
return 256
}
}
func (t Tensor) typeSize() uint64 {
blockSize := t.blockSize()
switch t.Kind {
case 0: // FP32
return 4
case 1: // FP16
return 2
case 2: // Q4_0
return 2 + blockSize/2
case 3: // Q4_1
return 2 + 2 + blockSize/2
case 6: // Q5_0
return 2 + 4 + blockSize/2
case 7: // Q5_1
return 2 + 2 + 4 + blockSize/2
case 8: // Q8_0
return 2 + blockSize
case 9: // Q8_1
return 4 + 4 + blockSize
case 10: // Q2_K
return blockSize/16 + blockSize/4 + 2 + 2
case 11: // Q3_K
return blockSize/8 + blockSize/4 + 12 + 2
case 12: // Q4_K
return 2 + 2 + 12 + blockSize/2
case 13: // Q5_K
return 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
return blockSize/2 + blockSize/4 + blockSize/16 + 2
case 15: // Q8_K
return 2 + blockSize + 2*blockSize/16
case 16: // IQ2_XXS
return 2 + 2*blockSize/8
case 17: // IQ2_XS
return 2 + 2*blockSize/8 + blockSize/32
case 18: // IQ3_XXS
return 2 + 3*blockSize/8
default:
return 0
}
}
func (t Tensor) parameters() uint64 {
var count uint64 = 1
for _, n := range t.Shape {
count *= n
}
return count
}
func (t Tensor) size() uint64 {
return t.parameters() * t.typeSize() / t.blockSize()
ModelFamily() string
ModelType() string
FileType() string
NumLayers() uint32
NumGQA() uint32
NumEmbed() uint32
NumHead() uint32
NumHeadKv() uint32
NumCtx() uint32
}
type container interface {
@@ -285,102 +122,42 @@ const (
var ErrUnsupportedFormat = errors.New("unsupported model format")
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
return nil, err
}
var c container
switch magic {
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, 0, ErrUnsupportedFormat
return nil, ErrUnsupportedFormat
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
c = &ContainerGGLA{}
case FILE_MAGIC_GGUF_LE:
c = &containerGGUF{ByteOrder: binary.LittleEndian}
c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian}
c = &ContainerGGUF{ByteOrder: binary.BigEndian}
default:
return nil, 0, errors.New("invalid file magic")
return nil, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
if errors.Is(err, io.EOF) {
// noop
} else if err != nil {
return nil, 0, err
return nil, err
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return nil, 0, err
return nil, err
}
// final model type
return &GGML{
container: c,
model: model,
}, offset, nil
}
func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
embedding := llm.KV().EmbeddingLength()
heads := llm.KV().HeadCount()
headsKV := llm.KV().HeadCountKV()
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
layers := llm.Tensors().Layers()
switch llm.KV().Architecture() {
case "llama":
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
partialOffload = 4 * batch * embedding
partialOffload += max(
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
if ffnGateWeight, ok := layers["0"]["ffn_gate.0.weight"]; ok {
ffnGateWeight1 := ffnGateWeight.Shape[1]
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
partialOffload = max(
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
)
}
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+4*embedding+context*(1+heads)),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
)
case "qwen2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+2*embedding+context+context*heads),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
)
case "phi2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+4*embedding+context+context*heads),
)
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
}
return
Size: offset,
}, nil
}

View File

File diff suppressed because it is too large Load Diff

100
llm/llama.go Normal file
View File

@@ -0,0 +1,100 @@
package llm
import (
_ "embed"
"fmt"
"time"
"github.com/ollama/ollama/api"
)
const jsonGrammar = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
}
var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
type prediction struct {
Content string `json:"content"`
Model string `json:"model"`
Prompt string `json:"prompt"`
Stop bool `json:"stop"`
Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMS float64 `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMS float64 `json:"prompt_ms"`
}
}
const maxRetries = 3
type PredictOpts struct {
Prompt string
Format string
Images []ImageData
Options api.Options
}
type PredictResult struct {
Content string
Done bool
PromptEvalCount int
PromptEvalDuration time.Duration
EvalCount int
EvalDuration time.Duration
}
type TokenizeRequest struct {
Content string `json:"content"`
}
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}
type DetokenizeRequest struct {
Tokens []int `json:"tokens"`
}
type DetokenizeResponse struct {
Content string `json:"content"`
}
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
}

View File

@@ -1,86 +1,175 @@
package llm
// #cgo CFLAGS: -Illama.cpp
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #include <stdlib.h>
// #include "llama.h"
import "C"
import (
"context"
"fmt"
"unsafe"
"log/slog"
"os"
"runtime"
"slices"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
// SystemInfo is an unused example of calling llama.cpp functions using CGo
func SystemInfo() string {
return C.GoString(C.llama_print_system_info())
type LLM interface {
Predict(context.Context, PredictOpts, func(PredictResult)) error
Embedding(context.Context, string) ([]float64, error)
Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error)
Close()
}
func Quantize(infile, outfile, filetype string) error {
cinfile := C.CString(infile)
defer C.free(unsafe.Pointer(cinfile))
var cpuOnlyFamilies = []string{
"mamba",
}
coutfile := C.CString(outfile)
defer C.free(unsafe.Pointer(coutfile))
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
params := C.llama_model_quantize_default_params()
params.nthread = -1
f, err := os.Open(model)
if err != nil {
return nil, err
}
defer f.Close()
switch filetype {
case "F32":
params.ftype = fileTypeF32
case "F16":
params.ftype = fileTypeF16
case "Q4_0":
params.ftype = fileTypeQ4_0
case "Q4_1":
params.ftype = fileTypeQ4_1
case "Q4_1_F16":
params.ftype = fileTypeQ4_1_F16
case "Q8_0":
params.ftype = fileTypeQ8_0
case "Q5_0":
params.ftype = fileTypeQ5_0
case "Q5_1":
params.ftype = fileTypeQ5_1
case "Q2_K":
params.ftype = fileTypeQ2_K
case "Q3_K_S":
params.ftype = fileTypeQ3_K_S
case "Q3_K_M":
params.ftype = fileTypeQ3_K_M
case "Q3_K_L":
params.ftype = fileTypeQ3_K_L
case "Q4_K_S":
params.ftype = fileTypeQ4_K_S
case "Q4_K_M":
params.ftype = fileTypeQ4_K_M
case "Q5_K_S":
params.ftype = fileTypeQ5_K_S
case "Q5_K_M":
params.ftype = fileTypeQ5_K_M
case "Q6_K":
params.ftype = fileTypeQ6_K
case "IQ2_XXS":
params.ftype = fileTypeIQ2_XXS
case "IQ2_XS":
params.ftype = fileTypeIQ2_XS
case "Q2_K_S":
params.ftype = fileTypeQ2_K_S
case "Q3_K_XS":
params.ftype = fileTypeQ3_K_XS
case "IQ3_XXS":
params.ftype = fileTypeIQ3_XXS
ggml, err := DecodeGGML(f)
if err != nil {
return nil, err
}
if opts.NumCtx > int(ggml.NumCtx()) {
slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
opts.NumCtx = int(ggml.NumCtx())
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
vram, _ := gpu.CheckVRAM()
size := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.NumGQA()) * kv / 6
// certain model architectures don't support gpu inference yet
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
opts.NumGPU = 0
}
info := gpu.GetGPUInfo()
switch runtime.GOOS {
case "darwin":
if opts.NumGPU == 0 {
break
}
if size+kv+graph > vram {
slog.Info("not enough vram available, setting num_gpu=0")
opts.NumGPU = 0
break
}
// TODO: implement layer splitting on macOS
opts.NumGPU = 999
default:
return fmt.Errorf("unknown filetype: %s", filetype)
if info.Library == "cpu" {
slog.Info("GPU not available, falling back to CPU")
opts.NumGPU = 0
break
}
// don't use GPU at all if no layers are loaded
if opts.NumGPU == 0 {
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
break
}
// user-defined GPU count
if opts.NumGPU != -1 {
break
}
// the "main" GPU needs the most memory and determines the limit
// of how many layers can be loaded. It needs to fit:
// 1. the full compute graph allocation for all devices (graph)
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers := int64(ggml.NumLayers()) + 1
devices := int64(info.DeviceCount)
avg := vram / devices
layers := maxlayers * (avg - graph) / (kv + size/devices)
if layers > maxlayers {
layers = maxlayers
}
// 1 + 2 must fit on the main gpu
min := graph + kv*layers/maxlayers
if layers <= 0 || min > avg {
slog.Info("not enough vram available, falling back to CPU only")
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
opts.NumGPU = 0
break
}
opts.NumGPU = int(layers)
}
if retval := C.llama_model_quantize(cinfile, coutfile, &params); retval != 0 {
return fmt.Errorf("llama_model_quantize: %d", retval)
}
return nil
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
return newLlmServer(info, model, adapters, projectors, opts)
}
// Give any native cgo implementations an opportunity to initialize
func Init() error {
return nativeInit()
}
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
if demandLib != "" {
libPath := availableDynLibs[demandLib]
if libPath == "" {
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
} else {
slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
dynLibs = []string{libPath}
}
}
// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
_, err := os.Stat(dynLibs[0])
if err != nil {
slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
err = nativeInit()
if err != nil {
return nil, err
}
}
err2 := fmt.Errorf("unable to locate suitable llm library")
for _, dynLib := range dynLibs {
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
if err == nil {
return srv, nil
}
slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
err2 = err
}
return nil, err2
}

View File

@@ -1,6 +0,0 @@
package llm
import "embed"
//go:embed build/linux/*/*/bin/*
var libEmbed embed.FS

View File

@@ -1,6 +0,0 @@
package llm
import "embed"
//go:embed build/windows/*/*/bin/*
var libEmbed embed.FS

View File

@@ -0,0 +1,13 @@
diff --git a/llama.cpp b/llama.cpp
index b27aa272..99372f9c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
- static const std::locale locale("en_US.UTF-8");
+ static const std::locale locale("");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;

View File

@@ -1,211 +0,0 @@
package llm
import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
"strings"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/gpu"
)
var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
func Init() error {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
return err
}
slog.Info("extracting embedded files", "dir", payloadsDir)
binGlob := "build/*/*/*/bin/*"
// extract server libraries
err = extractFiles(payloadsDir, binGlob)
if err != nil {
return fmt.Errorf("extract binaries: %v", err)
}
var variants []string
for v := range availableServers() {
variants = append(variants, v)
}
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}
// binary names may contain an optional variant separated by '_'
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
// Any library without a variant is the lowest common denominator
func availableServers() map[string]string {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
slog.Error("payload lookup error", "error", err)
return nil
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*")
files, err := filepath.Glob(pattern)
if err != nil {
slog.Debug("could not glob", "pattern", pattern, "error", err)
return nil
}
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(file)] = file
}
return servers
}
// serversForGpu returns a list of compatible servers give the provided GPU
// info, ordered by performance. assumes Init() has been called
// TODO - switch to metadata based mapping
func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := availableServers()
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
}
servers := []string{}
// exact match first
for a := range availableServers {
if a == requested {
servers = []string{a}
if a == "metal" {
return servers
}
break
}
}
alt := []string{}
// Then for GPUs load alternates and sort the list for consistent load ordering
if info.Library != "cpu" {
for a := range availableServers {
if info.Library == strings.Split(a, "_")[0] && a != requested {
alt = append(alt, a)
}
}
slices.Sort(alt)
servers = append(servers, alt...)
}
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableServers {
if cmp == "cpu_"+variant {
servers = append(servers, cmp)
break
}
}
} else {
servers = append(servers, "cpu")
}
}
if len(servers) == 0 {
servers = []string{"cpu"}
}
return servers
}
// extract extracts the embedded files to the target directory
func extractFiles(targetDir string, glob string) error {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return errPayloadMissing
}
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
}
g := new(errgroup.Group)
// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
for _, file := range files {
filename := file
variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
slog.Debug("extracting", "variant", variant, "file", filename)
g.Go(func() error {
srcf, err := libEmbed.Open(filename)
if err != nil {
return err
}
defer srcf.Close()
src := io.Reader(srcf)
if strings.HasSuffix(filename, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", filename, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
variantDir := filepath.Join(targetDir, variant)
if err := os.MkdirAll(variantDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
}
base := filepath.Base(filename)
destFilename := filepath.Join(variantDir, base)
_, err = os.Stat(destFilename)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", filename, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", filename, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", filename, err)
}
return nil
})
}
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return err
}
return nil
}

233
llm/payload_common.go Normal file
View File

@@ -0,0 +1,233 @@
package llm
import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/gpu"
)
// Libraries names may contain an optional variant separated by '_'
// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
// Any library without a variant is the lowest common denominator
var availableDynLibs = map[string]string{}
const pathComponentCount = 7
// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
func getDynLibs(gpuInfo gpu.GpuInfo) []string {
// Short circuit if we know we're using the default built-in (darwin only)
if gpuInfo.Library == "default" {
return []string{"default"}
}
// TODO - temporary until we have multiple CPU variations for Darwin
// Short circuit on darwin with metal only
if len(availableDynLibs) == 1 {
if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
return []string{availableDynLibs["metal"]}
}
}
exactMatch := ""
dynLibs := []string{}
altDynLibs := []string{}
requested := gpuInfo.Library
if gpuInfo.Variant != "" {
requested += "_" + gpuInfo.Variant
}
// Try to find an exact match
for cmp := range availableDynLibs {
if requested == cmp {
exactMatch = cmp
dynLibs = []string{availableDynLibs[cmp]}
break
}
}
// Then for GPUs load alternates and sort the list for consistent load ordering
if gpuInfo.Library != "cpu" {
for cmp := range availableDynLibs {
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
altDynLibs = append(altDynLibs, cmp)
}
}
slices.Sort(altDynLibs)
for _, altDynLib := range altDynLibs {
dynLibs = append(dynLibs, availableDynLibs[altDynLib])
}
}
// Load up the best CPU variant if not primary requested
if gpuInfo.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableDynLibs {
if cmp == "cpu_"+variant {
dynLibs = append(dynLibs, availableDynLibs[cmp])
break
}
}
} else {
dynLibs = append(dynLibs, availableDynLibs["cpu"])
}
}
// Finally, if we didn't find any matches, LCD CPU FTW
if len(dynLibs) == 0 {
dynLibs = []string{availableDynLibs["cpu"]}
}
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
return dynLibs
}
func rocmDynLibPresent() bool {
for dynLibName := range availableDynLibs {
if strings.HasPrefix(dynLibName, "rocm") {
return true
}
}
return false
}
func nativeInit() error {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
return err
}
slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
if err != nil {
if errors.Is(err, payloadMissing) {
slog.Info(fmt.Sprintf("%s", payloadMissing))
return nil
}
return err
}
for _, lib := range libs {
// The last dir component is the variant name
variant := filepath.Base(filepath.Dir(lib))
availableDynLibs[variant] = lib
}
if err := verifyDriverAccess(); err != nil {
return err
}
// Report which dynamic libraries we have loaded to assist troubleshooting
variants := make([]string, len(availableDynLibs))
i := 0
for variant := range availableDynLibs {
variants[i] = variant
i++
}
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}
func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return nil, payloadMissing
}
var mu sync.Mutex
var libs []string
var g errgroup.Group
for _, file := range files {
pathComps := strings.Split(file, "/")
if len(pathComps) != pathComponentCount {
slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
continue
}
file := file
g.Go(func() error {
// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
// Include the variant in the path to avoid conflicts between multiple server libs
targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
srcFile, err := libEmbed.Open(file)
if err != nil {
return fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
}
src := io.Reader(srcFile)
filename := file
if strings.HasSuffix(file, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", file, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
destFile := filepath.Join(targetDir, filepath.Base(filename))
if strings.Contains(destFile, "server") {
mu.Lock()
libs = append(libs, destFile)
mu.Unlock()
}
destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFp.Close()
if _, err := io.Copy(destFp, src); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
return nil
})
}
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return nil, err
}
return libs, nil
}
func verifyDriverAccess() error {
if runtime.GOOS != "linux" {
return nil
}
// Only check ROCm access if we have the dynamic lib loaded
if rocmDynLibPresent() {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
}
return nil
}

View File

@@ -0,0 +1,8 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
var libEmbed embed.FS

View File

@@ -0,0 +1,8 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
var libEmbed embed.FS

View File

@@ -4,5 +4,5 @@ import (
"embed"
)
//go:embed build/darwin/x86_64/*/bin/*
//go:embed llama.cpp/build/linux/*/*/lib/*
var libEmbed embed.FS

58
llm/payload_test.go Normal file
View File

@@ -0,0 +1,58 @@
package llm
import (
"testing"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
)
func TestGetDynLibs(t *testing.T) {
availableDynLibs = map[string]string{
"cpu": "X_cpu",
}
assert.Equal(t, false, rocmDynLibPresent())
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"], res[0])
variant := gpu.GetCPUVariant()
if variant != "" {
variant = "_" + variant
}
availableDynLibs = map[string]string{
"rocm_v5": "X_rocm_v5",
"rocm_v6": "X_rocm_v6",
"cpu" + variant: "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
res = getDynLibs(gpu.GpuInfo{Library: "default"})
assert.Len(t, res, 1)
assert.Equal(t, "default", res[0])
availableDynLibs = map[string]string{
"rocm": "X_rocm_v5",
"cpu" + variant: "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 2)
assert.Equal(t, availableDynLibs["rocm"], res[0])
assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
}

View File

@@ -4,5 +4,5 @@ import (
"embed"
)
//go:embed build/darwin/arm64/*/bin/*
//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
var libEmbed embed.FS

View File

@@ -1,843 +0,0 @@
package llm
import (
"bufio"
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"log/slog"
"math/rand"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
// LlamaServer is an instance of the llama.cpp server
type LlamaServer struct {
port int
cmd *exec.Cmd
done chan error // Channel to signal when the process exits
status *StatusWriter
options api.Options
}
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
f, err := os.Open(model)
if err != nil {
return nil, err
}
defer f.Close()
ggml, _, err := DecodeGGML(f)
if err != nil {
return nil, err
}
if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
opts.NumCtx = int(ggml.KV().ContextLength())
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
memoryAvailable, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo()
memoryMinimum := info.MinimumMemory
for _, projector := range projectors {
memoryMinimum += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6
}
if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
}
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload
if info.Library != "metal" {
if memoryRequiredPartial > memoryAvailable {
info.Library = "cpu"
}
}
var layerCount int
layers := ggml.Tensors().Layers()
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("%d", i)].size()
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
memoryRequiredTotal += memoryLayer
if memoryAvailable > memoryRequiredPartial+memoryLayer {
memoryRequiredPartial += memoryLayer
layerCount++
}
}
memoryLayerOutput := layers["output"].size()
memoryRequiredTotal += memoryLayerOutput
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
// disable partial offloading when model is greater than total system memory
opts.NumGPU = 0
} else if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}
if opts.NumGPU < 0 {
opts.NumGPU = layerCount
}
slog.Info(
"offload to gpu",
"reallayers", opts.NumGPU,
"layers", layerCount,
"required", format.HumanBytes2(memoryRequiredTotal),
"used", format.HumanBytes2(memoryRequiredPartial),
"available", format.HumanBytes2(memoryAvailable),
"kv", format.HumanBytes2(kv),
"fulloffload", format.HumanBytes2(graphFullOffload),
"partialoffload", format.HumanBytes2(graphPartialOffload),
)
if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
}
availableServers := availableServers()
servers := serversForGpu(info)
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
if demandLib != "" {
serverPath := availableServers[demandLib]
if serverPath == "" {
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
} else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib}
}
}
if len(servers) == 0 {
return nil, fmt.Errorf("no servers found for %v", info)
}
params := []string{
"--model", model,
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--embedding",
}
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
params = append(params, "--log-format", "json")
} else {
params = append(params, "--log-disable")
}
if opts.NumGPU >= 0 {
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
}
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
params = append(params, "--verbose")
}
if opts.MainGPU > 0 {
params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
}
if len(adapters) > 0 {
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
params = append(params, "--lora", adapters[0])
}
if len(projectors) > 0 {
// TODO: applying multiple projectors is not supported by the llama.cpp server yet
params = append(params, "--mmproj", projectors[0])
}
if opts.NumThread > 0 {
params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
}
if !opts.F16KV {
params = append(params, "--memory-f32")
}
if opts.UseMLock {
params = append(params, "--mlock")
}
if !opts.UseMMap {
params = append(params, "--no-mmap")
}
if opts.UseNUMA {
params = append(params, "--numa")
}
// Loop through potential servers
var finalErr error
for i := 0; i < len(servers); i++ {
dir := availableServers[servers[i]]
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed ", "error", err)
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
finalParams := append(params, "--port", strconv.Itoa(port))
pathEnv := "LD_LIBRARY_PATH"
if runtime.GOOS == "windows" {
pathEnv = "PATH"
}
// append the server directory to LD_LIBRARY_PATH/PATH
libraryPaths := []string{dir}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// Append our runner directory to the path
// This will favor system libraries over our bundled library dependencies
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
}
server := filepath.Join(dir, "ollama_llama_server")
if runtime.GOOS == "windows" {
server = server + ".exe"
}
s := &LlamaServer{
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
}
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
slog.Debug(libEnv)
s.cmd.Env = append(os.Environ(), libEnv)
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
slog.Info("starting llama server", "cmd", s.cmd.String())
if err = s.cmd.Start(); err != nil {
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
finalErr = err
continue
}
// reap subprocess when it exits
go func() {
// Exit status managed via getServerStatus
_ = s.cmd.Wait()
}()
return s, nil
}
slog.Error("unable to load any llama server", "error", finalErr)
return nil, finalErr
}
func projectorMemoryRequirements(filename string) uint64 {
file, err := os.Open(filename)
if err != nil {
return 0
}
defer file.Close()
ggml, _, err := DecodeGGML(file)
if err != nil {
return 0
}
var mem uint64
for _, layer := range ggml.Tensors().Layers() {
mem += layer.size()
}
return mem
}
type ServerStatus int
const ( // iota is reset to 0
ServerStatusReady ServerStatus = iota
ServerStatusNoSlotsAvaialble
ServerStatusLoadingModel
ServerStatusNotResponding
ServerStatusError
)
type ServerStatusResp struct {
Status string `json:"status"`
SlotsIdle int `json:"slots_idle"`
SlotsProcessing int `json:"slots_processing"`
Error string `json:"error"`
}
func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
// Fail fast if its exited
if s.cmd.ProcessState != nil {
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/health", s.port), nil)
if err != nil {
return ServerStatusError, fmt.Errorf("error creating GET request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
if errors.Is(err, context.DeadlineExceeded) {
return ServerStatusNotResponding, fmt.Errorf("server not responding")
}
return ServerStatusError, fmt.Errorf("health resp: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return ServerStatusError, fmt.Errorf("read health request: %w", err)
}
var status ServerStatusResp
if err := json.Unmarshal(body, &status); err != nil {
return ServerStatusError, fmt.Errorf("health unmarshal encode response: %w", err)
}
switch status.Status {
case "ok":
return ServerStatusReady, nil
case "no slot available":
return ServerStatusNoSlotsAvaialble, nil
case "loading model":
return ServerStatusLoadingModel, nil
default:
return ServerStatusError, fmt.Errorf("server error: %+v", status)
}
}
func (s *LlamaServer) Ping(ctx context.Context) error {
_, err := s.getServerStatus(ctx)
if err != nil {
slog.Debug("server unhealthy", "error", err)
return err
}
return nil
}
func (s *LlamaServer) WaitUntilRunning() error {
start := time.Now()
// TODO we need to wire up a better way to detect hangs during model load and startup of the server
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
ticker := time.NewTicker(50 * time.Millisecond)
defer ticker.Stop()
slog.Info("waiting for llama runner to start responding")
var lastStatus ServerStatus = -1
for {
select {
case err := <-s.done:
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
case <-ticker.C:
if time.Now().After(expiresAt) {
// timeout
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
}
if s.cmd.ProcessState != nil {
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
}
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
status, err := s.getServerStatus(ctx)
if err != nil && lastStatus != status {
slog.Debug("server not yet available", "error", err)
lastStatus = status
continue
}
switch status {
case ServerStatusLoadingModel:
// TODO - this state never seems to happen with the current server.cpp code (bug?)
// it doesn't respond to the health endpoint until after the model is loaded
slog.Debug("loading model")
case ServerStatusReady:
slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
return nil
}
}
}
}
const jsonGrammar = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`
const maxBufferSize = 512 * format.KiloByte
const maxRetries = 3
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
}
type completion struct {
Content string `json:"content"`
Model string `json:"model"`
Prompt string `json:"prompt"`
Stop bool `json:"stop"`
Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMS float64 `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMS float64 `json:"prompt_ms"`
}
}
type CompletionRequest struct {
Prompt string
Format string
Images []ImageData
Options api.Options
}
type CompletionResponse struct {
Content string
Done bool
PromptEvalCount int
PromptEvalDuration time.Duration
EvalCount int
EvalDuration time.Duration
}
func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
request := map[string]any{
"prompt": req.Prompt,
"stream": true,
"n_predict": req.Options.NumPredict,
"n_keep": req.Options.NumKeep,
"main_gpu": req.Options.MainGPU,
"temperature": req.Options.Temperature,
"top_k": req.Options.TopK,
"top_p": req.Options.TopP,
"tfs_z": req.Options.TFSZ,
"typical_p": req.Options.TypicalP,
"repeat_last_n": req.Options.RepeatLastN,
"repeat_penalty": req.Options.RepeatPenalty,
"presence_penalty": req.Options.PresencePenalty,
"frequency_penalty": req.Options.FrequencyPenalty,
"mirostat": req.Options.Mirostat,
"mirostat_tau": req.Options.MirostatTau,
"mirostat_eta": req.Options.MirostatEta,
"penalize_nl": req.Options.PenalizeNewline,
"seed": req.Options.Seed,
"stop": req.Options.Stop,
"image_data": req.Images,
"cache_prompt": true,
}
// Make sure the server is ready
status, err := s.getServerStatus(ctx)
if err != nil {
return err
} else if status != ServerStatusReady {
return fmt.Errorf("unexpected server status: %d", status)
}
if req.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(req.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
}
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %v", err)
}
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
if err != nil {
return fmt.Errorf("error creating POST request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("POST predict: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed reading llm error response: %w", err)
}
log.Printf("llm predict error: %s", bodyBytes)
return fmt.Errorf("%s", bodyBytes)
}
scanner := bufio.NewScanner(resp.Body)
buf := make([]byte, 0, maxBufferSize)
scanner.Buffer(buf, maxBufferSize)
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
for scanner.Scan() {
select {
case <-ctx.Done():
// This handles the request cancellation
return ctx.Err()
default:
line := scanner.Bytes()
if len(line) == 0 {
continue
}
// try again on slot unavailable
if bytes.Contains(line, []byte("slot unavailable")) {
retryNeeded = true
break
}
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
}
var c completion
if err := json.Unmarshal(evt, &c); err != nil {
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
}
switch {
case strings.TrimSpace(c.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(c.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return ctx.Err()
}
if c.Content != "" {
fn(CompletionResponse{
Content: c.Content,
})
}
if c.Stop {
fn(CompletionResponse{
Done: true,
PromptEvalCount: c.Timings.PromptN,
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
EvalCount: c.Timings.PredictedN,
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
})
return nil
}
}
}
if err := scanner.Err(); err != nil {
if strings.Contains(err.Error(), "unexpected EOF") {
s.Close()
msg := ""
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
}
return fmt.Errorf("error reading llm response: %v", err)
}
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
}
func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
// Make sure the server is ready
status, err := s.getServerStatus(ctx)
if err != nil {
return nil, err
} else if status != ServerStatusReady {
return nil, fmt.Errorf("unexpected server status: %d", status)
}
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
if err != nil {
return nil, fmt.Errorf("error creating embed request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("do embedding request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("error reading embed response: %w", err)
}
if resp.StatusCode >= 400 {
log.Printf("llm encode error: %s", body)
return nil, fmt.Errorf("%s", body)
}
var embedding EmbeddingResponse
if err := json.Unmarshal(body, &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
}
type TokenizeRequest struct {
Content string `json:"content"`
}
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}
func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
// Make sure the server is ready
status, err := s.getServerStatus(ctx)
if err != nil {
return nil, err
} else if status != ServerStatusReady {
return nil, fmt.Errorf("unexpected server status: %d", status)
}
data, err := json.Marshal(TokenizeRequest{Content: content})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
if err != nil {
return nil, fmt.Errorf("encode request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("do encode request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read encode request: %w", err)
}
if resp.StatusCode >= 400 {
log.Printf("llm encode error: %s", body)
return nil, fmt.Errorf("%s", body)
}
var encoded TokenizeResponse
if err := json.Unmarshal(body, &encoded); err != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err)
}
return encoded.Tokens, nil
}
type DetokenizeRequest struct {
Tokens []int `json:"tokens"`
}
type DetokenizeResponse struct {
Content string `json:"content"`
}
func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
// Make sure the server is ready
status, err := s.getServerStatus(ctx)
if err != nil {
return "", err
} else if status != ServerStatusReady {
return "", fmt.Errorf("unexpected server status: %d", status)
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
if err != nil {
return "", fmt.Errorf("decode request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("do decode request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read decode request: %w", err)
}
if resp.StatusCode >= 400 {
log.Printf("llm decode error: %s", body)
return "", fmt.Errorf("%s", body)
}
var decoded DetokenizeResponse
if err := json.Unmarshal(body, &decoded); err != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err)
}
return decoded.Content, nil
}
func (s *LlamaServer) Close() error {
if s.cmd != nil {
slog.Debug("stopping llama server")
return s.cmd.Process.Kill()
}
return nil
}
func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {
panic(err)
}
return dur
}

View File

@@ -1,42 +0,0 @@
package llm
import (
"bytes"
"os"
)
// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
LastErrMsg string
out *os.File
}
func NewStatusWriter(out *os.File) *StatusWriter {
return &StatusWriter{
out: out,
}
}
// TODO - regex matching to detect errors like
// libcublasLt.so.11: cannot open shared object file: No such file or directory
var errorPrefixes = []string{
"error:",
"CUDA error",
"cudaMalloc failed",
"\"ERR\"",
}
func (w *StatusWriter) Write(b []byte) (int, error) {
var errMsg string
for _, prefix := range errorPrefixes {
if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
errMsg = prefix + string(bytes.TrimSpace(after))
}
}
if errMsg != "" {
w.LastErrMsg = errMsg
}
return w.out.Write(b)
}

15
llm/utils.go Normal file
View File

@@ -0,0 +1,15 @@
package llm
import (
"fmt"
"time"
)
func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {
panic(err)
}
return dur
}

View File

@@ -14,7 +14,7 @@ go build .
Then run the desktop app with `npm start`:
```
cd macapp
cd app
npm install
npm start
```

View File

@@ -142,9 +142,7 @@ func (h *History) Save() error {
for cnt := 0; cnt < h.Size(); cnt++ {
v, _ := h.Buf.Get(cnt)
line, _ := v.([]rune)
if _, err := buf.WriteString(string(line) + "\n"); err != nil {
return err
}
buf.WriteString(string(line) + "\n")
}
buf.Flush()
f.Close()

View File

@@ -10,7 +10,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$V
# For developers, you can override the DOCKER_ORG to generate multiarch manifests
# DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
ARCH_IMAGE_REPO=${ARCH_IMAGE_REPO:-"${DOCKER_ORG}/release"}
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
@@ -25,7 +25,7 @@ OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
if [ -z "${PUSH}" ] ; then
LOAD_OR_PUSH="--load"
else
echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
echo "Will be pushing ${ARCH_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
LOAD_OR_PUSH="--push"
fi
@@ -37,7 +37,7 @@ if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
--build-arg=VERSION \
--build-arg=GOFLAGS \
-f Dockerfile \
-t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
-t ${ARCH_IMAGE_REPO}:$VERSION-${TARGETARCH} \
.
done
@@ -49,7 +49,7 @@ if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
--build-arg=GOFLAGS \
--target runtime-rocm \
-f Dockerfile \
-t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
-t ${ARCH_IMAGE_REPO}:$VERSION-rocm \
.
fi
fi
@@ -57,21 +57,21 @@ fi
if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
if [ -n "${PUSH}" ]; then
docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
${RELEASE_IMAGE_REPO}:$VERSION-arm64
${ARCH_IMAGE_REPO}:$VERSION-amd64 \
${ARCH_IMAGE_REPO}:$VERSION-arm64
docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
# For symmetry, tag/push the rocm image
if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
if [ "${ARCH_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
echo "Tagging and pushing rocm image"
docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
docker pull ${ARCH_IMAGE_REPO}:$VERSION-rocm
docker tag ${ARCH_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
fi
else
echo "Skipping manifest generation when not pushing images are available locally as "
echo " ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
echo " ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
echo " ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
echo " ${ARCH_IMAGE_REPO}:$VERSION-amd64"
echo " ${ARCH_IMAGE_REPO}:$VERSION-arm64"
echo " ${ARCH_IMAGE_REPO}:$VERSION-rocm"
fi
fi

View File

@@ -1,33 +0,0 @@
#!/bin/sh
set -eu
# We use 2 different image repositories to handle combining architecture images into multiarch manifest
# (The ROCm image is x86 only and is not a multiarch manifest)
# For developers, you can override the DOCKER_ORG to generate multiarch manifests
# DOCKER_ORG=jdoe VERSION=0.1.30 PUSH=1 ./scripts/tag_latest.sh
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
# Set PUSH to a non-empty string to trigger push instead of load
PUSH=${PUSH:-""}
echo "Assembling manifest and tagging latest"
docker manifest rm ${FINAL_IMAGE_REPO}:latest || true
docker manifest create ${FINAL_IMAGE_REPO}:latest \
${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
${RELEASE_IMAGE_REPO}:$VERSION-arm64
docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:rocm
if [ -n "${PUSH}" ]; then
echo "Pushing latest tags up..."
docker manifest push ${FINAL_IMAGE_REPO}:latest
docker push ${FINAL_IMAGE_REPO}:rocm
else
echo "Not pushing ${FINAL_IMAGE_REPO}:latest and ${FINAL_IMAGE_REPO}:rocm"
fi

View File

@@ -247,8 +247,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
}
if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
slog.Info(fmt.Sprintf("%s part %d stalled; retrying", b.Digest[7:19], part.N))
// reset last updated
part.lastUpdated = time.Time{}
return errPartStalled

View File

@@ -26,7 +26,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
@@ -284,7 +283,7 @@ func realpath(mfDir, from string) string {
return abspath
}
func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
func CreateModel(ctx context.Context, name, modelFileDir string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
deleteMap := make(map[string]struct{})
if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
for _, layer := range append(manifest.Layers, manifest.Config) {
@@ -322,7 +321,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
pathName := realpath(modelFileDir, c.Args)
ggufName, err := convertModel(name, pathName, fn)
ggufName, err := convertSafetensors(name, pathName)
if err != nil {
var pathErr *fs.PathError
switch {
@@ -338,26 +337,6 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
if ggufName != "" {
pathName = ggufName
defer os.RemoveAll(ggufName)
if quantization != "" {
quantization = strings.ToUpper(quantization)
fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", "F16", quantization)})
tempfile, err := os.CreateTemp(filepath.Dir(ggufName), quantization)
if err != nil {
return err
}
defer os.RemoveAll(tempfile.Name())
if err := llm.Quantize(ggufName, tempfile.Name(), quantization); err != nil {
return err
}
if err := tempfile.Close(); err != nil {
return err
}
pathName = tempfile.Name()
}
}
bin, err := os.Open(pathName)
@@ -440,32 +419,34 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
defer bin.Close()
var offset int64
CREATE:
for {
fn(api.ProgressResponse{Status: "creating model layer"})
if _, err := bin.Seek(offset, io.SeekStart); err != nil {
return err
}
ggml, size, err := llm.DecodeGGML(bin)
if errors.Is(err, io.EOF) {
break
} else if errors.Is(err, llm.ErrUnsupportedFormat) {
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
} else if err != nil {
return err
bin.Seek(offset, io.SeekStart)
ggml, err := llm.DecodeGGML(bin)
if err != nil {
switch {
case errors.Is(err, io.EOF):
break CREATE
case errors.Is(err, llm.ErrUnsupportedFormat):
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
default:
return err
}
}
config.SetModelFormat(ggml.Name())
config.SetModelFamily(ggml.KV().Architecture())
config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
config.SetFileType(ggml.KV().FileType())
config.SetModelFamily(ggml.ModelFamily())
config.SetModelType(ggml.ModelType())
config.SetFileType(ggml.FileType())
mediatype := mediatype
if ggml.KV().Architecture() == "clip" {
if ggml.ModelFamily() == "clip" {
mediatype = "application/vnd.ollama.image.projector"
}
sr := io.NewSectionReader(bin, offset, size)
sr := io.NewSectionReader(bin, offset, ggml.Size)
layer, err := NewLayer(sr, mediatype)
if err != nil {
return err
@@ -473,7 +454,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
layers.Add(layer)
offset += size
offset += ggml.Size
}
case "adapter":
if strings.HasPrefix(c.Args, "@") {
@@ -492,12 +473,12 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
}
defer bin.Close()
_, size, err := llm.DecodeGGML(bin)
ggml, err := llm.DecodeGGML(bin)
if err != nil {
return err
}
sr := io.NewSectionReader(bin, 0, size)
sr := io.NewSectionReader(bin, 0, ggml.Size)
layer, err := NewLayer(sr, mediatype)
if err != nil {
return err
@@ -569,6 +550,13 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
}
}
// xxx - can this be removed?
if config.ModelType == "65B" {
if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
config.ModelType = "70B"
}
}
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
return err
@@ -633,8 +621,8 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
return nil
}
func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string, error) {
r, err := zip.OpenReader(path)
func convertSafetensors(name, fn string) (string, error) {
r, err := zip.OpenReader(fn)
if err != nil {
return "", err
}
@@ -646,7 +634,6 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
}
defer os.RemoveAll(tempDir)
fn(api.ProgressResponse{Status: "unpacking model metadata"})
for _, f := range r.File {
fpath := filepath.Join(tempDir, f.Name)
outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
@@ -668,37 +655,37 @@ func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string
rc.Close()
}
mf, err := convert.GetModelFormat(tempDir)
params, err := convert.GetParams(tempDir)
if err != nil {
return "", err
}
params, err := mf.GetParams(tempDir)
SupportedArchs := []string{
"MistralForCausalLM",
}
for _, arch := range params.Architectures {
if !slices.Contains(SupportedArchs, arch) {
return "", fmt.Errorf("this safetensors model is not yet supported")
}
}
t, err := convert.GetSafeTensors(tempDir)
if err != nil {
return "", err
}
mArch, err := mf.GetModelArch(name, tempDir, params)
vocab, err := convert.LoadTokens(tempDir)
if err != nil {
return "", err
}
fn(api.ProgressResponse{Status: "processing tensors"})
if err := mArch.GetTensors(); err != nil {
return "", err
}
if err := mArch.LoadVocab(); err != nil {
return "", err
}
fn(api.ProgressResponse{Status: "converting model"})
path, err = mArch.WriteGGUF()
fn, err = convert.WriteGGUF(name, t, params, vocab)
if err != nil {
return "", err
}
return path, nil
return fn, nil
}
func CopyModel(src, dest string) error {

View File

@@ -56,49 +56,34 @@ func init() {
var loaded struct {
mu sync.Mutex
llama *llm.LlamaServer
runner llm.LLM
expireAt time.Time
expireTimer *time.Timer
model string
adapters []string
projectors []string
*Model
*api.Options
}
var defaultSessionDuration = 5 * time.Minute
func unload() {
if loaded.llama != nil {
loaded.llama.Close()
}
loaded.llama = nil
loaded.model = ""
loaded.adapters = nil
loaded.projectors = nil
loaded.Options = nil
}
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
ctx, cancel := context.WithTimeout(c, 10*time.Second)
defer cancel()
needLoad := loaded.llama == nil || // is there a model loaded?
loaded.model != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
loaded.llama.Ping(ctx) != nil
needLoad := loaded.runner == nil || // is there a model loaded?
loaded.ModelPath != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed?
if needLoad {
if loaded.llama != nil {
if loaded.runner != nil {
slog.Info("changing loaded model")
unload()
loaded.runner.Close()
loaded.runner = nil
loaded.Model = nil
loaded.Options = nil
}
llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
@@ -110,24 +95,29 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
return err
}
loaded.model = model.ModelPath
loaded.adapters = model.AdapterPaths
loaded.projectors = model.ProjectorPaths
loaded.llama = llama
loaded.Model = model
loaded.runner = llmRunner
loaded.Options = &opts
if err = llama.WaitUntilRunning(); err != nil {
slog.Error("error loading llama server", "error", err)
unload()
return err
}
}
loaded.expireAt = time.Now().Add(sessionDuration)
if loaded.expireTimer == nil {
loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
loaded.mu.Lock()
defer loaded.mu.Unlock()
unload()
if time.Now().Before(loaded.expireAt) {
return
}
if loaded.runner != nil {
loaded.runner.Close()
}
loaded.runner = nil
loaded.Model = nil
loaded.Options = nil
})
}
@@ -275,7 +265,7 @@ func GenerateHandler(c *gin.Context) {
sb.Reset()
if req.Context != nil {
prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
@@ -296,8 +286,9 @@ func GenerateHandler(c *gin.Context) {
go func() {
defer close(ch)
fn := func(r llm.CompletionResponse) {
fn := func(r llm.PredictResult) {
// Update model expiration
loaded.expireAt = time.Now().Add(sessionDuration)
loaded.expireTimer.Reset(sessionDuration)
// Build up the full response
@@ -331,7 +322,7 @@ func GenerateHandler(c *gin.Context) {
}
// TODO (jmorganca): encode() should not strip special tokens
tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
tokens, err := loaded.runner.Encode(c.Request.Context(), p)
if err != nil {
ch <- gin.H{"error": err.Error()}
return
@@ -353,13 +344,13 @@ func GenerateHandler(c *gin.Context) {
}
// Start prediction
req := llm.CompletionRequest{
predictReq := llm.PredictOpts{
Prompt: prompt,
Format: req.Format,
Images: images,
Options: opts,
}
if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()
@@ -480,7 +471,7 @@ func EmbeddingsHandler(c *gin.Context) {
return
}
embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
if err != nil {
slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
@@ -651,7 +642,7 @@ func CreateModelHandler(c *gin.Context) {
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil {
if err := CreateModel(ctx, model, filepath.Dir(req.Path), commands, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()
@@ -917,24 +908,6 @@ func HeadBlobHandler(c *gin.Context) {
}
func CreateBlobHandler(c *gin.Context) {
path, err := GetBlobsPath(c.Param("digest"))
if err != nil {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
_, err = os.Stat(path)
switch {
case errors.Is(err, os.ErrNotExist):
// noop
case err != nil:
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
default:
c.Status(http.StatusOK)
return
}
layer, err := NewLayer(c.Request.Body, "")
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -1040,14 +1013,16 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
}
func (s *Server) GenerateRoutes() http.Handler {
var origins []string
if o := os.Getenv("OLLAMA_ORIGINS"); o != "" {
origins = strings.Split(o, ",")
}
config := cors.DefaultConfig()
config.AllowWildcard = true
config.AllowBrowserExtensions = true
if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
config.AllowOrigins = strings.Split(allowedOrigins, ",")
}
config.AllowOrigins = origins
for _, allowOrigin := range defaultAllowOrigins {
config.AllowOrigins = append(config.AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
@@ -1150,7 +1125,9 @@ func Serve(ln net.Listener) error {
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-signals
unload()
if loaded.runner != nil {
loaded.runner.Close()
}
gpu.Cleanup()
os.Exit(0)
}()
@@ -1221,7 +1198,7 @@ func streamResponse(c *gin.Context, ch chan any) {
// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
encode := func(s string) ([]int, error) {
return loaded.llama.Tokenize(ctx, s)
return loaded.runner.Encode(ctx, s)
}
prompt, err := ChatPrompt(template, messages, numCtx, encode)
@@ -1351,8 +1328,9 @@ func ChatHandler(c *gin.Context) {
go func() {
defer close(ch)
fn := func(r llm.CompletionResponse) {
fn := func(r llm.PredictResult) {
// Update model expiration
loaded.expireAt = time.Now().Add(sessionDuration)
loaded.expireTimer.Reset(sessionDuration)
resp := api.ChatResponse{
@@ -1376,12 +1354,14 @@ func ChatHandler(c *gin.Context) {
ch <- resp
}
if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
// Start prediction
predictReq := llm.PredictOpts{
Prompt: prompt,
Format: req.Format,
Images: images,
Options: opts,
}, fn); err != nil {
}
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
ch <- gin.H{"error": err.Error()}
}
}()

View File

@@ -3,7 +3,6 @@ package server
import (
"bytes"
"context"
"encoding/binary"
"encoding/json"
"fmt"
"io"
@@ -17,6 +16,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
)
@@ -31,22 +31,13 @@ func Test_Routes(t *testing.T) {
}
createTestFile := func(t *testing.T, name string) string {
t.Helper()
f, err := os.CreateTemp(t.TempDir(), name)
assert.Nil(t, err)
defer f.Close()
err = binary.Write(f, binary.LittleEndian, []byte("GGUF"))
_, err = f.Write([]byte("GGUF"))
assert.Nil(t, err)
err = binary.Write(f, binary.LittleEndian, uint32(3))
assert.Nil(t, err)
err = binary.Write(f, binary.LittleEndian, uint64(0))
assert.Nil(t, err)
err = binary.Write(f, binary.LittleEndian, uint64(0))
_, err = f.Write([]byte{0x2, 0})
assert.Nil(t, err)
return f.Name()
@@ -61,7 +52,7 @@ func Test_Routes(t *testing.T) {
fn := func(resp api.ProgressResponse) {
t.Logf("Status: %s", resp.Status)
}
err = CreateModel(context.TODO(), name, "", "", commands, fn)
err = CreateModel(context.TODO(), name, "", commands, fn)
assert.Nil(t, err)
}
@@ -210,7 +201,7 @@ func Test_Routes(t *testing.T) {
},
}
s := &Server{}
s := Server{}
router := s.GenerateRoutes()
httpSrv := httptest.NewServer(router)
@@ -241,3 +232,27 @@ func Test_Routes(t *testing.T) {
}
}
type MockLLM struct {
encoding []int
}
func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
return nil
}
func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
return llm.encoding, nil
}
func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
return "", nil
}
func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
return []float64{}, nil
}
func (llm *MockLLM) Close() {
// do nothing
}

View File

@@ -1,83 +0,0 @@
package model
import (
"fmt"
"log/slog"
"strings"
"unicode"
)
// Digest represents a digest of a model Manifest. It is a comparable value
// type and is immutable.
//
// The zero Digest is not a valid digest.
type Digest struct {
s string
}
// Type returns the digest type of the digest.
//
// Example:
//
// ParseDigest("sha256-1234").Type() // returns "sha256"
func (d Digest) Type() string {
typ, _, _ := strings.Cut(d.s, "-")
return typ
}
// String returns the digest in the form of "<digest-type>-<digest>", or the
// empty string if the digest is invalid.
func (d Digest) String() string { return d.s }
// IsValid returns true if the digest is valid (not zero).
//
// A valid digest may be created only by ParseDigest, or
// ParseName(name).Digest().
func (d Digest) IsValid() bool { return d.s != "" }
// LogValue implements slog.Value.
func (d Digest) LogValue() slog.Value {
return slog.StringValue(d.String())
}
var (
_ slog.LogValuer = Digest{}
)
// ParseDigest parses a string in the form of "<digest-type>-<digest>" into a
// Digest.
func ParseDigest(s string) Digest {
typ, digest, ok := strings.Cut(s, "-")
if !ok {
typ, digest, ok = strings.Cut(s, ":")
}
if ok && isValidDigestType(typ) && isValidHex(digest) {
return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
}
return Digest{}
}
func isValidDigestType(s string) bool {
if len(s) == 0 {
return false
}
for _, r := range s {
if !unicode.IsLower(r) && !unicode.IsDigit(r) {
return false
}
}
return true
}
func isValidHex(s string) bool {
if len(s) == 0 {
return false
}
for i := range s {
c := s[i]
if c < '0' || c > '9' && c < 'a' || c > 'f' {
return false
}
}
return true
}

View File

@@ -1,46 +0,0 @@
package model
import "testing"
var testDigests = map[string]Digest{
"": {},
"sha256-1234": {s: "sha256-1234"},
"sha256-5678": {s: "sha256-5678"},
"blake2-9abc": {s: "blake2-9abc"},
"-1234": {},
"sha256-": {},
"sha256-1234-5678": {},
"sha256-P": {}, // invalid hex
"sha256-1234P": {},
"---": {},
}
func TestDigestParse(t *testing.T) {
// Test cases.
for s, want := range testDigests {
got := ParseDigest(s)
t.Logf("ParseDigest(%q) = %#v", s, got)
if got != want {
t.Errorf("ParseDigest(%q) = %q; want %q", s, got, want)
}
}
}
func TestDigestString(t *testing.T) {
// Test cases.
for s, d := range testDigests {
want := s
if !d.IsValid() {
want = ""
}
got := d.String()
if got != want {
t.Errorf("ParseDigest(%q).String() = %q; want %q", s, got, want)
}
got = ParseDigest(s).String()
if got != want {
t.Errorf("roundtrip ParseDigest(%q).String() = %q; want %q", s, got, want)
}
}
}

View File

@@ -1,688 +0,0 @@
package model
import (
"cmp"
"errors"
"fmt"
"hash/maphash"
"io"
"log/slog"
"path/filepath"
"slices"
"strings"
"sync"
"github.com/ollama/ollama/types/structs"
)
// Errors
var (
// ErrInvalidName, ErrIncompleteName, and ErrInvalidDigest are not
// used by this package, but are exported so that other packages can
// use them, instead of defining their own errors for them.
ErrInvalidName = errors.New("invalid model name")
ErrIncompleteName = errors.New("incomplete model name")
ErrInvalidDigest = errors.New("invalid digest")
)
// Defaults
const (
// MaskDefault is the default mask used by [Name.DisplayShortest].
MaskDefault = "registry.ollama.ai/library/?:latest"
// MaskNothing is a mask that masks nothing.
MaskNothing = "?/?/?:?"
// DefaultFill is the default fill used by [ParseName].
FillDefault = "registry.ollama.ai/library/?:latest+Q4_0"
// FillNothing is a fill that fills nothing.
FillNothing = "?/?/?:?+?"
)
const MaxNamePartLen = 128
type PartKind int
// Levels of concreteness
const (
// Each value aligns with its index in the Name.parts array.
PartHost PartKind = iota
PartNamespace
PartModel
PartTag
PartBuild
PartDigest
// NumParts is the number of parts in a Name. In this list, it must
// follow the final part.
NumParts
PartExtraneous = -1
)
var kindNames = map[PartKind]string{
PartHost: "Host",
PartNamespace: "Namespace",
PartModel: "Name",
PartTag: "Tag",
PartBuild: "Build",
PartDigest: "Digest",
}
func (k PartKind) String() string {
return cmp.Or(kindNames[k], "Unknown")
}
// Name is an opaque reference to a model. It holds the parts of a model
// with the case preserved, but is not directly comparable with other Names
// since model names can be represented with different casing depending on
// the use case. For instance, "Mistral" and "mistral" are the same model
// but each version may have come from different sources (e.g. copied from a
// Web page, or from a file path).
//
// Valid Names can ONLY be constructed by calling [ParseName].
//
// A Name is valid if and only if is have a valid Model part. The other parts
// are optional.
//
// A Name is considered "complete" if it has all parts present. To check if a
// Name is complete, use [Name.IsComplete].
//
// To compare two names in a case-insensitive manner, use [Name.EqualFold].
//
// The parts of a Name are:
//
// - Host: the domain of the model (optional)
// - Namespace: the namespace of the model (optional)
// - Model: the name of the model (required)
// - Tag: the tag of the model (optional)
// - Build: the build of the model; usually the quantization or "file type" (optional)
//
// The parts can be obtained in their original form by calling [Name.Parts].
//
// To check if a Name has at minimum a valid model part, use [Name.IsValid].
type Name struct {
_ structs.Incomparable
parts [NumParts]string // host, namespace, model, tag, build, digest
// TODO(bmizerany): track offsets and hold s (raw string) here? We
// could pack the offsets all into a single uint64 since the first
// parts take less bits since their max offset is less than the max
// offset of the next part. This would save a ton of bytes per Name
// and mean zero allocations for String.
}
// ParseName parses s into a Name, and returns the result of filling it with
// defaults. The input string must be a valid string
// representation of a model name in the form:
//
// [host/][namespace/]<model>[:tag][+build][@<digest-type>-<digest>]
//
// The name part is required, all others are optional. If a part is missing,
// it is left empty in the returned Name. If a part is invalid, the zero Ref
// value is returned.
//
// The build part is normalized to uppercase.
//
// Examples of valid paths:
//
// "example.com/library/mistral:7b+x"
// "example.com/eva/mistral:7b+Q4_0"
// "mistral:7b+x"
// "example.com/mike/mistral:latest+Q4_0"
// "example.com/bruce/mistral:latest"
// "example.com/pdevine/thisisfine:7b+Q4_0@sha256-1234567890abcdef"
//
// Examples of invalid paths:
//
// "example.com/mistral:7b+"
// "example.com/mistral:7b+Q4_0+"
// "x/y/z/z:8n+I"
// ""
//
// It returns the zero value if any part is invalid.
//
// # Fills
//
// For any valid s, the fill string is used to fill in missing parts of the
// Name. The fill string must be a valid Name with the exception that any part
// may be the string ("?"), which will not be considered for filling.
func ParseName(s, fill string) Name {
var r Name
parts(s)(func(kind PartKind, part string) bool {
if kind == PartDigest && !ParseDigest(part).IsValid() {
r = Name{}
return false
}
if kind == PartExtraneous || !isValidPart(kind, part) {
r = Name{}
return false
}
r.parts[kind] = part
return true
})
if r.IsValid() || r.IsResolved() {
return fillName(r, fill)
}
return Name{}
}
func parseMask(s string) Name {
var r Name
parts(s)(func(kind PartKind, part string) bool {
if part == "?" {
// mask part; treat as empty but valid
return true
}
if !isValidPart(kind, part) {
panic(fmt.Errorf("invalid mask part %s: %q", kind, part))
}
r.parts[kind] = part
return true
})
return r
}
func MustParseName(s, fill string) Name {
r := ParseName(s, fill)
if !r.IsValid() {
panic("invalid Name: " + s)
}
return r
}
// fillName fills in the missing parts of dst with the parts of src.
//
// The returned Name will only be valid if dst is valid.
//
// It skipps fill parts that are "?".
func fillName(r Name, fill string) Name {
fill = cmp.Or(fill, FillDefault)
f := parseMask(fill)
if fill != FillNothing && f.IsZero() {
panic("invalid fill")
}
for i := range r.parts {
if f.parts[i] == "?" {
continue
}
r.parts[i] = cmp.Or(r.parts[i], f.parts[i])
}
return r
}
// WithBuild returns a copy of r with the build set to the given string.
func (r Name) WithBuild(build string) Name {
r.parts[PartBuild] = build
return r
}
func (r Name) WithDigest(digest Digest) Name {
r.parts[PartDigest] = digest.String()
return r
}
var mapHashSeed = maphash.MakeSeed()
// MapHash returns a case insensitive hash for use in maps and equality
// checks. For a convenient way to compare names, use [Name.EqualFold].
//
//nolint:errcheck
func (r Name) MapHash() uint64 {
// correctly hash the parts with case insensitive comparison
var h maphash.Hash
h.SetSeed(mapHashSeed)
for _, part := range r.parts {
// downcase the part for hashing
for i := range part {
c := part[i]
if c >= 'A' && c <= 'Z' {
c = c - 'A' + 'a'
}
h.WriteByte(c)
}
}
return h.Sum64()
}
func (r Name) slice(from, to PartKind) Name {
var v Name
copy(v.parts[from:to+1], r.parts[from:to+1])
return v
}
// DisplayShortest returns the shortest possible, masked display string in form:
//
// [host/][<namespace>/]<model>[:<tag>]
//
// # Masks
//
// The mask is a string that specifies which parts of the name to omit based
// on case-insensitive comparison. [Name.DisplayShortest] omits parts of the name
// that are the same as the mask, moving from left to right until the first
// unequal part is found. It then moves right to left until the first unequal
// part is found. The result is the shortest possible display string.
//
// Unlike a [Name] the mask can contain "?" characters which are treated as
// wildcards. A "?" will never match a part of the name, since a valid name
// can never contain a "?" character.
//
// For example: Given a Name ("registry.ollama.ai/library/mistral:latest") masked
// with ("registry.ollama.ai/library/?:latest") will produce the display string
// ("mistral").
//
// If mask is the empty string, then [MaskDefault] is used.
//
// DisplayShortest panics if the mask is not the empty string, MaskNothing, and
// invalid.
//
// # Builds
//
// For now, DisplayShortest does consider the build or return one in the
// result. We can lift this restriction when needed.
func (r Name) DisplayShortest(mask string) string {
mask = cmp.Or(mask, MaskDefault)
d := parseMask(mask)
if mask != MaskNothing && r.IsZero() {
panic("invalid Name")
}
for i := range PartTag {
if !strings.EqualFold(r.parts[i], d.parts[i]) {
break
}
r.parts[i] = ""
}
for i := PartTag; i >= 0; i-- {
if !strings.EqualFold(r.parts[i], d.parts[i]) {
break
}
r.parts[i] = ""
}
return r.slice(PartHost, PartTag).DisplayLong()
}
// DisplayLongest returns the result of r.DisplayShortest(MaskNothing).
func (r Name) DisplayLongest() string {
return r.DisplayShortest(MaskNothing)
}
var seps = [...]string{
PartHost: "/",
PartNamespace: "/",
PartModel: ":",
PartTag: "+",
PartBuild: "@",
PartDigest: "",
}
// WriteTo implements io.WriterTo. It writes the fullest possible display
// string in form:
//
// <host>/<namespace>/<model>:<tag>+<build>@<digest-type>-<digest>
//
// Missing parts and their separators are not written.
//
// The full digest is always prefixed with "@". That is if [Name.IsValid]
// reports false and [Name.IsResolved] reports true, then the string is
// returned as "@<digest-type>-<digest>".
func (r Name) writeTo(w io.StringWriter) error {
var partsWritten int
for i := range r.parts {
if r.parts[i] == "" {
continue
}
if partsWritten > 0 || i == int(PartDigest) {
if _, err := w.WriteString(seps[i-1]); err != nil {
return err
}
}
if _, err := w.WriteString(r.parts[i]); err != nil {
return err
}
partsWritten++
}
return nil
}
var builderPool = sync.Pool{
New: func() interface{} {
return &strings.Builder{}
},
}
// DisplayLong returns the fullest possible display string in form:
//
// <host>/<namespace>/<model>:<tag>+<build>
//
// If any part is missing, it is omitted from the display string.
func (r Name) DisplayLong() string {
b := builderPool.Get().(*strings.Builder)
defer builderPool.Put(b)
b.Reset()
b.Grow(50) // arbitrarily long enough for most names
_ = r.writeTo(b)
return b.String()
}
// GoString implements fmt.GoStringer. It returns a string suitable for
// debugging and logging. It is similar to [Name.DisplayLong] but it always
// returns a string that includes all parts of the Name, with missing parts
// replaced with a ("?").
func (r Name) GoString() string {
for i := range r.parts {
r.parts[i] = cmp.Or(r.parts[i], "?")
}
return r.DisplayLong()
}
// LogValue implements slog.Valuer.
func (r Name) LogValue() slog.Value {
return slog.StringValue(r.GoString())
}
// IsComplete reports whether the Name is fully qualified. That is it has a
// domain, namespace, name, tag, and build.
func (r Name) IsComplete() bool {
return !slices.Contains(r.parts[:PartDigest], "")
}
// IsCompleteNoBuild is like [Name.IsComplete] but it does not require the
// build part to be present.
func (r Name) IsCompleteNoBuild() bool {
return !slices.Contains(r.parts[:PartBuild], "")
}
// IsResolved reports true if the Name has a valid digest.
//
// It is possible to have a valid Name, or a complete Name that is not
// resolved.
func (r Name) IsResolved() bool {
return r.Digest().IsValid()
}
// Digest returns the digest part of the Name, if any.
//
// If Digest returns a non-empty string, then [Name.IsResolved] will return
// true, and digest is considered valid.
func (r Name) Digest() Digest {
// This was already validated by ParseName, so we can just return it.
return Digest{r.parts[PartDigest]}
}
// EqualFold reports whether r and o are equivalent model names, ignoring
// case.
func (r Name) EqualFold(o Name) bool {
return r.CompareFold(o) == 0
}
// CompareFold performs a case-insensitive cmp.Compare on r and o.
//
// This can be used with [slices.SortFunc].
//
// For simple equality checks, use [Name.EqualFold].
func (r Name) CompareFold(o Name) int {
return slices.CompareFunc(r.parts[:], o.parts[:], compareFold)
}
func compareFold(a, b string) int {
return slices.CompareFunc([]rune(a), []rune(b), func(a, b rune) int {
return cmp.Compare(downcase(a), downcase(b))
})
}
func downcase(r rune) rune {
if r >= 'A' && r <= 'Z' {
return r - 'A' + 'a'
}
return r
}
func (r Name) Host() string { return r.parts[PartHost] }
func (r Name) Namespace() string { return r.parts[PartNamespace] }
func (r Name) Model() string { return r.parts[PartModel] }
func (r Name) Build() string { return r.parts[PartBuild] }
func (r Name) Tag() string { return r.parts[PartTag] }
// iter_Seq2 is a iter.Seq2 defined here to avoid the current build
// restrictions in the go1.22 iter package requiring the
// goexperiment.rangefunc tag to be set via the GOEXPERIMENT=rangefunc flag,
// which we are not yet ready to support.
//
// Once we are ready to support rangefunc, this can be removed and replaced
// with the iter.Seq2 type.
type iter_Seq2[A, B any] func(func(A, B) bool)
// Parts returns a sequence of the parts of a Name string from most specific
// to least specific.
//
// It normalizes the input string by removing "http://" and "https://" only.
// No other normalizations are performed.
func parts(s string) iter_Seq2[PartKind, string] {
return func(yield func(PartKind, string) bool) {
if strings.HasPrefix(s, "http://") {
s = strings.TrimPrefix(s, "http://")
} else {
s = strings.TrimPrefix(s, "https://")
}
if len(s) > MaxNamePartLen || len(s) == 0 {
return
}
numConsecutiveDots := 0
partLen := 0
state, j := PartDigest, len(s)
for i := len(s) - 1; i >= 0; i-- {
if partLen++; partLen > MaxNamePartLen {
// catch a part that is too long early, so
// we don't keep spinning on it, waiting for
// an isInValidPart check which would scan
// over it again.
yield(state, s[i+1:j])
return
}
switch s[i] {
case '@':
switch state {
case PartDigest:
if !yield(PartDigest, s[i+1:j]) {
return
}
if i == 0 {
// This is the form
// "@<digest>" which is valid.
//
// We're done.
return
}
state, j, partLen = PartBuild, i, 0
default:
yield(PartExtraneous, s[i+1:j])
return
}
case '+':
switch state {
case PartBuild, PartDigest:
if !yield(PartBuild, s[i+1:j]) {
return
}
state, j, partLen = PartTag, i, 0
default:
yield(PartExtraneous, s[i+1:j])
return
}
case ':':
switch state {
case PartTag, PartBuild, PartDigest:
if !yield(PartTag, s[i+1:j]) {
return
}
state, j, partLen = PartModel, i, 0
default:
yield(PartExtraneous, s[i+1:j])
return
}
case '/':
switch state {
case PartModel, PartTag, PartBuild, PartDigest:
if !yield(PartModel, s[i+1:j]) {
return
}
state, j = PartNamespace, i
case PartNamespace:
if !yield(PartNamespace, s[i+1:j]) {
return
}
state, j, partLen = PartHost, i, 0
default:
yield(PartExtraneous, s[i+1:j])
return
}
default:
if s[i] == '.' {
if numConsecutiveDots++; numConsecutiveDots > 1 {
yield(state, "")
return
}
} else {
numConsecutiveDots = 0
}
}
}
if state <= PartNamespace {
yield(state, s[:j])
} else {
yield(PartModel, s[:j])
}
}
}
func (r Name) IsZero() bool {
return r.parts == [NumParts]string{}
}
// IsValid reports if a model has at minimum a valid model part.
func (r Name) IsValid() bool {
// Parts ensures we only have valid parts, so no need to validate
// them here, only check if we have a name or not.
return r.parts[PartModel] != ""
}
// ParseNameFromURLPath parses forms of a URL path into a Name. Specifically,
// it trims any leading "/" and then calls [ParseName] with fill.
func ParseNameFromURLPath(s, fill string) Name {
s = strings.TrimPrefix(s, "/")
return ParseName(s, fill)
}
// URLPath returns a complete, canonicalized, relative URL path using the parts of a
// complete Name.
//
// The parts maintain their original case.
//
// Example:
//
// ParseName("example.com/namespace/model:tag+build").URLPath() // returns "/example.com/namespace/model:tag"
func (r Name) URLPath() string {
return r.DisplayShortest(MaskNothing)
}
// ParseNameFromFilepath parses a file path into a Name. The input string must be a
// valid file path representation of a model name in the form:
//
// host/namespace/model/tag/build
//
// The zero valid is returned if s does not contain all path elements
// leading up to the model part, or if any path element is an invalid part
// for the its corresponding part kind.
//
// The fill string is used to fill in missing parts of any constructed Name.
// See [ParseName] for more information on the fill string.
func ParseNameFromFilepath(s, fill string) Name {
var r Name
for i := range PartBuild + 1 {
part, rest, _ := strings.Cut(s, string(filepath.Separator))
if !isValidPart(i, part) {
return Name{}
}
r.parts[i] = part
s = rest
if s == "" {
break
}
}
if s != "" {
return Name{}
}
if !r.IsValid() {
return Name{}
}
return fillName(r, fill)
}
// Filepath returns a complete, canonicalized, relative file path using the
// parts of a complete Name.
//
// Each parts is downcased, except for the build part which is upcased.
//
// Example:
//
// ParseName("example.com/namespace/model:tag+build").Filepath() // returns "example.com/namespace/model/tag/BUILD"
func (r Name) Filepath() string {
for i := range r.parts {
if PartKind(i) == PartBuild {
r.parts[i] = strings.ToUpper(r.parts[i])
} else {
r.parts[i] = strings.ToLower(r.parts[i])
}
}
return filepath.Join(r.parts[:]...)
}
// FilepathNoBuild returns a complete, canonicalized, relative file path using
// the parts of a complete Name, but without the build part.
func (r Name) FilepathNoBuild() string {
for i := range PartBuild {
r.parts[i] = strings.ToLower(r.parts[i])
}
return filepath.Join(r.parts[:PartBuild]...)
}
// isValidPart reports if s contains all valid characters for the given
// part kind.
func isValidPart(kind PartKind, s string) bool {
if s == "" {
return false
}
var consecutiveDots int
for _, c := range []byte(s) {
if c == '.' {
if consecutiveDots++; consecutiveDots >= 2 {
return false
}
} else {
consecutiveDots = 0
}
if !isValidByteFor(kind, c) {
return false
}
}
return true
}
func isValidByteFor(kind PartKind, c byte) bool {
if kind == PartNamespace && c == '.' {
return false
}
if c == '.' || c == '-' {
return true
}
if c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_' {
return true
}
return false
}

View File

@@ -1,708 +0,0 @@
package model
import (
"bytes"
"cmp"
"fmt"
"log/slog"
"path/filepath"
"slices"
"strings"
"testing"
)
type fields struct {
host, namespace, model, tag, build string
digest string
}
func fieldsFromName(p Name) fields {
return fields{
host: p.parts[PartHost],
namespace: p.parts[PartNamespace],
model: p.parts[PartModel],
tag: p.parts[PartTag],
build: p.parts[PartBuild],
digest: p.parts[PartDigest],
}
}
var testNames = map[string]fields{
"mistral:latest": {model: "mistral", tag: "latest"},
"mistral": {model: "mistral"},
"mistral:30B": {model: "mistral", tag: "30B"},
"mistral:7b": {model: "mistral", tag: "7b"},
"mistral:7b+Q4_0": {model: "mistral", tag: "7b", build: "Q4_0"},
"mistral+KQED": {model: "mistral", build: "KQED"},
"mistral.x-3:7b+Q4_0": {model: "mistral.x-3", tag: "7b", build: "Q4_0"},
"mistral:7b+q4_0": {model: "mistral", tag: "7b", build: "q4_0"},
"llama2": {model: "llama2"},
"user/model": {namespace: "user", model: "model"},
"example.com/ns/mistral:7b+Q4_0": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "Q4_0"},
"example.com/ns/mistral:7b+X": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "X"},
// invalid digest
"mistral:latest@invalid256-": {},
"mistral:latest@-123": {},
"mistral:latest@!-123": {},
"mistral:latest@1-!": {},
"mistral:latest@": {},
// resolved
"x@sha123-1": {model: "x", digest: "sha123-1"},
"@sha456-2": {digest: "sha456-2"},
"@@sha123-1": {},
// preserves case for build
"x+b": {model: "x", build: "b"},
// invalid (includes fuzzing trophies)
" / / : + ": {},
" / : + ": {},
" : + ": {},
" + ": {},
" : ": {},
" / ": {},
" /": {},
"/ ": {},
"/": {},
":": {},
"+": {},
// (".") in namepsace is not allowed
"invalid.com/7b+x": {},
"invalid:7b+Q4_0:latest": {},
"in valid": {},
"invalid/y/z/foo": {},
"/0": {},
"0 /0": {},
"0 /": {},
"0/": {},
":/0": {},
"+0/00000": {},
"0+.\xf2\x80\xf6\x9d00000\xe5\x99\xe6\xd900\xd90\xa60\x91\xdc0\xff\xbf\x99\xe800\xb9\xdc\xd6\xc300\x970\xfb\xfd0\xe0\x8a\xe1\xad\xd40\x9700\xa80\x980\xdd0000\xb00\x91000\xfe0\x89\x9b\x90\x93\x9f0\xe60\xf7\x84\xb0\x87\xa5\xff0\xa000\x9a\x85\xf6\x85\xfe\xa9\xf9\xe9\xde00\xf4\xe0\x8f\x81\xad\xde00\xd700\xaa\xe000000\xb1\xee0\x91": {},
"0//0": {},
"m+^^^": {},
"file:///etc/passwd": {},
"file:///etc/passwd:latest": {},
"file:///etc/passwd:latest+u": {},
":x": {},
"+x": {},
"x+": {},
// Disallow ("\.+") in any part to prevent path traversal anywhere
// we convert the name to a path.
"../etc/passwd": {},
".../etc/passwd": {},
"./../passwd": {},
"./0+..": {},
strings.Repeat("a", MaxNamePartLen): {model: strings.Repeat("a", MaxNamePartLen)},
strings.Repeat("a", MaxNamePartLen+1): {},
}
// TestConsecutiveDots tests that consecutive dots are not allowed in any
// part, to avoid path traversal. There also are some tests in testNames, but
// this test is more exhaustive and exists to emphasize the importance of
// preventing path traversal.
func TestNameConsecutiveDots(t *testing.T) {
for i := 1; i < 10; i++ {
s := strings.Repeat(".", i)
if i > 1 {
if g := ParseName(s, FillNothing).DisplayLong(); g != "" {
t.Errorf("ParseName(%q) = %q; want empty string", s, g)
}
} else {
if g := ParseName(s, FillNothing).DisplayLong(); g != s {
t.Errorf("ParseName(%q) = %q; want %q", s, g, s)
}
}
}
}
func TestNameParts(t *testing.T) {
var p Name
if w, g := int(NumParts), len(p.parts); w != g {
t.Errorf("Parts() = %d; want %d", g, w)
}
}
func TestNamePartString(t *testing.T) {
if g := PartKind(-2).String(); g != "Unknown" {
t.Errorf("Unknown part = %q; want %q", g, "Unknown")
}
for kind, name := range kindNames {
if g := kind.String(); g != name {
t.Errorf("%s = %q; want %q", kind, g, name)
}
}
}
func TestParseName(t *testing.T) {
for baseName, want := range testNames {
for _, prefix := range []string{"", "https://", "http://"} {
// We should get the same results with or without the
// http(s) prefixes
s := prefix + baseName
t.Run(s, func(t *testing.T) {
name := ParseName(s, FillNothing)
got := fieldsFromName(name)
if got != want {
t.Errorf("ParseName(%q) = %q; want %q", s, got, want)
}
// test round-trip
if !ParseName(name.DisplayLong(), FillNothing).EqualFold(name) {
t.Errorf("ParseName(%q).String() = %s; want %s", s, name.DisplayLong(), baseName)
}
})
}
}
}
func TestParseNameFill(t *testing.T) {
cases := []struct {
in string
fill string
want string
}{
{"mistral", "example.com/library/?:latest+Q4_0", "example.com/library/mistral:latest+Q4_0"},
{"mistral", "example.com/library/?:latest", "example.com/library/mistral:latest"},
{"llama2:x", "example.com/library/?:latest+Q4_0", "example.com/library/llama2:x+Q4_0"},
// Invalid
{"", "example.com/library/?:latest+Q4_0", ""},
{"llama2:?", "example.com/library/?:latest+Q4_0", ""},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
name := ParseName(tt.in, tt.fill)
if g := name.DisplayLong(); g != tt.want {
t.Errorf("ParseName(%q, %q) = %q; want %q", tt.in, tt.fill, g, tt.want)
}
})
}
t.Run("invalid fill", func(t *testing.T) {
defer func() {
if recover() == nil {
t.Fatal("expected panic")
}
}()
ParseName("x", "^")
})
}
func TestParseNameHTTPDoublePrefixStrip(t *testing.T) {
cases := []string{
"http://https://valid.com/valid/valid:latest",
"https://http://valid.com/valid/valid:latest",
}
for _, s := range cases {
t.Run(s, func(t *testing.T) {
name := ParseName(s, FillNothing)
if name.IsValid() {
t.Errorf("expected invalid path; got %#v", name)
}
})
}
}
func TestCompleteWithAndWithoutBuild(t *testing.T) {
cases := []struct {
in string
complete bool
completeNoBuild bool
}{
{"", false, false},
{"incomplete/mistral:7b+x", false, false},
{"incomplete/mistral:7b+Q4_0", false, false},
{"incomplete:7b+x", false, false},
{"complete.com/x/mistral:latest+Q4_0", true, true},
{"complete.com/x/mistral:latest", false, true},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
p := ParseName(tt.in, FillNothing)
t.Logf("ParseName(%q) = %#v", tt.in, p)
if g := p.IsComplete(); g != tt.complete {
t.Errorf("Complete(%q) = %v; want %v", tt.in, g, tt.complete)
}
if g := p.IsCompleteNoBuild(); g != tt.completeNoBuild {
t.Errorf("CompleteNoBuild(%q) = %v; want %v", tt.in, g, tt.completeNoBuild)
}
})
}
// Complete uses Parts which returns a slice, but it should be
// inlined when used in Complete, preventing any allocations or
// escaping to the heap.
allocs := testing.AllocsPerRun(1000, func() {
keep(ParseName("complete.com/x/mistral:latest+Q4_0", FillNothing).IsComplete())
})
if allocs > 0 {
t.Errorf("Complete allocs = %v; want 0", allocs)
}
}
func TestNameLogValue(t *testing.T) {
cases := []string{
"example.com/library/mistral:latest+Q4_0",
"mistral:latest",
"mistral:7b+Q4_0",
}
for _, s := range cases {
t.Run(s, func(t *testing.T) {
var b bytes.Buffer
log := slog.New(slog.NewTextHandler(&b, nil))
name := ParseName(s, FillNothing)
log.Info("", "name", name)
want := fmt.Sprintf("name=%s", name.GoString())
got := b.String()
if !strings.Contains(got, want) {
t.Errorf("expected log output to contain %q; got %q", want, got)
}
})
}
}
func TestNameGoString(t *testing.T) {
cases := []struct {
name string
in string
wantString string
wantGoString string // default is tt.in
}{
{
name: "Complete Name",
in: "example.com/library/mistral:latest+Q4_0",
wantGoString: "example.com/library/mistral:latest+Q4_0@?",
},
{
name: "Short Name",
in: "mistral:latest",
wantGoString: "?/?/mistral:latest+?@?",
},
{
name: "Long Name",
in: "library/mistral:latest",
wantGoString: "?/library/mistral:latest+?@?",
},
{
name: "Case Preserved",
in: "Library/Mistral:Latest",
wantGoString: "?/Library/Mistral:Latest+?@?",
},
{
name: "With digest",
in: "Library/Mistral:Latest@sha256-123456",
wantGoString: "?/Library/Mistral:Latest+?@sha256-123456",
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
p := ParseName(tt.in, FillNothing)
tt.wantGoString = cmp.Or(tt.wantGoString, tt.in)
if g := fmt.Sprintf("%#v", p); g != tt.wantGoString {
t.Errorf("GoString() = %q; want %q", g, tt.wantGoString)
}
})
}
}
func TestDisplayLongest(t *testing.T) {
g := ParseName("example.com/library/mistral:latest+Q4_0", FillNothing).DisplayLongest()
if g != "example.com/library/mistral:latest" {
t.Errorf("got = %q; want %q", g, "example.com/library/mistral:latest")
}
}
func TestDisplayShortest(t *testing.T) {
cases := []struct {
in string
mask string
want string
wantPanic bool
}{
{"example.com/library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
{"example.com/library/mistral:latest+Q4_0", "example.com/_/_:latest", "library/mistral", false},
{"example.com/library/mistral:latest+Q4_0", "", "example.com/library/mistral", false},
{"example.com/library/mistral:latest+Q4_0", "", "example.com/library/mistral", false},
// case-insensitive
{"Example.com/library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
{"example.com/Library/mistral:latest+Q4_0", "example.com/library/_:latest", "mistral", false},
{"example.com/library/Mistral:latest+Q4_0", "example.com/library/_:latest", "Mistral", false},
{"example.com/library/mistral:Latest+Q4_0", "example.com/library/_:latest", "mistral", false},
{"example.com/library/mistral:Latest+q4_0", "example.com/library/_:latest", "mistral", false},
// zero value
{"", MaskDefault, "", true},
// invalid mask
{"example.com/library/mistral:latest+Q4_0", "example.com/mistral", "", true},
// DefaultMask
{"registry.ollama.ai/library/mistral:latest+Q4_0", MaskDefault, "mistral", false},
// Auto-Fill
{"x", "example.com/library/_:latest", "x", false},
{"x", "example.com/library/_:latest+Q4_0", "x", false},
{"x/y:z", "a.com/library/_:latest+Q4_0", "x/y:z", false},
{"x/y:z", "a.com/library/_:latest+Q4_0", "x/y:z", false},
}
for _, tt := range cases {
t.Run("", func(t *testing.T) {
defer func() {
if tt.wantPanic {
if recover() == nil {
t.Errorf("expected panic")
}
}
}()
p := ParseName(tt.in, FillNothing)
t.Logf("ParseName(%q) = %#v", tt.in, p)
if g := p.DisplayShortest(tt.mask); g != tt.want {
t.Errorf("got = %q; want %q", g, tt.want)
}
})
}
}
func TestParseNameAllocs(t *testing.T) {
allocs := testing.AllocsPerRun(1000, func() {
keep(ParseName("example.com/mistral:7b+Q4_0", FillNothing))
})
if allocs > 0 {
t.Errorf("ParseName allocs = %v; want 0", allocs)
}
}
func BenchmarkParseName(b *testing.B) {
b.ReportAllocs()
for range b.N {
keep(ParseName("example.com/mistral:7b+Q4_0", FillNothing))
}
}
func FuzzParseNameFromFilepath(f *testing.F) {
f.Add("example.com/library/mistral/7b/Q4_0")
f.Add("example.com/../mistral/7b/Q4_0")
f.Add("example.com/x/../7b/Q4_0")
f.Add("example.com/x/../7b")
f.Fuzz(func(t *testing.T, s string) {
name := ParseNameFromFilepath(s, FillNothing)
if strings.Contains(s, "..") && !name.IsZero() {
t.Fatalf("non-zero value for path with '..': %q", s)
}
if name.IsValid() == name.IsZero() {
t.Errorf("expected valid path to be non-zero value; got %#v", name)
}
})
}
func FuzzParseName(f *testing.F) {
f.Add("example.com/mistral:7b+Q4_0")
f.Add("example.com/mistral:7b+q4_0")
f.Add("example.com/mistral:7b+x")
f.Add("x/y/z:8n+I")
f.Add(":x")
f.Add("@sha256-123456")
f.Add("example.com/mistral:latest+Q4_0@sha256-123456")
f.Add(":@!@")
f.Add("...")
f.Fuzz(func(t *testing.T, s string) {
r0 := ParseName(s, FillNothing)
if strings.Contains(s, "..") && !r0.IsZero() {
t.Fatalf("non-zero value for path with '..': %q", s)
}
if !r0.IsValid() && !r0.IsResolved() {
if !r0.EqualFold(Name{}) {
t.Errorf("expected invalid path to be zero value; got %#v", r0)
}
t.Skipf("invalid path: %q", s)
}
for _, p := range r0.parts {
if len(p) > MaxNamePartLen {
t.Errorf("part too long: %q", p)
}
}
if !strings.EqualFold(r0.DisplayLong(), s) {
t.Errorf("String() did not round-trip with case insensitivity: %q\ngot = %q\nwant = %q", s, r0.DisplayLong(), s)
}
r1 := ParseName(r0.DisplayLong(), FillNothing)
if !r0.EqualFold(r1) {
t.Errorf("round-trip mismatch: %+v != %+v", r0, r1)
}
})
}
func TestNameStringAllocs(t *testing.T) {
name := ParseName("example.com/ns/mistral:latest+Q4_0", FillNothing)
allocs := testing.AllocsPerRun(1000, func() {
keep(name.DisplayLong())
})
if allocs > 1 {
t.Errorf("String allocs = %v; want 0", allocs)
}
}
func TestNamePath(t *testing.T) {
cases := []struct {
in string
want string
}{
{"example.com/library/mistral:latest+Q4_0", "example.com/library/mistral:latest"},
// incomplete
{"example.com/library/mistral:latest", "example.com/library/mistral:latest"},
{"", ""},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
p := ParseName(tt.in, FillNothing)
t.Logf("ParseName(%q) = %#v", tt.in, p)
if g := p.URLPath(); g != tt.want {
t.Errorf("got = %q; want %q", g, tt.want)
}
})
}
}
func TestNameFilepath(t *testing.T) {
cases := []struct {
in string
want string
wantNoBuild string
}{
{
in: "example.com/library/mistral:latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "Example.Com/Library/Mistral:Latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "Example.Com/Library/Mistral:Latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "example.com/library/mistral:latest",
want: "example.com/library/mistral/latest",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "",
want: "",
wantNoBuild: "",
},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
p := ParseName(tt.in, FillNothing)
t.Logf("ParseName(%q) = %#v", tt.in, p)
g := p.Filepath()
g = filepath.ToSlash(g)
if g != tt.want {
t.Errorf("got = %q; want %q", g, tt.want)
}
g = p.FilepathNoBuild()
g = filepath.ToSlash(g)
if g != tt.wantNoBuild {
t.Errorf("got = %q; want %q", g, tt.wantNoBuild)
}
})
}
}
func TestParseNameFilepath(t *testing.T) {
cases := []struct {
in string
fill string // default is FillNothing
want string
}{
{
in: "example.com/library/mistral/latest/Q4_0",
want: "example.com/library/mistral:latest+Q4_0",
},
{
in: "example.com/library/mistral/latest",
fill: "?/?/?:latest+Q4_0",
want: "example.com/library/mistral:latest+Q4_0",
},
{
in: "example.com/library/mistral",
fill: "?/?/?:latest+Q4_0",
want: "example.com/library/mistral:latest+Q4_0",
},
{
in: "example.com/library",
want: "",
},
{
in: "example.com/",
want: "",
},
{
in: "example.com/^/mistral/latest/Q4_0",
want: "",
},
{
in: "example.com/library/mistral/../Q4_0",
want: "",
},
{
in: "example.com/library/mistral/latest/Q4_0/extra",
want: "",
},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
in := strings.ReplaceAll(tt.in, "/", string(filepath.Separator))
fill := cmp.Or(tt.fill, FillNothing)
want := ParseName(tt.want, fill)
if g := ParseNameFromFilepath(in, fill); !g.EqualFold(want) {
t.Errorf("got = %q; want %q", g.DisplayLong(), tt.want)
}
})
}
}
func TestParseNameFromPath(t *testing.T) {
cases := []struct {
in string
want string
fill string // default is FillNothing
}{
{
in: "example.com/library/mistral:latest+Q4_0",
want: "example.com/library/mistral:latest+Q4_0",
},
{
in: "/example.com/library/mistral:latest+Q4_0",
want: "example.com/library/mistral:latest+Q4_0",
},
{
in: "/example.com/library/mistral",
want: "example.com/library/mistral",
},
{
in: "/example.com/library/mistral",
fill: "?/?/?:latest+Q4_0",
want: "example.com/library/mistral:latest+Q4_0",
},
{
in: "/example.com/library",
want: "",
},
{
in: "/example.com/",
want: "",
},
{
in: "/example.com/^/mistral/latest",
want: "",
},
}
for _, tt := range cases {
t.Run(tt.in, func(t *testing.T) {
fill := cmp.Or(tt.fill, FillNothing)
if g := ParseNameFromURLPath(tt.in, fill); g.DisplayLong() != tt.want {
t.Errorf("got = %q; want %q", g.DisplayLong(), tt.want)
}
})
}
}
func ExampleName_MapHash() {
m := map[uint64]bool{}
// key 1
m[ParseName("mistral:latest+q4", FillNothing).MapHash()] = true
m[ParseName("miSTRal:latest+Q4", FillNothing).MapHash()] = true
m[ParseName("mistral:LATest+Q4", FillNothing).MapHash()] = true
// key 2
m[ParseName("mistral:LATest", FillNothing).MapHash()] = true
fmt.Println(len(m))
// Output:
// 2
}
func ExampleName_CompareFold_sort() {
names := []Name{
ParseName("mistral:latest", FillNothing),
ParseName("mistRal:7b+q4", FillNothing),
ParseName("MIstral:7b", FillNothing),
}
slices.SortFunc(names, Name.CompareFold)
for _, n := range names {
fmt.Println(n.DisplayLong())
}
// Output:
// MIstral:7b
// mistRal:7b+q4
// mistral:latest
}
func ExampleName_completeAndResolved() {
for _, s := range []string{
"x/y/z:latest+q4_0@sha123-1",
"x/y/z:latest+q4_0",
"@sha123-1",
} {
name := ParseName(s, FillNothing)
fmt.Printf("complete:%v resolved:%v digest:%s\n", name.IsComplete(), name.IsResolved(), name.Digest())
}
// Output:
// complete:true resolved:true digest:sha123-1
// complete:true resolved:false digest:
// complete:false resolved:true digest:sha123-1
}
func ExampleName_DisplayShortest() {
name := ParseName("example.com/jmorganca/mistral:latest+Q4_0", FillNothing)
fmt.Println(name.DisplayShortest("example.com/jmorganca/_:latest"))
fmt.Println(name.DisplayShortest("example.com/_/_:latest"))
fmt.Println(name.DisplayShortest("example.com/_/_:_"))
fmt.Println(name.DisplayShortest("_/_/_:_"))
// Default
name = ParseName("registry.ollama.ai/library/mistral:latest+Q4_0", FillNothing)
fmt.Println(name.DisplayShortest(""))
// Output:
// mistral
// jmorganca/mistral
// jmorganca/mistral:latest
// example.com/jmorganca/mistral:latest
// mistral
}
func keep[T any](v T) T { return v }

View File

@@ -1,2 +0,0 @@
go test fuzz v1
string("/0")

View File

@@ -1,2 +0,0 @@
go test fuzz v1
string("0//0")

View File

@@ -1,2 +0,0 @@
go test fuzz v1
string("0 /0")

View File

@@ -1,2 +0,0 @@
go test fuzz v1
string("+0/00000")

View File

@@ -1,2 +0,0 @@
go test fuzz v1
string(":")

Some files were not shown because too many files have changed in this diff Show More